https://github.com/Microsoft/CNTK
Raw File
Tip revision: c18212eeac755c586f2dbf29fac00eee47bba1f4 authored by Peyman Manikashani on 02 August 2018, 23:13:08 UTC
BN FP16 fix
Tip revision: c18212e
LatticeIndexBuilder.cpp
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "stdafx.h"
#define _CRT_SECURE_NO_WARNINGS
#define _SCL_SECURE_NO_WARNINGS
#include "LatticeIndexBuilder.h"
#include "ReaderUtil.h"

namespace CNTK {

    using namespace std;

    LatticeIndexBuilder::LatticeIndexBuilder(const FileWrapper& latticeFile, const std::vector<std::string>& latticeToc, CorpusDescriptorPtr corpus, bool lastChunkInTOC)
        : IndexBuilder(latticeFile), m_latticeToc(latticeToc), m_lastChunkInTOC(lastChunkInTOC)
    {
        IndexBuilder::SetCorpus(corpus);
        IndexBuilder::SetChunkSize(g_64MB);

        if (m_corpus == nullptr)
            InvalidArgument("LatticeIndexBuilder: corpus descriptor was not specified.");
        // Lattice index builder does not need to map sequence keys to locations,
        // deserializer will do that instead, set primary to true to skip that step.
        m_primary = true;
    }

    /*virtual*/ wstring LatticeIndexBuilder::GetCacheFilename() /*override*/
    {
        if (m_isCacheEnabled && !m_corpus->IsNumericSequenceKeys() && !m_corpus->IsHashingEnabled())
            InvalidArgument("Index caching is not supported for non-numeric sequence keys "
                "using in a corpus with disabled hashing.");

        wstringstream  wss;
        wss << m_input.Filename() << "."
            << (m_corpus->IsNumericSequenceKeys() ? "1" : "0") << "."
            << (m_corpus->IsHashingEnabled() ? std::to_wstring(CorpusDescriptor::s_hashVersion) : L"0") << "."
            << L"v" << IndexBuilder::s_version << "."
            << L"cache";

        return wss.str();
    }

    void LatticeIndexBuilder::AddSequence(shared_ptr<Index>& index, size_t id, size_t byteOffset, size_t prevSequenceStartOffset, const string& seqKey)
    {
        IndexedSequence sequence;
        // Sanity check for the lattice to be less than 50MB
        if (byteOffset - prevSequenceStartOffset < 50000000)
        {
            auto seqSize = (uint32_t)(byteOffset - prevSequenceStartOffset);
            if (seqSize % sizeof(float) == 0)
                seqSize = seqSize / sizeof(float);
            else
                seqSize = seqSize / sizeof(float) + 1;

            sequence.SetKey(id)
                .SetNumberOfSamples(seqSize)
                .SetOffset(prevSequenceStartOffset)
                .SetSize(byteOffset - prevSequenceStartOffset);
            index->AddSequence(sequence);
        }
        else
        {
            LogicError("ERROR: Lattice with the next key '%s' inside the TOC file '%ls' is larger than 50MB\n", seqKey.c_str(), m_input.Filename().c_str());
        }
    }

    // Building an index of the Lattice file
    /*virtual*/ void LatticeIndexBuilder::Populate(shared_ptr<Index>& index) /*override*/
    {
        m_input.CheckIsOpenOrDie();

        index->Reserve(filesize(m_input.File()));

        if (m_latticeToc.size() == 0)
            RuntimeError("Lattice TOC is empty");

        if (!m_corpus)
            RuntimeError("LatticeIndexBuilder: corpus descriptor was not specified.");

        size_t prevId{ 0 };
        bool firstLine = true;
        size_t prevSequenceStartOffset{ 0 };
        for (string const& line : m_latticeToc)
        {
            if (line.empty())
                continue;

            size_t eqLoc = line.find('=');
            size_t openBracketLoc = line.find('[');
            size_t closeBracketLoc = line.find(']');
            
            if (eqLoc == string::npos || openBracketLoc == string::npos || closeBracketLoc == string::npos)
                RuntimeError("The lattice TOC line is malformed: %s", line.c_str());

            string seqKey = line.substr(0, eqLoc);
            
            size_t byteOffset;
            sscanf(line.substr(openBracketLoc + 1, closeBracketLoc).c_str(), "%zu", &byteOffset);

            if (firstLine)
            {
                firstLine = false;
            }
            else
            {
                if (byteOffset == 0)
                {
                    // New chunk in the same toc file
                    AddSequence(index, prevId, filesize(m_input.File()), prevSequenceStartOffset, seqKey);
                }
                else 
                { 
                    AddSequence(index, prevId, byteOffset, prevSequenceStartOffset, seqKey);
                }

                
            }
            prevId = m_corpus->KeyToId(seqKey);
            prevSequenceStartOffset = byteOffset;
        }
        if (m_lastChunkInTOC) {
            AddSequence(index, prevId, filesize(m_input.File()), prevSequenceStartOffset, "last_sequence");
        }
    }

}
back to top