Content - 9cb397f969b2458857191702a9cd74c358e43dd4 - ad68f9a/Source/Readers/ReaderLib/Indexer.h

swh:1:snp:f50ab94432af916b5fb8b4ad831e8dddded77084

Tip revision: 91c566fd2c99bbe4730106752c24298ee8e285da authored by Amit Agarwal on 04 April 2017, 20:25:02 UTC
CNTK v2 library: Fix trainer to save the model instead of the combined
Tip revision: 91c566f
Indexer.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//

#pragma once

#include <stdint.h>
#include <vector>
#include "DataDeserializer.h"
#include "CorpusDescriptor.h"

namespace Microsoft { namespace MSR { namespace CNTK {

// Sequence metadata. This text-reader specific descriptor adds two additional
// fields: file offset and size in bytes. Both are required to efficiently
// locate and retrieve a sequence from file, given a sequence descriptor.
struct SequenceDescriptor : SequenceDescription
{
    SequenceDescriptor() : SequenceDescription({}), m_fileOffsetBytes(0),
        m_byteSize(0)
    {
    }
    // size_t m_numberOfSamples -- number of samples in the sequence (largest count among all inputs)
    // in case of text data this value == number of rows this sequence spans over.
    int64_t m_fileOffsetBytes; // sequence offset in the input file (in bytes)
    size_t m_byteSize; // size in bytes
};

// Chunk metadata, similar to the sequence descriptor above,
// but used to facilitate indexing and retrieval of blobs of input data of
// some user-specified size.
struct ChunkDescriptor : ChunkDescription
{
    ChunkDescriptor() : ChunkDescription({}), m_byteSize(0) {}
    // TODO: if we don't want to keep the whole index
    // (metadata for all sequences in memory), we should not
    // leave this empty when building a chunk index, and only
    // fill it out when the chunk needs to be loaded
    // (the indexer will have to do a second pass for this chunk).
    std::vector<SequenceDescriptor> m_sequences;

    size_t m_byteSize; // size in bytes
};

typedef shared_ptr<ChunkDescriptor> ChunkDescriptorPtr;

// A collection of chunk descriptors, each containing
// a collection of sequence descriptors for the corresponding
// chunk of the input data.
// It also stores a mapping of keys into sequence descriptors.
struct Index
{
    std::vector<ChunkDescriptor> m_chunks;                                  // chunks
    std::map<size_t, std::pair<size_t, size_t>> m_keyToSequenceInChunk;     // sequence key -> sequence location in chunk
    const size_t m_maxChunkSize;                                            // maximum chunk size in bytes
    bool m_primary;                                                         // index for primary deserializer

    Index(size_t chunkSize, bool primary) : m_maxChunkSize(chunkSize), m_primary(primary)
    {}

    // Adds sequence (metadata) to the index. Additionally, it
    // assigns an appropriate chunk id to the sequence descriptor,
    // ensures that chunks do not exceed the maximum allowed size
    // (except when a sequence size is greater than the maximum chunk size)
    void AddSequence(SequenceDescriptor& sd)
    {
        assert(!m_chunks.empty());
        ChunkDescriptor* chunk = &m_chunks.back();
        if (chunk->m_byteSize > 0 && (chunk->m_byteSize + sd.m_byteSize) > m_maxChunkSize)
        {
            // Creating a new chunk if the size is exceeded.
            chunk->m_sequences.shrink_to_fit();
            m_chunks.push_back({});
            chunk = &m_chunks.back();
            chunk->m_id = (ChunkIdType)(m_chunks.size() - 1);
            if (CHUNKID_MAX < m_chunks.size())
            {
                RuntimeError("Maximum number of chunks exceeded");
            }
        }

        chunk->m_byteSize += sd.m_byteSize;
        chunk->m_numberOfSequences++;
        chunk->m_numberOfSamples += sd.m_numberOfSamples;
        sd.m_chunkId = chunk->m_id;
        sd.m_indexInChunk = chunk->m_sequences.size();
        if (!m_primary)
        {
            auto location = std::make_pair(chunk->m_id, sd.m_indexInChunk);
            auto sequenceId = sd.m_key.m_sequence;
            m_keyToSequenceInChunk.insert(std::make_pair(sequenceId, location));
        }
        chunk->m_sequences.push_back(sd);
    }

    // Reserves inner structures for the specified number of bytes.
    void Reserve(size_t sizeInBytes)
    {
        if (m_maxChunkSize > 0)
        {
            m_chunks.reserve((sizeInBytes + m_maxChunkSize - 1) / m_maxChunkSize);
        }

        m_chunks.push_back({});
    }

    // Checks if the index is empty.
    bool IsEmpty() const
    {
        return m_chunks.empty();
    }

    DISABLE_COPY_AND_MOVE(Index);
};

// A helper class that does a pass over the input file building up
// an index consisting of sequence and chunk descriptors (which among 
// others specify size and file offset of the respective structure).
// As opposed to the data deserializer, indexer performs almost no parsing 
// and therefore is several magnitudes faster.
class Indexer
{
public:
    Indexer(FILE* file, bool isPrimary, bool skipSequenceIds = false, char streamPrefix = '|', size_t chunkSize = 32 * 1024 * 1024, size_t bufferSize = 2 * 1024 * 1024);

    // Reads the input file, building and index of chunks and corresponding
    // sequences.
    void Build(CorpusDescriptorPtr corpus);

    // Returns input data index (chunk and sequence metadata)
    const Index& GetIndex() const { return m_index; }

    // True, when input does not have the sequence id column
    // or when sequence id column was ignored during indexing
    // (by passing skipSequenceIds = true to the constructor).
    bool HasSequenceIds() const { return m_hasSequenceIds; }

private:
    FILE* m_file;

    int64_t m_fileOffsetStart;
    int64_t m_fileOffsetEnd;

    std::unique_ptr<char[]> m_buffer;
    const size_t m_bufferSize;
    const char* m_bufferStart;
    const char* m_bufferEnd;
    const char* m_pos; // buffer index

    bool m_done; // true, when all input was processed

    bool m_hasSequenceIds; // true, when input contains one sequence per line 
                           // or when sequence id column was ignored during indexing.

    // a collection of chunk descriptors and sequence keys.
    Index m_index;

    const char m_streamPrefix;

    // fills up the buffer with data from file, all previously buffered data
    // will be overwritten.
    void RefillBuffer();

    // Moves the buffer position to the beginning of the next line.
    void SkipLine();

    // Tries to get numeric sequence id.
    // Throws an exception if a non-numerical is read until the pipe character or 
    // EOF is reached without hitting the pipe character.
    // Returns false if no numerical characters are found preceding the pipe.
    // Otherwise, writes sequence id value to the provided reference, returns true.
    bool TryGetNumericSequenceId(size_t& id);

    // Same as above but for symbolic ids.
    // It reads a symbolic key and converts it to numeric id using provided keyToId function.
    bool TryGetSymbolicSequenceId(size_t& id, std::function<size_t(const std::string&)> keyToId);


    // Build a chunk/sequence index, treating each line as an individual sequence.
    // Does not do any sequence parsing, instead uses line number as 
    // the corresponding sequence id.
    void BuildFromLines();

    // Returns current offset in the input file (in bytes). 
    int64_t GetFileOffset() const { return m_fileOffsetStart + (m_pos - m_bufferStart); }

    DISABLE_COPY_AND_MOVE(Indexer);
};

}}}