Content - a55727adae6260e6856e124a123d4995a303384e

Permalink
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// This is the main header of the CNTK library API containing the entire public API definition. 
//

#pragma once

#ifdef SWIG
#define final
#define explicit
#define static_assert(condition, message)
#endif

#include "CNTKLibrary.h"

///
/// Experimental features in CNTK library. 
/// Please be aware that these are subject to frequent changes and even removal.
///

namespace CNTK {
    ///
    ///  Sequence key, used to correlate sequences across different deserializers.
    ///
    struct SequenceKey
    {
        SequenceKey() : SequenceKey(0, 0) {}
        SequenceKey(size_t sequence, unsigned int sample) : m_sequence(sequence), m_sample(sample) {}

        size_t m_sequence;    /// Sequence id - identifies sequence across different deserializers.
        unsigned int m_sample;/// Sample id - identifies a sample inside the sequence.
                              /// Used when the deserializer operates in sample mode.
    };

    typedef unsigned int ChunkIdType;

    static const ChunkIdType ChunkIdMax = (ChunkIdType)(-1);
    static const unsigned int SequenceLenMax = (unsigned int)(-1);

    ///
    /// Defines main properties of a sequence.
    /// Sequence information is used by the randomizer to establish a global timeline for complete input.
    /// A sequence is defined as an ordered collection of samples.
    ///
    struct SequenceInfo
    {
        size_t m_indexInChunk;                     /// Sequence index in chunk.
        unsigned int m_numberOfSamples;            /// Number of samples in a sequence.
        ChunkIdType m_chunkId;                     /// Each sequence belongs to an I/O chunk, how chunk is defined is specific to a
                                                   /// particular data deserializer (or bundler). The randomizer guarantees to request
                                                   /// sequences from only limited subset of chunks at any moment in time.
        SequenceKey m_key;                         /// Sequence key, uniquely identifies the sequence.
                                                   /// When data is coming from different deserializers it is used to correlated sequences
                                                   /// (the reader will perform a form of SQL join operation on the m_key
                                                   /// to correlated the data between different streams).
    };

    ///
    /// Defines sequence data and its layout.
    /// Currently CNTK supports dense and sparse sequences (csc).
    /// The storageType in the corresponding stream information identifies what type of SequenceData
    /// data deserializer or transformer provides.
    /// The layout of samples are described in the sampleLayout.
    /// All samples in the sequence should have the same layout.
    ///
    struct SequenceDataBase
    {
        SequenceDataBase(unsigned int numberOfSamples, bool isValid)
            : m_numberOfSamples(numberOfSamples), m_elementType(DataType::Unknown), m_isValid(isValid)
        {}

        virtual ~SequenceDataBase() = default;

        // Returns the shape of samples in the sequence.
        virtual const NDShape& GetSampleShape() = 0;

        // Returns a pointer to internal data buffer.
        virtual const void* GetDataBuffer() = 0;

        unsigned int m_numberOfSamples;  /// Number of samples in the sequence

        DataType m_elementType;   /// Sequence element type.
        bool m_isValid;           /// Flag indicating if sequence is valid.
        SequenceKey m_key;        /// Sequence key.
    };
    typedef std::shared_ptr<SequenceDataBase> SequenceDataPtr;

    ///
    /// Dense sequence. Should be returned by the deserializer for streams with storage type StorageType::dense.
    /// All samples are stored in the 'data' member as a contiguous array.
    ///
    struct DenseSequenceData : SequenceDataBase
    {
        DenseSequenceData(unsigned int numberOfSamples = 0, bool isValid = true)
            : SequenceDataBase(numberOfSamples, isValid)
        {}
    };
    typedef std::shared_ptr<DenseSequenceData> DenseSequenceDataPtr;

    ///
    /// Sparse sequence. Should be returned by the deserializer for streams with storage type StorageType::csc_sparse.
    /// All non zero values are stored in the m_data member as a contiguous array.
    /// The corresponding row indices are stored in m_indices.
    /// All samples in the sequence should have the same layout.
    ///
    struct SparseSequenceData : SequenceDataBase
    {
        SparseSequenceData(unsigned int numberOfSamples = 0, bool isValid = true)
            : SequenceDataBase(numberOfSamples, isValid)
        {}

        SparseIndexType* m_indices {0};            /// an index for every value in the m_data array
        std::vector<SparseIndexType> m_nnzCounts;  /// nnz count for each sample in the sequence
        SparseIndexType m_totalNnzCount {0};       /// sum of all nzzCounts of all samples
                                                   /// Using IndexType for both properties above since the nnzCount should fit inside
                                                   /// the index type (in CSC format, the last value in the column index array == nnzCount)
    };
    typedef std::shared_ptr<SparseSequenceData> SparseSequenceDataPtr;

    ///
    /// A chunk represents a set of sequences.
    /// In order to enable efficient IO, the deserializer is asked to load a complete chunk in memory.
    /// Which chunks to load is controlled by the randomizer. The randomizer guarantees that at any point in time
    /// only a limited number of chunks are kept in memory. It uses only sequences from these chunks for randomization.
    ///
    /// In case when several deserializers provide data, the chunking of the "primary" deserializer defines
    /// which chunks are requested by the randomizer. If the order of sequences in the deserializers is the same,
    /// the randomizer will access only a limited set of chunks at any point in time. 
    /// If the sequences across the deserializers are not aligned, it can lead to increased memory pressure
    /// caused by randomly accessed sequences in different chunks in secondary deserializers.
    ///
    /// The lifetime of a chunk is controlled by the randomizer - when all sequences of the chunk are consumed, the randomizer
    /// releases the shared pointer to the chunk releasing the associated memory.
    ///
    class Chunk
    {
    public:
        ///
        /// Gets data for the sequence with the given index.
        /// result contains a SequenceDataPtr for every input stream declared by the
        /// deserializer that produced this chunk.
        ///
        virtual void GetSequence(size_t sequenceIndex, std::vector<SequenceDataPtr>& result) = 0;

        virtual ~Chunk() = default;

    protected:
        Chunk() = default;
    };
    typedef std::shared_ptr<Chunk> ChunkPtr;

    ///
    /// Meta information of a chunk.
    ///
    struct ChunkInfo
    {
        ChunkIdType m_id;            /// Chunk id.
        size_t m_numberOfSamples;    /// Number of samples in the chunk.
        size_t m_numberOfSequences;  /// Number of sequences in the chunk.
    };

    ///
    /// Interface all data deserializers should implement.
    /// Data deserializers are intimately familiar with a particular input formats and responsible for bringing
    /// the serialized data into sequences in memory. Very often data for different streams (i.e. features/lattices)
    /// reside in the same physical storage (file), so the data deserializer can expose not a single but several
    /// streams. Examples of data include image data deserializer or htkmlf data deserializer.
    ///
    class DataDeserializer
    {
    public:
        ///
        /// Gets stream information for all streams this deserializer exposes.
        ///
        virtual std::vector<StreamInformation> StreamInfos() = 0;

        ///
        /// Gets metadata for chunks this deserializer exposes.
        ///
        virtual std::vector<ChunkInfo> ChunkInfos() = 0;

        ///
        /// Gets sequence infos for a given a chunk.
        ///
        virtual void SequenceInfosForChunk(ChunkIdType chunkId, std::vector<SequenceInfo>& result) = 0;

        ///
        /// Gets sequence information given the one of the primary deserializer.
        /// Used for non-primary deserializers.
        /// Returns false if the corresponding secondary sequence is not valid.
        ///
        virtual bool GetSequenceInfo(const SequenceInfo& primary, SequenceInfo& result) = 0;

        ///
        /// Gets chunk data given its id.
        ///
        virtual ChunkPtr GetChunk(ChunkIdType chunkId) = 0;

        virtual ~DataDeserializer() = default;

    protected:
        DataDeserializer() = default;
    };

    typedef std::shared_ptr<DataDeserializer> DataDeserializerPtr;
}