https://github.com/Microsoft/CNTK
Raw File
Tip revision: ea5a77ebd44c2efef7abe96aabd46024fb52e140 authored by Eldar Akchurin on 08 June 2017, 13:39:36 UTC
Exposing user deserializer in Python
Tip revision: ea5a77e
CNTKLibraryExperimental.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// This is the main header of the CNTK library API containing the entire public API definition. 
//

#pragma once

#ifdef SWIG
#define final
#define explicit
#define static_assert(condition, message)
#endif

#include "CNTKLibrary.h"

///
/// Experimental features in CNTK library. 
/// Please be aware that these are subject to frequent changes and even removal.
///

namespace CNTK { namespace Experimental {
}}

namespace CNTK {
    ///
    /// User defined deserializers.
    ///

    ///
    ///  Sequence key, used for correlation of sequences between different deserializers.
    ///
    struct KeyType
    {
        KeyType() : m_sequence(0), m_sample(0) {}
        KeyType(size_t sequence, unsigned int sample) : m_sequence(sequence), m_sample(sample) {}

        size_t m_sequence;    /// Sequence id - identifies sequence across different deserializers.
        unsigned int m_sample;/// Sample id - identifies a sample inside the sequence.
                              /// Used when the deserializer operates in sample mode.
    };

    class Chunk;
    typedef std::shared_ptr<Chunk> ChunkPtr;

    typedef unsigned int ChunkIdType;

    static const ChunkIdType ChunkIdMax = (ChunkIdType)(-1);
    static const unsigned int SequenceLenMax = (unsigned int)(-1);

    ///
    /// Defines main properties of a sequence.
    /// Sequence descriptions are used by the randomizer to establish a global timeline for complete input.
    /// A sequence is defined as an ordered set of samples (size == 1 is used for sample training).
    ///
    struct SequenceDescription
    {
        size_t m_indexInChunk;                     /// Sequence index in chunk.
        unsigned int m_numberOfSamples;            /// Number of samples in a sequence.
        ChunkIdType m_chunkId;                     /// Each sequence belongs to an I/O chunk, how chunk is defined is specific to a
                                                   /// particular data deserializer (or bundler). The randomizer guarantees to request
                                                   /// sequences from only limited subset of chunks at any moment in time.
        KeyType m_key;                             /// Sequence key, uniquely identifies the sequence.
                                                   /// When data is coming from different deserializers it is used to correlated sequences
                                                   /// (the reader will perform a form of SQL join operation on the m_key
                                                   /// to correlated the data between different streams).
    };

    typedef std::shared_ptr<SequenceDescription> SequenceDescriptionPtr;

    ///
    /// Defines sequence data and its layout.
    /// Currently CNTK supports dense and sparse sequences (csc).
    /// The storageType in the corresponding stream description identifies what type of SequenceData
    /// data deserializer or transformer provides.
    /// The layout of samples are described in the sampleLayout.
    /// All samples in the sequence should have the same layout.
    ///
    struct SequenceDataBase
    {
        SequenceDataBase() : m_numberOfSamples(0), m_elementType(DataType::Unknown), m_isValid(true) {}
        virtual ~SequenceDataBase() = default;

        unsigned int m_numberOfSamples;  /// Number of samples in the sequence

                                         /// Returns a pointer to the data buffer.
                                         /// The actual size is provided for particular sequences,i.e. see DenseSequenceData, or SparseSequenceData.
        virtual const void* GetDataBuffer() = 0;

        // Returns the shape of samples in the sequence.
        virtual const NDShape& GetSampleShape() = 0;

        DataType                 m_elementType;     /// Sequence element type.
        bool                     m_isValid;         /// Flag indicating if sequence is valid.
        KeyType                  m_key;             /// Sequence key.
    };
    typedef std::shared_ptr<SequenceDataBase> SequenceDataPtr;

    ///
    /// Dense sequence. Should be returned by the deserializer for streams with storage type StorageType::dense.
    /// All samples are stored in the 'data' member as a contiguous array.
    ///
    struct DenseSequenceData : SequenceDataBase
    {
    };
    typedef std::shared_ptr<DenseSequenceData> DenseSequenceDataPtr;

    ///
    /// Sparse sequence. Should be returned by the deserializer for streams with storage type StorageType::csc_sparse.
    /// All non zero values are store in the 'data' member as a contiguous array.
    /// The corresponding row indices are stored in 'indices' per sample.
    /// All samples in the sequence should have the same layout.
    ///
    struct SparseSequenceData : SequenceDataBase
    {
        SparseIndexType* m_indices;                /// an index for every value in the m_data array
        std::vector<SparseIndexType> m_nnzCounts;  /// nnz count for each sample in the sequence
        SparseIndexType m_totalNnzCount;           /// sum of all nzzCounts of all samples
                                                   /// Using IndexType for both properties above since the nnzCount should fit inside
                                                   /// the index type (in CSC format, the last value in the column index array == nnzCount)
    };
    typedef std::shared_ptr<SparseSequenceData> SparseSequenceDataPtr;

    ///
    /// A chunk represents a set of sequences.
    /// In order to enable efficient IO, the deserializer is asked to load a complete chunk in memory.
    /// Which chunks to load are controlled by the randomizer. The randomizer guarantees that at any point in time
    /// only a limited number of chunks is requested from the deserializer and uses for randomization only sequences
    /// from these chunks.
    ///
    /// In case when several deserializers provide data, the chunking of the "primary" deserializer defines
    /// which chunks are requested by the randomizer. Thus, if the deserializers are "aligned" as how they see chunks,
    /// the randomizer will access only a limited set. If the data between different randomizers is not aligned - this
    /// could lead to memory pressure caused by randomly accessed sequences in different chunks in secondary deserializers.
    ///
    /// The lifetime of chunk is controlled by the randomizer - when all sequences of the chunk are consumed, the randomizer
    /// releases the shared pointer to the chunk by that freeing the associated memory.
    /// Sequences are only pointers to the real data which is allocated on chunk basis.
    ///
    class Chunk
    {
    public:
        /// Gets a sequence per input by its index inside the chunk.
        virtual void GetSequence(size_t sequenceIndex, std::vector<SequenceDataPtr>& result) = 0;

        virtual ~Chunk() {};

    protected:
        Chunk() {}
    };

    ///
    /// Represents a chunk description.
    ///
    struct ChunkDescription
    {
        ChunkIdType m_id;            /// Chunk id.
        size_t m_numberOfSamples;    /// Number of samples in the chunk.
        size_t m_numberOfSequences;  /// Number of sequences in the chunk.
    };

    typedef std::vector<ChunkDescription> ChunkDescriptions;

    ///
    /// Interface all data deserializers should implement.
    /// Data deserializers are intimately familiar with a particular input formats and responsible for bringing
    /// the serialized data into sequences in memory. Very often data for different streams (i.e. features/lattices)
    /// reside in the same physical storage (file), so the data deserializer can expose not a single but several
    /// streams. Examples of data include image data deserializer or htkmlf data deserializer.
    ///
    class DataDeserializer
    {
    public:
        ///
        /// Gets stream descriptions for all streams this deserializer exposes.
        ///
        virtual std::vector<StreamInformation> GetStreamDescriptions() = 0;

        ///
        /// Gets chunk descriptions this deserializer exposes.
        ///
        virtual ChunkDescriptions GetChunkDescriptions() = 0;

        ///
        /// Gets sequence descriptions for a given a chunk.
        ///
        virtual void GetSequencesForChunk(ChunkIdType chunkId, std::vector<SequenceDescription>& descriptions) = 0;

        ///
        /// Gets sequence description given the sequence description of the primary deserializer.
        /// Used for deserializers not in driving/primary mode.
        /// Returns false if the corresponding secondary sequence is not valid.
        ///
        virtual bool GetSequenceDescription(const SequenceDescription& primary, SequenceDescription& description) = 0;

        ///
        /// Gets chunk data given its id.
        ///
        virtual ChunkPtr GetChunk(ChunkIdType chunkId) = 0;

        virtual ~DataDeserializer() {};

    protected:
        DataDeserializer() {}
    };

    typedef std::shared_ptr<DataDeserializer> DataDeserializerPtr;
}
back to top