// // Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // // This is the main header of the CNTK library API containing the entire public API definition. // #pragma once #ifdef SWIG #define final #define explicit #define static_assert(condition, message) #endif #include "CNTKLibrary.h" /// /// Experimental features in CNTK library. /// Please be aware that these are subject to frequent changes and even removal. /// namespace CNTK { namespace Experimental { }} namespace CNTK { /// /// User defined deserializers. /// /// /// Sequence key, used for correlation of sequences between different deserializers. /// struct KeyType { KeyType() : m_sequence(0), m_sample(0) {} KeyType(size_t sequence, unsigned int sample) : m_sequence(sequence), m_sample(sample) {} size_t m_sequence; /// Sequence id - identifies sequence across different deserializers. unsigned int m_sample;/// Sample id - identifies a sample inside the sequence. /// Used when the deserializer operates in sample mode. }; class Chunk; typedef std::shared_ptr ChunkPtr; typedef unsigned int ChunkIdType; static const ChunkIdType ChunkIdMax = (ChunkIdType)(-1); static const unsigned int SequenceLenMax = (unsigned int)(-1); /// /// Defines main properties of a sequence. /// Sequence descriptions are used by the randomizer to establish a global timeline for complete input. /// A sequence is defined as an ordered set of samples (size == 1 is used for sample training). /// struct SequenceDescription { size_t m_indexInChunk; /// Sequence index in chunk. unsigned int m_numberOfSamples; /// Number of samples in a sequence. ChunkIdType m_chunkId; /// Each sequence belongs to an I/O chunk, how chunk is defined is specific to a /// particular data deserializer (or bundler). The randomizer guarantees to request /// sequences from only limited subset of chunks at any moment in time. KeyType m_key; /// Sequence key, uniquely identifies the sequence. /// When data is coming from different deserializers it is used to correlated sequences /// (the reader will perform a form of SQL join operation on the m_key /// to correlated the data between different streams). }; typedef std::shared_ptr SequenceDescriptionPtr; /// /// Defines sequence data and its layout. /// Currently CNTK supports dense and sparse sequences (csc). /// The storageType in the corresponding stream description identifies what type of SequenceData /// data deserializer or transformer provides. /// The layout of samples are described in the sampleLayout. /// All samples in the sequence should have the same layout. /// struct SequenceDataBase { SequenceDataBase() : m_numberOfSamples(0), m_elementType(DataType::Unknown), m_isValid(true) {} virtual ~SequenceDataBase() = default; unsigned int m_numberOfSamples; /// Number of samples in the sequence /// Returns a pointer to the data buffer. /// The actual size is provided for particular sequences,i.e. see DenseSequenceData, or SparseSequenceData. virtual const void* GetDataBuffer() = 0; // Returns the shape of samples in the sequence. virtual const NDShape& GetSampleShape() = 0; DataType m_elementType; /// Sequence element type. bool m_isValid; /// Flag indicating if sequence is valid. KeyType m_key; /// Sequence key. }; typedef std::shared_ptr SequenceDataPtr; /// /// Dense sequence. Should be returned by the deserializer for streams with storage type StorageType::dense. /// All samples are stored in the 'data' member as a contiguous array. /// struct DenseSequenceData : SequenceDataBase { }; typedef std::shared_ptr DenseSequenceDataPtr; /// /// Sparse sequence. Should be returned by the deserializer for streams with storage type StorageType::csc_sparse. /// All non zero values are store in the 'data' member as a contiguous array. /// The corresponding row indices are stored in 'indices' per sample. /// All samples in the sequence should have the same layout. /// struct SparseSequenceData : SequenceDataBase { SparseIndexType* m_indices; /// an index for every value in the m_data array std::vector m_nnzCounts; /// nnz count for each sample in the sequence SparseIndexType m_totalNnzCount; /// sum of all nzzCounts of all samples /// Using IndexType for both properties above since the nnzCount should fit inside /// the index type (in CSC format, the last value in the column index array == nnzCount) }; typedef std::shared_ptr SparseSequenceDataPtr; /// /// A chunk represents a set of sequences. /// In order to enable efficient IO, the deserializer is asked to load a complete chunk in memory. /// Which chunks to load are controlled by the randomizer. The randomizer guarantees that at any point in time /// only a limited number of chunks is requested from the deserializer and uses for randomization only sequences /// from these chunks. /// /// In case when several deserializers provide data, the chunking of the "primary" deserializer defines /// which chunks are requested by the randomizer. Thus, if the deserializers are "aligned" as how they see chunks, /// the randomizer will access only a limited set. If the data between different randomizers is not aligned - this /// could lead to memory pressure caused by randomly accessed sequences in different chunks in secondary deserializers. /// /// The lifetime of chunk is controlled by the randomizer - when all sequences of the chunk are consumed, the randomizer /// releases the shared pointer to the chunk by that freeing the associated memory. /// Sequences are only pointers to the real data which is allocated on chunk basis. /// class Chunk { public: /// Gets a sequence per input by its index inside the chunk. virtual void GetSequence(size_t sequenceIndex, std::vector& result) = 0; virtual ~Chunk() {}; protected: Chunk() {} }; /// /// Represents a chunk description. /// struct ChunkDescription { ChunkIdType m_id; /// Chunk id. size_t m_numberOfSamples; /// Number of samples in the chunk. size_t m_numberOfSequences; /// Number of sequences in the chunk. }; typedef std::vector ChunkDescriptions; /// /// Interface all data deserializers should implement. /// Data deserializers are intimately familiar with a particular input formats and responsible for bringing /// the serialized data into sequences in memory. Very often data for different streams (i.e. features/lattices) /// reside in the same physical storage (file), so the data deserializer can expose not a single but several /// streams. Examples of data include image data deserializer or htkmlf data deserializer. /// class DataDeserializer { public: /// /// Gets stream descriptions for all streams this deserializer exposes. /// virtual std::vector GetStreamDescriptions() = 0; /// /// Gets chunk descriptions this deserializer exposes. /// virtual ChunkDescriptions GetChunkDescriptions() = 0; /// /// Gets sequence descriptions for a given a chunk. /// virtual void GetSequencesForChunk(ChunkIdType chunkId, std::vector& descriptions) = 0; /// /// Gets sequence description given the sequence description of the primary deserializer. /// Used for deserializers not in driving/primary mode. /// Returns false if the corresponding secondary sequence is not valid. /// virtual bool GetSequenceDescription(const SequenceDescription& primary, SequenceDescription& description) = 0; /// /// Gets chunk data given its id. /// virtual ChunkPtr GetChunk(ChunkIdType chunkId) = 0; virtual ~DataDeserializer() {}; protected: DataDeserializer() {} }; typedef std::shared_ptr DataDeserializerPtr; }