https://github.com/Microsoft/CNTK
Tip revision: 6f5de8dab6ae70c320e087914813477273b3d46a authored by Vadim Mazalov on 16 December 2016, 08:15:40 UTC
Make momentum(PerMB|PerSample|AsTimeConstant) double for increased precision
Make momentum(PerMB|PerSample|AsTimeConstant) double for increased precision
Tip revision: 6f5de8d
DataDeserializer.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include <vector>
#include "Reader.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// Sequence key, used for correlations of sequences between different deserializers.
// TODO: In many cases sequence keys share the same prefix. Splitting the sequence key on
// sequence prefix and suffix will allow us to store keys more efficiently.
// The sample identifies a particular sample inside the sequence. In the future it will be hidden, so that deserializers won't know about
// sequence or sample mode, exposing only sequences.
struct KeyType
{
// Possible sequence common prefix.
// size_t m_prefix;
// Identifies sequence between different deserializers.
size_t m_sequence : 40;
// Sample id.
size_t m_sample : 24;
};
class Chunk;
typedef std::shared_ptr<Chunk> ChunkPtr;
typedef uint32_t ChunkIdType;
#define CHUNKID_MAX ((ChunkIdType)(-1))
#define SEQUENCELEN_MAX ((uint32_t)(-1))
// Defines main properties of a sequence.
// Sequence descriptions are used by the randomizer to establish a global timeline for complete input.
// A sequence is defined as an ordered set of samples (size == 1 is used for sample training).
struct SequenceDescription
{
size_t m_id; // Sequence id, uniquely identifies the sequence.
uint32_t m_numberOfSamples; // Number of samples in a sequence.
ChunkIdType m_chunkId; // Each sequence belongs to an I/O chunk, how chunk is defined is specific to a
// particular data deserializer (or bundler). The randomizer guarantees to request
// sequences from only limited subset of chunks at any moment in time.
KeyType m_key; // Sequence key, used for correlations between sequences of different deserializers.
};
typedef std::shared_ptr<SequenceDescription> SequenceDescriptionPtr;
// Defines sequence data and its layout.
// Currently CNTK supports dense and sparse sequences (csc).
// The storageType in the corresponding stream description identifies what type of SequenceData
// data deserializer or transformer can provide provides.
// The layout of samples are described in the sampleLayout.
// All samples in the sequence should have the same layout.
// TODO: add type casts (As<T>() or AsRef<>() or AsPtr<>()) to subclasses as members here.
struct SequenceDataBase
{
SequenceDataBase() : m_id(0), m_numberOfSamples(0), m_elementType(ElementType::tvariant) {}
virtual ~SequenceDataBase() = default;
// Sequence id.
size_t m_id;
uint32_t m_numberOfSamples; // Number of samples in the sequence
ChunkPtr m_chunk;
// Returns a pointer to the data buffer.
// The actual size is provided for particular sequences,i.e. see DenseSequenceData, or SparseSequenceData.
virtual const void* GetDataBuffer() = 0;
ElementType m_elementType; // Sequence element type.
TensorShapePtr m_sampleLayout; // Sample layout, can be shared by several sequences.
};
typedef std::shared_ptr<SequenceDataBase> SequenceDataPtr;
// Dense sequence. Should be returned by the deserializer for streams with storage type StorageType::dense.
// All samples are stored in the 'data' member as a contiguous array.
struct DenseSequenceData : SequenceDataBase
{
};
typedef std::shared_ptr<DenseSequenceData> DenseSequenceDataPtr;
// Sparse sequence. Should be returned by the deserializer for streams with storage type StorageType::csc_sparse.
// All non zero values are store in the 'data' member as a contiguous array.
// The corresponding row indices are stored in 'indices' per sample.
// All samples in the sequence should have the same layout.
struct SparseSequenceData : SequenceDataBase
{
IndexType* m_indices; // an index for every value in the m_data array
std::vector<IndexType> m_nnzCounts; // nnz count for each sample in the sequence
IndexType m_totalNnzCount; // sum of all nzzCounts of all samples
// Using IndexType for both properties above since the nnzCount should fit inside
// the index type (in CSC format, the last value in the column index array == nnzCount)
};
typedef std::shared_ptr<SparseSequenceData> SparseSequenceDataPtr;
// A chunk represents a set of sequences.
// In order to enable efficient IO, the deserializer is asked to load a complete chunk in memory.
// Which chunks to load are controlled by the randomizer. The randomizer guarantees that at any point in time
// only a limited number of chunks is requested from the deserializer and uses for randomization only sequences
// from these chunks.
//
// In case when several deserializers provide data, the chunking of the "primary" deserializer defines
// which chunks are requested by the randomizer. Thus, if the deserializers are "aligned" as how they see chunks,
// the randomizer will access only a limited set. If the data between different randomizers is not aligned - this
// could lead to memory pressure caused by randomly accessed sequences in different chunks in secondary deserializers.
//
// The lifetime of chunk is controlled by the randomizer - when all sequences of the chunk are consumed, the randomizer
// releases the shared pointer to the chunk by that freeing the associated memory.
// Sequences are only pointers to the real data which is allocated on chunk basis.
class Chunk
{
public:
// Gets a sequence per input by its identifier.
// The sequence has a reference to the corresponding chunk. The chunk is not
// deallocated till all its sequences are released.
virtual void GetSequence(size_t sequenceId, std::vector<SequenceDataPtr>& result) = 0;
virtual ~Chunk() {};
protected:
Chunk() {}
private:
DISABLE_COPY_AND_MOVE(Chunk);
};
// Represents a chunk description.
struct ChunkDescription
{
// Chunk id.
ChunkIdType m_id;
// Number of samples in the chunk.
size_t m_numberOfSamples;
// Number of sequences in the chunk.
size_t m_numberOfSequences;
};
typedef std::shared_ptr<ChunkDescription> ChunkDescriptionPtr;
typedef std::vector<ChunkDescriptionPtr> ChunkDescriptions;
//////////////////////////////////////////////////////////////////////////////////////////////////
// Interface all data deserializers should implement.
// Data deserializers are intimately familiar with a particular input formats and responsible for bringing
// the serialized data into sequences in memory. Very often data for different streams (i.e. features/lattices)
// reside in the same physical storage (file), so the data deserializer can expose not a single but several
// streams. Examples of data include image data deserializer or htkmlf data deserializer.
// TODO: This interface will become ABI and deserializers can be implemented in different languages, i.e. Python.
//////////////////////////////////////////////////////////////////////////////////////////////////
class IDataDeserializer
{
public:
// Gets stream descriptions for all streams this deserializer exposes.
virtual std::vector<StreamDescriptionPtr> GetStreamDescriptions() const = 0;
// Gets chunk descriptions this deserializer exposes.
virtual ChunkDescriptions GetChunkDescriptions() = 0;
// Gets sequence descriptions for a given a chunk.
virtual void GetSequencesForChunk(ChunkIdType chunkId, std::vector<SequenceDescription>& descriptions) = 0;
// Gets sequence description given the sequence description of the primary deserializer.
// Used for deserializers not in driving/primary mode.
// Returns false if the corresponding secondary sequence is not valid.
// TODO: Possibly move this out into a separate interface.
virtual bool GetSequenceDescription(const SequenceDescription& primary, SequenceDescription& description) = 0;
// Gets chunk data given its id.
virtual ChunkPtr GetChunk(ChunkIdType chunkId) = 0;
virtual ~IDataDeserializer() {};
};
typedef std::shared_ptr<IDataDeserializer> IDataDeserializerPtr;
}}}