MatrixQuantizerGPU.h
#pragma once
#include "QuantizedMatrix.h" // TODO: strangely, this must be included first, although it is the first thing MatrixQuantizer.h includes. Without, nvcc fails.
#include "MatrixQuantizerImpl.h"
#include "ColumnQuantizer.h"
#include "GPUMatrix.h"
#ifndef CPUONLY
#include <cuda_runtime_api.h>
#include <cuda.h>
#endif // !CPUONLY
#include <vector>
#include <memory>
namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
class MatrixQuantizerGPU : public MatrixQuantizerImpl<ElemType>
{
public:
MatrixQuantizerGPU(int deviceId, bool useDedicatedComputeStream, bool forceSync = false);
~MatrixQuantizerGPU();
// Disallow copy and move construction and assignment
DISABLE_COPY_AND_MOVE(MatrixQuantizerGPU);
void QuantizeAsync(const Matrix<ElemType>& inMatrix, const Matrix<ElemType>& inResidual, QuantizedMatrix<ElemType>& outQMatrix, Matrix<ElemType>& outResidual, bool zeroThresholdFor1Bit) override;
void WaitQuantizeAsyncDone() override;
void UnquantizeAsync(QuantizedMatrix<ElemType>& inQMatrix, Matrix<ElemType>& outMatrix, bool add = false) override;
void WaitUnquantizeAsyncDone() override;
private:
// Helper function to get a temporary intermediate matrix on the GPU to store quantization results
QuantizedMatrix<ElemType>& GetTempGPUQuantizedMatrix(size_t numRows, size_t numCols, size_t nBits, bool& newlyAllocated);
#ifndef CPUONLY
// Record a event to flag the completion of quantization/unquantization kernel on the compute stream
void RecordQuantizeCompleteEvent(cudaStream_t computestream) const;
// Synchronize the fetch stream to the quantization completion event and record an event on the fetch
// stream to flag the completion of fetching the quantization results from the GPU
void SyncQuantizeCompleEventAndFetchAndRecordFetchCompleteEvent(char* cpuBuffer, char* gpuBuffer, size_t size) const;
// Synchronize the compute stream to the assign completion event to ensure that subsequent compute stream operations
// wait for the assign stream operations, scheduled so far, to finish
void SyncAssignCompleteEvent(cudaStream_t computestream) const;
// for concurrent computation and memcpy
// - assign to GPU : CPU-to-GPU,started by CPU when data read; flags assigncomplete
// - GPU-side operation --waits for assigncomplete; flags quantizecomplete
// - fetch from GPU --waits for quantizecomplete; flags fetchcomplete
// - CPU-side access of buffer --read: waits for fetchcomplete, write: waits for assigncomplete
public:
static cudaStream_t GetComputeStream(); // get the compute stream
static cudaStream_t GetFetchStream(); // and the copy streams
static cudaStream_t GetAssignStream();
private:
// helper functions for gpus
static void Sync();
static void SyncStream(cudaStream_t stream);
static void SyncEvent(cudaEvent_t ev);
private:
static cudaStream_t m_computeStream;
static cudaStream_t m_fetchStream;
static cudaStream_t m_assignStream;
mutable cudaEvent_t m_tempMatrixZeroingCompleteEvent;
mutable cudaEvent_t m_quantizeCompleteEvent;
mutable cudaEvent_t m_fetchCompleteEvent;
mutable cudaEvent_t m_assignCompleteEvent;
#endif // !CPUONLY
private:
bool m_forceSync;
bool m_quantizeOpIncludedFetch;
// A temporary intermediate QuantizedMatrix buffer on the GPU
QuantizedMatrix<ElemType>* m_tempGPUQuantizedMatrix;
};
// This type records and synchronizes events on the main
// GPU matrix computation work stream
class MATH_API GPUMatrixComputeStreamEvent : public MatrixComputeStreamEvent
{
public:
GPUMatrixComputeStreamEvent(int deviceId);
~GPUMatrixComputeStreamEvent();
void SynchronizeEvent() override;
template <typename ElemType>
void SynchronizeQuantizationComputeStreamWithEvent();
template <typename ElemType>
void SynchronizeDataTransferFetchStreamWithEvent();
private:
#ifndef CPUONLY
cudaEvent_t m_mainGPUComputeStreamCUDAEvent;
#endif
};
} } }