#include "stdafx.h" #include "MatrixQuantizerCPU.h" namespace Microsoft { namespace MSR { namespace CNTK { template MatrixQuantizerCPU::MatrixQuantizerCPU() : MatrixQuantizerImpl(CPUDEVICE) { } template void MatrixQuantizerCPU::QuantizeAsync(const Matrix& inMatrix, const Matrix& inResidual, QuantizedMatrix& outQMatrix, Matrix& outResidual, bool zeroThresholdFor1Bit) { // The outQMatrix should be on the CPU // TODO: Support transferring the quantization output to a quantized matrix on the GPU assert(outQMatrix.GetDeviceId() == CPUDEVICE); size_t nBits = outQMatrix.GetNumBits(); size_t nRow = inMatrix.GetNumRows(); size_t nCol = inMatrix.GetNumCols(); // Verify that the different matrix parameters have matching dimensions assert((outQMatrix.GetNumRows() == nRow) && (outQMatrix.GetNumCols() == nCol)); assert((inResidual.GetNumRows() == nRow) && (inResidual.GetNumCols() == nCol)); assert((outResidual.GetNumRows() == nRow) && (outResidual.GetNumCols() == nCol)); const size_t ldNbits = ValueQuantizer::ld(nBits); #ifdef QUANTUSEPPL Concurrency::parallel_for((size_t) 0, us.cols(), [&](size_t j) #else for (size_t j = 0; j < nCol; j++) #endif { auto& qcol = *(outQMatrix.GetQuantizedColumn(j)); if (zeroThresholdFor1Bit) { // Explicit use of 'template' keyword is needed to compile with GCC ColumnQuantizer::template ComputeRangeStatColj(inMatrix.Data(), inResidual.Data(), (long) nRow, j, nBits, qcol.lower, qcol.upper); } else { // Explicit use of 'template' keyword is needed to compile with GCC ColumnQuantizer::template ComputeRangeStatColj(inMatrix.Data(), inResidual.Data(), (long) nRow, j, nBits, qcol.lower, qcol.upper); } ColumnQuantizer q(ldNbits, qcol.lower, qcol.upper); if (zeroThresholdFor1Bit) { // Explicit use of 'template' keyword is needed to compile with GCC q.template Quantize(inMatrix.Data(), inResidual.Data(), (long) nRow, j, qcol.bits, outResidual.Data()); } else { // Explicit use of 'template' keyword is needed to compile with GCC q.template Quantize(inMatrix.Data(), inResidual.Data(), (long) nRow, j, qcol.bits, outResidual.Data()); } } #ifdef QUANTUSEPPL ); #endif } template void MatrixQuantizerCPU::WaitQuantizeAsyncDone() { // TODO: Currently this is a no-op since the actual quantization is synchronous } // unquantize an entire matrix, calling unquantize() for each column template void MatrixQuantizerCPU::UnquantizeAsync(QuantizedMatrix& inQMatrix, Matrix& outMatrix, bool add /*= false*/) { // The inQMatrix and hould be on the CPU assert(inQMatrix.GetDeviceId() == CPUDEVICE); assert(outMatrix.GetDeviceId() == CPUDEVICE); size_t nBits = inQMatrix.GetNumBits(); size_t nRow = inQMatrix.GetNumRows(); size_t nCol = inQMatrix.GetNumCols(); // Verify that the different matrix parameters have matching dimensions assert((outMatrix.GetNumRows() == nRow) && (outMatrix.GetNumCols() == nCol)); const size_t ldNbits = ValueQuantizer::ld(nBits); #ifdef QUANTUSEPPL Concurrency::parallel_for((size_t) 0, us.cols(), [&](size_t j) #else for (size_t j = 0; j < nCol; j++) #endif { const auto& qcol = *(inQMatrix.GetQuantizedColumn(j)); ColumnQuantizer q(ldNbits, qcol.lower, qcol.upper); q.Unquantize(outMatrix.Data(), (long) nRow, j, qcol.bits, add); } #ifdef QUANTUSEPPL ); #endif } template void MatrixQuantizerCPU::WaitUnquantizeAsyncDone() { // TODO: Currently this is a no-op since the actual quantization is synchronous } template #if defined(_MSC_VER) void MatrixQuantizerCPU::TopKAsync(const Matrix& , const Matrix& , struct stream &, Matrix& , int ) #else void MatrixQuantizerCPU::TopKAsync(const Matrix& inMatrix, const Matrix& inResidual, struct stream &sendbuf, Matrix& outResidual, int topK) #endif { RuntimeError("Not implemented."); } template void MatrixQuantizerCPU::WaitTopKAsyncDone() { RuntimeError("Not implemented."); } template #if defined(_MSC_VER) void MatrixQuantizerCPU::UnTopKAsync(struct stream &, Matrix& ) #else void MatrixQuantizerCPU::UnTopKAsync(struct stream &recvbuf, Matrix& outMatrix) #endif { RuntimeError("Not implemented."); } template void MatrixQuantizerCPU::WaitUnTopKAsyncDone() { RuntimeError("Not implemented."); } //The explicit instantiation part will make the linker happy template class MatrixQuantizerCPU; template class MatrixQuantizerCPU; }}}