https://github.com/Microsoft/CNTK
Tip revision: 8fef21be1d80b02849548dd05993abcebafd4837 authored by Dong Yu on 09 August 2016, 21:51:45 UTC
support minibatchSize (for larger models that will be out of memory without it) and frameMode (for RNNNode based Windowed RNN) for HTKMLFWriter (and HTKMLFReader).
support minibatchSize (for larger models that will be out of memory without it) and frameMode (for RNNNode based Windowed RNN) for HTKMLFWriter (and HTKMLFReader).
Tip revision: 8fef21b
ColumnQuantizer.h
#ifndef __COLUMN_QUANTIZER_H__
#define __COLUMN_QUANTIZER_H__
#include "ValueQuantizer.h"
#include <math.h>
#pragma warning(disable : 4127) // conditional expression is constant
namespace Microsoft { namespace MSR { namespace CNTK {
#define ColMIDX(i, j, numRow) (((j) * (numRow)) + (i)) // 0 based indexing for column major
// ---------------------------------------------------------------------------
// Class to perform columnwise quantization/unquantization
//
// The quantization of a column is performed in 2 steps
// a) Compute the values used for unquantizing/reconstructing the quantized values. This is done by computing a pair of
// values that specify the range of reconstructed unquantized values, such that the aggregate qunatization error is minimized.
// b) Perform the actual quantization by quantizing each value in the column to an integer of the size of
// the specified number of bits and then packing these integer bits into the quantized matrix storage
// ---------------------------------------------------------------------------
template <class ElemType>
class ColumnQuantizer
{
typedef typename ValueQuantizer<ElemType>::QWord QWord;
typedef typename ValueQuantizer<ElemType>::QWordVal QWordVal;
static const size_t QWordNumBits = ValueQuantizer<ElemType>::QWordNumBits;
public:
cudacode ColumnQuantizer(size_t logNbits, ElemType lower, ElemType upper)
: valQ(logNbits, lower, upper)
{
}
// compute #QWords per column of a given height
static size_t QWordsPerCol(size_t rows, size_t Nbits)
{
const size_t valsPerQWord = QWordNumBits / Nbits;
return (rows + valsPerQWord - 1) / valsPerQWord;
}
size_t QWordsPerCol(size_t rows) const
{
return QWordsPerCol(rows, valQ.NBits());
}
// quantize a matrix column into qcoldata
// The current value of 'inResidual' is added to the matrix, and 'outResidual' gets updated with the new residual;
// inResidual = outResidual is allowed (intended)
template <bool ZeroThresholdFor1Bit>
cudacode void Quantize(const ElemType* inMat, const ElemType* inResidual, long M, size_t j, QWord* qColBits, ElemType* outResidual) const
{
// we loop over QWord values
// E.g. there are 35 ints for a 1100-dim column (at 1-bit quantization).
// For better CUDA memory collating, we interleave memory such that computing consecutive ints triggers consecutive memory accesses
// (although for the CPU side, it breaks caching; we could do in-place op)
// E.g., int 0 accesses elements 0, 35, 70, etc.
// while int 1 accesses elements 1, 36, 71, etc
// up to int 34 accesses elements 34, 69, 104, etc.
const size_t numQWordsPerCol = QWordsPerCol(M);
for (size_t iQWord = 0; iQWord < numQWordsPerCol; iQWord++)
{
qColBits[iQWord] = QuantizeOneQWord<ZeroThresholdFor1Bit>(inMat, inResidual, M, iQWord, M, numQWordsPerCol, j, outResidual);
}
}
// unquantize a matrix column from qcoldata
// If 'add' then add to the existing content of the matrix (this is a common thing to do; saves a buffer).
cudacode void Unquantize(ElemType* outMat, long M, size_t j, const QWord* qColBits, bool add) const
{
// loop over QWord values
const size_t numQWordsPerCol = QWordsPerCol(M);
for (size_t iQWord = 0; iQWord < numQWordsPerCol; iQWord++)
{
UnquantizeOneQWord(outMat, M, iQWord, M, numQWordsPerCol, j, qColBits[iQWord], add);
}
}
// workaround for not being able to declare a default argument for lambda parameters
template <bool ZeroThresholdFor1Bit>
static cudacode void ComputeRangeStatColj(const ElemType* inMat, const ElemType* inResidual, long M, size_t j, size_t bits, ElemType& lower, ElemType& upper)
{
/*dummy reducers do nothing in linear CPU version*/
ComputeRangeStatColjSubset<ZeroThresholdFor1Bit>(inMat, inResidual, M, j, bits, lower, upper, 0, 1, [](ElemType&)
{
},
[](unsigned int&)
{
});
}
public:
// quantize the value in inMat[rowStart,colIdx], inMat[rowStart + rowStride,colIdx],inMat[rowStart + rowStride*,colIdx] ... and pack them into a QWord
// Question: note that it is somewhat un-intuitional, but this memory access pattern is efficient for GPU?
template <bool ZeroThresholdFor1Bit>
cudacode QWord QuantizeOneQWord(
const ElemType* inMat, const ElemType* inResidual,
long M,
size_t rowStart, size_t rowEnd, size_t rowStride,
size_t colIdx,
ElemType* outResidual) const
{
QWord bitBuf = 0;
if ((valQ.NBits() == 1) && (inResidual == outResidual) /*in-place*/)
{
ElemType val0 = valQ.Unquantize(0);
ElemType val1 = valQ.Unquantize(1);
size_t ij = ColMIDX(rowStart, colIdx, M);
const ElemType* usibj = inMat + ij;
const ElemType* usibjend = usibj + (rowEnd - rowStart);
ElemType* resibj = outResidual + ij;
// we know that the range covers at most the number of bits in a 'QWord'
for (QWord bitmask = 1; usibj < usibjend; bitmask <<= 1, usibj += rowStride, resibj += rowStride)
{
// quantize --we access element (i,j) through the three increasing pointers
ElemType val = *usibj + *resibj;
// Explicit use of 'template' keyword is needed to compile with GCC
bool qval = valQ.template Quantize1<ZeroThresholdFor1Bit>(val);
if (qval)
{
bitBuf |= bitmask;
}
// compute residual
ElemType uval = valQ.Unquantize1(qval, val0, val1);
*resibj = val - uval;
}
}
else
{
// number of bits in a QWord
size_t i = rowStart;
for (size_t k = 0; (k < QWordNumBits) && (i < rowEnd); k += valQ.NBits(), i += rowStride)
{
// quantize
size_t ij = ColMIDX(i, colIdx, M);
ElemType val = inMat[ij] + inResidual[ij];
QWordVal qval = valQ.Quantize<ZeroThresholdFor1Bit>(val);
// compute residual
ElemType uval = valQ.Unquantize(qval);
ElemType r = val - uval;
outResidual[ij] = r;
bitBuf = bitBuf | (qval << k);
}
}
return bitBuf;
}
// unquantize one QWord of a quantized matrix column
cudacode void UnquantizeOneQWord(
ElemType* us, long M,
size_t rowStart, size_t rowEnd, size_t rowStride,
size_t colIdx, QWord bitBuf, bool add) const
{
// special case for 1 bit
if (valQ.NBits() == 1)
{
ElemType val0 = valQ.Unquantize(0);
ElemType val1 = valQ.Unquantize(1);
size_t ij = ColMIDX(rowStart, colIdx, M);
ElemType* usibj = us + ij;
const ElemType* usibjend = usibj + (rowEnd - rowStart);
for (; usibj < usibjend; usibj += rowStride)
{
// get value
// bitbuf is shifted in-place
bool qval = (bitBuf & 1) != 0;
// and get bitbuf into next position
bitBuf >>= 1;
// unquantize
ElemType val = ValueQuantizer<ElemType>::Unquantize1(qval, val0, val1);
if (add)
{
val += *usibj;
}
*usibj = val;
}
}
else
{
// (rangeend MUST be a power of two; ensured by constructing off ldNbits)
const QWordVal bitmask = valQ.QuanRangeEnd() - 1;
size_t i = rowStart;
for (size_t k = 0; (k < QWordNumBits) && (i < rowEnd); k += valQ.NBits(), i += rowStride)
{
// get value
const QWordVal qval = (bitBuf >> k) & bitmask; // % 2^Nbits
// unquantize
ElemType val = valQ.Unquantize(qval);
size_t ij = ColMIDX(i, colIdx, M);
if (add)
{
val += us[ij];
}
us[ij] = val;
}
}
}
// determine quantization range of one column
// This code is written so that it can run in parallel threads on CUDA for collated memory access;
// set 'subsets' to >1 and pass cross-thread reducer functions for 'float' and 'size_t' (which would reduce through using CUDA __shared__ memory).
// TODO: further opportunity for speed-up: use 'mean' from last round for 1-bit and stddev calc
template <bool ZeroThresholdFor1Bit, class F1, class F2>
static cudacode void ComputeRangeStatColjSubset(
const ElemType* inMat,
const ElemType* inResidual, long M,
size_t j,
size_t bits,
ElemType& lower, ElemType& upper,
size_t subset, size_t subsets,
F1 allReduceElem, F2 allReduceUint)
{
// quantization range, cut off after how many standard deviations (make this a parameter if we care)
size_t rows = M;
// compute mean
// computing the mean is expensive; we assume there is no reason for asymmetry and thus a zero mean
// an initial experiment showed that this is significantly worse (36.0 vs. 37.7% frame acc) at the start, but seems to recover nearly (minor gap)
// thought:
// - we could set the threshold at 0
// - but keep the quantization values for 0 and 1 separate
// i.e.
// - do not symmetrize/pool the quantization values for 0 and 1
// - but hard-code the quantization threshold to be 0 instead of the mean of the two bounds
// This should give us the best of all--fast operation yet ability to be asymmetric within a column
ElemType mean = 0.0f;
if (!ZeroThresholdFor1Bit || (bits != 1))
{
ElemType meanacc = 0.0f;
// (subset: compute subset sum)
for (size_t i = subset; i < rows; i += subsets)
{
size_t ij = ColMIDX(i, j, M);
meanacc += inMat[ij] + inResidual[ij];
}
// multi-subset (CUDA): reduce to one thread
allReduceElem(meanacc);
mean = meanacc / rows;
}
if (bits == 1)
{
// 1-bit case:
// We want to minimize the (squared) reconstruction error within the two levels.
// I.e. we should reconstruct to the respective means of each level.
// To be able to express the range by two floats, we approximate the level threshold as the av. of the two level means.
// compute the two level means
ElemType meanacc0 = 0.0f, meanacc1 = 0.0f;
unsigned int num0 = 0, num1 = 0;
// (subset: compute subset sum)
for (size_t i = subset; i < rows; i += subsets)
{
size_t ij = ColMIDX(i, j, M);
ElemType val = inMat[ij] + inResidual[ij];
if (val < mean)
{
meanacc0 += val;
num0++;
}
else
{
meanacc1 += val;
num1++;
}
}
// multi-subset (CUDA): reduce to one thread
allReduceElem(meanacc0);
allReduceElem(meanacc1);
allReduceUint(num0);
allReduceUint(num1);
ElemType radius;
ElemType newmean;
if (!ZeroThresholdFor1Bit)
{
// we minimize the error jointly across positive and negative numbers to make things
// symmetrical around the mean (which may be non-zero) tying the two sides
ElemType devacc0 = (num0 * mean) - meanacc0;
ElemType devacc1 = meanacc1 - (num1 * mean);
// both deviations tied, to ensure consistent mean
ElemType dev = (devacc0 + devacc1) / rows;
radius = 2.0f * dev;
newmean = mean;
}
else
{
// we keep two separate reconstruction values to allow for asymmetries--but we
// instead hard-code that the threshold is 0
// happens for all-zero columns which do exist (mean0 is 0 in that case)
if (num0 == 0)
num0 = 1;
if (num1 == 0)
num1 = 1;
ElemType mean0 = meanacc0 / num0;
ElemType mean1 = meanacc1 / num1;
// approximate by using their average as the threshold between 0 and 1
// with these values, bits (0,1) which mean values (0.5,1.5) will reconstruct to mean0/1
newmean = 0.5f * (mean0 + mean1);
radius = 2.0f * (mean1 - newmean);
}
if (subset == 0)
{
lower = newmean - radius;
upper = newmean + radius;
}
}
else
{
ElemType stddevs = 5.0f;
// >1 bit:
// We linearly quantize between 'stddevs' standard deviations.
ElemType varacc = 0.0f;
// (subset: compute subset sum)
for (size_t i = subset; i < rows; i += subsets)
{
size_t ij = ColMIDX(i, j, M);
ElemType val = inMat[ij] + inResidual[ij];
varacc += (val - mean) * (val - mean);
}
// multi-subset (CUDA): reduce to one thread
allReduceElem(varacc);
ElemType stddev = sqrt(varacc / rows);
if (subset == 0)
{
// stddevs = how many stddevs from the mean until outside of quantization range
lower = mean - (stddevs * stddev);
upper = mean + (stddevs * stddev);
}
}
}
private:
ValueQuantizer<ElemType> valQ;
template <typename T>
friend class QuantizedMatrix;
};
}
}
}
#endif