TensorView.cpp
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// TensorView.cpp -- main CPP file that contains all functions exported by the CNTKMath.dll
//
//
// TODO:
// - dimension inference in nodes
// - reduction on GPU is highly inefficient; test cases Image/QuickE2E PlusNode::BackpropTo() and ScaleNode::BackpropTo()
// - accuracy deviation in FullUtterance and SequenceTraining
// - TimesNode --needs to specify reduction dimensions
// - ConvolutionNode --needs to specify reduction dimensions
// - some nodes create new "dimensions" such as RowStack. Should that be an actual new tensor dimension?
// This implements the TensorView class, which is a layer around Matrix that reinterprets its content as a generic tensor.
//
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
#include "stdafx.h"
#include "Basics.h"
#include "TensorView.h"
#include <array>
#ifndef let
#define let const auto
#endif
namespace Microsoft { namespace MSR { namespace CNTK {
using namespace std;
// -------------------------------------------------------------------
// construction
// -------------------------------------------------------------------
// main constructor (all constructors except the default one route through this)
template <class ElemType>
TensorView<ElemType>::TensorView(const MatrixBasePtr& sob, const TensorShape& shape)
: m_sob(dynamic_pointer_cast<Matrix<ElemType>>(sob)), m_shape(shape)
{
if (!m_sob)
LogicError("TensorView: Attempted to create a TensorView<ElemType> on a storage object of a different ElemType.");
#ifdef _DEBUG
// check bounds of TensorShape against underlying storage object
// This is useful to detect errors like passing a matrix from the wrong input.
const auto r = shape.GetLocationRange();
const auto n = m_sob->GetNumElements();
if (r.first < 0 || (size_t)r.second > n)
LogicError("TensorView: Shape bounds [%d,%d) exceed bounds of underlying storage object [0,%d).", (int) r.first, (int) r.second, (int) n);
#endif
}
// -------------------------------------------------------------------
// elementwise operations
// -------------------------------------------------------------------
static bool Matches(size_t d1, size_t d2)
{
return d1 == 1 || d2 == 1 || d1 == d2;
} // do two dimensions match?
template <class ElemType, size_t N>
static void PrepareTensorOperands(array<TensorShape, N> shapes, array<size_t, N>& offsets,
SmallVector<size_t>& regularOpDims,
array<SmallVector<ptrdiff_t>, N>& regularStrides,
SmallVector<size_t>& reducingOpDims,
array<SmallVector<ptrdiff_t>, N>& reducingStrides)
{
// massage TensorShapes
// Note that TensorShapes here may be shapes are stored or shapes with stride magic applied.
// expand ones to make tensors compatible
// Trailing dimensions broadcast.
// E.g. A(J) vs. B(J x T) will broadcast A(:) to all T columns.
// To broadcast an A(T) to all J rows of B, use TensorShape editing to insert a dimension to get A(1,T).
size_t dims = 0;
for (size_t i = 0; i < N; i++)
if (dims < shapes[i].GetRank())
dims = shapes[i].GetRank();
for (size_t i = 0; i < N; i++)
if (shapes[i].GetRank() < dims)
shapes[i].PadRankInPlace(dims);
// all shapes[] now have the same rank
// determine operation shape (max over all dimensions)
SmallVector<size_t> opDims(shapes[0].GetDims());
for (size_t k = 0; k < dims; k++)
for (size_t i = 1; i < N; i++)
opDims[k] = max(opDims[k], shapes[i][k]);
// dimension compatibility check
// Each participant can broadcast. Non-broadcasting dimensions must match the operation dimension.
for (size_t k = 0; k < dims; k++)
for (size_t i = 0; i < N; i++)
if (!Matches(shapes[i][k], opDims[k]))
InvalidArgument("Binary tensor operation: Dimension %d of input [%d] is incompatible with operation dimensions (%s vs. %s)", (int) k, (int) i, string(shapes[i]).c_str(), string(TensorShape(opDims)).c_str());
// flatten consecutive dimensions
// Dimensions must be consecutive in memory, and either non-broadcasting or all-broadcasting, across all dimensions.
// After this, as, bs, and cs no longer match the TensorShape objects.
// fprintf(stderr, "Pre-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
for (size_t k = 1; k < dims; k++)
{
for (size_t i = 0; i < N; i++)
{
// check if stored without gaps to skip
if (!shapes[i].CanFlatten(k))
goto nope;
// check if they are either all broadcasting or all not broadcasting
if ((shapes[i][k] != opDims[k] || shapes[i][k - 1] != opDims[k - 1]) && (shapes[i][k] != 1 || shapes[i][k - 1] != 1))
goto nope;
}
// these dimensions can be merged
for (size_t i = 0; i < N; i++)
shapes[i].FlattenInPlace(k); // TODO: overdoing the immutable thingy much?
opDims = TensorShape(opDims).FlattenInPlace(k).GetDims(); // (ugh)
nope:;
}
// fprintf(stderr, "Post-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
// remove singleton dimensions
SmallVector<bool> toDrop(dims, false);
bool anyToDrop = false;
for (size_t k = 0; k < dims; k++)
{
for (size_t i = 0; i < N; i++)
if (shapes[i][k] != 1)
goto neither;
toDrop[k] = true; // found an all-singleton dimensions
anyToDrop = true;
neither:;
}
if (anyToDrop)
{
for (size_t i = 0; i < N; i++)
shapes[i].DropDimsInPlace(toDrop);
opDims = TensorShape(opDims).DropDimsInPlace(toDrop).GetDims(); // (ugh)
dims = opDims.size(); // #dims has changed
}
for (size_t i = 0; i < N; i++)
assert(dims == shapes[i].size());
// note: if op is a scalar, then we end up with 0 dimensions here, which is allowed
// fprintf(stderr, "Post-drop: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
// determine broadcasting; that is, set strides to 0 for 1-dimensions
// To be more precise, we should only set actually broadcasting dimensions to 0.
// But since dimensions that are 1 across all args are eliminated, any 1 must be some form of broadcasting.
for (size_t i = 0; i < N; i++) // TODO: do we need to test output tensor here as well?
for (size_t k = 0; k < dims; k++)
if (shapes[i][k] < opDims[k])
{
shapes[i].SetBroadcastStrides();
break;
}
// fprintf(stderr, "%s op %s -> %s via %s\n", string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
// determine inverse broadcasting dimensions
// Inverse broadcasting dims are actual for loops in the kernel, whereas broadcasting input dims are handled by the thread index.
// For regular input dims:
// - determine number of steps (product over opDims[.])
// - launch that many kernels
// - pass in:
// - total number of steps
// - strides for all inputs (with stride magic), separated by regular and inverse broadcasting dimensions
// - opDim (no stride magic allowed) for regular broadcasting dimensions
// - reverse broadcasting dimensions
// - opcodes for elementwise op and reduction op
// - in each kernel:
// - map thread index to dimensions (regular broadcasting ones)
// - for-loop over inverse broadcasting dimensions
// - map dimensions (including inverse broadcasting) for every input
// - perform op on the input values
// - accumulate
// - map dimensions (regular) for output
// - save result
// separate out the inverse-broadcasting dimensions
// Any singleton dimension in the result tensor is inverse-broadcasting, because there must be at least one non-1 dimension
// in one of the inputs, otherwise the entire dimension would have been optimized away above.
SmallVector<bool> isReducingDim(dims); // true for each inverse-broadcasting dimension
bool isAnyReducingDim = false;
for (size_t k = 0; k < dims; k++)
{
bool isRed = shapes.back()[k] == 1;
isReducingDim[k] = isRed;
isAnyReducingDim |= isRed;
}
// form the regular (non-inverse-broadcasting) dims
if (isAnyReducingDim)
{
for (size_t i = 0; i < N; i++)
regularStrides[i] = shapes[i].DropDims(isReducingDim).GetStrides();
regularOpDims = TensorShape(opDims).DropDims(isReducingDim).GetDims(); // (ugh)
// form the inverse-broadcasting dims
SmallVector<bool> isRegularDim(dims); // true for each inverse-broadcasting dimension
for (size_t k = 0; k < dims; k++)
isRegularDim[k] = !isReducingDim[k]; // (no way to do this more nicely?)
for (size_t i = 0; i < N; i++)
reducingStrides[i] = shapes[i].DropDims(isRegularDim).GetStrides();
reducingOpDims = TensorShape(opDims).DropDims(isRegularDim).GetDims(); // (ugh)
}
else // case if no reduction: things are simpler
{
for (size_t i = 0; i < N; i++)
regularStrides[i] = shapes[i].GetStrides();
regularOpDims = opDims;
for (size_t i = 0; i < N; i++)
reducingStrides[i].clear();
reducingOpDims.clear();
}
for (size_t i = 0; i < N; i++)
offsets[i] = shapes[i].GetOffset();
}
// enforce that in case of broadcasting, the output must not be an input
template <class ElemType>
static bool CheckDifferentObject(const TensorView<ElemType>& a, const TensorView<ElemType>& b)
{
if (&a == &b)
LogicError("Do{U,Bi,Ter}naryOpOf: When inverse broadcasting, output must not be an input.");
return true;
}
template <class ElemType>
void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
{
// static int cc = 0; if (cc++ == 0)
// fprintf(stderr, "Tensor Op: Op %d: %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(GetShape()).c_str());
// prepare all tensor descriptor information as needed for execution
array<size_t, 2> offsets;
array<SmallVector<ptrdiff_t>, 2> regularStrides, reducingStrides;
SmallVector<size_t> regularOpDims, reducingOpDims;
PrepareTensorOperands<ElemType, 2>(array<TensorShape, 2>{a.GetShape(), GetShape()}, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// output cannot be input when reducing
if (reducingOpDims.size() > 0)
CheckDifferentObject(a, *this);
// now perform the operation
GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
template <class ElemType>
void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
{
// static int cc = 0; if (cc++ == 0)
// fprintf(stderr, "Tensor Op: Op %d: %s op %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(GetShape()).c_str());
array<size_t, 3> offsets;
array<SmallVector<ptrdiff_t>, 3> regularStrides, reducingStrides;
SmallVector<size_t> regularOpDims, reducingOpDims;
PrepareTensorOperands<ElemType, 3>(array<TensorShape, 3>{a.GetShape(), b.GetShape(), GetShape()}, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// output cannot be input when reducing
if (reducingOpDims.size() > 0)
CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this);
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
template <class ElemType>
void TensorView<ElemType>::DoTernaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
{
// static int cc = 0; if (cc++ == 0)
// fprintf(stderr, "Tensor Op: Op %d: %s, %s, %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(c.GetShape()).c_str(), string(GetShape()).c_str());
array<size_t, 4> offsets;
array<SmallVector<ptrdiff_t>, 4> regularStrides, reducingStrides;
SmallVector<size_t> regularOpDims, reducingOpDims;
PrepareTensorOperands<ElemType, 4>(array<TensorShape, 4>{a.GetShape(), b.GetShape(), c.GetShape(), GetShape()}, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// output cannot be input when reducing
if (reducingOpDims.size() > 0)
CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this) && CheckDifferentObject(c, *this);
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), c.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
// -------------------------------------------------------------------
// matrix product -- GEMM for flattened tensors
// -------------------------------------------------------------------
// print the dimensions of a matrix-product operation, for pretty error reporting
static string MatrixProductFormat(const TensorShape& shapeA, bool transA, const TensorShape& shapeB, bool transB, const TensorShape& shapeC, bool transC)
{
string result = "[" + string(shapeA) + "]"; if (transA) result.append("'");
result += " * ";
result += "[" + string(shapeB) + "]"; if (transB) result.append("'");
result += " -> ";
result += "[" + string(shapeC) + "]"; if (transC) result.append("'");
return result;
}
// flatten a tensor into a 2D tensor, where splitPoint is the first index to go into the second dimension
// The tensor must be flattenable this way, i.e. each of the two index ranges must be dense.
static void FlattenToMatrix(TensorShape& shape, bool trans, size_t splitPoint)
{
if (trans)
splitPoint = shape.GetRank() - splitPoint;
shape.FlattenTo2DInPlace(splitPoint, "DoMatrixProductOf");
}
// convert tensor into a Matrix object
template <class ElemType>
shared_ptr<Matrix<ElemType>> TensorView<ElemType>::AsMatrix() const
{
assert(m_shape.GetRank() == 2);
if (m_shape.GetStrides()[0] != 1 && m_shape[0] != 1)
InvalidArgument("AsMatrix: Flattened [%s] matrix is not dense (it has a stride).", string(m_shape).c_str());
// create a Matrix view into the TensorView (which in turn is a view over a Matrix...)
// The way to do this is to use a ColumnSlice.
// express the TensorView's storage in m_sob's coordinates
let firstColumn = m_shape.GetOffset() / m_sob->GetNumRows();
let numColumns = m_shape.GetNumElements() / m_sob->GetNumRows();
if (firstColumn * m_sob->GetNumRows() != m_shape.GetOffset() || numColumns * m_sob->GetNumRows() != m_shape.GetNumElements())
InvalidArgument("AsMatrix: Flattened [%s] matrix has an offset or width that is not a multiple of the storage object's row dimension.", string(m_shape).c_str());
// now reinterpret this slice according to the new tensor shape
// Example:
// - each sob column contains a set of vectors stored as a 2D tensor [I x J], and [S x T] samples
// - we want to apply a [K x I] matrix to all vectors in each set
// - so we reinterpret the [(I * J) x (S * T)] storage object as a [I x (J * S * T)] matrix
// and apply the matrix product to this (by calling GEMM)
// - which in turn yields a [K x (J * S x*T)] matrix
// which gets reinterpreted back as a [K x J x S x T] tensor
// In the special case of sparse matrices, this split cannot be done. E.g. in the above example, we could only multiply with a [K x I x J] tensor.
let needsSlicing = firstColumn != 0 || numColumns != m_sob->GetNumCols();
let needsReshaping = m_shape[0] != m_sob->GetNumRows() || m_shape[1] != numColumns;
// Note: If an output matrix is a view and needs to move to a different device, we will fail later, since the current structure cannot support that.
// As a consequence, some configurations will simply not work currently.
// We minimize the chance of this by using the original storage object whenever possible.
if (!needsSlicing && !needsReshaping) // no need to mess with the storage object: pass it on as it is. Full support for moving devices.
return m_sob;
else if (needsSlicing && !needsReshaping) // slicing is supported for sparse as well
return make_shared<Matrix<ElemType>>(m_sob->ColumnSlice(firstColumn, numColumns));
else if (m_sob->GetMatrixType() != MatrixType::DENSE) // needsReshaping: not allowed for sparse matrices
RuntimeError("AsMatrix: Sparse tensors are not supported unless they are 1D or 2D matrices.");
else // dense can slice and reshape neutrally, but will also fail if output matrix needs to move devices
return make_shared<Matrix<ElemType>>(m_sob->ColumnSlice(firstColumn, numColumns).Reshaped(m_shape[0], m_shape[1]));
}
template <class ElemType>
void TensorView<ElemType>::DoMatrixProductOf(ElemType beta, bool transC, const TensorView& a, bool transA, const TensorView& b, bool transB, ElemType alpha)
{
// determine integration dimension offset
auto shapeA = a.m_shape;
auto shapeB = b.m_shape;
auto shapeC = m_shape;
if (shapeA.GetRank() + shapeB.GetRank() < shapeC.GetRank())
InvalidArgument("DoMatrixProductOf: Ranks %s don't match, output must have a non-reduced output dimension.", MatrixProductFormat(shapeA, transA, shapeB, transB, shapeC, transC).c_str());
let removedDims = shapeA.GetRank() + shapeB.GetRank() - shapeC.GetRank();
let numReducedDims = removedDims / 2;
if (numReducedDims * 2 != removedDims)
InvalidArgument("DoMatrixProductOf: Ranks %s mismatch.", MatrixProductFormat(shapeA, transA, shapeB, transB, shapeC, transC).c_str());
let firstReducedDim = shapeA.GetRank() - numReducedDims;
// flatten. This updates shapeA etc.
FlattenToMatrix(shapeA, transA, firstReducedDim);
FlattenToMatrix(shapeB, transB, numReducedDims);
FlattenToMatrix(shapeC, transC, firstReducedDim);
// check dimensions
// shapeX[transX] and shapeX[1-transX] are row and column dim, respectively, or swapped if transposed
if (shapeA[transA] != shapeC[transC] || // output dim
shapeB[1-transB] != shapeC[1-transC] || // input dim
shapeA[1-transA] != shapeB[transB]) // reduction dim
{
InvalidArgument("DoMatrixProductOf: Flattened tensor dimensions %s mismatch.", MatrixProductFormat(shapeA, transA, shapeB, transB, shapeC, transC).c_str());
}
// create Matrix objects out of this
let A = a.Reshaped(shapeA).AsMatrix();
let B = b.Reshaped(shapeB).AsMatrix();
auto C = Reshaped(shapeC).AsMatrix();
// and go
if (!transC)
Matrix<ElemType>::MultiplyAndWeightedAdd(alpha, *A, transA, *B, transB, beta, *C);
else // C' = A * B <==> C = (A * B)' = B' * A'
Matrix<ElemType>::MultiplyAndWeightedAdd(alpha, *B, !transB, *A, !transA, beta, *C);
}
template class TensorView<float>;
template class TensorView<double>;
}}}