https://github.com/Microsoft/CNTK
Tip revision: 0e208365be18021f25c19a24ea34bf8e187926e7 authored by liqfu on 26 August 2018, 15:41:20 UTC
CNTK splice allows broadcast. This case is handled in the change. For noop (identity) ops, its inputs and outputs types shall be set according to upstream ops. ToBatch/ToSequence and Unpack Batch/Sequence ops added during model importing need tp be skipped. Model import need to handle ops with multiple outputs
CNTK splice allows broadcast. This case is handled in the change. For noop (identity) ops, its inputs and outputs types shall be set according to upstream ops. ToBatch/ToSequence and Unpack Batch/Sequence ops added during model importing need tp be skipped. Model import need to handle ops with multiple outputs
Tip revision: 0e20836
Criterion.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// Criterion.h -- helper classes for accumulating criteria
#pragma once
#include "Basics.h"
#include "Matrix.h"
#include "TensorView.h"
#include <memory> // for pair
#include <limits> // for isnan() and numeric_limits --TODO: is that the right header?
namespace Microsoft { namespace MSR { namespace CNTK {
// helper for criterion pretty printing
static inline string GeneratePaddedFloatOrExpFormat(int padSize, int precision, double value)
{
char format[16];
char buffer[512];
sprintf(format, "%%.%dg", precision);
sprintf(buffer, format, value);
for (int i = 0; i < strlen(buffer); i++)
{
if (buffer[i] == 'e' || buffer[i] == 'E')
{
sprintf(format, "%%%d.%de", padSize, precision);
return format;
}
}
sprintf(format, "%%%d.%df", padSize, precision);
return format;
}
// helper class for passing accumulated epoch-level criteria around while retaining their sample counts
// Criteria are represented as a tuple (aggregate criterion, sample count). The average criterion value is their ratio.
struct EpochCriterion : public std::pair<double, size_t>
{
// construction
explicit EpochCriterion(double aggregateCriterionValue = 0.0, size_t aggregateSampleCount = 0) : std::pair<double, size_t>(aggregateCriterionValue, aggregateSampleCount) { }
EpochCriterion(const std::pair<double, size_t>& other) : std::pair<double, size_t>(other) { }
// main way of reading this out: compute the actual average criterion value from the aggregate and sample count
double Average() const { return second > 0 ? first / second : 0.0; } // compute the epoch-average
// a few more handy operations that occurred multiple times
bool IsNan() const { return std::isnan(first); }
EpochCriterion operator-(const EpochCriterion& other) const { return EpochCriterion(first - other.first, second - other.second); }
void operator+=(const EpochCriterion& other) { first += other.first; second += other.second; }
static EpochCriterion Infinity() { return EpochCriterion(std::numeric_limits<double>::infinity()); }
bool IsInfinity() const { return first == std::numeric_limits<double>::infinity(); }
// log a criterion value in a form like 'av * count; '
void LogCriterion(const wstring& name, bool addSemicolon = true) const
{
double evalErrorSinceLastLogged = Average();
int evalSamplesSinceLastLogged = (int)second;
fprintf(stderr, "%ls = ", name.c_str());
string format;
bool asPercentage = name.back() == 's'; // heuristic: plural forms are error counters
if (asPercentage)
fprintf(stderr, (GeneratePaddedFloatOrExpFormat(2, 3, 100*evalErrorSinceLastLogged) + "%%").c_str(), 100*evalErrorSinceLastLogged);
else
fprintf(stderr, GeneratePaddedFloatOrExpFormat(0, 8, evalErrorSinceLastLogged).c_str(), evalErrorSinceLastLogged);
fprintf(stderr, " * %d", evalSamplesSinceLastLogged);
if (addSemicolon) // if no more numbers follow, then use addSemicolon = false
fprintf(stderr, "; ");
}
};
// We accumulate criteria in this struct.
// Criteria are accumulated together with their counts (counts depend on sequence lengths, and different criteria may have different sequence lengths).
template <class ElemType>
struct CriterionAccumulator
{
// constructor params:
// criterionNodes - list of criterion nodes
// accumulatorCriterionNodesNodes - list of criterion nodes that already accumulate results
CriterionAccumulator(const std::vector<ComputationNodeBasePtr>& criterionNodes, DEVICEID_TYPE deviceId,
const std::vector<ComputationNodeBasePtr>& accumulatorCriterionNodesNodes = {})
: m_aggregateCriterionValues(make_shared<Matrix<ElemType>>(1, criterionNodes.size(), deviceId)),
m_criterionNodes(criterionNodes),
m_accumulatorCriterionNodes(accumulatorCriterionNodesNodes)
{
m_aggregateCriterionValues->SetValue(0);
m_aggregateSampleCounts.assign(criterionNodes.size(), 0);
}
// 'i' is the index of the element we add into (multiple eval criteria share the same matrix object)
// Use 'reset=true' to not accumulate but overwrite.
const CriterionAccumulator& Add(size_t i, size_t numSamplesInMinibatch)
{
return Accumulate(i, numSamplesInMinibatch, /*reset=*/false);
}
const CriterionAccumulator& Assign(size_t i, size_t numSamplesInMinibatch)
{
return Accumulate(i, numSamplesInMinibatch, /*reset=*/true);
}
// retrieve an accumulated result as a pair (numerator, denominator)
EpochCriterion GetCriterion(size_t i) const
{
// BUGBUG: For unknown reasons, this (or the other below) check makes a difference for MPI configs.
// If it is left out, then training and test configs end up being scaled by the same factor close to 1.
if (m_aggregateSampleCounts[i] == 0)
return EpochCriterion(0, 0); // avoid unnecessary GPU access
else
return EpochCriterion(m_aggregateCriterionValues->GetValue(0, i), m_aggregateSampleCounts[i]);
}
private:
// shared part of Add() and Assign()
// This code assumes that if number of samples is 0, the criterion value is invalid and must not be fetched from the GPU or otherwise looked at.
const CriterionAccumulator& Accumulate(size_t i, size_t numSamplesInMinibatch, bool reset)
{
const auto& node = m_criterionNodes[i]; // multiple nodes are managed by this struct
// Accumulator nodes already accumulate error for all samples that passed through network in forward pass.
// For them we use 1 as number of samples to avoid averaging again.
// Also, we always perform reset for those nodes to avoid accumulating again.
bool nodeContainsAccumulatedResult = ContainsAccumulatedResult(node);
size_t beta = (nodeContainsAccumulatedResult || reset) ? 0 : 1;
size_t numSamples = GetNumSamples(m_criterionNodes[i], numSamplesInMinibatch, nodeContainsAccumulatedResult);
// Note: numSamples == 0 if numSamplesInMinibatch == 0 meaning empty minibatch.
// For criterion nodes that emit criteria per frame, we will at this point
// do masking and an implicit reduction.
// get a TensorView of the criterion values to aggregate
// TODO: Verify that node->GetSampleLayout().GetNumElements() == 1. Require explicit summation to declare intent that this is a criterion.
FrameRange fr(node->GetMBLayout());
node->MaskMissingValueColumnsToZero(fr); // set gaps to zero, so that we can aggregate
// get a TensorView of our aggregator
TensorShape shape{ m_aggregateCriterionValues->GetNumRows(), m_aggregateCriterionValues->GetNumCols() };
shape.NarrowTo(1, i, i + 1); // narrow to the single element that corresponds to the accumulator value
auto criterionAccumulator = TensorView<ElemType>(m_aggregateCriterionValues, shape);
// accumulate
if (numSamples > 0) // (if MB is empty, we must not look at the matrix)
{
auto criterionValue = node->As<ComputationNode<ElemType>>()->ValueTensorFor(SIZE_MAX, fr);
// Note: If criterion is > [1 x 1] then inverse broadcasting will kick in and aggregate.
// If count is zero, we lazily consider the numerator as zero as well.
criterionAccumulator.DoCopyOf(m_aggregateSampleCounts[i] ? (float)beta : 0, criterionValue, 1);
}
m_aggregateSampleCounts[i] = m_aggregateSampleCounts[i] * beta + numSamples;
return *this;
}
bool ContainsAccumulatedResult(const ComputationNodeBasePtr& node) const
{
// Node contains accumulated result if it can be found in the list of accumulation nodes specified in
// CriterionAccumulator constructor.
return std::find(m_accumulatorCriterionNodes.begin(), m_accumulatorCriterionNodes.end(), node) !=
m_accumulatorCriterionNodes.end();
}
public:
// get the number of samples
// 'numSamplesInMinibatch' is the "generic" number of samples in the minibatch, which
// we will use if the node has no MBLayout.
// If 'numSamplesInMinibatch' is 0, then this means that the 'node' is invalid and should not be looked at.
static size_t GetNumSamples(const ComputationNodeBasePtr& node, size_t numSamplesInMinibatch,
bool nodeContainsAccumulatedCriterion = false)
{
if (numSamplesInMinibatch == 0) // empty MB: node is invalid, MBLayout must not be looked at
return 0;
else if (nodeContainsAccumulatedCriterion)
{
// For nodes that already contain accumulated error we use 1 as number of samples to avoid averaging again.
// These nodes contain error for all samples that passed through network in forward pass instead of per
// sample error (as such they don't have minibatch layout).
if (node->HasMBLayout())
LogicError("Node %ls is marked as aggregation, but has minibatch layout.", node->GetName().c_str());
return 1;
}
else if (node->HasMBLayout())
return node->GetMBLayout()->GetActualNumSamples();
else
return numSamplesInMinibatch;
}
CriterionAccumulator& operator=(const CriterionAccumulator&) = delete;
private:
shared_ptr<Matrix<ElemType>> m_aggregateCriterionValues; // [1 x N]
vector<size_t> m_aggregateSampleCounts; // [N]
const std::vector<ComputationNodeBasePtr> m_criterionNodes;
// Criterion nodes that accumulate result themselves.
const std::vector<ComputationNodeBasePtr> m_accumulatorCriterionNodes;
};
}}}