https://github.com/Microsoft/CNTK
Tip revision: a7cfb7bce52880e55d8e8eb7a0ce60e31dddface authored by Vadim Mazalov on 01 October 2018, 18:27:25 UTC
Nan log
Nan log
Tip revision: a7cfb7b
ProgressTracing.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include "Basics.h"
#include <chrono>
#include "TimerUtility.h"
#include <string>
namespace Microsoft { namespace MSR { namespace CNTK {
// If the Tracing flag is set, print out a timestamp with no new line at the end
#define PREPENDTS(stream) \
do \
{ \
if (ProgressTracing::GetTimestampingFlag()) \
{ \
char mbstr[30]; \
fprintf(stream, "%s: ", ProgressTracing::Timestamp(mbstr)); \
} \
} while(0)
// TODO: make this proper C++ functions with variadic templates and a name that reflects their difference to fprintf(stderr) which already implies printing to log
// Print out a log message. If the Tracing flag is set, prepend with a timestamp
#define LOGPRINTF(stream, ...) \
do \
{ \
PREPENDTS(stream); \
fprintf(stream, __VA_ARGS__); \
} while(0)
// ---------------------------------------------------------------------------
// ProgressTracing -- static helper class for logging a progress indicator
//
// This is for use by the cluster management tools for indicating global progress to the user.
//
// This logs to stdout (not stderr) in a specific format, e.g. understood by the Philly cluster. The format is:
// PROGRESS xx.xx%
// EVALERR xx.xx%
//
// Specifically, this class handles a two-level progress computation:
// - outer level: loop over multiple training phases, each running multiple steps (epochs)
// - inner level in one training phase: loop over multiple steps, *without* knowledge about the other training phases
//
// In order for the inner level to log correctly in the global context, the outer loop
// must inform this class about the total number of steps and the current offset to apply in the inner level.
// ---------------------------------------------------------------------------
/*static*/ class ProgressTracing
{
bool m_enabled;
bool m_tracingFlag;
bool m_timestampFlag; // TODO: What does this do? TODO: camelCase
size_t m_totalNumberOfSteps; // total number of epochs in entire training run
size_t m_currentStepOffset; // current offset
Timer m_progressTracingTimer;
ProgressTracing()
: m_enabled(false), m_tracingFlag(false), m_timestampFlag(false), m_totalNumberOfSteps(0), m_currentStepOffset(0)
{
}
static ProgressTracing& GetStaticInstance()
{
static ProgressTracing us;
return us;
} // wrap static state in an accessor, so we won't need a CPP file
public:
static bool GetTracingFlag()
{
return GetStaticInstance().m_tracingFlag;
}
static bool GetTimestampingFlag()
{
return GetStaticInstance().m_timestampFlag;
// TODO: timestampFlag or timestampingFlag? (Or timeStampFlag?)
}
template<unsigned int N>
static const char* Timestamp(char(&buf)[N])
{
std::time_t tt = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
if (!std::strftime(buf, _countof(buf), "%m/%d/%Y %H:%M:%S", std::localtime(&tt)))
LogicError("Timestamp: Buffer too small.");
return buf;
}
// helper to return a time-stamp prefix if time-stamping enabled, complete with ': ' at its end
static std::wstring GetTimeStampPrefix()
{
char mbstr[30];
return GetTimestampingFlag() ? msra::strfun::wstrprintf(L"%s: ", Timestamp(mbstr)) : L"";
}
static void SetTracingFlag()
{
auto& us = GetStaticInstance();
us.m_tracingFlag = true;
}
static void SetTimestampingFlag()
{
auto& us = GetStaticInstance();
us.m_timestampFlag = true;
}
// call TraceTotalNumberOfSteps() to set the total number of steps
// Calling this with totalNumberOfSteps>0 will enable progress tracing.
static void TraceTotalNumberOfSteps(size_t totalNumberOfSteps)
{
auto& us = GetStaticInstance();
us.m_enabled = totalNumberOfSteps > 0;
if (us.m_enabled)
{
us.m_totalNumberOfSteps = totalNumberOfSteps;
us.m_progressTracingTimer.Start();
}
}
// call SetStepOffset() at start of a multi-epoch training to set the index of the first epoch in that training
// This value is added to the local epoch index in TraceProgress().
static void SetStepOffset(size_t currentStepOffset)
{
GetStaticInstance().m_currentStepOffset = currentStepOffset;
}
// emit the trace message for global progress
// 'currentStep' will be offset by m_currentStepOffset.
// This only prints of enough time (10s) has elapsed since last print, and the return value is 'true' if it did print.
static bool TraceProgressPercentage(size_t epochNumber, double mbProg /*0..100*/, bool isTimerPaced)
{
auto& us = GetStaticInstance();
if (!us.m_enabled)
{
return false;
}
// compute global progress
bool needToPrint = us.m_progressTracingTimer.ElapsedSeconds() > 10;
if (needToPrint || isTimerPaced)
{
double epochProg = ((100.0f * (double) (us.m_currentStepOffset + epochNumber)) / (double) us.m_totalNumberOfSteps);
mbProg = (mbProg * 100.0f) / (double) us.m_totalNumberOfSteps;
printf("PROGRESS: %.2f%%\n", epochProg + mbProg);
us.m_progressTracingTimer.Restart();
}
return needToPrint;
}
// emit a trace message for the train loss value
// The value is printed in percent.
static void TraceTrainLoss(double err)
{
auto& us = GetStaticInstance();
if (!us.m_enabled)
{
return;
}
printf("EVALERR: %.7f%%\n", err);
}
// This prints a PROGRESS message with a percentage value of 0 to prevent timeouts on Philly
// when executing long running non-training operations like PreCompute, CV, Eval, and Write
static size_t TraceFakeProgress(size_t numIterationsBeforePrintingProgress, size_t numItersSinceLastPrintOfProgress)
{
size_t newNumItersSinceLastPrintOfProgress = numItersSinceLastPrintOfProgress;
if (GetTracingFlag())
{
newNumItersSinceLastPrintOfProgress++;
if (newNumItersSinceLastPrintOfProgress >= numIterationsBeforePrintingProgress)
{
printf("PROGRESS: %.2f%%\n", 0.0f);
newNumItersSinceLastPrintOfProgress = 0;
}
}
return newNumItersSinceLastPrintOfProgress;
}
};
}}}