// // Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // // Real-time thread-safe profiler that generates a summary report and a detail profile log. // The profiler is highly performant and lightweight. Profiling a single event introduces an overhead // of approximately 100 ns. // #ifndef _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings #endif #define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _ #include "PerformanceProfiler.h" #include "Basics.h" #include "fileutil.h" #include "TimerUtility.h" #include #include #include #include #ifndef CPUONLY #include #endif #ifdef _WIN32 #include #else #include #endif namespace Microsoft { namespace MSR { namespace CNTK { // // Fixed profiler event descriptions // enum FixedEventType { profilerEvtTime = 0, profilerEvtThroughput, profilerEvtSeparator }; struct FixedEventDesc { char eventDescription[64]; FixedEventType eventType; bool syncGpu; }; static const FixedEventDesc c_fixedEvtDesc[profilerEvtMax] = { { "Main Thread", profilerEvtSeparator, false }, // profilerSepMainThread { "", profilerEvtSeparator, false }, // profilerSepSpace0 { "Epoch", profilerEvtTime, false }, // profilerEvtMainEpoch { "_Minibatch Iteration", profilerEvtTime, false }, // profilerEvtMainMinibatch { "__Get Minibatch", profilerEvtTime, true }, // profilerEvtMainGetMinibatch { "__Forward + Backward", profilerEvtTime, true }, // profilerEvtMainFB { "__Gradient Aggregation", profilerEvtTime, true }, // profilerEvtMainGradient { "__Weight Update", profilerEvtTime, true }, // profilerEvtMainWeights { "__Post Processing", profilerEvtTime, true }, // profilerEvtMainPost { "", profilerEvtSeparator, false }, // profilerSepSpace1 { "Data Reader", profilerEvtSeparator, false }, // profilerSepDataReader { "", profilerEvtSeparator, false }, // profilerSepSpace2 { "Prefetch Minibatch", profilerEvtTime, false }, // profilerEvtPrefetchMinibatch }; struct FixedEventRecord { int cnt; // event count long long sum; // time (ns) or throughput (kB/s) double sumsq; // sum of squares long long min; // time (ns) or throughput (kB/s) long long max; // time (ns) or throughput (kB/s) long long totalBytes; // used only for throughput events }; // // The custom event record is a variable size datastructure in memory: // NULL terminated description string, followed by CustomEventRecord struct // struct CustomEventRecord { long long beginClock; long long endClock; unsigned int threadId; }; // // Global state of the profiler // struct ProfilerState { bool enabled; // Profiler enabled (active) bool syncGpu; // Sync GPU per each profiling event bool cudaSyncEnabled; // Runtime state of CUDA kernel sync std::wstring profilerDir; // Directory where reports/logs are saved std::wstring logSuffix; // Suffix to append to report/log file names FixedEventRecord fixedEvents[profilerEvtMax]; // Profiling data for each fixed event bool customEventBufferFull; // Is custom event buffer full? unsigned long long customEventBufferBytes; // Number of bytes allocated for the custom event buffer unsigned long long customEventOffset; // Offset to current place in buffer unique_ptr customEventBuffer; // Pointer to custom event buffer long long startClock; }; // We support one global instance of the profiler static unique_ptr g_profilerState; // Mutex controlling access to g_profilerState static std::mutex g_mutex; // Forward declarations unsigned int GetThreadId(); void ProfilerGenerateReport(const std::wstring& fileName, struct tm* timeInfo); void FormatTimeStr(char* str, size_t strLen, double value); void FormatThroughputStr(char* str, size_t strLen, double value); void FormatBytesStr(char* str, size_t strLen, long long bytes); void ProfilerGenerateDetailFile(const std::wstring& fileName); double TicksToSeconds(long long ticks) { long long ticksPerSec = Clock::GetTicksPerSecond(); return static_cast(ticks) / ticksPerSec; } double TicksSqToSecondsSq(double ticksSq) { long long ticksPerSec = Clock::GetTicksPerSecond(); return ticksSq / ticksPerSec / ticksPerSec; } // // Initialize all resources to enable profiling. // profilerDir: Directory where the profiler logs will be saved. // customEventBufferBytes: Size of the custom event buffer. // logSuffix: Suffix string to append to log file names. // syncGpu: Wait for GPU to complete processing for each profiling event with syncGpu flag set. // void PERF_PROFILER_API ProfilerInit(const std::wstring& profilerDir, const unsigned long long customEventBufferBytes, const std::wstring& logSuffix, const bool syncGpu) { if (g_profilerState != nullptr) { RuntimeError("Error: ProfilerInit: Profiler already initialized.\n"); } g_profilerState.reset(new ProfilerState()); g_profilerState->profilerDir = profilerDir; g_profilerState->logSuffix = logSuffix; g_profilerState->customEventBufferFull = false; g_profilerState->customEventBufferBytes = customEventBufferBytes; g_profilerState->customEventOffset = 0ull; g_profilerState->customEventBuffer.reset(new char[customEventBufferBytes]); g_profilerState->syncGpu = syncGpu; g_profilerState->enabled = false; if (_wmkdir(g_profilerState->profilerDir.c_str()) == -1 && errno != EEXIST) { RuntimeError("Error: ProfilerInit: Cannot create directory <%ls>.\n", g_profilerState->profilerDir.c_str()); } } // // Enable/disable profiling. // By default, profiling is disabled after a ProfilerInit call. // This can be used to temporarily turn profiling on/off during execution. // void PERF_PROFILER_API ProfilerEnable(bool enable) { // A nullptr state indicates that the profiler is globally disabled, and not initialized if (g_profilerState == nullptr) return; g_profilerState->enabled = enable; if (enable) { g_profilerState->startClock = Clock::GetTimeStamp(); } } // // Internal helper functions to record fixed and custom profiling events. // void ProfilerTimeRecordFixedEvent(const int eventId, const long long beginClock, const long long endClock) { std::lock_guard lock(g_mutex); if (!g_profilerState->enabled) return; long long delta = endClock - beginClock; if (g_profilerState->fixedEvents[eventId].cnt == 0) { g_profilerState->fixedEvents[eventId].min = delta; g_profilerState->fixedEvents[eventId].max = delta; } g_profilerState->fixedEvents[eventId].min = std::min(delta, g_profilerState->fixedEvents[eventId].min); g_profilerState->fixedEvents[eventId].max = std::max(delta, g_profilerState->fixedEvents[eventId].max); g_profilerState->fixedEvents[eventId].sum += delta; g_profilerState->fixedEvents[eventId].sumsq += (double)delta * (double)delta; g_profilerState->fixedEvents[eventId].cnt++; } void ProfilerTimeRecordToBuffer(const char* eventDescription, const long long beginClock, const long long endClock) { std::lock_guard lock(g_mutex); if (!g_profilerState->enabled) return; auto eventDescriptionBytes = strlen(eventDescription) + 1; auto requiredBufferBytes = eventDescriptionBytes + sizeof(CustomEventRecord); if ((g_profilerState->customEventOffset + requiredBufferBytes) > g_profilerState->customEventBufferBytes) { if (!g_profilerState->customEventBufferFull) { fprintf(stderr, "Warning: Performance Profiler: Buffer is full, no more events will be recorded.\n"); g_profilerState->customEventBufferFull = true; } return; } strcpy(g_profilerState->customEventBuffer.get() + g_profilerState->customEventOffset, eventDescription); g_profilerState->customEventOffset += eventDescriptionBytes; CustomEventRecord eventRecord; eventRecord.beginClock = beginClock; eventRecord.endClock = endClock; eventRecord.threadId = GetThreadId(); memcpy(g_profilerState->customEventBuffer.get() + g_profilerState->customEventOffset, &eventRecord, sizeof(CustomEventRecord)); g_profilerState->customEventOffset += sizeof(CustomEventRecord); } // // Measure either a fixed or custom event time. // ProfilerTimeBegin() returns a stateId that is passed to ProfilerTimeEnd(). // If ProfilerTimeEnd() is not called, the event is not recorded. // long long PERF_PROFILER_API ProfilerTimeBegin() { return Clock::GetTimeStamp(); } void PERF_PROFILER_API ProfilerTimeEnd(const long long stateId, const int eventId) { // A nullptr state indicates that the profiler is globally disabled, and not initialized if (g_profilerState == nullptr) return; if (c_fixedEvtDesc[eventId].syncGpu) ProfilerSyncGpu(); long long endClock = Clock::GetTimeStamp(); ProfilerTimeRecordFixedEvent(eventId, stateId, endClock); ProfilerTimeRecordToBuffer(c_fixedEvtDesc[eventId].eventDescription, stateId, endClock); } void PERF_PROFILER_API ProfilerTimeEnd(const long long stateId, const char* eventDescription) { // A nullptr state indicates that the profiler is globally disabled, and not initialized if (g_profilerState == nullptr) return; ProfilerTimeRecordToBuffer(eventDescription, stateId, Clock::GetTimeStamp()); } // // Conditionally sync the GPU if the syncGPU flag is set. This only needs to be excplicitly // called for custom events. // void PERF_PROFILER_API ProfilerSyncGpu() { #ifndef CPUONLY // A nullptr state indicates that the profiler is globally disabled, and not initialized if (g_profilerState == nullptr) return; if(!g_profilerState->enabled) return; if (g_profilerState->syncGpu) cudaDeviceSynchronize(); #endif } // // Measure throughput given the number of bytes. // ProfilerThroughputBegin() returns a stateId that is passed to ProfilerThroughputEnd(). // If ProfilerThroughputEnd() is not called, the event is not recorded. // long long PERF_PROFILER_API ProfilerThroughputBegin() { return Clock::GetTimeStamp(); } void PERF_PROFILER_API ProfilerThroughputEnd(const long long stateId, const int eventId, const long long bytes) { long long endClock = Clock::GetTimeStamp(); // A nullptr state indicates that the profiler is globally disabled, and not initialized if (g_profilerState == nullptr) return; std::lock_guard lock(g_mutex); if (!g_profilerState->enabled) return; auto beginClock = stateId; if (endClock == beginClock) return; // Use kB rather than bytes to prevent overflow long long kBytesPerSec = Clock::GetTicksPerSecond() * bytes / 1000 / (endClock - beginClock); if (g_profilerState->fixedEvents[eventId].cnt == 0) { g_profilerState->fixedEvents[eventId].min = kBytesPerSec; g_profilerState->fixedEvents[eventId].max = kBytesPerSec; } g_profilerState->fixedEvents[eventId].min = std::min(kBytesPerSec, g_profilerState->fixedEvents[eventId].min); g_profilerState->fixedEvents[eventId].max = std::max(kBytesPerSec, g_profilerState->fixedEvents[eventId].max); g_profilerState->fixedEvents[eventId].sum += kBytesPerSec; g_profilerState->fixedEvents[eventId].sumsq += (double)kBytesPerSec * (double)kBytesPerSec; g_profilerState->fixedEvents[eventId].totalBytes += bytes; g_profilerState->fixedEvents[eventId].cnt++; } // // Generate reports and release all resources. // void PERF_PROFILER_API ProfilerClose() { // A nullptr state indicates that the profiler is globally disabled, and not initialized if (g_profilerState == nullptr) return; // Get current time as yyyy-mm-dd_hh-mm-ss time_t currentTime; time(¤tTime); struct tm* timeInfo = localtime(¤tTime); wchar_t timeStr[32]; wcsftime(timeStr, sizeof(timeStr) / sizeof(timeStr[0]), L"%Y-%m-%d_%H-%M-%S", timeInfo); // Generate summary report std::wstring fileName = g_profilerState->profilerDir + L"/" + std::wstring(timeStr) + L"_summary_" + g_profilerState->logSuffix + L".txt"; ProfilerGenerateReport(fileName, timeInfo); // Generate detailed event file fileName = g_profilerState->profilerDir + L"/" + std::wstring(timeStr) + L"_detail_" + g_profilerState->logSuffix + L".json"; ProfilerGenerateDetailFile(fileName); g_profilerState.reset(); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Utility functions. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // // Get current thread id // unsigned int GetThreadId() { #ifdef _WIN32 return (unsigned int)GetCurrentThreadId(); #else return (unsigned int)syscall(SYS_gettid); #endif } // // Get current process id // unsigned int GetProcessId() { #ifdef _WIN32 return (unsigned int)GetCurrentProcessId(); #else return (unsigned int)syscall(SYS_getpid); #endif } // // Generate summary report. // void ProfilerGenerateReport(const std::wstring& fileName, struct tm* timeInfo) { FILE* f = _wfopen(fileName.c_str(), L"wt"); if (f == NULL) { RuntimeError("Error: ProfilerGenerateReport: Cannot create file <%ls>.\n", fileName.c_str()); } fprintfOrDie(f, "CNTK Performance Profiler Summary Report\n\n"); char timeStr[32]; strftime(timeStr, sizeof(timeStr), "%Y/%m/%d %H:%M:%S", timeInfo); fprintfOrDie(f, "Time Stamp: %s\n\n", timeStr); fprintfOrDie(f, "Description................ ............Mean ..........StdDev .............Min .............Max ...........Count ...........Total\n\n"); for (int evtIdx = 0; evtIdx < profilerEvtMax; evtIdx++) { bool printLine = false; switch (c_fixedEvtDesc[evtIdx].eventType) { case profilerEvtTime: if (g_profilerState->fixedEvents[evtIdx].cnt > 0) { printLine = true; fprintfOrDie(f, "%-26s: ", c_fixedEvtDesc[evtIdx].eventDescription); char str[32]; double mean = TicksToSeconds(g_profilerState->fixedEvents[evtIdx].sum) / g_profilerState->fixedEvents[evtIdx].cnt; FormatTimeStr(str, sizeof(str), mean); fprintfOrDie(f, "%s ", str); double sum = TicksToSeconds(g_profilerState->fixedEvents[evtIdx].sum); double sumsq = TicksSqToSecondsSq(g_profilerState->fixedEvents[evtIdx].sumsq); double stdDev = sumsq - (pow(sum, 2.0) / g_profilerState->fixedEvents[evtIdx].cnt); if (stdDev < 0.0) stdDev = 0.0; stdDev = sqrt(stdDev / (double)g_profilerState->fixedEvents[evtIdx].cnt); FormatTimeStr(str, sizeof(str), stdDev); fprintfOrDie(f, "%s ", str); FormatTimeStr(str, sizeof(str), TicksToSeconds(g_profilerState->fixedEvents[evtIdx].min)); fprintfOrDie(f, "%s ", str); FormatTimeStr(str, sizeof(str), TicksToSeconds(g_profilerState->fixedEvents[evtIdx].max)); fprintfOrDie(f, "%s ", str); fprintfOrDie(f, "%16d ", g_profilerState->fixedEvents[evtIdx].cnt); FormatTimeStr(str, sizeof(str), TicksToSeconds(g_profilerState->fixedEvents[evtIdx].sum)); fprintfOrDie(f, "%s", str); } break; case profilerEvtThroughput: if (g_profilerState->fixedEvents[evtIdx].cnt > 0) { printLine = true; fprintfOrDie(f, "%-26s: ", c_fixedEvtDesc[evtIdx].eventDescription); char str[32]; double mean = ((double)g_profilerState->fixedEvents[evtIdx].sum / (double)g_profilerState->fixedEvents[evtIdx].cnt); FormatThroughputStr(str, sizeof(str), mean); fprintfOrDie(f, "%s ", str); double stdDev = g_profilerState->fixedEvents[evtIdx].sumsq - (pow((double)g_profilerState->fixedEvents[evtIdx].sum, 2.0) / (double)g_profilerState->fixedEvents[evtIdx].cnt); if (stdDev < 0.0) stdDev = 0.0; stdDev = sqrt(stdDev / (double)g_profilerState->fixedEvents[evtIdx].cnt); FormatThroughputStr(str, sizeof(str), stdDev); fprintfOrDie(f, "%s ", str); FormatThroughputStr(str, sizeof(str), (double)g_profilerState->fixedEvents[evtIdx].min); fprintfOrDie(f, "%s ", str); FormatThroughputStr(str, sizeof(str), (double)g_profilerState->fixedEvents[evtIdx].max); fprintfOrDie(f, "%s ", str); fprintfOrDie(f, "%16d ", g_profilerState->fixedEvents[evtIdx].cnt); FormatBytesStr(str, sizeof(str), g_profilerState->fixedEvents[evtIdx].totalBytes); fprintfOrDie(f, "%s", str); } break; case profilerEvtSeparator: printLine = true; fprintfOrDie(f, "%s", c_fixedEvtDesc[evtIdx].eventDescription); break; } if (printLine) fprintfOrDie(f, "\n"); } fclose(f); } // // String formatting helpers for reporting. // void FormatTimeStr(char* str, size_t strLen, double seconds) { if (seconds < 60.0) { sprintf_s(str, strLen, "%13.3f ms", seconds * 1000.0); } else { sprintf_s(str, strLen, " %02d:%02d:%06.3f", (int)seconds / 3600, ((int)seconds / 60) % 60, fmod(seconds, 60.0)); } } void FormatThroughputStr(char* str, size_t strLen, double kbps) { // MBps = 1000000 bytes per second sprintf_s(str, strLen, "%11.3f MBps", kbps / 1000.0); } void FormatBytesStr(char* str, size_t strLen, long long bytes) { // kB = 1024 bytes, MB = 1024*1024 bytes if (bytes < (1024ll * 1024ll)) { sprintf_s(str, strLen, "%13lld kB", bytes >> 10); } else { sprintf_s(str, strLen, "%13lld MB", bytes >> 20); } } // // Generate detail event file in chrome://tracing format (https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview#heading=h.yr703knxre9f) // void ProfilerGenerateDetailFile(const std::wstring& fileName) { FILE* f = _wfopen(fileName.c_str(), L"wt"); if (f == NULL) { RuntimeError("Error: ProfilerGenerateDetailFile: Cannot create file <%ls>.\n", fileName.c_str()); } fprintfOrDie(f, "[\n"); char* eventPtr = g_profilerState->customEventBuffer.get(); bool firstRecord = true; unsigned int pid = GetProcessId(); while (eventPtr < (g_profilerState->customEventBuffer.get() + g_profilerState->customEventOffset)) { char* descriptionStr = eventPtr; eventPtr += strlen(descriptionStr) + 1; CustomEventRecord* eventRecord = (CustomEventRecord*)eventPtr; eventPtr += sizeof(CustomEventRecord); fprintfOrDie(f, "%s {\"pid\":%u, \"tid\":%u, \"name\":\"%s\", \"cat\":\"PERF\", \"ph\":\"B\", \"ts\":%llu}", firstRecord ? "" : ",\n", pid, eventRecord->threadId, descriptionStr, (unsigned long long)(1000000.0 * TicksToSeconds(eventRecord->beginClock - g_profilerState->startClock))); firstRecord = false; fprintfOrDie(f, ",\n {\"pid\":%u, \"tid\":%u, \"name\":\"%s\", \"cat\":\"PERF\", \"ph\":\"E\", \"ts\":%llu}", pid, eventRecord->threadId, descriptionStr, (unsigned long long)(1000000.0 * TicksToSeconds(eventRecord->endClock - g_profilerState->startClock))); } fprintfOrDie(f, "\n]\n"); fclose(f); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Scoped helpers. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void ProfilerContext::Init(const std::wstring& profilerDir, const unsigned long long customEventBufferBytes, const std::wstring& logSuffix, const bool syncGpu) { ProfilerInit(profilerDir, customEventBufferBytes, logSuffix, syncGpu); } ProfilerContext::~ProfilerContext() { ProfilerClose(); } ScopeProfile::ScopeProfile(int eventId) { m_eventId = eventId; m_description = nullptr; m_stateId = ProfilerTimeBegin(); } ScopeProfile::ScopeProfile(const char* description) { m_description = description; m_stateId = ProfilerTimeBegin(); } ScopeProfile::~ScopeProfile() { if (m_description) { ProfilerTimeEnd(m_stateId, m_description); } else { ProfilerTimeEnd(m_stateId, m_eventId); } } ScopeThroughput::ScopeThroughput(int eventId, long long bytes) { m_bytes = bytes; m_eventId = eventId; m_stateId = ProfilerThroughputBegin(); } ScopeThroughput::~ScopeThroughput() { ProfilerThroughputEnd(m_stateId, m_eventId, m_bytes); } }}}