// // Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // #include "stdafx.h" #include "Learner.h" #include "TensorView.h" #include "Utils.h" #include "Serialization.h" #define UPDATE_FUNCTION \ switch (smoothedGradientValue->GetDataType()) \ { \ case DataType::Float: \ Update(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); \ break; \ case DataType::Double: \ Update(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); \ break; \ default: \ NOT_IMPLEMENTED; \ } using namespace Microsoft::MSR::CNTK; using namespace std; namespace CNTK { template /*static*/ shared_ptr> LearnerBase::GetMatrix(const NDArrayViewPtr& arrayView) { return arrayView->GetMatrix(); } template /*static*/ shared_ptr> LearnerBase::GetWritableMatrix(const NDArrayViewPtr& arrayView) { return arrayView->GetWritableMatrix(); } template /*static*/ const TensorView* LearnerBase::GetTensorView(const NDArrayViewPtr& arrayView) { return arrayView->GetTensorView(); } /*static*/ bool LearnerBase::HasNan(const NDArrayViewPtr& value, const char* name) { switch (value->GetDataType()) { case DataType::Float: return value->GetMatrix()->HasNan(name); case DataType::Double: return value->GetMatrix()->HasNan(name); default: LogicError("Unsupported DataType %s", DataTypeName(value->GetDataType())); } } /*static*/ void LearnerBase::Print(const NDArrayViewPtr& value, const char* msg) { switch (value->GetDataType()) { case DataType::Float: value->GetMatrix()->Print(msg); break; case DataType::Double: value->GetMatrix()->Print(msg); break; default: LogicError("Unsupported DataType %s", DataTypeName(value->GetDataType())); } } void LearnerBase::ResetSmoothedGradients() { for(auto v : m_smoothedGradientValues) { if (v.second->GetDataType() == DataType::Float) v.second->SetValue(0.0f); else if (v.second->GetDataType() == DataType::Double) v.second->SetValue(0.0); else LogicError("Unsupported DataType %s", DataTypeName(v.second->GetDataType())); } } // Clipping gradients to prevent outliers, template void LearnerBase::ClipGradient(Matrix& gradient, size_t actualMBSize) const { if (m_additionalOptions.gradientClippingThresholdPerSample != numeric_limits::infinity()) { double maxGradientPerMB = m_additionalOptions.gradientClippingThresholdPerSample * actualMBSize; if (m_additionalOptions.gradientClippingWithTruncation) gradient.InplaceTruncate(ElementType(maxGradientPerMB)); else { // norm2 normalized double gradientNorm = gradient.FrobeniusNorm(); if (gradientNorm > maxGradientPerMB) { double normFactor = maxGradientPerMB / gradientNorm; gradient *= ElementType(normFactor); } } } } // Performs additional preprocessing before calling the update method // (gradient clipping and L2 regularization depending on the additional learning parameters). template void LearnerBase::PreProcess(const NDArrayViewPtr& parameterValue, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const { const auto& gradientMatrix = gradientValue->GetWritableMatrix(); // clipping gradients to prevent outliers ClipGradient(*gradientMatrix, actualMBSize); // L2 regularizer if (m_additionalOptions.l2RegularizationWeight > 0) { // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample const auto weight = m_additionalOptions.l2RegularizationWeight * actualMBSize; const auto& parameterMatrix = parameterValue->GetWritableMatrix(); Matrix::ScaleAndAdd(ElementType(weight), *parameterMatrix, *gradientMatrix); } } // Performs additional postprocessing after the update method has been executed // (noise injection and L1 regularization specified by the additional learning parameters). template void LearnerBase::PostProcess(const Parameter& parameter, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const { const auto& parameterValue = parameter.Value(); const auto& parameterMatrix = parameterValue->GetWritableMatrix(); const auto gaussianNoiseInjectionStdDev = GetCurrentTrainingParameterValue(m_additionalOptions.gaussianNoiseInjectionStdDev); if (gaussianNoiseInjectionStdDev > 0) { const auto& gradientMatrix = gradientValue->GetWritableMatrix(); Matrix sgdUpdateNoise((DEVICEID_TYPE)parameterMatrix->GetDeviceId()); // get the gradient structure since gradient is sparse sgdUpdateNoise.SetValue(*gradientMatrix); const auto noiseStdDev = gaussianNoiseInjectionStdDev; // reset its value to random sgdUpdateNoise.SetGaussianRandomValue(ElementType(0.0), ElementType(noiseStdDev)); Matrix::ScaleAndAdd(ElementType(1.0), sgdUpdateNoise, *parameterMatrix); } // L1 regularizer with proximal gradient descent method if (m_additionalOptions.l1RegularizationWeight > 0) { const auto learningRate = LearningRate(actualMBSize); // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample const auto weight = learningRate * m_additionalOptions.l1RegularizationWeight * actualMBSize; parameterValue->GetWritableMatrix()->InplaceSoftThreshold(ElementType(weight)); } } template /*static*/ TensorView* LearnerBase::GetWritableTensorView(const NDArrayViewPtr& arrayView) { return arrayView->GetWritableTensorView(); } LearnerBase::LearnerBase(const vector& parameters, const LearningRateSchedule& learningRateSchedule, AdditionalLearningOptions additionalOptions, bool allocateSmoothGradients /* = true */) : Learner(parameters, learningRateSchedule), m_minibatchCount(0), m_additionalOptions(additionalOptions) { std::unordered_set uniqueParameters(parameters.begin(), parameters.end()); if (uniqueParameters.size() != parameters.size()) { LogicError("Learner parameters contain duplicates."); } for (const auto& parameter : parameters) { if (!allocateSmoothGradients) { continue; } NDArrayViewPtr view = AllocateNDArrayView(parameter, parameter.Shape()); m_smoothedGradientValues.insert(make_pair(parameter, view)); } } /*static*/ NDArrayViewPtr LearnerBase::AllocateNDArrayView(const Parameter& parameter, const NDShape& shape) { if (parameter.GetDataType() == DataType::Float) { return MakeSharedObject(float(0.0), shape, parameter.Value()->Device()); } else { return MakeSharedObject(0.0, shape, parameter.Value()->Device()); } } /*static*/ NDShape LearnerBase::GetMatrixShape(const Parameter& parameter) { if (parameter.GetDataType() == DataType::Float) { auto matrix = GetMatrix(parameter.Value()); return{ matrix->GetNumRows(), matrix->GetNumCols() }; } else { auto matrix = GetMatrix(parameter.Value()); return{ matrix->GetNumRows(), matrix->GetNumCols() }; } } /*virtual*/ bool LearnerBase::Update(const unordered_map& gradientValues, size_t trainingSampleCount) /*override*/ { if (LearningRate(trainingSampleCount) == 0.0) { return false; } // make sure trainingSampleCount is a valid value assert(trainingSampleCount > 0); for (const auto& parameter : Parameters()) { const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter); const auto& gradientValue = gradientValues.at(parameter); // TODO: make this a runtime parameter. #if DUMPOUTPUT LOGPRINTF(stderr, "Update_%ls\n", parameter.Uid().c_str()); #endif #ifdef _DEBUG if (HasNan(smoothedGradientValue, "TrainOneEpoch/UpdateWeights/Learner::Update(): ")) LogicError("%ls has NaNs in smoothedGradient.", parameter.Uid().c_str()); #endif #if DUMPOUTPUT const auto learningRate = LearningRate(trainingSampleCount); const auto momentum = MomentumValueForMB(trainingSampleCount); LOGPRINTF(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n", learningRate, momentum, trainingSampleCount); LOGPRINTF(stderr, "GradUpdateType()=%s, GradientUpdateNoiseStd()=%0.8f\n", LearnerType().c_str(), m_additionalOptions.gaussianNoiseInjectionStdDev); Print(gradientValue, "Gradient Update"); Print(smoothedGradientValue, "Smoothed Gradient Input"); #endif UPDATE_FUNCTION; #if DUMPOUTPUT Print(parameter.Value(), "Parameter Update"); #endif #ifdef _DEBUG const auto& parameterValue = parameter.Value(); if (HasNan(parameterValue, "TrainOneEpoch/UpdateWeights/Learner::Update(): ")) LogicError("%ls has NaNs in parameter values after parameter update.", parameter.Uid().c_str()); #endif } m_sampleCount += trainingSampleCount; m_minibatchCount++; // TODO: sweep count also needs to be updated. return true; } template void LearnerBase::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const { const auto& parameterValue = parameter.Value(); PreProcess(parameterValue, gradientValue, trainingSampleCount); Update(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); PostProcess(parameter, gradientValue, trainingSampleCount); auto paramRef = parameter; paramRef.RecordValueUpdate(); } string LearnerBase::LearnerType() const { return Typename(this); } static const std::wstring s_learnerTypeValue = L"Learner"; /*virtual*/ Dictionary LearnerBase::Serialize() const /*override*/ { Dictionary checkpoint; checkpoint[versionKey] = CurrentVersion(); checkpoint[typeKey] = s_learnerTypeValue; checkpoint[sampleCountKey] = m_sampleCount; checkpoint[minibatchCountKey] = m_minibatchCount; checkpoint[learningRateScheduleKey] = m_learningRateSchedule.Serialize(); // TODO: should we also save momentum schedule into the checkpoint? // If that is the case, need to be able to override this method in subclasses, // TODO: we now store mapping from UID to Parameter value in the checkpoint, // if uids turn out to be too fragile, this can be easily changed to a vector // of parameters, so that in restore we don't have to lookup based on uids, // and simply restore parameters one by one in the original (dfs) order. for (const auto& parameter : Parameters()) { if (checkpoint.Contains(parameter.Uid())) { LogicError("Parameter uids must be unique"); } const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter); checkpoint[parameter.Uid()] = *smoothedGradientValue; } return checkpoint; } /*virtual*/ void LearnerBase::RestoreFromCheckpoint(const Dictionary& checkpoint) /*override*/ { static const vector s_requiredDictionaryKeys = { typeKey, sampleCountKey, minibatchCountKey, learningRateScheduleKey }; ValidateDictionary(checkpoint, s_requiredDictionaryKeys, s_learnerTypeValue, CurrentVersion()); m_sampleCount = checkpoint[sampleCountKey].Value(); m_minibatchCount = checkpoint[minibatchCountKey].Value(); // TODO: which learning rate schedule should take precedence here? // The one given at construction time or the one loaded from a checkpoint? m_learningRateSchedule = TrainingParameterSchedule::Deserialize(checkpoint[learningRateScheduleKey].Value()); const auto parameters = Parameters(); for (const auto& parameter : parameters) { if (!checkpoint.Contains(parameter.Uid())) { LogicError("Checkpoint does not contain state for parameter %ls", parameter.Uid().c_str()); } const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter); const NDArrayView& checkpointedValue = checkpoint[parameter.Uid()].Value(); if (smoothedGradientValue->GetDataType() != checkpointedValue.GetDataType()) { LogicError("A value restored from a checkpoint for the smoothed gradient data type for parameter %ls does not match the expected value", parameter.Uid().c_str()); } if (smoothedGradientValue->Shape() != checkpointedValue.Shape()) { LogicError("A value restored from a checkpoint for the smoothed gradient shape for parameter %ls does not match the expected value", parameter.Uid().c_str()); } smoothedGradientValue->CopyFrom(checkpointedValue); } } /*virtual*/ void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/ { UPDATE_FUNCTION; } template void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const { const auto& parameterValue = parameter.Value(); const auto& smoothedGradientMatrix = GetWritableMatrix(smoothedGradientValue); const auto& gradientMatrix = GetWritableMatrix(gradientValue); const auto& parameterMatrix = GetWritableMatrix(parameterValue); const auto learningRate = ElementType(LearningRate(trainingSampleCount)); const auto momentum = ElementType(MomentumValueForMB(trainingSampleCount)); // TODO: break up the NormalGrad into 3 different functions, each with its own set of parameters // Also, come up with a better name for NormalGrad (Default? Regular? Plain?). // (one for vanilla SGD, the other for momentum SGD, and the third one for NAG). smoothedGradientMatrix->NormalGrad(*gradientMatrix, *parameterMatrix, learningRate, momentum, UseNesterovMomentum()); } double LearnerMomentumSGD::MomentumValueForMB(const MomentumSchedule& schedule, size_t minibatchSize) const { double currentMomentum = GetCurrentTrainingParameterValue(schedule); if (schedule.Unit() == MomentumSchedule::UnitType::Minibatch) { return currentMomentum; } return std::pow(currentMomentum, minibatchSize); } /*virtual*/ void LearnerAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/ { UPDATE_FUNCTION; } template void LearnerAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const { UNUSED(trainingSampleCount); const auto& parameterValue = parameter.Value(); const auto& smoothedGradientMatrix = GetWritableMatrix(smoothedGradientValue); const auto& gradientMatrix = GetWritableMatrix(gradientValue); const auto& parameterMatrix = GetWritableMatrix(parameterValue); const auto learningRate = LearningRate(trainingSampleCount); const auto aveMultiplier = smoothedGradientMatrix->Adagrad(*gradientMatrix, m_needAveMultiplier); Matrix::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix); } /*static*/ const double LearnerFSAdaGrad::s_targetAdagradAvDenom = 1.0; LearnerFSAdaGrad::LearnerFSAdaGrad(const vector& parameters, const LearningRateSchedule& learningRateSchedule, const MomentumSchedule& momentumSchedule, const MomentumSchedule& varianceMomentumSchedule, AdditionalLearningOptions additionalOptions) : LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule, additionalOptions, /*allocateSmoothGradients*/ false), m_varianceMomentumSchedule(varianceMomentumSchedule) { for (const auto& parameter : parameters) { const auto shape = GetMatrixShape(parameter); NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], 2 * shape[1] }); m_smoothedGradientValues.insert(make_pair(parameter, view)); m_smoothedCounts.insert(make_pair(parameter, 0.0)); } } /*virtual*/ void LearnerFSAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/ { UPDATE_FUNCTION; } template void LearnerFSAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const { const auto& parameterValue = parameter.Value(); const auto& smoothedGradientMatrix = GetWritableMatrix(smoothedGradientValue); const auto& gradientMatrix = GetWritableMatrix(gradientValue); const auto& parameterMatrix = GetWritableMatrix(parameterValue); const auto learningRate = LearningRate(trainingSampleCount); const auto momentum = MomentumValueForMB(trainingSampleCount); const auto varMomentum = VarianceMomentumValueForMB(trainingSampleCount); double& smoothedCount = m_smoothedCounts.at(parameter); smoothedGradientMatrix->FSAdagradUpdate(trainingSampleCount, *gradientMatrix, *parameterMatrix, smoothedCount, learningRate, s_targetAdagradAvDenom, momentum, varMomentum); } LearnerRMSProp::LearnerRMSProp(const vector& parameters, const LearningRateSchedule& learningRateSchedule, double gamma, double inc, double dec, double max, double min, bool needAveMultiplier, AdditionalLearningOptions additionalOptions) : LearnerBase(parameters, learningRateSchedule, additionalOptions, /*allocateSmoothGradients*/ false), m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min), m_needAveMultiplier(needAveMultiplier) { for (const auto& parameter : parameters) { // When needAveMultiplier == true, CPU and GPU implementations of RMSProp require different number of columns. size_t factor = 3; if (needAveMultiplier && parameter.Value()->Device().Type() == DeviceKind::GPU) { factor = 4; } const auto shape = GetMatrixShape(parameter); NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], factor * shape[1] }); m_smoothedGradientValues.insert(make_pair(parameter, view)); } } /*virtual*/ void LearnerRMSProp::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/ { UPDATE_FUNCTION; } template void LearnerRMSProp::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const { UNUSED(trainingSampleCount); const auto& parameterValue = parameter.Value(); const auto& smoothedGradientMatrix = GetWritableMatrix(smoothedGradientValue); const auto& gradientMatrix = GetWritableMatrix(gradientValue); const auto& parameterMatrix = GetWritableMatrix(parameterValue); const auto learningRate = LearningRate(trainingSampleCount); const auto aveMultiplier = smoothedGradientMatrix->RmsProp(*gradientMatrix, ElementType(m_gamma), ElementType(m_inc), ElementType(m_max), ElementType(m_dec), ElementType(m_min), m_needAveMultiplier); Matrix::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix); } // Explicit template instantiations template shared_ptr> LearnerBase::GetWritableMatrix(const NDArrayViewPtr& arrayView); template shared_ptr> LearnerBase::GetWritableMatrix(const NDArrayViewPtr& arrayView); LearnerPtr SGDLearner(const vector& parameters, const LearningRateSchedule& learningRateSchedule, AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/) { return MakeSharedObject(parameters, learningRateSchedule, additionalOptions); } LearnerPtr MomentumSGDLearner(const vector& parameters, const LearningRateSchedule& learningRateSchedule, const MomentumSchedule& momentumSchedule, AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/) { return MakeSharedObject(parameters, learningRateSchedule, momentumSchedule, additionalOptions); } LearnerPtr NesterovLearner(const vector& parameters, const LearningRateSchedule& learningRateSchedule, const MomentumSchedule& momentumSchedule, AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/) { return MakeSharedObject(parameters, learningRateSchedule, momentumSchedule, additionalOptions); } LearnerPtr AdamLearner(const vector& parameters, const LearningRateSchedule& learningRateSchedule, const MomentumSchedule& momentumSchedule, const MomentumSchedule& varianceMomentumSchedule, /*= MomentumAsTimeConstantSchedulePerSample(2 * 3600 * 100)*/ bool lowMemory, /*= true*/ AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/) { if (!lowMemory) { LogicError("AdamLearner: only the low-memory variant is supported at the moment."); } return MakeSharedObject(parameters, learningRateSchedule, momentumSchedule, varianceMomentumSchedule, additionalOptions); } LearnerPtr AdaGradLearner(const vector& parameters, const LearningRateSchedule& learningRateSchedule, bool needAveMultiplier /*= true*/, AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/) { return MakeSharedObject(parameters, learningRateSchedule, needAveMultiplier, additionalOptions); } LearnerPtr RMSPropLearner(const vector& parameters, const LearningRateSchedule& learningRateSchedule, double gamma, double inc, double dec, double max, double min, bool needAveMultiplier /*= true*/, AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/) { return MakeSharedObject(parameters, learningRateSchedule, gamma, inc, dec, max, min, needAveMultiplier, additionalOptions); } }