// // Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // #pragma once #include "Basics.h" #include "Globals.h" #include "Matrix.h" #include "ComputationNode.h" #include "ConvolutionEngine.h" namespace Microsoft { namespace MSR { namespace CNTK { // ----------------------------------------------------------------------- // ConvolutionNodeBase // ----------------------------------------------------------------------- // ConvolutionNodeBase is a base class for ND-convolution(ConvolutionNode) and ND-pooling(PoolingNode). // // 2D convolutions (incl. pooling) support two different storage formats: // // * legacy ("HWC") mode: Channels are tuples of scalars // // This follows "high performance convolutional neural networks for document processing" by Kumar Chellapilla, Sidde Puri, and Patrice Simard. // Each sample is stored as a column-major matrix (height, width) of float[numChannels] (r00, g00, b00, r10, g10, b10, r01, g01, b01, r11, g11, b11). // // - input : [C x W x H x T] or ARRAY[1..T] OF ARRAY[1..H] OF ARRAY[1..W] OF ARRAY[1..C] // - output : [K x W' x H' x T] or ARRAY[1..T] OF ARRAY[1..H'] OF ARRAY[1..W'] OF ARRAY[1..K] // - filter : [K x W" x H" x C ] or ARRAY[1..C] OF ARRAY[1..H"] OF ARRAY[1..W"] OF ARRAY[1..K] // // * cudnn ("CHW") mode (works both GPU and CPU): Channels are planes // // - input : [W x H x C x T] or ARRAY[1..T] OF ARRAY[1..C] OF ARRAY[1..H] OF ARRAY[1..W] // - output : [W' x H' x K x T] or ARRAY[1..T] OF ARRAY[1..K] OF ARRAY[1..H'] OF ARRAY[1..W'] // - filter : [W" x H" x C x K ] or ARRAY[1..K] OF ARRAY[1..C] OF ARRAY[1..H] OF ARRAY[1..W] // // where: // - using ' for output and " for filter // - T = samples (NVidia calls this N) // - W, H = width, height (W', H' for output, W", H" for kernel) // - C = input channels // - 3 for color images, 1 for B&W images // - for hidden layer: dimension of activation vector for each pixel // - K = output channels = dimension of activation vector for each pixel (also called N by NVidia, inconsistently) // // For ND-convolution/pooling only second format ('cudnn') is supported. // template class ConvolutionNodeBase : public ComputationNode { typedef ComputationNode Base; UsingComputationNodeMembers; public: ConvolutionNodeBase(DEVICEID_TYPE deviceId, const wstring& name) : Base(deviceId, name), m_poolKind(PoolKind::None), m_poolIncludePad(false), m_transpose(false), m_outputShape(TensorShape(0)), m_ceilOutDim(false), m_maxTempMemSizeInSamples(0) { } ConvolutionNodeBase(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape, const std::vector& sharing, const std::vector& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad, PoolKind poolKind, bool poolIncludePad, bool transpose, const TensorShape& outputShape, bool ceilOutDim, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples) : Base(deviceId, name), m_kernelShape(kernelShape), m_mapCount(mapCount), m_stride(strideShape), m_sharing(sharing), m_autoPad(autoPadding), m_lowerPad(lowerPad), m_upperPad(upperPad), m_poolKind(poolKind), m_poolIncludePad(poolIncludePad), m_transpose(transpose), m_outputShape(outputShape), m_ceilOutDim(ceilOutDim), m_imageLayout(imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples) { } public: void Save(File& fstream) const override { Base::Save(fstream); m_kernelShape.Save(fstream); m_mapCount.Save(fstream); m_stride.Save(fstream); fstream << m_sharing; fstream << m_autoPad; m_lowerPad.Save(fstream); m_upperPad.Save(fstream); fstream << (int32_t)m_poolKind; fstream << (int32_t)m_imageLayout; fstream << m_maxTempMemSizeInSamples; fstream << m_transpose; m_outputShape.Save(fstream); fstream << m_ceilOutDim; fstream << m_poolIncludePad; } void Load(File& fstream, size_t modelVersion) override { Base::Load(fstream, modelVersion); // Let ConvolutionNode handle older models. if (modelVersion >= CNTK_MODEL_VERSION_5) { m_kernelShape.Load(fstream); m_mapCount.Load(fstream); m_stride.Load(fstream); fstream >> m_sharing; fstream >> m_autoPad; m_lowerPad.Load(fstream); m_upperPad.Load(fstream); int32_t k; fstream >> k; m_poolKind = (PoolKind)k; int32_t layout; fstream >> layout; m_imageLayout = (ImageLayoutKind)layout; fstream >> m_maxTempMemSizeInSamples; } if (modelVersion >= CNTK_MODEL_VERSION_9) { fstream >> m_transpose; } if (modelVersion >= CNTK_MODEL_VERSION_20) { m_outputShape.Load(fstream); } if (modelVersion >= CNTK_MODEL_VERSION_21) { fstream >> m_ceilOutDim; } if (modelVersion >= CNTK_MODEL_VERSION_23) { fstream >> m_poolIncludePad; } } void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override { Base::CopyTo(nodeP, newName, flags); if (flags & CopyNodeFlags::copyNodeValue) { auto node = dynamic_pointer_cast>(nodeP); node->m_kernelShape = m_kernelShape; node->m_mapCount = m_mapCount; node->m_stride = m_stride; node->m_sharing = m_sharing; node->m_autoPad = m_autoPad; node->m_lowerPad = m_lowerPad; node->m_upperPad = m_upperPad; node->m_poolKind = m_poolKind; node->m_transpose = m_transpose; node->m_outputShape = m_outputShape; node->m_ceilOutDim = m_ceilOutDim; node->m_poolIncludePad = m_poolIncludePad; node->m_imageLayout = m_imageLayout; node->m_maxTempMemSizeInSamples = m_maxTempMemSizeInSamples; } } void DumpNodeInfo(const bool printValues, const bool printMetadata, File& fstream) const override { Base::DumpNodeInfo(printValues, printMetadata, fstream); if (m_convEng != nullptr) fstream << "Geometry: " << string(*m_convEng->Geometry()) << "\n"; fstream << "PoolKind: " << (int)m_poolKind << "\n"; } TensorShape KernelShape() const { return m_kernelShape; } TensorShape MapCount() const { return m_mapCount; } TensorShape Strides() const { return m_stride; } std::vector Sharing() const { return m_sharing; } std::vector AutoPad() const { return m_autoPad; } TensorShape LowerPad() const { return m_lowerPad; } TensorShape UpperPad() const { return m_upperPad; } bool Transpose() const { return m_transpose; } TensorShape OutputShape() const { return m_outputShape; } size_t MaxTempMemSizeInSamples() const { return m_maxTempMemSizeInSamples; } PoolKind PoolingKind() const { return m_poolKind; } bool CeilOutDim() const { return m_ceilOutDim; } bool PoolIncludePad() const { return m_poolIncludePad; } // bottomlessly expand shape to filterRank, then expand to inputRank using defaults or given 'from' values template static void FixVectorShape(size_t filterRank, size_t inputRank, V& shape, T deflt, const V& from = V()) { if (shape.size() == 0) return; // let ComputeOutputShape() deal with this special case // repeat the last value until we have the same rank as the filter while (shape.size() < filterRank) shape.push_back(shape.back()); // increase to input rank // If 'from' is given then clone the value from there. This is meant to be the input dimensions for convolution. while (shape.size() < inputRank) shape.push_back(shape.size() < from.size() ? from[shape.size()] : deflt); } private: static void FixTensorShape(size_t filterRank, size_t inputRank, TensorShape& shape, size_t deflt, const TensorShape& from = TensorShape()) { auto dims = shape.GetDims(); FixVectorShape(filterRank, inputRank, dims, deflt, from.GetDims()); shape = TensorShape(dims); } protected: // infer reduction dimensions if m_convolution2D is true, for legacy NDL branch void InferConvolution2DReductionDims(const TensorShape& inputShape, size_t numChannels) { size_t kW = m_kernelShape[0]; size_t kH = m_kernelShape[1]; size_t sW = m_stride[0]; size_t sH = m_stride[1]; m_kernelShape = TensorShape(kW, kH, numChannels); m_stride = TensorShape(sW, sH, numChannels); size_t filterRank = 2; FixVectorShape(filterRank, inputShape.size(), m_autoPad, false); FixTensorShape(filterRank, inputShape.size(), m_lowerPad, 0); FixTensorShape(filterRank, inputShape.size(), m_upperPad, 0); FixVectorShape(filterRank, inputShape.size(), m_sharing, true); } // infer reduction dimensions if not given void InferReductionDims(const TensorShape& inputShape, const TensorShape& fromShape) { // If kernel has a lower rank than the input then the remaining dimensions are to be reduced over. size_t filterRank = m_kernelShape.size(); FixTensorShape(filterRank, inputShape.size(), m_kernelShape, 1, fromShape); // convolve over red dim; pool over 1 FixTensorShape(filterRank, inputShape.size(), m_stride, 1, fromShape); // stride for reduction dims is red dim or 1 FixVectorShape(filterRank, inputShape.size(), m_autoPad, false); // no padding for reduction dims FixTensorShape(filterRank, inputShape.size(), m_lowerPad, 0); FixTensorShape(filterRank, inputShape.size(), m_upperPad, 0); FixVectorShape(filterRank, inputShape.size(), m_sharing, true); } // Derived classes implement transforms calculation. Since all derived classes are filter based we consolidate common // filter transform calculation here to be reused by derived classes. For example convolution and de-convolution // have same transform but inversed, hence both of them may reuse this method and one will call inverse in addition // (similar holds for pooling nodes). SpaceTransform ComputeFilterTransform() { std::shared_ptr geometry = m_convEng->Geometry(); SpaceTransform result; result.m_axisTransforms.resize(2); result.m_axisTransforms[0].scale = (float)(geometry->GetStride(0)); result.m_axisTransforms[0].translate = (float)((geometry->KernelShape()[0] - 1) / 2 - geometry->GetLowerPad(0)); result.m_axisTransforms[1].scale = (float)(geometry->GetStride(1)); result.m_axisTransforms[1].translate = (float)((geometry->KernelShape()[1] - 1) / 2 - geometry->GetLowerPad(1)); return result; } virtual TensorShape ComputeOutputShape(const TensorShape& inputShape, const TensorShape& dilate, bool ceilOutDim, bool isFinalValidationPass) { const size_t DEAFULT_NUM_GROUPS = 1; return ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride, m_sharing, m_autoPad, m_lowerPad, m_upperPad, dilate, DEAFULT_NUM_GROUPS, ceilOutDim, Base::NeedsDynamicValidation(), isFinalValidationPass); } protected: TensorShape m_kernelShape; TensorShape m_mapCount; TensorShape m_stride; std::vector m_sharing; std::vector m_autoPad; TensorShape m_lowerPad; TensorShape m_upperPad; PoolKind m_poolKind; bool m_transpose; TensorShape m_outputShape; bool m_ceilOutDim; bool m_poolIncludePad; ImageLayoutKind m_imageLayout; size_t m_maxTempMemSizeInSamples; shared_ptr> m_tempMatrixForward; shared_ptr> m_tempMatrixBackward; std::unique_ptr> m_convEng; }; #define UsingConvolutionNodeBaseMembers \ UsingComputationNodeMembersBoilerplate; \ protected: \ using Base::m_kernelShape; \ using Base::m_mapCount; \ using Base::m_stride; \ using Base::m_sharing; \ using Base::m_autoPad; \ using Base::m_lowerPad; \ using Base::m_upperPad; \ using Base::m_poolKind; \ using Base::m_transpose; \ using Base::m_outputShape; \ using Base::m_ceilOutDim; \ using Base::m_poolIncludePad; \ using Base::m_imageLayout; \ using Base::m_maxTempMemSizeInSamples; \ using Base::m_tempMatrixForward; \ using Base::m_tempMatrixBackward; \ using Base::m_convEng; \ using Base::InferConvolution2DReductionDims; \ using Base::InferReductionDims; \ public: // ----------------------------------------------------------------------- // ConvolutionNode (convolutionWeights, inputFeature) // ----------------------------------------------------------------------- template class ConvolutionNode : public ConvolutionNodeBase, public NumInputs<2>, public TransformerNode { typedef ConvolutionNodeBase Base; UsingConvolutionNodeBaseMembers; static const std::wstring TypeName() { return L"Convolution"; } public: ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name) : Base(deviceId, name), m_dilation(TensorShape(1)), m_groups(1) { } ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape, const std::vector& sharing, const std::vector& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad, bool transpose, const TensorShape &outputShape, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, const TensorShape& dilation=TensorShape(1), size_t groups=1) : Base(deviceId, name, kernelShape, mapCount, strideShape, sharing, autoPadding, lowerPad, upperPad, PoolKind::None, false, transpose, outputShape, false, imageLayout, maxTempMemSizeInSamples), m_convolution2D(false), m_dilation(dilation), m_groups(groups) { // Make sure not using dilation on CPU if(deviceId < 0) { for(int i = 0; i < dilation.size(); i++) { if(1 != dilation[i]) RuntimeError("Dilated convolution on CPU is not yet implemented."); } } } ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayout, bool zeroPadding, size_t maxTempMemSizeInSamples) : ConvolutionNode(deviceId, name, TensorShape(kernelWidth, kernelHeight, 1), TensorShape(1, 1, outputChannels), TensorShape(horizontalSubsample, verticalSubsample, 1), vector{true}, vector{zeroPadding}, TensorShape(0), TensorShape(0), false, TensorShape(0), imageLayout, maxTempMemSizeInSamples) { m_convolution2D = true; } ConvolutionNode(const ScriptableObjects::IConfigRecordPtr configp) : ConvolutionNode(configp->Get(L"deviceId"), L"", configp->Get(L"kernelShape"), configp->Get(L"mapCount"), configp->Get(L"strideShape"), configp->Get(L"dimSharing"), configp->Get(L"dimPadding"), configp->Get(L"dimPadLower"), configp->Get(L"dimPadUpper"), configp->Get(L"transpose"), configp->Get(L"dimOutputShape"), ImageLayoutKindFrom(configp->Get(L"imageLayout")), configp->Get(L"maxTempMemSizeInSamples"), configp->Get(L"dimDilation")) { AttachInputsFromConfig(configp, GetExpectedNumInputs()); } // TODO: the check for NeedsDynamicValidation() is a temporary resolution and needs to be properly handled when we look at support for free dimension convolution inputs. virtual ParentGradientOptimization ImplementsGradientOptimization(const ComputationNodeBase*) const override { bool overwrite = Base::NeedsDynamicValidation() ? false : m_convEng->ImplementsGradientOverwriteOptimization(); return overwrite ? ParentGradientOptimization::Overwrite : ParentGradientOptimization::None; } public: void Save(File& fstream) const override { Base::Save(fstream); fstream << m_convolution2D; m_dilation.Save(fstream); } void Load(File& fstream, size_t modelVersion) override { Base::Load(fstream, modelVersion); // Back compat: load pre-ND convolution models. if (modelVersion < CNTK_MODEL_VERSION_5) { size_t kW, kH, sW, sH; fstream >> kW; fstream >> kH; fstream >> sW; fstream >> sH; uint32_t imageLayout, mapCount; fstream >> mapCount; fstream >> imageLayout; m_imageLayout = (ImageLayoutKind)imageLayout; bool pad; fstream >> pad; fstream >> m_maxTempMemSizeInSamples; m_poolKind = PoolKind::None; m_convolution2D = true; m_kernelShape = TensorShape(kW, kH, 1); m_mapCount = TensorShape(mapCount); m_stride = TensorShape(sW, sH, 1); m_sharing = vector{true}; m_autoPad = vector{pad}; m_lowerPad = TensorShape(0); m_upperPad = TensorShape(0); } else { fstream >> m_convolution2D; if (modelVersion >= CNTK_MODEL_VERSION_18) { m_dilation.Load(fstream); } else { m_dilation = TensorShape(1); } } } void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override { Base::CopyTo(nodeP, newName, flags); if (flags & CopyNodeFlags::copyNodeValue) { auto node = dynamic_pointer_cast>(nodeP); node->m_convolution2D = m_convolution2D; } } void ForwardProp(const FrameRange& fr) override { Matrix sliceOutputValue = ValueFor(fr); const Matrix& input0 = InputRef(0).ValueAsMatrix(); Matrix sliceInput1Value = InputRef(1).ValueFor(fr); if (!m_transpose) m_convEng->Forward(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrixForward); else { // BackwardData adds results to the output so need to zero them out first. // REVIEW alexeyk: should be rolled into BackwardData itself. sliceOutputValue.SetValue(0); m_convEng->BackwardData(sliceInput1Value, input0, sliceOutputValue, /*accumulateGradient =*/ true, *m_tempMatrixForward); } } void BackpropTo(const size_t inputIndex, const FrameRange& fr) override { auto sliceOutputGrad = GradientFor(fr); // this potentially computes over time, so we must mask gaps to 0 if (Input(inputIndex)->ReducesInTimeWrt(shared_from_this())) MaskMissingGradientColumnsToZero(fr); if (Input(inputIndex)->ReducesInTimeWrt(Input(1 - inputIndex))) Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr); if (inputIndex == 0) // derivative with respect to the weight matrix { auto& grad = InputRef(0).GradientAsMatrix(); auto sliceInput1Value = InputRef(1).ValueFor(fr); if (!m_transpose) m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, !Input(inputIndex)->IsGradientInitializedBy(this), fr.IsAllFrames(), *m_tempMatrixBackward); else m_convEng->BackwardKernel(sliceInput1Value, sliceOutputGrad, grad, !Input(inputIndex)->IsGradientInitializedBy(this), fr.IsAllFrames(), *m_tempMatrixBackward); } else if (inputIndex == 1) // derivative with respect to the input feature { auto& input0 = InputRef(0).ValueAsMatrix(); auto sliceInput1Grad = InputRef(1).GradientFor(fr); if (!m_transpose) m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, !Input(inputIndex)->IsGradientInitializedBy(this), *m_tempMatrixBackward); else { // REVIEW alexeyk: Forward overwrites values in sliceInput1Grad. Should handle correctly instead. m_convEng->Forward(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrixBackward); } } } void Validate(bool isFinalValidationPass) override { Base::Validate(isFinalValidationPass); InferMBLayoutFromInputsForStandardCase(isFinalValidationPass); size_t inputIdx = GetExpectedNumInputs() - 1; TensorShape inputShape; TensorShape outputShape; // If 2D convolution syntax is used then some of the tensor dimensions need to be inferred. if (m_convolution2D) // NOTE: when m_convolution2D is true, it's a legacy branch. Code should not enter here any more. { // Need to update some tensors with correct input dims. auto inDims = ImageDimensions(GetInputSampleLayout(inputIdx), m_imageLayout); // inputShape is used in ConvolveGeometry which supports only CHW layout. inputShape = inDims.AsTensorShape(ImageLayoutKind::CHW); InferConvolution2DReductionDims(inputShape, inDims.m_numChannels); size_t kW = m_kernelShape[0]; size_t kH = m_kernelShape[1]; size_t mapCount = m_mapCount.GetNumElements(); size_t weightCols = kW * kH * inDims.m_numChannels; // if mapCount is 0 then take it from the input matrix if (mapCount == 0) Input(0)->GetAsMatrixNumRows(); // check/infer input [0] (weights) // BUGBUG: For now, we treat the weights as a 2D matrix. They should be a tensor proper. Input(0)->ValidateInferInputDimsFrom(TensorShape(mapCount, weightCols)); if (isFinalValidationPass && (Input(0)->GetAsMatrixNumCols() != weightCols || Input(0)->GetAsMatrixNumRows() != mapCount)) { LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]", Input(0)->NodeName().c_str(), (int)mapCount, (int)weightCols); } outputShape = this->ComputeOutputShape(inputShape, TensorShape(1), /*ceilOutDim*/false, isFinalValidationPass); // ConvolveGeometry always uses CHW. SetDims(ImageDimensions(outputShape, ImageLayoutKind::CHW).AsTensorShape(m_imageLayout), HasMBLayout()); } else { inputShape = GetInputSampleLayout(inputIdx); // infer reduction dimensions if not given InferReductionDims(inputShape, inputShape); if (!m_transpose) { outputShape = this->ComputeOutputShape(inputShape, m_dilation, /*ceilOutDim*/false, isFinalValidationPass); if (m_outputShape.GetRank() > 0 && m_outputShape != TensorShape(0)) // user have explicitly set m_outputShape, we check if it's the same as outputShape { if (m_outputShape != outputShape) { InvalidArgument("%ls %ls the shape of the specified convolution output %ls is different from " "the result of convoluting the input argument using the provided options %ls. It is recommended " "that the output shape is not specified for convolution.", NodeName().c_str(), OperationName().c_str(), static_cast(m_outputShape).c_str(), static_cast(outputShape).c_str()); } } } else { if (m_outputShape.GetRank() <= 0 || m_outputShape == TensorShape(0)) { // In case of convolution transpose (deconvolution), node input (inputShape) is really the output of the convolution // and node output (outDims) is convolution input. ConvolveGeometry does not care about deconvolutions (it does not have to). const size_t DEAFULT_NUM_GROUPS = 1; outputShape = ConvolveGeometry::ComputeInputShape(inputShape, m_kernelShape, m_mapCount, m_stride, m_sharing, m_autoPad, m_lowerPad, m_upperPad, TensorShape(1), DEAFULT_NUM_GROUPS, false, Base::NeedsDynamicValidation(), isFinalValidationPass); } else { // in case the user specifies the output shape, we make sure the input shape can be the result of // convolution from the specified output shape auto inferredShape = this->ComputeOutputShape(m_outputShape, TensorShape(1), false, isFinalValidationPass); if (inputShape != inferredShape) InvalidArgument("%ls %ls the shape of the convolution transpose operand %ls is different from " "the result of convoluting the specified output argument using " "the provided options %ls", NodeName().c_str(), OperationName().c_str(), static_cast(inputShape).c_str(), static_cast(inferredShape).c_str()); outputShape = m_outputShape; } } if (m_imageLayout == ImageLayoutKind::CHW) SetDims(outputShape, HasMBLayout()); else // legacy format SetDims(ImageDimensions(outputShape, ImageLayoutKind::CHW).AsTensorShape(m_imageLayout), HasMBLayout()); } // update LearnableParameter if it has 0 dimensions (to be inferred) // Typically this would be the #inputChannels (C). if (Input(0)->GetSampleLayout().GetNumElements() == 0) { // BUGBUG: Inference does not support sharing. Problem is that we have the information too late. // In this case, users will have to specify the correct dimensions. Good luck. #if 1 // old style for back compat with previous results. Randomization will differ. if (Input(0)->GetSampleLayout().GetRank() == 2) Input(0)->ValidateInferInputDimsFrom(TensorShape(m_mapCount.GetNumElements(), m_kernelShape.GetNumElements())); else #endif { auto weightShape = m_kernelShape.GetDims(); for (auto outDim : m_mapCount.GetDims()) weightShape.push_back(outDim); Input(0)->ValidateInferInputDimsFrom(TensorShape(weightShape)); } } if (isFinalValidationPass) { bool recomputeConvGeometry = (m_convEng == nullptr) ? false : // For first minibatch, this flag must be false, so initial mem allocation can happen. (outputShape != m_convEng->Geometry()->OutputShape()) || (inputShape != m_convEng->Geometry()->InputShape()); if ((m_convEng == nullptr) || ((m_convEng != nullptr) && recomputeConvGeometry)) { auto geometry = std::make_shared(!m_transpose ? inputShape : outputShape, m_kernelShape, m_mapCount, m_stride, m_sharing, m_autoPad, m_lowerPad, m_upperPad, m_dilation, false, m_groups); m_convEng = ConvolutionEngine::Create(geometry, m_deviceId, m_imageLayout, m_maxTempMemSizeInSamples, m_poolKind, ConvolutionEngineKind::All, NodeName(), Globals::ShouldForceDeterministicAlgorithms(), false, recomputeConvGeometry); } if (Input(0)->GetSampleLayout().GetNumElements() != m_kernelShape.GetNumElements() * m_convEng->Geometry()->KernelCount()) { LogicError("Convolution weight matrix %ls should have dimension [(filter shape) x (input channels) x (output channels)]", Input(0)->NodeName().c_str()); } } } void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override { Base::RequestMatricesBeforeForwardProp(matrixPool); RequestMatrixFromPool(m_tempMatrixForward, matrixPool, 0, false, true); } // m_tempMatrixForward is only used as workspace for convolution, we can release it immediately afterwards void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool) override { Base::ReleaseMatricesAfterForwardProp(matrixPool); ReleaseMatrixToPool(m_tempMatrixForward, matrixPool); } void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override { Base::RequestMatricesBeforeBackprop(matrixPool); RequestMatrixFromPool(m_tempMatrixBackward, matrixPool, 0, false, true); } void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override { Base::ReleaseMatricesAfterBackprop(matrixPool); ReleaseMatrixToPool(m_tempMatrixBackward, matrixPool); } void SetmMaxTempMemSizeInSamples(const size_t maxTempMemSizeInSamples) { m_maxTempMemSizeInSamples = maxTempMemSizeInSamples; if (m_convEng != nullptr) m_convEng->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples); } bool IsConvolution2D() const { return m_convolution2D; } bool OutputUsedInComputingInputNodesGradients() const override { return false; } private: using TransformerNode::m_transforms; using ConvolutionNodeBase::ComputeFilterTransform; virtual void /*TransformerNode::*/ComputeTransforms() override { if (m_transforms[1].m_axisTransforms.empty()) { m_transforms[1] = ComputeFilterTransform(); if (!m_transpose) { // Convolution, need to inverse transform. m_transforms[1] = m_transforms[1].Inverse(); } // else: Deconvolution, nothing to do. } // else: transform already computed, no need to do computation again. } virtual bool /*TransformerNode::*/SupportsTransformOnInput(size_t inputIndex) override { // We support transforms just on convolution input. return (inputIndex == 1); } virtual TensorShape /*ConvolutionNode::*/ComputeOutputShape(const TensorShape& inputShape, const TensorShape& dilate, bool ceilOutDim, bool isFinalValidationPass) { return ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride, m_sharing, m_autoPad, m_lowerPad, m_upperPad, dilate, m_groups, ceilOutDim, Base::NeedsDynamicValidation(), isFinalValidationPass); } TensorShape m_dilation; size_t m_groups; protected: // Flag that indicates whether the node is created using 2D-syntax. bool m_convolution2D; }; // ----------------------------------------------------------------------- // ROIPoolingNode (inputFeatures, inputROIs)--pooling for object detection. // // Each input image has a fixed number of regions of interest (ROIs), // specified as bounding boxes (x, y, w, h) that are relative to the // image size [W x H]. This node is meant as a replacement for the // final pooling layer of an image classification network. The first // fully-connected layer expects a fixed size input, but for object // detection we want each ROI to look like an image to the network so // we can get a label for it. The ROIs have different spatial sizes, // so this node does Max Pooling, but with an adaptive pooling window, // so that each ROI output has the spatial size expected by the first // fully-connected layer. Images are Input(0). ROIs are Input(1). // // Input0: Images [W x H x C x N] // Input1: ROIs [4 x roisPerImage x N], // output: Pooled ROIs [PW x PH x C x roisPerImage x N] // where PW = Pooled Width, PH = Pooled Height, C = Channels, N = Batch Size // // See http://arxiv.org/abs/1504.08083 // ----------------------------------------------------------------------- template class ROIPoolingNode : public ComputationNode, public NumInputs<2> { typedef ComputationNode Base; UsingComputationNodeMembersBoilerplate; static const std::wstring TypeName() { return L"ROIPooling"; } public: ROIPoolingNode(DEVICEID_TYPE deviceId, const wstring& name, PoolKind poolKind = PoolKind::Max, const TensorShape& roiOutputShape = TensorShape(), double spatialScale = 1.0/16.0) : Base(deviceId, name), m_poolKind(poolKind), m_roiOutputShape(roiOutputShape), m_spatialScale(spatialScale), m_argmaxData(Matrix::Zeros(1, 1, deviceId)) { } ROIPoolingNode(const ScriptableObjects::IConfigRecordPtr configp) : ROIPoolingNode(configp->Get(L"deviceId"), L"", PoolKindFrom(configp->Get(L"pool")), configp->Get(L"roiOutputShape"), configp->Get(L"featureScale")) { AttachInputsFromConfig(configp, GetExpectedNumInputs()); } void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override { Base::RequestMatricesBeforeForwardProp(matrixPool); size_t matrixSize = m_sampleLayout.GetNumElements(); RequestMatrixFromPool(m_tempMatrix, matrixPool, matrixSize, true); } // m_tempMatrix cannot be released after Forward Prop because its content (argmax) is used for back prop. void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override { Base::ReleaseMatricesAfterBackprop(matrixPool); ReleaseMatrixToPool(m_tempMatrix, matrixPool); } // Input0: Images [W x H x C x N] // Input1: ROIs [4 x roisPerImage x N], // output: Pooled ROIs [PW x PH x C x roisPerImage x N] // where PW = Pooled Width, PH = Pooled Height, C = Channels, N = Batch Size // // Explanation: this node has a target output shape of // [Pooled Width x Pooled Height x Channels], as does any pooling // layer. However, we want each /ROI/ to have that output size, // not each image. After this node, operations in the network // should be on ROIs, not on the full images. The forward pass // loops over images and the ROIs associated with each image; for // every ROI, it treats the subset of the image specified by that // ROI as a full image and does max pooling over that subset, // using whatever window size will correspond to an output of // [Pooled Width x Pooled Height x Channels]. Hence, // the output tensor is [PW x PH x C x roisPerImage x N] // An example validation output looks like this: // Validating --> z.roiOut = ROIPooling (z.conv5Out.conv5.y, rois) : [61 x 61 x 256 x *], [4 x 64 x *] -> [6 x 6 x 256 x 64 x *] void ForwardProp(const FrameRange& fr) override { // [4 x roisPerImage x N] -- first dimension is roiSize (4), second is rois-per-image, third is mb size size_t roisPerImage = (size_t)GetInputSampleLayout(1)[1]; auto inputShape = GetInputSampleLayout(0); Matrix inputSlice = Input(0)->ValueFor(fr); Matrix ROIs = Input(1)->ValueFor(fr); // our output slice for this minibatch. Matrix outputSlice = ValueFor(fr); // input slice is [W x H x C x N]; cols are images. // ROIs is [4 x roisPerImage x N]; cols are ROIs for different images. // each ROI is (x, y, w, h) relative to original image size. size_t inputW = (size_t)inputShape[0]; size_t inputH = (size_t)inputShape[1]; size_t numChannels = (size_t)inputShape[2]; size_t outW = m_roiOutputShape[0]; size_t outH = m_roiOutputShape[1]; m_tempMatrix->Resize(outW * outH * numChannels * roisPerImage, inputSlice.GetNumCols()); if (m_poolKind == PoolKind::Max) inputSlice.MaxROIPoolingForward(roisPerImage, inputSlice.GetNumCols(), numChannels, inputW, inputH, outW, outH, ROIs, outputSlice, *m_tempMatrix, m_spatialScale); else LogicError("Average ROI pooling is not supported."); } // similar to usual MaxPooling backpropagation. Send gradients // back through to the locations that were used as the "max." Only // difference: needs to sum gradients over all the ROIs that may // have used that location. One image location could be in // multiple ROIs--in that case each ROI may contribute a gradient term. void BackpropTo(const size_t inputIndex, const FrameRange& fr) override { if (inputIndex != 0) return; auto inputShape = GetInputSampleLayout(0); Matrix inputSlice = Input(0)->ValueFor(fr); int inputW = inputShape[0]; int inputH = inputShape[1]; int numChannels = inputShape[2]; auto inputGrad = Input(0)->GradientFor(fr); auto pooledGrad = GradientFor(fr); int roisPerImage = GetInputSampleLayout(1)[1]; auto roiData = Input(1)->ValueFor(fr); if (m_poolKind == PoolKind::Max) pooledGrad.MaxROIPoolingBackward(roisPerImage, inputSlice.GetNumCols(), numChannels, inputW, inputH, m_roiOutputShape[0], m_roiOutputShape[1], roiData, inputGrad, *m_tempMatrix, m_spatialScale); else LogicError("Average ROI pooling is not supported."); } void Save(File& fstream) const override { Base::Save(fstream); m_roiOutputShape.Save(fstream); fstream << (int32_t)m_poolKind; fstream << m_spatialScale; } void Load(File& fstream, size_t modelVersion) override { Base::Load(fstream, modelVersion); m_roiOutputShape.Load(fstream); if (modelVersion < CNTK_MODEL_VERSION_26) { // There are 2 problems here: // 1. m_spatialScale value depends on your location in the network, for current R-CNN and its family it is 1/16. // 2. roiData format also has changed from ratio to absolute values and those are given as input. m_poolKind = PoolKind::Max; m_spatialScale = 1.0/16.0; } else { int32_t k; fstream >> k; m_poolKind = (PoolKind)k; fstream >> m_spatialScale; } } void Validate(bool isFinalValidationPass) override { Base::Validate(isFinalValidationPass); InferMBLayoutFromInputsForStandardCase(isFinalValidationPass); auto inShape = GetInputSampleLayout(0); // layout of input shape is width x height x numChannels auto roiShape = GetInputSampleLayout(1); // layout of ROI shape is 4 x roisPerImage if (isFinalValidationPass && (m_roiOutputShape.size() != 2)) InvalidArgument("ROIPoolingNode: roi output shape must have two dimensions ([W x H])."); if (isFinalValidationPass && (inShape[0] < m_roiOutputShape[0] || inShape[1] < m_roiOutputShape[1])) InvalidArgument("ROIPoolingNode: inputWidth must >= windowWidth and inputHeight must >= windowHeight."); if (isFinalValidationPass && (inShape[2] < 1)) InvalidArgument("ROIPoolingNode: input must have at least one channel ([W x H x C])."); if (isFinalValidationPass && (roiShape[0] != 4)) InvalidArgument("ROIPoolingNode: ROI input must have the following shape: [4 x roisPerImage]."); if (isFinalValidationPass && (roiShape[1] < 1)) InvalidArgument("ROIPoolingNode: ROI input must contain at least one ROI ([4 x roisPerImage])."); // set output dimensions to [W x H x C x roisPerImage] SetDims(TensorShape(m_roiOutputShape[0], m_roiOutputShape[1], inShape[2], roiShape[1]), HasMBLayout()); } void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override { Base::CopyTo(nodeP, newName, flags); if (flags & CopyNodeFlags::copyNodeValue) { auto node = dynamic_pointer_cast>(nodeP); node->m_poolKind = m_poolKind; node->m_roiOutputShape = m_roiOutputShape; node->m_spatialScale = m_spatialScale; } } PoolKind PoolingKind() const { return m_poolKind; } TensorShape ROIOutputShape() const { return m_roiOutputShape; } double SpatialScale() const { return m_spatialScale; } protected: PoolKind m_poolKind; TensorShape m_roiOutputShape; double m_spatialScale; shared_ptr> m_tempMatrix; Matrix m_argmaxData; }; // ----------------------------------------------------------------------- // PoolingNode (inputFeature) // Performs max or average ND pooling. // ----------------------------------------------------------------------- template class PoolingNode : public ConvolutionNodeBase, public NumInputs<1>, public TransformerNode { typedef ConvolutionNodeBase Base; UsingConvolutionNodeBaseMembers; static const std::wstring TypeName() { return L"Pooling"; } public: PoolingNode(DEVICEID_TYPE deviceId, const wstring& name) : Base(deviceId, name) { } PoolingNode(DEVICEID_TYPE deviceId, const wstring& name, PoolKind pool, const TensorShape& kernelShape, const TensorShape& strideShape, const std::vector& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad, bool ceilOutDim, const bool poolIncludePad, ImageLayoutKind imageLayout) : Base(deviceId, name, kernelShape, TensorShape(1), strideShape, vector{true}, autoPadding, lowerPad, upperPad, pool, poolIncludePad, false, TensorShape(0), ceilOutDim, imageLayout, 0) { } PoolingNode(const ScriptableObjects::IConfigRecordPtr configp) : PoolingNode(configp->Get(L"deviceId"), L"", PoolKindFrom(configp->Get(L"pool")), configp->Get(L"kernelShape"), configp->Get(L"strideShape"), configp->Get(L"dimPadding"), configp->Get(L"dimPadLower"), configp->Get(L"dimPadUpper"), configp->Get(L"ceilOut"), configp->Get(L"poolIncludePad"), ImageLayoutKindFrom(configp->Get(L"imageLayout"))) { AttachInputsFromConfig(configp, GetExpectedNumInputs()); } public: void ForwardProp(const FrameRange& fr) override { Matrix sliceOutputValue = ValueFor(fr); const Matrix& input0 = InputRef(0).ValueFor(fr); m_convEng->ForwardPooling(input0, sliceOutputValue); } void BackpropTo(const size_t inputIndex, const FrameRange& fr) override { auto sliceOutputGrad = GradientFor(fr); Matrix sliceInput0Grad = InputRef(0).GradientFor(fr); Matrix sliceInput0Value = InputRef(0).ValueFor(fr); Matrix sliceOutputValue = ValueFor(fr); m_convEng->BackwardPooling(sliceOutputValue, sliceOutputGrad, sliceInput0Value, sliceInput0Grad, !InputRef(0).IsGradientInitializedBy(this)); } bool OutputUsedInComputingInputNodesGradients() const override { // The PoolingNode requires output values only for max pooling. return m_poolKind == PoolKind::Max; } virtual ParentGradientOptimization ImplementsGradientOptimization(const ComputationNodeBase*) const override { return ParentGradientOptimization::Overwrite; } public: void Validate(bool isFinalValidationPass) override { Base::Validate(isFinalValidationPass); InferMBLayoutFromInputsForStandardCase(isFinalValidationPass); if (m_imageLayout != ImageLayoutKind::CHW) { InvalidArgument( "%ls %ls supports only cuDNN (CHW) data layout. " "Please specify imageLayout=\"cudnn\" in %ls node in your script " "and make sure input data layout is CHW", NodeName().c_str(), OperationName().c_str(), NodeName().c_str()); } const auto& inputShape = GetInputSampleLayout(0); // infer reduction dimensions if not given InferReductionDims(inputShape, TensorShape()); auto outDims = this->ComputeOutputShape(inputShape, TensorShape(1), m_ceilOutDim, isFinalValidationPass); SetDims(outDims, HasMBLayout()); if (isFinalValidationPass) { bool recomputeConvGeometry = (m_convEng == nullptr) ? false : // For first minibatch, this flag must be false, so initial mem allocation can happen. (outDims != m_convEng->Geometry()->OutputShape()) || (inputShape != m_convEng->Geometry()->InputShape()); if ((m_convEng == nullptr) || ((m_convEng != nullptr) && recomputeConvGeometry)) { auto geometry = std::make_shared(inputShape, m_kernelShape, m_mapCount, m_stride, m_sharing, m_autoPad, m_lowerPad, m_upperPad, TensorShape(1), m_ceilOutDim); m_convEng = ConvolutionEngine::Create(geometry, m_deviceId, m_imageLayout, m_maxTempMemSizeInSamples, m_poolKind, ConvolutionEngineKind::All, NodeName(), Globals::ShouldForceDeterministicAlgorithms(), m_poolIncludePad, recomputeConvGeometry); } } } private: using TransformerNode::m_transforms; using ConvolutionNodeBase::ComputeFilterTransform; virtual void /*TransformerNode::*/ComputeTransforms() override { if (m_transforms[0].m_axisTransforms.empty()) { m_transforms[0] = ComputeFilterTransform(); m_transforms[0] = m_transforms[0].Inverse(); } // else: transform already computed, no need to do it again. } virtual bool /*TransformerNode::*/SupportsTransformOnInput(size_t /*inputIndex*/) override { // We support transforms on all inputs (one here). return true; } }; // ----------------------------------------------------------------------- // MaxUnpoolingNode (unpoolInputValues, poolInputValues) // Performs "max unpooling" operation. Max unpooling mirrors the operation // performed by max pooling node and depends on the values provided to // the max pooling node (so unlike deconvolution operation, it is not // completely independent). Unpooling takes 2 inputs: features to be unpooled, // which tensor has the same shape as corresponding max pooling node output // and inputs for the original pooling node. Unpooling node // produces an output which has the same dimensions as input to the // corresponding max pooling node (i.e. poolInputValues). // TODO: need to add support for other pooling types, for example, // average unpooling. Note that in this case, generic unpooling operation // will take different number of inputs depending on pooling type. // ----------------------------------------------------------------------- template class MaxUnpoolingNode : public ConvolutionNodeBase, public NumInputs<2>, public TransformerNode { typedef ConvolutionNodeBase Base; UsingConvolutionNodeBaseMembers; static const std::wstring TypeName() { return L"MaxUnpooling"; } public: MaxUnpoolingNode(DEVICEID_TYPE deviceId, const wstring& name) : Base(deviceId, name) { } MaxUnpoolingNode(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& kernelShape, const TensorShape& strideShape, const std::vector& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad, ImageLayoutKind imageLayout) : Base(deviceId, name, kernelShape, TensorShape(1), strideShape, vector{true}, autoPadding, lowerPad, upperPad, PoolKind::Max, false, true, TensorShape(0), false, imageLayout, 0) { } MaxUnpoolingNode(const ScriptableObjects::IConfigRecordPtr configp) : MaxUnpoolingNode(configp->Get(L"deviceId"), L"", configp->Get(L"kernelShape"), configp->Get(L"strideShape"), configp->Get(L"dimPadding"), configp->Get(L"dimPadLower"), configp->Get(L"dimPadUpper"), ImageLayoutKindFrom(configp->Get(L"imageLayout"))) { AttachInputsFromConfig(configp, GetExpectedNumInputs()); } public: void ForwardProp(const FrameRange& fr) override { const Matrix& unpoolInput = InputRef(0).ValueFor(fr); const Matrix& poolInput = InputRef(1).ValueFor(fr); Matrix sliceOutputValue = ValueFor(fr); m_convEng->MaxUnpooling(unpoolInput, poolInput, sliceOutputValue); } void BackpropTo(const size_t inputIndex, const FrameRange& fr) override { if (inputIndex != 0) return; auto sliceOutputGrad = GradientFor(fr); Matrix sliceInput0Grad = InputRef(0).GradientFor(fr); // BUGBUG: ForwardPooling overwrites values in sliceInput1Grad. Should handle correctly instead. m_convEng->ForwardPooling(sliceOutputGrad, sliceInput0Grad); } bool OutputUsedInComputingInputNodesGradients() const override { return false; } void Validate(bool isFinalValidationPass) override { Base::Validate(isFinalValidationPass); InferMBLayoutFromInputsForStandardCase(isFinalValidationPass); if (m_imageLayout != ImageLayoutKind::CHW) { InvalidArgument( "%ls %ls supports only cuDNN (CHW) data layout. " "Please specify imageLayout=\"cudnn\" in %ls node in your script " "and make sure input data layout is CHW", NodeName().c_str(), OperationName().c_str(), NodeName().c_str()); } auto inputShape = GetInputSampleLayout(0); // infer reduction dimensions if not given InferReductionDims(inputShape, TensorShape()); // Same as in case of deconvolution, node input (inputShape) is really the output of the max pooling // and node output (outDims) is pooling input. auto outputShape = GetInputSampleLayout(1); auto inferredShape = this->ComputeOutputShape(outputShape, TensorShape(1), false, isFinalValidationPass); if (inputShape != inferredShape) InvalidArgument("%ls %ls the shape of the unpooling operand %ls is different from " "the result of pooling the poolingInput argument using" "the provided options %ls", NodeName().c_str(), OperationName().c_str(), static_cast(inputShape).c_str(), static_cast(inferredShape).c_str()); SetDims(outputShape, HasMBLayout()); if (isFinalValidationPass) { bool recomputeConvGeometry = (m_convEng == nullptr) ? false : // For first minibatch, this flag must be false, so initial mem allocation can happen. (outputShape != m_convEng->Geometry()->OutputShape()) || (inputShape != m_convEng->Geometry()->InputShape()); if ((m_convEng == nullptr) || ((m_convEng != nullptr) && recomputeConvGeometry)) { auto geometry = std::make_shared(outputShape, m_kernelShape, m_mapCount, m_stride, m_sharing, m_autoPad, m_lowerPad, m_upperPad); // Create reference engine as it's the only engine that implements unpooling. m_convEng = ConvolutionEngine::Create(geometry, m_deviceId, m_imageLayout, m_maxTempMemSizeInSamples, m_poolKind, ConvolutionEngineKind::Reference, NodeName(), false, false, recomputeConvGeometry); } } } private: using TransformerNode::m_transforms; using ConvolutionNodeBase::ComputeFilterTransform; virtual void /*TransformerNode::*/ComputeTransforms() override { if (m_transforms.empty()) { m_transforms[0] = ComputeFilterTransform(); } // else: transform already computed, no need to do it again. } virtual bool /*TransformerNode::*/SupportsTransformOnInput(size_t inputIndex) override { // We support transform for just unpool input. return (inputIndex == 0); } }; // ----------------------------------------------------------------------- // Legacy PoolingNodeBase (input) // ----------------------------------------------------------------------- template class PoolingNodeBase : public ComputationNode, public NumInputs<1> { typedef ComputationNode Base; UsingComputationNodeMembers; public: PoolingNodeBase(DEVICEID_TYPE deviceId, const wstring& name, PoolKind poolKind) : Base(deviceId, name), m_windowWidth(SIZE_MAX), m_windowHeight(SIZE_MAX), m_horizontalSubsample(SIZE_MAX), m_verticalSubsample(SIZE_MAX), m_imageLayoutKind(ImageLayoutKind::HWC), m_poolKind(poolKind) { } PoolingNodeBase(DEVICEID_TYPE deviceId, const wstring& name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind, PoolKind poolKind) : Base(deviceId, name), m_windowWidth(windowWidth), m_windowHeight(windowHeight), m_horizontalSubsample(horizontalSubsample), m_verticalSubsample(verticalSubsample), m_imageLayoutKind(imageLayoutKind), m_poolKind(poolKind) { ConvertToTensorShape(); } PoolingNodeBase(const ScriptableObjects::IConfigRecordPtr configp, PoolKind poolKind) : PoolingNodeBase(configp->Get(L"deviceId"), L"", configp->Get(L"windowWidth"), configp->Get(L"windowHeight"), configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"), ImageLayoutKindFrom(configp->Get(L"imageLayout")), poolKind) { // input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample AttachInputsFromConfig(configp, this->GetExpectedNumInputs()); } void Save(File& fstream) const override { Base::Save(fstream); uint32_t imageLayoutKind = (uint32_t)m_imageLayoutKind; uint32_t windowWidth = (uint32_t)m_windowWidth; fstream << windowWidth << imageLayoutKind << m_windowHeight << m_horizontalSubsample << m_verticalSubsample; } void Load(File& fstream, size_t modelVersion) override { Base::Load(fstream, modelVersion); uint32_t imageLayoutKind, windowWidth; fstream >> windowWidth >> imageLayoutKind >> m_windowHeight >> m_horizontalSubsample >> m_verticalSubsample; m_windowWidth = windowWidth; m_imageLayoutKind = (ImageLayoutKind)imageLayoutKind; ConvertToTensorShape(); } void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override { Base::CopyTo(nodeP, newName, flags); if (flags & CopyNodeFlags::copyNodeValue) { auto node = dynamic_pointer_cast>(nodeP); node->m_windowWidth = m_windowWidth; node->m_windowHeight = m_windowHeight; node->m_horizontalSubsample = m_horizontalSubsample; node->m_verticalSubsample = m_verticalSubsample; node->m_inputSizePerSample = m_inputSizePerSample; node->m_outputSizePerSample = m_outputSizePerSample; node->m_imageLayoutKind = m_imageLayoutKind; node->ConvertToTensorShape(); } } void ForwardProp(const FrameRange& fr) override { Matrix sliceInput0Value = InputRef(0).ValueFor(fr); Matrix sliceOutputValue = ValueFor(fr); m_convEng->ForwardPooling(sliceInput0Value, sliceOutputValue); } void BackpropTo(const size_t /*inputIndex*/, const FrameRange& fr) override { Matrix sliceInput0Grad = InputRef(0).GradientFor(fr); Matrix sliceOutputGrad = GradientFor(fr); Matrix sliceInput0Value = InputRef(0).ValueFor(fr); Matrix sliceOutputValue = ValueFor(fr); m_convEng->BackwardPooling(sliceOutputValue, sliceOutputGrad, sliceInput0Value, sliceInput0Grad, !InputRef(0).IsGradientInitializedBy(this)); } virtual ParentGradientOptimization ImplementsGradientOptimization(const ComputationNodeBase*) const override { return ParentGradientOptimization::Overwrite; } void Validate(bool isFinalValidationPass) override { Base::Validate(isFinalValidationPass); InferMBLayoutFromInputsForStandardCase(isFinalValidationPass); // get input tensor shape and interpret as image dimensions auto inDims = ImageDimensions(GetInputSampleLayout(0), m_imageLayoutKind); if (isFinalValidationPass && (inDims.m_width < m_windowWidth || inDims.m_height < m_windowHeight)) InvalidArgument("PoolingNodeBase: inputWidth must >= windowWidth and inputHeight must >= windowHeight."); // determine output tensor shape auto outDims = ImageDimensions( (inDims.m_width - m_windowWidth) / m_horizontalSubsample + 1, (inDims.m_height - m_windowHeight) / m_verticalSubsample + 1, inDims.m_numChannels); m_inputSizePerSample = inDims.m_width * inDims.m_height * inDims.m_numChannels; SetDims(outDims.AsTensorShape(m_imageLayoutKind), HasMBLayout()); if (isFinalValidationPass) { // set up various engines and descriptor objects m_geometry = std::make_shared(inDims.AsTensorShape(m_imageLayoutKind), ImageDimensions(m_windowWidth, m_windowHeight, 1).AsTensorShape(m_imageLayoutKind), TensorShape(1), ImageDimensions(m_horizontalSubsample, m_verticalSubsample, 1).AsTensorShape(m_imageLayoutKind), ConvolveGeometry::BoolVec{true}, ConvolveGeometry::BoolVec{false}, TensorShape(0), TensorShape(0)); } } void DumpNodeInfo(const bool printValues, const bool printMetadata, File& fstream) const override { Base::DumpNodeInfo(printValues, printMetadata, fstream); if (printMetadata) { auto inputSampleLayout = GetInputSampleLayout(0); char str[4096]; sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu] \n", (unsigned long)inputSampleLayout[1], (unsigned long)inputSampleLayout[2], (unsigned long)inputSampleLayout[0]); fstream << string(str); sprintf(str, "PoolingWindow[Width:%lu, Height:%lu] SubSampling[Horizontal:%lu, Vertical:%lu]\n", (unsigned long)m_windowWidth, (unsigned long)m_windowHeight, (unsigned long)m_horizontalSubsample, (unsigned long)m_verticalSubsample); fstream << string(str); sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu] \n", (unsigned long)m_sampleLayout[1], (unsigned long)m_sampleLayout[2], (unsigned long)m_sampleLayout[0]); fstream << string(str); sprintf(str, "TotalSizePerSample[Input:%lu, Output:%lu] \n", (unsigned long)m_inputSizePerSample, (unsigned long)m_outputSizePerSample); fstream << string(str); } } bool IsImageLayoutCHW() const { return m_imageLayoutKind == ImageLayoutKind::CHW; } TensorShape KernelShape() const { return m_kernelShape; } TensorShape Strides() const { return m_stride; } std::vector Sharing() const { return m_sharing; } std::vector AutoPad() const { return m_autoPad; } TensorShape LowerPad() const { return m_lowerPad; } TensorShape UpperPad() const { return m_upperPad; } PoolKind PoolingKind() const { return m_poolKind; } protected: void ConvertToTensorShape() { m_kernelShape = ImageDimensions(m_windowWidth, m_windowHeight, 1).AsTensorShape(m_imageLayoutKind); m_stride = ImageDimensions(m_horizontalSubsample, m_verticalSubsample, 1).AsTensorShape(m_imageLayoutKind); m_sharing = { true }; m_autoPad = { false }; m_lowerPad = TensorShape(0); m_upperPad = TensorShape(0); } protected: size_t m_windowWidth, m_windowHeight; size_t m_horizontalSubsample, m_verticalSubsample; size_t m_inputSizePerSample, m_outputSizePerSample; ImageLayoutKind m_imageLayoutKind; // how to interpret the tensor (which dimensions are X/Y and C) // Mapping to V2 PoolingNode description.. PoolKind m_poolKind; TensorShape m_kernelShape; TensorShape m_stride; std::vector m_sharing; std::vector m_autoPad; TensorShape m_lowerPad; TensorShape m_upperPad; ConvolveGeometryPtr m_geometry; std::unique_ptr> m_convEng; }; // add this at the start of each derived class, to get access to the members of ComputationNode // See #define of 'UsingComputationNodeMembersBoilerplate' for more explanation. #define UsingPoolingNodeBaseMembers \ UsingComputationNodeMembersBoilerplate; \ \ protected: \ using Base::m_geometry; \ using Base::m_convEng; \ using Base::m_windowWidth; \ using Base::m_windowHeight; \ using Base::m_horizontalSubsample; \ using Base::m_verticalSubsample; \ using Base::m_inputSizePerSample; \ using Base::m_outputSizePerSample; \ using Base::m_imageLayoutKind; \ \ public: // ----------------------------------------------------------------------- // Legacy MaxPoolingNode // ----------------------------------------------------------------------- template class MaxPoolingNode : public PoolingNodeBase { typedef PoolingNodeBase Base; UsingPoolingNodeBaseMembers; static const std::wstring TypeName() { return L"MaxPooling"; } public: MaxPoolingNode(DEVICEID_TYPE deviceId, const wstring& name) : Base(deviceId, name, PoolKind::Max) { } MaxPoolingNode(DEVICEID_TYPE deviceId, const wstring& name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind) : Base(deviceId, name, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind, PoolKind::Max) { } MaxPoolingNode(const ScriptableObjects::IConfigRecordPtr configp) : Base(configp, PoolKind::Max) { } void Validate(bool isFinalValidationPass) override { Base::Validate(isFinalValidationPass); if (isFinalValidationPass && m_convEng == nullptr) { m_convEng = ConvolutionEngine::Create(m_geometry, m_deviceId, m_imageLayoutKind, 0, PoolKind::Max, ConvolutionEngineKind::All, NodeName()); } } }; // ----------------------------------------------------------------------- // Legacy AveragePoolingNode // ----------------------------------------------------------------------- template class AveragePoolingNode : public PoolingNodeBase { typedef PoolingNodeBase Base; UsingPoolingNodeBaseMembers; static const std::wstring TypeName() { return L"AveragePooling"; } public: AveragePoolingNode(DEVICEID_TYPE deviceId, const wstring& name) : Base(deviceId, name, PoolKind::Average) { } AveragePoolingNode(DEVICEID_TYPE deviceId, const wstring& name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind) : Base(deviceId, name, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind, PoolKind::Average) { } AveragePoolingNode(const ScriptableObjects::IConfigRecordPtr configp) : Base(configp, PoolKind::Average) { } void Validate(bool isFinalValidationPass) override { Base::Validate(isFinalValidationPass); if (isFinalValidationPass && m_convEng == nullptr) { m_convEng = ConvolutionEngine::Create(m_geometry, m_deviceId, m_imageLayoutKind, 0, PoolKind::Average, ConvolutionEngineKind::All, NodeName()); } } }; } } }