https://github.com/Microsoft/CNTK
Raw File
Tip revision: ddb6f9a28f3fa313d6dfd19850bce03a8bb76984 authored by Mark Hillebrand on 18 January 2016, 08:32:28 UTC
License change
Tip revision: ddb6f9a
SimpleNetworkBuilder.h
//
// <copyright file="SimpleNetworkBuilder.h" company="Microsoft">
//     Copyright (c) Microsoft Corporation.  All rights reserved.
// </copyright>
//
#pragma once

#include "Basics.h"
#include "Matrix.h"
#include "BestGpu.h"

#include "ComputationNetwork.h"
#include "Config.h"

// TODO: giving up moving stuff for now, running out of time. The following #includes should not be necessary once the hard-working code in here gets moved to .cpp
#include "InputAndParamNodes.h"

#include <stdexcept>
#include <regex>
#include <string>

#pragma warning (disable: 4661)

using namespace std;    // TODO: ugh!

/// this is for sparse input, useful when input dimension is very large and sparse such as language modeling
/// to-do: need to use it guided by argument
#define SPARSE_INPUT

namespace Microsoft { namespace MSR { namespace CNTK {

#define MAX_DEPTH 20

    enum RNNTYPE {
        SIMPLENET = 0, /// no recurrent connections
        SIMPLERNN = 1, LSTM = 2, DEEPRNN = 4, CLASSLM = 8,
        LBLM = 16,
        LSTMENCODER = 18,
        NPLM = 32, CLASSLSTM = 64, NCELSTM = 128,
        CLSTM = 256, RCRF = 512,
        UNIDIRECTIONALLSTM = 19,
        BIDIRECTIONALLSTM = 20,
        ALIGNMENTSIMILARITYGENERATOR = 21,
        ALIGNMENTSIMILARITYGFORWARDDECODER = 22
    };

    enum class TrainingCriterion : int  // TODO: camel-case these
    {
        CrossEntropyWithSoftmax,
        CrossEntropy,
        SquareError,
        Logistic,
        ClassCrossEntropyWithSoftmax,
        NCECrossEntropyWithSoftmax,
        CRF,
        SequenceWithSoftmax
    };

    enum class EvalCriterion : int
    {
        CrossEntropyWithSoftmax,
        CrossEntropy,
        SquareError,
        Logistic,
        ErrorPrediction,
        ClassCrossEntropyWithSoftmax,
        NCECrossEntropyWithSoftmax,
        CRF,
        SequenceWithSoftmax
    };

    extern TrainingCriterion ParseTrainingCriterionString(wstring s);
    extern EvalCriterion ParseEvalCriterionString(wstring s);

    template<class ElemType>
    class SimpleNetworkBuilder
    {
    protected:
        typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;

    private:
        SimpleNetworkBuilder() //disable default constructor from being called
        {
        }

    public:
        SimpleNetworkBuilder(const ConfigParameters& config) : m_net(nullptr)
        {
            Init(config);
        }
        SimpleNetworkBuilder(const ScriptableObjects::IConfigRecord &) { NOT_IMPLEMENTED; }

        // full parameter Init routine
        void Init(const intargvector& layerSizes, const TrainingCriterion trainCriterion, const EvalCriterion evalCriterion,
            int outputLayerSize = -1,
            const stringargvector nonLinearFunctions = L"Sigmoid",
            const bool addDropoutNodes = false,
            const bool uniformInit = true, const ElemType initValueScale = 1.0f,
            const bool applyMeanVarNorm = false, bool needPrior = false, DEVICEID_TYPE deviceId = AUTOPLACEMATRIX)
        {
            if (deviceId == AUTOPLACEMATRIX)
                deviceId = Matrix<ElemType>::GetBestGPUDeviceId();
            deviceId = EnforceOneGPUOnly(deviceId);      // see EnforceOneGPUOnly() for comment on what this is

            m_deviceId = deviceId;
            m_net = make_shared<ComputationNetwork>(m_deviceId);

            if (m_deviceId < 0)
                fprintf(stderr, "SimpleNetworkBuilder Using CPU\n");
            else
                fprintf(stderr, "SimpleNetworkBuilder Using GPU %d\n", m_deviceId);

            m_outputLayerSize = outputLayerSize;
            m_layerSizes = layerSizes;
            m_applyMeanVarNorm = applyMeanVarNorm;
            m_trainCriterion = trainCriterion;
            m_evalCriterion = evalCriterion;
            m_addDropoutNodes = addDropoutNodes;
            m_needPrior = needPrior;
            m_nonLinearFunctions = nonLinearFunctions;
            m_uniformInit = uniformInit;
            m_initValueScale = initValueScale;
            if (m_layerSizes.size() < 2)
                InvalidArgument("A network should have at least two layers (one input and one output)");
        }

        void InitAttentionNetworkConfig(const ConfigParameters& config)
        {
            m_auxFeatDim = config("auxfeatdim", "20");
        }

        virtual void InitRecurrentConfig(const ConfigParameters& config)
        {
            ConfigArray rLayerSizes = config("recurrentLayer", "");
            intargvector recurrentLayers = rLayerSizes;
            m_recurrentLayers = recurrentLayers;
            m_defaultHiddenActivity = config("defaultHiddenActivity", "0.1");
            ConfigArray str_rnnType = config("rnnType", L"SIMPLENET");

            m_maOrder = config("maOrder", "0");
            m_lookupTableOrder = config("lookupTableOrder", "0");

            ConfigArray sSizes = config("streamSizes", "");
            m_streamSizes = sSizes;
            sSizes = config("lookupTableOrderSizes", "");  /// this allows having a multiple streams of inputs with
            /// different lookuptable order sizes. the older one lookupTableOrder is still kept to have backward
            /// support.
            m_lookupTabelOrderSizes = sSizes;

            m_labelEmbeddingSize = config("labelEmbeddingSize", "10");
            m_constForgetGateValue = config("constForgetGateValue", "false");
            m_constInputGateValue = config("constInputGateValue", "false");
            m_constOutputGateValue = config("constOutputGateValue", "false");

            m_forgetGateInitVal = config("forgetGateInitVal", "-1");
            m_inputGateInitVal = config("inputGateInitVal", "-1");
            m_outputGateInitVal = config("outputGateInitVal", "-1");

            m_sparse_input = config("sparseinput", "false");

            stringargvector strType = str_rnnType;
            if (std::find(strType.begin(), strType.end(), L"SIMPLENET") != strType.end())
                m_rnnType = SIMPLENET;
            else if (std::find(strType.begin(), strType.end(), L"SIMPLERNN") != strType.end())
                m_rnnType = SIMPLERNN;
            else if (std::find(strType.begin(), strType.end(), L"LSTM") != strType.end())
                m_rnnType = LSTM;
            else if (std::find(strType.begin(), strType.end(), L"DEEPRNN") != strType.end())
                m_rnnType = DEEPRNN;
            else if (std::find(strType.begin(), strType.end(), L"CLASSLM") != strType.end())
                m_rnnType = CLASSLM;
            else if (std::find(strType.begin(), strType.end(), L"LBLM") != strType.end())
                m_rnnType = LBLM;
            else if (std::find(strType.begin(), strType.end(), L"NPLM") != strType.end())
                m_rnnType = NPLM;
            else if (std::find(strType.begin(), strType.end(), L"CLASSLSTM") != strType.end())
                m_rnnType = CLASSLSTM;
            else if (std::find(strType.begin(), strType.end(), L"NCELSTM") != strType.end())
                m_rnnType = NCELSTM;
            else if (std::find(strType.begin(), strType.end(), L"CLSTM") != strType.end())
                m_rnnType = CLSTM;
            else if (std::find(strType.begin(), strType.end(), L"CRF") != strType.end())
                m_rnnType = RCRF;
            else if (std::find(strType.begin(), strType.end(), L"LSTMENCODER") != strType.end())
                m_rnnType = LSTMENCODER;
            else if (std::find(strType.begin(), strType.end(), L"TRANSDUCER") != strType.end() ||
                std::find(strType.begin(), strType.end(), L"UNIDIRECTIONALLSTMWITHPASTPREDICTION") != strType.end())
                m_rnnType = UNIDIRECTIONALLSTM;
            else if (std::find(strType.begin(), strType.end(), L"JOINTCONDITIONALBILSTMSTREAMS") != strType.end() ||
                std::find(strType.begin(), strType.end(), L"BIDIRECTIONALLSTMWITHPASTPREDICTION") != strType.end())
                m_rnnType = BIDIRECTIONALLSTM;
            else if (std::find(strType.begin(), strType.end(), L"ALIGNMENTSIMILARITYGENERATOR") != strType.end())
                m_rnnType = ALIGNMENTSIMILARITYGENERATOR;
            else if (std::find(strType.begin(), strType.end(), L"ALIGNMENTSIMILARITYGFORWARDDECODER") != strType.end())
                m_rnnType = ALIGNMENTSIMILARITYGFORWARDDECODER;
            else
                InvalidArgument("InitRecurrentConfig: unknown value for rnnType parameter '%ls'", strType[0].c_str());
        }

        // Init - Builder Initialize for multiple data sets
        // config - [in] configuration parameters for the network builder
        virtual void Init(const ConfigParameters& config)
        {
            DEVICEID_TYPE deviceId = DeviceFromConfig(config);

            ElemType initValueScale = config("initValueScale", "1.0");

            ConfigArray layerTypes = config("layerTypes", L"Sigmoid");
            stringargvector nonlinearFunctions = layerTypes;

            bool uniformInit = config("uniformInit", "true");
            bool applyMeanVarNorm = config("applyMeanVarNorm", "false");
            bool needPrior = config("needPrior", "false");

            bool addDropoutNodes = config("addDropoutNodes", "false");

            int outputLayerSize;
            ConfigArray layerSizes;
            intargvector layers;
            TrainingCriterion trainingCriterion;
            EvalCriterion evalCriterion;

            outputLayerSize = config("outputLayerSize", "-1");
            layerSizes = config("layerSizes", "100");
            layers = layerSizes;
            trainingCriterion = ParseTrainingCriterionString(config("trainingCriterion"));
            evalCriterion = ParseEvalCriterionString(config("evalCriterion"));

            ConfigArray rDirect = config("directConnect", "");
            m_directConnect = rDirect;

            m_word2class = config("word2cls", "");
            m_cls2index = config("cls2index", "");
            m_vocabSize = (int)config("vocabSize", "-1");
            m_nbrCls = (int)config("nbrClass", "-1");
            nce_noises = (int)config("noise_number", "-1");//nce noise

            Init(layers, trainingCriterion, evalCriterion, outputLayerSize,
                nonlinearFunctions, addDropoutNodes,
                uniformInit, initValueScale, applyMeanVarNorm, needPrior, deviceId);

            InitRecurrentConfig(config);

            InitAttentionNetworkConfig(config);

        }

        ComputationNetworkPtr BuildNetworkFromDescription();
        ComputationNetworkPtr BuildNetworkFromDescription(ComputationNetwork* encoderNet);      // legacy support of deprecated sequence-to-sequence implementation

        ComputationNetworkPtr BuildNetworkFromDbnFile(const std::wstring& dbnModelFileName);    // legacy support for fseide's Microsoft-internal tool "DBN.exe"

        RNNTYPE RnnType(){ return m_rnnType; }

    protected:

        ComputationNetworkPtr BuildSimpleDNN();

        ComputationNetworkPtr BuildSimpleRNN(size_t mbSize = 1);

        ComputationNetworkPtr BuildClassEntropyNetwork(size_t mbSize = 1);

        ComputationNodePtr BuildLSTMComponent(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input);

        ComputationNodePtr BuildLSTMNodeComponent(ULONG &randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input);

        ComputationNodePtr BuildLSTMComponentWithMultiInputs(ULONG &randomSeed, size_t mbSize, size_t iLayer, const vector<size_t>& inputDim, size_t outputDim, const vector<ComputationNodePtr>& inputObs, bool inputWeightSparse = false);

        ComputationNodePtr BuildDirectConnect(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode);

        ComputationNetworkPtr BuildLogBilinearNetworkFromDescription(size_t mbSize = 1);

        ComputationNetworkPtr BuildNeuralProbNetworkFromDescription(size_t mbSize = 1);

        ComputationNetworkPtr BuildLSTMNetworkFromDescription(size_t mbSize = 1);

        ComputationNetworkPtr BuildSeqTrnLSTMNetworkFromDescription(size_t mbSize = 1);

        ComputationNetworkPtr BuildLSTMEncoderNetworkFromDescription(size_t mbSize = 1);

        ComputationNetworkPtr BuildUnidirectionalLSTMNetworksFromDescription(size_t mbSize = 1);

        ComputationNetworkPtr BuildBiDirectionalLSTMNetworksFromDescription(size_t mbSize = 1);

        ComputationNetworkPtr BuildCLASSLSTMNetworkFromDescription(size_t mbSize = 1);

        ComputationNetworkPtr BuildConditionalLSTMNetworkFromDescription(size_t mbSize = 1);

        ComputationNetworkPtr BuildNCELSTMNetworkFromDescription(size_t mbSize = 1);

        ComputationNetworkPtr BuildAlignmentForwardDecoderNetworkFromDescription(ComputationNetwork* encoderNet, size_t mbSize = 1);

        ComputationNetworkPtr BuildAlignmentDecoderNetworkFromDescription(ComputationNetwork* encoderNet, size_t mbSize = 1);

        //layer is 0 based
        ComputationNodePtr ApplyNonlinearFunction(ComputationNodePtr input, const size_t layer, const std::wstring nodeName = L"");
        ComputationNodePtr AddTrainAndEvalCriterionNodes(ComputationNodePtr input, ComputationNodePtr label, ComputationNodePtr matrix = nullptr, const std::wstring trainNodeName = L"", const std::wstring evalNodeName = L"", ComputationNodePtr clspostprob = nullptr, ComputationNodePtr trans = nullptr);

        static bool CheckDbnTag(File &fstream, const std::string expectedTag)
        {
            char tag[5];
            for (int i = 0; i<4; i++)
                fstream >> tag[i];
            tag[4] = 0;
            return std::string(tag) == expectedTag;
        }

        Matrix<ElemType> ReadMatrixFromDbnFile(File &fstream, const std::string expectedName)
        {
            int numRows, numCols;
            std::string name;
            if (!CheckDbnTag(fstream, "BMAT"))
                RuntimeError("Error reading DBN file - did not find expected tag BMAT\n");
            //fstream.GetMarker(FileMarker::fileMarkerBeginSection, "BMAT");
            fstream >> name >> numRows >> numCols;
            if (name != expectedName)
            {
                InvalidArgument("ERROR reading pretrained DBN file, expected name %s, found name %s\n", expectedName.c_str(), name.c_str());
            }

            if (numCols>1) // transpose W because dbn stores that way apparently
            {
                int origRows = numRows;
                numRows = numCols;
                numCols = origRows;
            }

            Matrix<ElemType> mat(numRows, numCols, m_deviceId);

            // dbn operates on row vectors not column vectors. x*W + b, so need to read in as W'
            //ElemType* d_array = new ElemType[numRows*numCols];
            float tmp;
            for (long i = 0; i<numRows; i++)
                for (long j = 0; j<numCols; j++)
                {
                    fstream >> tmp;
                    mat(i, j) = tmp;
                    //d_array[i] = (ElemType)tmp;                
                }
            if (!CheckDbnTag(fstream, "EMAT"))
                RuntimeError("Error reading DBN file - did not find expected tag EMAT\n");
            //fstream.GetMarker(FileMarker::fileMarkerBeginSection, "EMAT");

            return mat;

        }

    protected:

        ComputationNetworkPtr m_net;

        int m_outputLayerSize;
        intargvector m_layerSizes;
        bool m_applyMeanVarNorm;
        bool m_needPrior;

        DEVICEID_TYPE m_deviceId;
        bool m_uniformInit;

        ElemType m_initValueScale;
        bool m_addDropoutNodes;

        stringargvector m_nonLinearFunctions;

        TrainingCriterion m_trainCriterion;
        EvalCriterion m_evalCriterion;

        intargvector m_directConnect; /// connect those layers directly in a sequence order
        /// for example: 1:2:3 will connect 1 to 2 and then 2 to 3

        /// recurrent network 
        intargvector m_recurrentLayers;
        float m_defaultHiddenActivity;
        RNNTYPE m_rnnType;
        int   m_maOrder; /// MA model order

        bool m_constForgetGateValue;
        bool m_constInputGateValue;
        bool m_constOutputGateValue;

        ElemType m_forgetGateInitVal;
        ElemType m_inputGateInitVal;
        ElemType m_outputGateInitVal;

        intargvector m_streamSizes;  /// for multiple stream data
        intargvector m_lookupTabelOrderSizes; /// each stream has its own projection, so need to provide with the lookup table order size for each stream

        int m_lookupTableOrder;
        int m_labelEmbeddingSize;

        /// these are the file names for word 2 class mapping and class to word index mapping
        /// these are used for class-based language modeling
        string m_cls2index;
        string m_word2class;
        int m_nbrCls;  /// number of classes
        int m_vocabSize; /// vocabulary size
        int nce_noises;

        bool m_sparse_input;

        /**
        for attention network development
        */
        size_t m_auxFeatDim;
    };

}}}
back to top