https://github.com/Microsoft/CNTK
Raw File
Tip revision: fd272947e85a0c397d7117b67c63a83b4bd9dbad authored by TJ on 17 August 2018, 18:02:01 UTC
Added more comprehensive comments about the issue
Tip revision: fd27294
SimpleNetworkBuilder.cpp
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//

#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings

#include "SimpleNetworkBuilder.h"
#include "ComputationNetworkBuilder.h"

#include "ComputationNode.h"
#include "InputAndParamNodes.h"
#include "LinearAlgebraNodes.h"
#include "NonlinearityNodes.h"
#include "ConvolutionalNodes.h"
#include "RecurrentNodes.h"
#include "PreComputeNodes.h"

#pragma warning(disable : 4189) // (we have lots of unused variables to show how variables can be set up)

namespace Microsoft { namespace MSR { namespace CNTK {

template <class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNetworkFromDescription()
{
    ComputationNetworkPtr net;
    switch (m_standardNetworkKind)
    {
    case FFDNNKind:
        net = BuildFFDNNFromDescription();
        break;
    case RNNKind:
        net = BuildRNNFromDescription();
        break;
    case LSTMKind:
        net = BuildLSTMNetworkFromDescription();
        break;
    case ClassLSTMNetworkKind:
        net = BuildClassLSTMNetworkFromDescription();
        break;
    case NCELSTMNetworkKind:
        net = BuildNCELSTMNetworkFromDescription();
        break;
    case ClassEntropyRNNKind:
        net = BuildClassEntropyRNNFromDescription();
        break;
    case LogBilinearNetworkKind:
        net = BuildLogBilinearNetworkFromDescription();
        break;
    case DNNLMNetworkKind:
        net = BuildDNNLMNetworkFromDescription();
        break;
    case ConditionalLSTMNetworkKind:
        net = BuildConditionalLSTMNetworkFromDescription();
        break;
#ifdef COMING_SOON
    case CRFLSTMNetworkKind:
        net = BuildCRFLSTMNetworkFromDescription();
        break;
#endif
    default:
        LogicError("BuildNetworkFromDescription: invalid m_standardNetworkKind %d", (int) m_standardNetworkKind);
    }

    // post-process the network
    net->CompileNetwork();

    return net;
}

template <class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildFFDNNFromDescription()
{

    ComputationNetworkBuilder<ElemType> builder(*m_net);
    if (m_net->GetTotalNumberOfNodes() < 1) // not built yet
    {
        unsigned long randomSeed = 1;

        size_t numHiddenLayers = m_layerSizes.size() - 2;
        ComputationNodePtr input, w, b, output, label, prior, scaledLogLikelihood;

        input = builder.CreateInputNode(L"features", m_layerSizes[0]);
        m_net->AddToNodeGroup(L"feature", input);

        if (m_applyMeanVarNorm)
        {
            w = builder.Mean(input, L"MeanOfFeatures");
            b = builder.InvStdDev(input, L"InvStdOfFeatures");
            output = builder.PerDimMeanVarNormalization(input, w, b, L"MVNormalizedFeatures");

            input = output;
        }

        if (numHiddenLayers > 0)
        {
            w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[0]);
            m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
            b = builder.CreateLearnableParameter(L"B0", m_layerSizes[1], 1);
            m_net->InitLearnableParameters(b, L"fixedValue", 0);
            output = ApplyNonlinearFunction(builder.Plus(builder.Times(w, input, 1, L"W0*features"), b, L"W0*features+B0"), 0, L"H1");

            if (m_addDropoutNodes)
                input = builder.Dropout(output, L"DropH1");
            else
                input = output;

            for (int i = 1; i < numHiddenLayers; i++)
            {
                wstring nameOfW = msra::strfun::wstrprintf(L"W%d", i);
                wstring nameOfB = msra::strfun::wstrprintf(L"B%d", i);
                wstring nameOfPrevH = msra::strfun::wstrprintf(L"H%d", i);
                wstring nameOfTimes = nameOfW + L"*" + nameOfPrevH;
                wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
                wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);

                w = builder.CreateLearnableParameter(nameOfW, m_layerSizes[i + 1], m_layerSizes[i]);
                m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
                b = builder.CreateLearnableParameter(nameOfB, m_layerSizes[i + 1], 1);
                m_net->InitLearnableParameters(b, L"fixedValue", 0);
                output = ApplyNonlinearFunction(builder.Plus(builder.Times(w, input, 1, nameOfTimes), b, nameOfPlus), i, nameOfH);

                if (m_addDropoutNodes)
                    input = builder.Dropout(output, L"Drop" + nameOfH);
                else
                    input = output;
            }
        }

        wstring nameOfW = msra::strfun::wstrprintf(L"W%d", numHiddenLayers);
        wstring nameOfB = msra::strfun::wstrprintf(L"B%d", numHiddenLayers);
        wstring nameOfPrevH = msra::strfun::wstrprintf(L"H%d", numHiddenLayers - 1);
        wstring nameOfTimes = nameOfW + L"*" + nameOfPrevH;
        wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;

        w = builder.CreateLearnableParameter(nameOfW, m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
        b = builder.CreateLearnableParameter(nameOfB, m_layerSizes[numHiddenLayers + 1], 1);
        m_net->InitLearnableParameters(b, L"fixedValue", 0);
        output = builder.Plus(builder.Times(w, input, 1, nameOfTimes), b, nameOfPlus);
        m_net->RenameNode(output, L"HLast");

        label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);

        AddTrainAndEvalCriterionNodes(output, label);

        if (m_needPrior)
        {
            prior = builder.Mean(label, L"Prior");
            input = builder.Log(prior, L"LogOfPrior");

            // following two lines are needed only if true probability is needed
            // output = builder.Softmax(output);
            // output = builder.Log(output);

            scaledLogLikelihood = builder.Minus(output, input, L"ScaledLogLikelihood");
            m_net->AddToNodeGroup(L"output", scaledLogLikelihood);
        }
        else
        {
            m_net->AddToNodeGroup(L"output", output);
        }

        // add softmax layer (if prob is needed or KL reg adaptation is needed)
        output = builder.Softmax(output, L"PosteriorProb");
        // m_net->AddToNodeGroup(L"output", output);
    }

    return m_net;
}

// Note: while ComputationNode and CompuationNetwork are (supposed to be) independent of ElemType, it is OK to keep this class dependent.
template <class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildRNNFromDescription()
{
    ComputationNetworkBuilder<ElemType> builder(*m_net);
    if (m_net->GetTotalNumberOfNodes() < 1) // not built yet
    {
        unsigned long randomSeed = 1;

        size_t numHiddenLayers = m_layerSizes.size() - 2;

        size_t numRecurrentLayers = m_recurrentLayers.size();

        ComputationNodePtr input, w, b, u, pastValue, output, label, prior;

        input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
        m_net->AddToNodeGroup(L"feature", input);

        if (m_applyMeanVarNorm)
        {
            w = builder.Mean(input);
            b = builder.InvStdDev(input);
            output = builder.PerDimMeanVarNormalization(input, w, b);

            input = output;
        }

        int recur_idx = 0;
        if (numHiddenLayers > 0)
        {
            // TODO: to figure out sparse matrix size
            u = builder.CreateLearnableParameter(L"U0", m_layerSizes[1], m_layerSizes[0]);
            m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);

            if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == 1)
            {
                w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[1]);
                m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

                pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], 1);
                // unless there is a good algorithm to detect loops, use this explicit setup
                output = ApplyNonlinearFunction(
                    builder.Plus(
                        builder.Times(u, input), builder.Times(w, pastValue)),
                    0);
                pastValue->AttachInputs({ output });
                recur_idx++;
            }
            else
            {
                output = SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), 0);
                // TODO: Why the ^^ namespace?
                // output = builder.Times(u, input);
            }

            if (m_addDropoutNodes)
                input = builder.Dropout(output);
            else
                input = output;

            for (int i = 1; i < numHiddenLayers; i++)
            {
                // TODO: to figure out sparse matrix size
                u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
                m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);

                if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
                {
                    w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i + 1], m_layerSizes[i + 1]);
                    m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

                    pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t) m_layerSizes[i + 1], 1);
                    // unless there is a good algorithm to detect loops, use this explicit setup
                    output = ApplyNonlinearFunction(
                        builder.Plus(
                            builder.Times(u, input), builder.Times(w, pastValue)),
                        0);
                    pastValue->AttachInputs({ output });
                    recur_idx++;
                }
                else
                {
                    output = SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), i);
                }

                if (m_addDropoutNodes)
                    input = builder.Dropout(output);
                else
                    input = output;
            }
        }

        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
        /*m_net->MatrixL2Reg(w , L"L1w");*/

        label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
        AddTrainAndEvalCriterionNodes(input, label, w, L"criterion", L"eval");

        output = builder.Times(w, input, 1, L"outputs");

        m_net->AddToNodeGroup(L"output", output);

        if (m_needPrior)
            prior = builder.Mean(label);
    }

    return m_net;
}

template <class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyRNNFromDescription()
{
    ComputationNetworkBuilder<ElemType> builder(*m_net);

    if (m_net->GetTotalNumberOfNodes() < 1) // not built yet
    {
        unsigned long randomSeed = 1;

        size_t numHiddenLayers = m_layerSizes.size() - 2;

        size_t numRecurrentLayers = m_recurrentLayers.size();

        ComputationNodePtr input, w, b, u, pastValue, output, label, prior;
        ComputationNodePtr wrd2cls, cls2idx, clslogpostprob, clsweight;

        if (m_vocabSize != m_layerSizes[numHiddenLayers + 1])
            RuntimeError("BuildClassEntropyRNNFromDescription : vocabulary size should be the same as the output layer size");

        input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
        m_net->AddToNodeGroup(L"feature", input);

        if (m_applyMeanVarNorm)
        {
            w = builder.Mean(input);
            b = builder.InvStdDev(input);
            output = builder.PerDimMeanVarNormalization(input, w, b);

            input = output;
        }

        int recur_idx = 0;
        if (numHiddenLayers > 0)
        {
            u = builder.CreateLearnableParameter(L"U0", m_layerSizes[1], m_layerSizes[0]);
            m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);

            if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == 1)
            {
                w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[1]);
                m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

                pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], 1);
                // unless there is a good algorithm to detect loops, use this explicit setup
                output = ApplyNonlinearFunction(
                    builder.Plus(
                        builder.Times(u, input), builder.Times(w, pastValue)),
                    0);
                pastValue->AttachInputs({ output });
                recur_idx++;
            }
            else
            {
                b = builder.CreateLearnableParameter(L"B0", m_layerSizes[1], 1);
                m_net->RandomInitLearnableParameters(b, m_uniformInit, randomSeed++, m_initValueScale);
                output = SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), 0);
            }

            if (m_addDropoutNodes)
                input = builder.Dropout(output);
            else
                input = output;

            for (int i = 1; i < numHiddenLayers; i++)
            {
                u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
                m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
                if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
                {
                    w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i + 1], m_layerSizes[i + 1]);
                    m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

                    pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t) m_layerSizes[i + 1], 1);
                    // unless there is a good algorithm to detect loops, use this explicit setup
                    output = ApplyNonlinearFunction(
                        builder.Plus(
                            builder.Times(u, input), builder.Times(w, pastValue)),
                        0);
                    pastValue->AttachInputs({ output });
                    recur_idx++;
                }
                else
                {
                    output = SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), i);
                }

                if (m_addDropoutNodes)
                    input = builder.Dropout(output);
                else
                    input = output;
            }
        }

        // need to have [input_dim x output_dim] matrix
        // e.g., [200 x 10000], where 10000 is the vocabulary size
        // this is for speed-up issue as per word matrix can be simply obtained using column slice
        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

        // the label is a dense matrix. each element is the word index
        label = builder.CreateInputNode(L"labels", 4);

        clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
        m_net->RandomInitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
        clslogpostprob = builder.Times(clsweight, input, 1, L"ClassPostProb");

        output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeClassBasedCrossEntropy", L"EvalNodeClassBasedCrossEntrpy",
                                               clslogpostprob);

        m_net->AddToNodeGroup(L"output", output);

        if (m_needPrior)
        {
            prior = builder.Mean(label);
        }
    }

    return m_net;
}

template <class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildConditionalLSTMNetworkFromDescription()
{
    ComputationNetworkBuilder<ElemType> builder(*m_net);
    if (m_net->GetTotalNumberOfNodes() < 1) // not built yet
    {
        unsigned long randomSeed = 1;

        size_t numHiddenLayers = m_layerSizes.size() - 2;

        size_t numRecurrentLayers = m_recurrentLayers.size();

        ComputationNodePtr input, w, b, u, e, pastValue, output, label, prior;
        ComputationNodePtr gt;
        ComputationNodePtr clslogpostprob;
        ComputationNodePtr clsweight;

        input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
        m_net->AddToNodeGroup(L"feature", input);

        if (m_applyMeanVarNorm)
        {
            w = builder.Mean(input);
            b = builder.InvStdDev(input);
            output = builder.PerDimMeanVarNormalization(input, w, b);

            input = output;
        }

        if (m_lookupTableOrder > 0)
        {
            e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
            m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
            output = builder.LookupTable(e, input, L"LookupTable");

            if (m_addDropoutNodes)
                input = builder.Dropout(output);
            else
                input = output;
        }
        else
        {
            LogicError("BuildClassLSTMNetworkFromDescription: LSTMNode cannot take sparse input. Need to project sparse input to continuous vector using LookupTable. Suggest using setups below\n layerSizes=$VOCABSIZE$:100:$HIDDIM$:$VOCABSIZE$ \nto have 100 dimension projection, and lookupTableOrder=1\n to project to a single window. To use larger context window, set lookupTableOrder=3 for example with width-3 context window.\n ");
        }

        int recur_idx = 0;
        int offset = m_lookupTableOrder > 0 ? 1 : 0;
        if (numHiddenLayers > 0)
        {
            //           output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
            output = (ComputationNodePtr) BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
            // previously used function. now uses LSTMNode which is correct and fast
            input = output;
            for (int i = 1 + offset; i < numHiddenLayers; i++)
            {
                //                    output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
                output = (ComputationNodePtr) BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);

                if (m_addDropoutNodes)
                    input = builder.Dropout(output);
                else
                    input = output;
            }
        }

        // serve as a global bias term
        gt = builder.CreateInputNode(L"binaryFeature", m_auxFeatDim);
        m_net->AddToNodeGroup(L"feature", gt);
        e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"AuxTrans%d", 0), m_layerSizes[numHiddenLayers], m_auxFeatDim);
        m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
        u = ApplyNonlinearFunction(builder.Times(e, gt), numHiddenLayers, L"TimesToGetGlobalBias");
        output = builder.Plus(input, u, L"PlusGlobalBias");
        input = output;

        // need to have [input_dim x output_dim] matrix
        // e.g., [200 x 10000], where 10000 is the vocabulary size
        // this is for speed-up issue as per word matrix can be simply obtained using column slice
        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

        // the label is a dense matrix. each element is the word index
        label = builder.CreateInputNode(L"labels", 4);

        clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
        m_net->RandomInitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
        clslogpostprob = builder.Times(clsweight, input, 1, L"ClassPostProb");

        output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeClassBasedCrossEntropy", L"EvalNodeClassBasedCrossEntrpy",
                                               clslogpostprob);

        output = builder.TransposeTimes(w, input, L"outputs");

        m_net->AddToNodeGroup(L"output", output);

        // add softmax layer (if prob is needed or KL reg adaptation is needed)
        output = builder.Softmax(output, L"PosteriorProb");
    }

    return m_net;
}

template <class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFromDescription()
{
    ComputationNetworkBuilder<ElemType> builder(*m_net);
    if (m_net->GetTotalNumberOfNodes() < 1) // not built yet
    {
        unsigned long randomSeed = 1;

        size_t numHiddenLayers = m_layerSizes.size() - 2;

        size_t numRecurrentLayers = m_recurrentLayers.size();

        ComputationNodePtr input, w, b, u, pastValue, output, label, prior, featin, e;
        ComputationNodePtr bi = nullptr;
        ComputationNodePtr Wxi1 = nullptr, Wxi = nullptr;
        ComputationNodePtr Wxi2 = nullptr, Wxi3 = nullptr, Wxi4 = nullptr;
        ComputationNodePtr ot = nullptr, it = nullptr, ft = nullptr, gt = nullptr, ct = nullptr, ht = nullptr;
        ComputationNodePtr pastValueXI, pastValueXII, pastValueXIII, pastValueXIV;

        //                input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
        input = builder.CreateInputNode(L"features", m_layerSizes[0]);
        featin = input;
        m_net->AddToNodeGroup(L"feature", input);

        if (m_applyMeanVarNorm)
        {
            w = builder.Mean(input);
            b = builder.InvStdDev(input);
            output = builder.PerDimMeanVarNormalization(input, w, b);

            input = output;
        }

        // used for lookuptable node unittest, will delete
        if (m_lookupTableOrder > 0)
        {
            e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
            m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
            output = builder.LookupTable(e, input, L"Lookuptatble");

            if (m_addDropoutNodes)
                input = builder.Dropout(output);
            else
                input = output;
        }

        int recur_idx = 0;
        // unless there is a good algorithm to detect loops, use this explicit setup
        int ik = 1;
        output = input;
        while (ik <= m_maOrder)
        {
            pastValueXI =
                builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], ik, msra::strfun::wstrprintf(L"pastValue%d", ik));
            pastValueXI->SetLearningRateMultiplier(0);
            pastValueXI->AttachInputs({ input });
            // TODO: to figure out sparse matrix size
            Wxi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"DD%d", ik), m_layerSizes[0], m_layerSizes[0]);
            m_net->RandomInitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);

            it = builder.Plus(output, builder.Times(Wxi, pastValueXI));
            output = it;

            ik++;
        }

        if (m_addDropoutNodes)
            input = builder.Dropout(output);
        else
            input = output;

        for (int i = m_lookupTableOrder > 0 ? 1 : 0; i < numHiddenLayers; i++)
        {
            u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i] * (m_lookupTableOrder > 0 ? m_lookupTableOrder : 1));
            m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
            output = builder.Times(u, input);
            input = output;
            if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
            {
                w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"R%d", i + 1), m_layerSizes[i + 1], m_layerSizes[i + 1]);
                m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
                pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i + 1], 1);
                output = builder.Plus(builder.Times(w, pastValue), input);

                pastValue->AttachInputs({ output });
                input = output;
                recur_idx++;
            }

            bi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bi%d", i), m_layerSizes[i + 1], 1);
            m_net->InitLearnableParameters(bi, L"fixedValue", 0);
            output = builder.Plus(input, bi);

            if (m_addDropoutNodes)
                input = builder.Dropout(output);
            else
                input = output;
        }

        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

        label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
        AddTrainAndEvalCriterionNodes(input, label, w);

        output = builder.Times(w, input, 1, L"outputs");

        m_net->AddToNodeGroup(L"output", output);

        if (m_needPrior)
        {
            prior = builder.Mean(label);
        }
    }

    return m_net;
}

template <class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildDNNLMNetworkFromDescription()
{
    ComputationNetworkBuilder<ElemType> builder(*m_net);
    if (m_net->GetTotalNumberOfNodes() < 1) // not built yet
    {
        unsigned long randomSeed = 1;

        size_t numHiddenLayers = m_layerSizes.size() - 2;

        size_t numRecurrentLayers = m_recurrentLayers.size();

        ComputationNodePtr input = nullptr, w = nullptr, b = nullptr, u = nullptr, pastValue, output = nullptr, label = nullptr, prior = nullptr;
        ComputationNodePtr bi = nullptr;
        ComputationNodePtr Wxi1 = nullptr, Wxi = nullptr;
        ComputationNodePtr Wxi2 = nullptr, Wxi3 = nullptr, Wxi4 = nullptr;
        ComputationNodePtr ot = nullptr, it = nullptr, ft = nullptr, gt = nullptr, ct = nullptr, ht = nullptr;
        ComputationNodePtr pastValueXI, pastValueXII, pastValueXIII, pastValueXIV;

        input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
        m_net->AddToNodeGroup(L"feature", input);

        if (m_applyMeanVarNorm)
        {
            w = builder.Mean(input);
            b = builder.InvStdDev(input);
            output = builder.PerDimMeanVarNormalization(input, w, b);

            input = output;
        }

        int recur_idx = 0;
        if (numHiddenLayers > 0)
        {
            bi = builder.CreateLearnableParameter(L"bi0", m_layerSizes[1], 1);
            m_net->InitLearnableParameters(bi, L"fixedValue", 0);

            pastValueXI = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 1);
            pastValueXII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 2);
            pastValueXIII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 3);
            pastValueXIV = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 4);
            pastValueXI->AttachInputs({ input });
            pastValueXII->AttachInputs({ input });
            pastValueXIII->AttachInputs({ input });
            pastValueXIV->AttachInputs({ input });

            if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == 1)
            {
                // TODO: to figure out sparse matrix size
                Wxi2 = builder.CreateLearnableParameter(L"WXI2", m_layerSizes[1], m_layerSizes[0]);
                m_net->RandomInitLearnableParameters(Wxi2, m_uniformInit, randomSeed++, m_initValueScale);
                // TODO: to figure out sparse matrix size
                Wxi3 = builder.CreateLearnableParameter(L"WXI3", m_layerSizes[1], m_layerSizes[0]);
                m_net->RandomInitLearnableParameters(Wxi3, m_uniformInit, randomSeed++, m_initValueScale);
                // TODO: to figure out sparse matrix size
                Wxi4 = builder.CreateLearnableParameter(L"WXI4", m_layerSizes[1], m_layerSizes[0]);
                m_net->RandomInitLearnableParameters(Wxi4, m_uniformInit, randomSeed++, m_initValueScale);
                // TODO: to figure out sparse matrix size
                Wxi1 = builder.CreateLearnableParameter(L"WXI1", m_layerSizes[1], m_layerSizes[0]);
                m_net->RandomInitLearnableParameters(Wxi1, m_uniformInit, randomSeed++, m_initValueScale);
                // TODO: to figure out sparse matrix size
                Wxi = builder.CreateLearnableParameter(L"WXI", m_layerSizes[1], m_layerSizes[0]);
                m_net->RandomInitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);

                // unless there is a good algorithm to detect loops, use this explicit setup
                it = builder.Plus(
                    builder.Tanh(
                        builder.Plus(
                            builder.Times(Wxi4, pastValueXIV),
                            builder.Plus(
                                builder.Times(Wxi3, pastValueXIII),
                                builder.Plus(
                                    builder.Times(Wxi2, pastValueXII),
                                    builder.Plus(
                                        builder.Times(Wxi1, pastValueXI),
                                        builder.Times(Wxi, input)))))),
                    bi);
                output = it;
                pastValueXI->SetLearningRateMultiplier(0);
                pastValueXII->SetLearningRateMultiplier(0);
                pastValueXIII->SetLearningRateMultiplier(0);
                pastValueXIV->SetLearningRateMultiplier(0);
                recur_idx++;
            }
            else
            {
                output = SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), 0);
            }

            if (m_addDropoutNodes)
                input = builder.Dropout(output);
            else
                input = output;

            for (int i = 1; i < numHiddenLayers; i++)
            {
                u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
                m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
                if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
                {
                    w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i + 1], m_layerSizes[i + 1]);
                    m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
                    std::list<ComputationNodeBasePtr> recurrent_loop;
                    pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i + 1], 1);
                    output = SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), builder.Times(w, pastValue)), i);
                    pastValue->AttachInputs({ output });
                    recur_idx++;
                }
                else
                {
                    output = SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), i);
                }

                if (m_addDropoutNodes)
                    input = builder.Dropout(output);
                else
                    input = output;
            }
        }

        // TODO: to figure out sparse matrix size
        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
        //b = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"B%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], 1);
        //m_net->InitLearnableParameters(b, L"fixedValue", 0);
        label = builder.CreateSparseInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
        AddTrainAndEvalCriterionNodes(input, label, w);

        output = builder.Times(w, input);

        m_net->AddToNodeGroup(L"output", output);

        if (m_needPrior)
        {
            prior = builder.Mean(label);
        }
    }

    return m_net;
}

template <class ElemType>
shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildDirectConnect(unsigned long& randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode)
{
    ComputationNetworkBuilder<ElemType> builder(*m_net);

    ComputationNodePtr directOutput, mergedNode;

    for (size_t i = 0; i < m_directConnect.size(); i++)
    {
        if (m_directConnect[i] == iLayer)
        {
            ComputationNodePtr directWIO = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"D%d", i), outputDim, inputDim);
            m_net->RandomInitLearnableParameters(directWIO, m_uniformInit, randomSeed++, m_initValueScale);
            directOutput = ApplyNonlinearFunction(builder.Times(directWIO, input), i);

            ComputationNodePtr scalar = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"SV%d", i), 1, 1);
            m_net->InitLearnableParameters(scalar, L"fixedValue", (ElemType) 0.01);
            ComputationNodePtr scaled = builder.ElementTimes(scalar, directOutput, msra::strfun::wstrprintf(L"S%d", i));

            mergedNode = builder.Plus(toNode, scaled);
        }
    }

    return mergedNode;
}

template <class ElemType>
shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildLSTMComponent(unsigned long& randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr inputObs)
{
    ComputationNetworkBuilder<ElemType> builder(*m_net);

    size_t numHiddenLayers = m_layerSizes.size() - 2;

    ComputationNodePtr input, w, b, u, e, pastValue, output, label, prior;
    ComputationNodePtr Wxo, Who, Wco, bo, Wxi, Whi, Wci, bi;
    ComputationNodePtr Wxf, Whf, Wcf, bf, Wxc, Whc, bc;
    ComputationNodePtr ot, it, ft, ct, ht;
    ComputationNodePtr pastValueHI, pastValueCI, pastValueHO, pastValueHF, pastValueHC, pastValueCF, pastValueCC;
    ComputationNodePtr directWIO, directInput, directOutput;
    ComputationNodePtr bit, bft, bct;

    input = inputObs;
    Wxo = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WXO%d", iLayer), outputDim, inputDim);
    Wxi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WXI%d", iLayer), outputDim, inputDim);
    Wxf = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WXF%d", iLayer), outputDim, inputDim);
    Wxc = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WXC%d", iLayer), outputDim, inputDim);

    m_net->RandomInitLearnableParameters(Wxo, m_uniformInit, randomSeed++, m_initValueScale);
    m_net->RandomInitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
    m_net->RandomInitLearnableParameters(Wxf, m_uniformInit, randomSeed++, m_initValueScale);
    m_net->RandomInitLearnableParameters(Wxc, m_uniformInit, randomSeed++, m_initValueScale);

    bo = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bo%d", iLayer), outputDim, 1);
    bc = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bc%d", iLayer), outputDim, 1);
    bi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bi%d", iLayer), outputDim, 1);
    bf = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bf%d", iLayer), outputDim, 1);

    m_net->InitLearnableParameters(bi, L"fixedValue", m_inputGateInitVal);
    m_net->InitLearnableParameters(bc, L"fixedValue", 0);
    m_net->InitLearnableParameters(bo, L"fixedValue", m_outputGateInitVal);
    m_net->InitLearnableParameters(bf, L"fixedValue", m_forgetGateInitVal);

    Whi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WHI%d", iLayer), outputDim, outputDim);
    m_net->RandomInitLearnableParameters(Whi, m_uniformInit, randomSeed++, m_initValueScale);
    Wci = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WCI%d", iLayer), outputDim, 1);
    m_net->RandomInitLearnableParameters(Wci, m_uniformInit, randomSeed++, m_initValueScale);

    Whf = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WHF%d", iLayer), outputDim, outputDim);
    m_net->RandomInitLearnableParameters(Whf, m_uniformInit, randomSeed++, m_initValueScale);
    Wcf = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WCF%d", iLayer), outputDim, 1);
    m_net->RandomInitLearnableParameters(Wcf, m_uniformInit, randomSeed++, m_initValueScale);

    Who = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WHO%d", iLayer), outputDim, outputDim);
    m_net->RandomInitLearnableParameters(Who, m_uniformInit, randomSeed++, m_initValueScale);
    Wco = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WCO%d", iLayer), outputDim, 1);
    m_net->RandomInitLearnableParameters(Wco, m_uniformInit, randomSeed++, m_initValueScale);

    Whc = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WHC%d", iLayer), outputDim, outputDim);
    m_net->RandomInitLearnableParameters(Whc, m_uniformInit, randomSeed++, m_initValueScale);

    size_t layer1 = outputDim;

    pastValueHI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
    pastValueHF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
    pastValueHO = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
    pastValueHC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
    pastValueCI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
    pastValueCF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
    pastValueCC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);

    if (m_constInputGateValue)
    {
        // it = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"CONSTIT%d", iLayer), outputDim);
        // m_net->InitLearnableParameters(it, L"fixedValue", m_constInputGateValue);
        // it->SetLearningRateMultiplier(0);
        it = nullptr;
    }
    else
        it = ApplyNonlinearFunction(
            builder.Plus(
                builder.Plus(
                    builder.Plus(
                        builder.Times(Wxi, input),
                        bi),
                    builder.Times(Whi, pastValueHI)),
                builder.DiagTimes(Wci, pastValueCI)),
            0);

    if (it == nullptr)
    {
        bit = builder.Tanh(
            builder.Plus(
                builder.Times(Wxc, input),
                builder.Plus(
                    builder.Times(Whc, pastValueHC),
                    bc)));
    }
    else
    {
        bit = builder.ElementTimes(it,
                                   builder.Tanh(
                                       builder.Plus(
                                           builder.Times(Wxc, input),
                                           builder.Plus(
                                               builder.Times(Whc, pastValueHC),
                                               bc))));
    }

    if (m_constForgetGateValue)
    {
        ft = nullptr;
    }
    else
        ft = ApplyNonlinearFunction(
            builder.Plus(
                builder.Plus(
                    builder.Plus(
                        builder.Times(Wxf, input),
                        bf),
                    builder.Times(Whf, pastValueHF)),
                builder.DiagTimes(Wcf, pastValueCF)),
            0);

    if (ft == nullptr)
    {
        bft = pastValueCC;
    }
    else
    {
        bft = builder.ElementTimes(ft, pastValueCC);
    }

    ct = builder.Plus(bft, bit);

    if (m_constOutputGateValue)
    {
        ot = nullptr;
    }
    else
        ot = ApplyNonlinearFunction(
            builder.Plus(
                builder.Plus(
                    builder.Plus(
                        builder.Times(Wxo, input),
                        bo),
                    builder.Times(Who, pastValueHO)),
                builder.DiagTimes(Wco, ct)),
            0);

    if (ot == nullptr)
    {
        output = builder.Tanh(ct);
    }
    else
    {
        output = builder.ElementTimes(ot, builder.Tanh(ct));
    }

    pastValueHO->AttachInputs({ output });
    pastValueHI->AttachInputs({ output });
    pastValueHF->AttachInputs({ output });
    pastValueHC->AttachInputs({ output });
    pastValueCI->AttachInputs({ ct });
    pastValueCF->AttachInputs({ ct });
    pastValueCC->AttachInputs({ ct });

    if (m_addDropoutNodes)
        input = builder.Dropout(output);
    else
        input = output;
    output = input;

    return output;
}

#ifdef COMING_SOON

template <class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildCRFLSTMNetworkFromDescription()
{
    ComputationNetworkBuilder<ElemType> builder(*m_net);
    if (m_net->GetTotalNumberOfNodes() < 1) // not built yet
    {
        ULONG randomSeed = 1;

        size_t numHiddenLayers = m_layerSizes.size() - 2;

        size_t numRecurrentLayers = m_recurrentLayers.size();

        ComputationNodePtr input, w, b, u, e, pastValue, output, label, prior;
        ComputationNodePtr Wxo, Who, Wco, bo, Wxi, Whi, Wci, bi;
        ComputationNodePtr Wxf, Whf, Wcf, bf, Wxc, Whc, bc;
        ComputationNodePtr ot, it, ft, ct, ht;
        ComputationNodePtr pastValueHI, pastValueCI, pastValueHO, pastValueHF, pastValueHC, pastValueCF, pastValueCC;
        ComputationNodePtr directWIO, directInput, directOutput;
        ComputationNodePtr outputFromEachLayer[MAX_DEPTH] = {nullptr};
        ComputationNodePtr trans;

        input = builder.CreateInputNode(L"features", m_layerSizes[0]);
        m_net->AddToNodeGroup(L"feature", input);

        if (m_applyMeanVarNorm)
        {
            w = builder.Mean(input);
            b = builder.InvStdDev(input);
            output = builder.PerDimMeanVarNormalization(input, w, b);

            input = output;
        }

        if (m_lookupTableOrder > 0)
        {
            e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
            m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
            output = builder.LookupTable(e, input, L"LookupTable");

            if (m_addDropoutNodes)
                input = builder.Dropout(output);
            else
                input = output;

            outputFromEachLayer[1] = input;
        }

        // direct connect from input node to output node

        int recur_idx = 0;
        int offset = m_lookupTableOrder > 0 ? 1 : 0;
        if (numHiddenLayers > 0)
        {
            for (int i = offset; i < numHiddenLayers; i++)
            {
                if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
                {
                    output = (ComputationNodePtr) BuildLSTMComponent(randomSeed, i, m_layerSizes[i] * (offset ? m_lookupTableOrder : 1), m_layerSizes[i + 1], input);
                    input = output;

                    recur_idx++;
                }
                else
                {
                    u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i] * (offset ? m_lookupTableOrder : 1));
                    m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
                    b = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"B%d", i), m_layerSizes[i + 1], 1);
                    m_net->InitLearnableParameters(b, L"fixedValue", 0);
                    output = ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), i);
                }

                if (m_addDropoutNodes)
                    input = builder.Dropout(output);
                else
                    input = output;
            }
        }

        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"TimesBeforeSoftMax%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

        output = builder.Times(w, input, L"outputsBeforeSoftmax");

        trans = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"TransProb%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers + 1]);
        m_net->InitLearnableParameters(trans, L"fixedValue", (ElemType) 1.0 / m_layerSizes[numHiddenLayers + 1]);
        //          m_net->RandomInitLearnableParameters(trans, m_uniformInit, randomSeed++, m_initValueScale);
        trans->SetLearningRateMultiplier(1.0f);
        label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
        AddTrainAndEvalCriterionNodes(output, label, nullptr, L"CRFTrainCriterion", L"CRFEvalCriterion", nullptr, trans);

        input = output;
        output = builder.SequenceDecoder(label, input, trans, L"outputs");
        m_net->AddToNodeGroup(L"output", output);

        output = builder.Softmax(input, L"PosteriorProb");
    }

    return m_net;
}

#endif

template <class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassLSTMNetworkFromDescription()
{
    ComputationNetworkBuilder<ElemType> builder(*m_net);
    if (m_net->GetTotalNumberOfNodes() < 1) // not built yet
    {
        unsigned long randomSeed = 1;

        size_t numHiddenLayers = m_layerSizes.size() - 2;

        size_t numRecurrentLayers = m_recurrentLayers.size();

        ComputationNodePtr input, w, b, u, e, pastValue, output, label, prior;
        ComputationNodePtr Wxo, Who, Wco, bo, Wxi, Whi, Wci, bi;
        ComputationNodePtr clslogpostprob;
        ComputationNodePtr clsweight;

        input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
        m_net->AddToNodeGroup(L"feature", input);

        if (m_applyMeanVarNorm)
        {
            w = builder.Mean(input);
            b = builder.InvStdDev(input);
            output = builder.PerDimMeanVarNormalization(input, w, b);

            input = output;
        }

        if (m_lookupTableOrder > 0)
        {
            e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
            m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
            output = builder.LookupTable(e, input, L"LookupTable");

            if (m_addDropoutNodes)
                input = builder.Dropout(output);
            else
                input = output;
        }
        else
        {
            LogicError("BuildClassLSTMNetworkFromDescription: LSTMNode cannot take sparse input. Need to project sparse input to continuous vector using LookupTable. Suggest using setups below\n layerSizes=$VOCABSIZE$:100:$HIDDIM$:$VOCABSIZE$ \nto have 100 dimension projection, and lookupTableOrder=1\n to project to a single window. To use larger context window, set lookupTableOrder=3 for example with width-3 context window.\n ");
        }

        int recur_idx = 0;
        int offset = m_lookupTableOrder > 0 ? 1 : 0;
        if (numHiddenLayers > 0)
        {
            //                output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
            output = (ComputationNodePtr) BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
            // previously used function. now uses LSTMNode which is correct and fast
            input = output;
            for (int i = 1 + offset; i < numHiddenLayers; i++)
            {
                //                    output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
                output = (ComputationNodePtr) BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);

                if (m_addDropoutNodes)
                    input = builder.Dropout(output);
                else
                    input = output;
            }
        }

        // need to have [input_dim x output_dim] matrix
        // e.g., [200 x 10000], where 10000 is the vocabulary size
        // this is for speed-up issue as per word matrix can be simply obtained using column slice
        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

        // the label is a dense matrix. each element is the word index
        label = builder.CreateInputNode(L"labels", 4);

        clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
        m_net->RandomInitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
        clslogpostprob = builder.Times(clsweight, input, 1, L"ClassPostProb");

        output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeClassBasedCrossEntropy", L"EvalNodeClassBasedCrossEntrpy",
                                               clslogpostprob);

        output = builder.TransposeTimes(w, input, L"outputs");

        m_net->AddToNodeGroup(L"output", output);

        // add softmax layer (if prob is needed or KL reg adaptation is needed)
        output = builder.Softmax(output, L"PosteriorProb");
    }

    return m_net;
}

#if 1
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildLSTMNodeComponent(ULONG&, size_t, size_t, size_t, ComputationNodePtr)
{
    InvalidArgument("BuildLSTMNodeComponent: LSTMNode is no longer available. You should not get here.");
}
#else
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildLSTMNodeComponent(ULONG& randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr inputObs)
{
    ComputationNetworkBuilder<ElemType> builder(*m_net);
    size_t numHiddenLayers = m_layerSizes.size() - 2;

    ComputationNodePtr input, output;
    ComputationNodePtr wInputGate, wForgetGate, wOutputGate, wMemoryCellMatrix;

    input = inputObs;
    size_t nDim = inputDim + outputDim + 2;
    wInputGate = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WINPUTGATE%d", iLayer), outputDim, nDim);
    m_net->RandomInitLearnableParameters(wInputGate, m_uniformInit, randomSeed++, m_initValueScale);
    wInputGate->Value().ColumnSlice(0, 1).SetValue(m_inputGateInitVal); // init to input gate bias
    wForgetGate = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WFORGETGATE%d", iLayer), outputDim, nDim);
    m_net->RandomInitLearnableParameters(wForgetGate, m_uniformInit, randomSeed++, m_initValueScale);
    wForgetGate->Value().ColumnSlice(0, 1).SetValue(m_forgetGateInitVal); // init to forget gate bias
    wOutputGate = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WOUTPUTGATE%d", iLayer), outputDim, nDim);
    m_net->RandomInitLearnableParameters(wOutputGate, m_uniformInit, randomSeed++, m_initValueScale);
    wOutputGate->Value().ColumnSlice(0, 1).SetValue(m_outputGateInitVal); // init to output gate bias
    wMemoryCellMatrix = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WMEMORYCELLWEIGHT%d", iLayer), outputDim, inputDim + outputDim + 1);
    m_net->RandomInitLearnableParameters(wMemoryCellMatrix, m_uniformInit, randomSeed++, m_initValueScale);
    wMemoryCellMatrix->Value().ColumnSlice(0, 1).SetValue(0); // init to memory cell bias

    output = builder.LSTM(inputObs, wInputGate, wForgetGate, wOutputGate, wMemoryCellMatrix, msra::strfun::wstrprintf(L"LSTM%d", iLayer));

#ifdef DEBUG_DECODER
    wInputGate->Value().SetValue((ElemType) 0.01);
    wForgetGate->Value().SetValue((ElemType) 0.01);
    wOutputGate->Value().SetValue((ElemType) 0.01);
    wMemoryCellMatrix->Value().SetValue((ElemType) 0.01);
#endif

    if (m_addDropoutNodes)
        input = builder.Dropout(output);
    else
        input = output;
    output = input;

    return output;
}
#endif

template <class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMNetworkFromDescription()
{
    ComputationNetworkBuilder<ElemType> builder(*m_net);
    if (m_net->GetTotalNumberOfNodes() < 1) // not built yet
    {
        ULONG randomSeed = 1;

        size_t numHiddenLayers = m_layerSizes.size() - 2;

        size_t numRecurrentLayers = m_recurrentLayers.size();

        ComputationNodePtr input, w, b, u, e, pastValue, output, label, prior;
        ComputationNodePtr Wxo, Who, Wco, bo, Wxi, Whi, Wci, bi;
        ComputationNodePtr Wxf, Whf, Wcf, bf, Wxc, Whc, bc;
        ComputationNodePtr ot, it, ft, ct, ht;
        ComputationNodePtr pastValueHI, pastValueCI, pastValueHO, pastValueHF, pastValueHC, pastValueCF, pastValueCC;
        ComputationNodePtr directWIO, directInput, directOutput;
        ComputationNodePtr outputFromEachLayer[MAX_DEPTH] = {nullptr};

        if (m_sparse_input)
            input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
        else
            input = builder.CreateInputNode(L"features", m_layerSizes[0]);

        m_net->AddToNodeGroup(L"feature", input);

        if (m_applyMeanVarNorm)
        {
            w = builder.Mean(input);
            b = builder.InvStdDev(input);
            output = builder.PerDimMeanVarNormalization(input, w, b);

            input = output;
        }

        if (m_lookupTableOrder > 0)
        {
            e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
            m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
            output = builder.LookupTable(e, input, L"LookupTable");
#ifdef DEBUG_DECODER
            e->Value().SetValue((ElemType) 0.01);
#endif

            if (m_addDropoutNodes)
                input = builder.Dropout(output);
            else
                input = output;

            outputFromEachLayer[1] = input;
        }

        // direct connect from input node to output node

        int recur_idx = 0;
        int offset = m_lookupTableOrder > 0 ? 1 : 0;
        if (numHiddenLayers > 0)
        {

            // output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
            output = (ComputationNodePtr) BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
            // previously used function. now uses LSTMNode which is correct and fast
            input = output;
            outputFromEachLayer[offset + 1] = input;

            for (int i = 1 + offset; i < numHiddenLayers; i++)
            {
                if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i)
                {

                    // output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
                    output = (ComputationNodePtr) BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
                    // previously used function, now uses LSTMnode, which is fast and correct

                    recur_idx++;
                }
                else
                {
                    u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
                    m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
                    b = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"B%d", i), m_layerSizes[i + 1], 1);
                    m_net->InitLearnableParameters(b, L"fixedValue", 0);
                    output = ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), i);
                }

                if (m_addDropoutNodes)
                    input = builder.Dropout(output);
                else
                    input = output;

                outputFromEachLayer[i + 1] = input;
            }
        }

        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
#ifdef DEBUG_DECODER
        w->Value().SetValue((ElemType) 0.01);
#endif
        label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
        AddTrainAndEvalCriterionNodes(input, label, w);

        output = builder.Times(w, input, 1, L"outputs");

        if (m_needPrior)
        {
            prior = builder.Mean(label);
            input = builder.Log(prior, L"LogOfPrior");
            ComputationNodePtr
                scaledLogLikelihood = builder.Minus(output, input, L"ScaledLogLikelihood");
            m_net->AddToNodeGroup(L"output", scaledLogLikelihood);
        }
        else
            m_net->AddToNodeGroup(L"output", output);

        // add softmax layer (if prob is needed or KL reg adaptation is needed)
        output = builder.Softmax(output, L"PosteriorProb");
    }

    return m_net;
}

template <class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNCELSTMNetworkFromDescription()
{
    ComputationNetworkBuilder<ElemType> builder(*m_net);
    if (m_net->GetTotalNumberOfNodes() < 1) // not built yet
    {
        unsigned long randomSeed = 1;

        size_t numHiddenLayers = m_layerSizes.size() - 2;
        size_t numRecurrentLayers = m_recurrentLayers.size();

        ComputationNodePtr input, w, b, u, e, pastValue, output, label, prior;
        ComputationNodePtr Wxo, Who, Wco, bo, Wxi, Whi, Wci, bi;
        ComputationNodePtr clslogpostprob;
        ComputationNodePtr bias;
        ComputationNodePtr outputFromEachLayer[MAX_DEPTH] = {nullptr};

        input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
        m_net->AddToNodeGroup(L"feature", input);

        if (m_applyMeanVarNorm)
        {
            w = builder.Mean(input);
            b = builder.InvStdDev(input);
            output = builder.PerDimMeanVarNormalization(input, w, b);

            input = output;
        }

        if (m_lookupTableOrder > 0)
        {
            e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
            m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
            output = builder.LookupTable(e, input, L"LookupTable");

            if (m_addDropoutNodes)
                input = builder.Dropout(output);
            else
                input = output;

            outputFromEachLayer[1] = input;
        }

        // direct connect from input node to output node

        int recur_idx = 0;
        int offset = m_lookupTableOrder > 0 ? 1 : 0;
        if (numHiddenLayers > 0)
        {
            output = (ComputationNodePtr) BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
            input = output;
            outputFromEachLayer[offset + 1] = input;

            for (int i = 1 + offset; i < numHiddenLayers; i++)
            {
                if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i)
                {
                    output = (ComputationNodePtr) BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);

                    recur_idx++;
                }
                else
                {
                    u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
                    m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
                    b = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"B%d", i), m_layerSizes[i + 1], 1);
                    m_net->InitLearnableParameters(b, L"fixedValue", 0);
                    output = ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), i);
                }

                if (m_addDropoutNodes)
                    input = builder.Dropout(output);
                else
                    input = output;

                outputFromEachLayer[i + 1] = input;
            }
        }

        for (size_t i = offset; i < m_layerSizes.size(); i++)
        {
            // add direct connect from each layers' output to the layer before the output layer
            output = BuildDirectConnect(randomSeed, i, (i > 1) ? m_layerSizes[i] : ((offset == 0) ? m_layerSizes[i] : m_layerSizes[i] * m_lookupTableOrder), m_layerSizes[numHiddenLayers], outputFromEachLayer[i], input);
            if (output != nullptr)
                input = output;
        }

        // need to have [input_dim x output_dim] matrix
        // e.g., [200 x 10000], where 10000 is the vocabulary size
        // this is for speed-up issue as per word matrix can be simply obtained using column slice
        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

        // the label is a dense matrix. each element is the word index
        label = builder.CreateInputNode(L"labels", 2 * (this->nce_noises + 1));

        bias = builder.CreateLearnableParameter(L"BiasVector", 1, m_layerSizes[m_layerSizes.size() - 1]);
        m_net->InitLearnableParameters(bias, L"fixedValue", (ElemType) -std::log(m_layerSizes[m_layerSizes.size() - 1]));
        // m_net->RandomInitLearnableParameters(bias, m_uniformInit, randomSeed++, std::log(m_layerSizes[m_layerSizes.size() - 1])* m_initValueScale);
        // clslogpostprob = builder.Times(clsweight, input, 1, L"ClassPostProb");

        output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeNCEBasedCrossEntropy", L"EvalNodeNCEBasedCrossEntrpy", bias);

        m_net->AddToNodeGroup(L"output", output);

        if (m_needPrior)
        {
            prior = builder.Mean(label);
        }
    }

    return m_net;
}

// load a model file from Frank Seide's Microsoft-internal legacy tool "DBN.exe"
template <class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNetworkFromDbnFile(const std::wstring& dbnModelFileName)
{
    ComputationNetworkBuilder<ElemType> builder(*m_net);

    std::string hdr, comment, name;
    int version;
    int numLayers, i;
    std::string layerType;

    unsigned long randomSeed = 1;

    ComputationNodePtr input, w, b, output, label, prior, scaledLogLikelihood;
    shared_ptr<PreComputedNodeBase<ElemType>> pcNodePtr;

    File fstream(dbnModelFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead);

    if (!CheckDbnTag(fstream, "DBN\n"))
        RuntimeError("Error reading DBN file - did not find expected tag DBN\n");
    fstream >> comment;
    if (!CheckDbnTag(fstream, "BDBN"))
        RuntimeError("Error reading DBN file - did not find expected tag BDBN\n");
    fstream >> version >> numLayers;

    Matrix<ElemType> globalMean = ReadMatrixFromDbnFile(fstream, std::string("gmean"));
    Matrix<ElemType> globalStdDev = ReadMatrixFromDbnFile(fstream, std::string("gstddev"));
    assert(globalMean.GetNumCols() == 1);
    assert(globalStdDev.GetNumCols() == 1);

    // move to CPU since element-wise operation is expensive and can go wrong in GPU
    int curDevId = globalStdDev.GetDeviceId();
    globalStdDev.TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false);
    for (int i2 = 0; i2 < globalStdDev.GetNumRows(); i2++)
        globalStdDev(i2, 0) = (ElemType) 1.0 / (const ElemType) globalStdDev(i2, 0);
    globalStdDev.TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);

    if (!CheckDbnTag(fstream, "BNET"))
        RuntimeError("Error reading DBN file - did not find expected tag BNET\n");

    for (i = 0; i < numLayers; i++) // 0th index is for input layer,
    {
        fstream >> layerType;

        Matrix<ElemType> wts = ReadMatrixFromDbnFile(fstream, std::string("W"));
        Matrix<ElemType> bias = ReadMatrixFromDbnFile(fstream, std::string("a")); // remnant from pretraining, not needed
        Matrix<ElemType> A = ReadMatrixFromDbnFile(fstream, std::string("b"));
        if (i == 0)
        {
            input = builder.CreateInputNode(L"features", wts.GetNumCols());
            m_net->AddToNodeGroup(L"feature", input);

            size_t frameDim = globalMean.GetNumRows();
            size_t numContextFrames = wts.GetNumCols() / frameDim;
            size_t contextDim = numContextFrames * frameDim;
            Matrix<ElemType> contextMean(contextDim, 1, m_deviceId);
            Matrix<ElemType> contextStdDev(contextDim, 1, m_deviceId);

            // move to CPU since element-wise operation is expensive and can go wrong in GPU
            contextMean.TransferFromDeviceToDevice(m_deviceId, CPUDEVICE, true, false, false);
            contextStdDev.TransferFromDeviceToDevice(m_deviceId, CPUDEVICE, true, false, false);
            for (size_t j = 0; j < frameDim; j++)
            {
                for (size_t k = 0; k < numContextFrames; k++)
                {
                    contextMean(j + k * frameDim, 0) = (const ElemType) globalMean(j, 0);
                    contextStdDev(j + k * frameDim, 0) = (const ElemType) globalStdDev(j, 0);
                }
            }
            contextMean.TransferFromDeviceToDevice(CPUDEVICE, m_deviceId, true, false, false);
            contextStdDev.TransferFromDeviceToDevice(CPUDEVICE, m_deviceId, true, false, false);

            w = builder.Mean(input, L"MeanOfFeatures");
            static_pointer_cast<PreComputedNodeBase<ElemType>>(w)->SideLoadFromMatrix(contextMean);
            w->SetLearningRateMultiplier(0);

            b = builder.InvStdDev(input, L"InvStdOfFeatures");
            static_pointer_cast<PreComputedNodeBase<ElemType>>(b)->SideLoadFromMatrix(contextStdDev);
            b->SetLearningRateMultiplier(0);

            output = builder.PerDimMeanVarNormalization(input, w, b, L"MVNormalizedFeatures");
            input = output;
        }
        if (i == numLayers - 1)
        {
            m_outputLayerSize = wts.GetNumRows();
        }
        wstring nameOfW = msra::strfun::wstrprintf(L"W%d", i);
        wstring nameOfB = msra::strfun::wstrprintf(L"B%d", i);
        wstring nameOfPrevH = msra::strfun::wstrprintf(L"H%d", i);
        wstring nameOfTimes = nameOfW + L"*" + nameOfPrevH;
        wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
        wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);

        w = builder.CreateLearnableParameter(nameOfW, wts.GetNumRows(), wts.GetNumCols());
        m_net->InitLearnableParameters(w, L"fixedValue", 0); // follow protocol
        w->Value().SetValue(wts); // and overwrite

        b = builder.CreateLearnableParameter(nameOfB, bias.GetNumRows(), 1);
        m_net->InitLearnableParameters(b, L"fixedValue", 0); // follow protocol
        b->Value().SetValue(bias); // and overwrite

        if (layerType == "perceptron")
        {
            fprintf(stderr, "DBN: Reading (%lu x %lu) perceptron\n", (unsigned long) wts.GetNumRows(), (unsigned long) wts.GetNumCols());
            output = builder.Plus(builder.Times(w, input, 1, nameOfTimes), b, nameOfPlus);
        }
        else if (layerType == "rbmisalinearbernoulli")
        {
            fprintf(stderr, "DBN: Reading (%lu x %lu) linear layer\n", (unsigned long) wts.GetNumRows(), (unsigned long) wts.GetNumCols());
            output = builder.Plus(builder.Times(w, input, 1, nameOfTimes), b, nameOfPlus);
        }
        else // assume rbmbernoullibernoulli
        {
            fprintf(stderr, "DBN: Reading (%lu x %lu) non-linear layer\n", (unsigned long) wts.GetNumRows(), (unsigned long) wts.GetNumCols());
            output = ApplyNonlinearFunction(builder.Plus(builder.Times(w, input, 1, nameOfTimes), b, nameOfPlus), i, nameOfH);
            if (m_addDropoutNodes)
                input = builder.Dropout(output, L"Drop" + nameOfH);
        }

        input = output;
    }

    if (!CheckDbnTag(fstream, "ENET"))
        RuntimeError("Error reading DBN file - did not find expected tag ENET\n");
    // size_t outputLayerSize =  m_layerSizes[m_layerSizes.size()-1];

    label = builder.CreateInputNode(L"labels", m_outputLayerSize);

    if (layerType == "perceptron") // complete network
    {
        m_net->RenameNode(output, L"HLast");

        Matrix<ElemType> priorVals = ReadMatrixFromDbnFile(fstream, std::string("Pu"));
        assert(priorVals.GetNumCols() == 1 && priorVals.GetNumRows() == m_outputLayerSize);

        prior = builder.Mean(label, L"Prior");
        static_pointer_cast<PreComputedNodeBase<ElemType>>(prior)->SideLoadFromMatrix(priorVals);
        prior->SetLearningRateMultiplier(0);
    }
    else // pretrained network - need to add output layer, initalize
    {
        size_t outputLayerSize = 0;
        if (this->m_outputLayerSize >= 0)
            outputLayerSize = this->m_outputLayerSize;
        else if (m_layerSizes.size() > 0)
            m_layerSizes[m_layerSizes.size() - 1];
        else
            RuntimeError("Output layer size must be specified when converting a pre-trained network, use outputLayerSize=");

        size_t penultimateSize = input->GetSampleMatrixNumRows();

        wstring nameOfW = msra::strfun::wstrprintf(L"W%d", i);
        wstring nameOfB = msra::strfun::wstrprintf(L"B%d", i);
        wstring nameOfPrevH = msra::strfun::wstrprintf(L"H%d", i);
        wstring nameOfTimes = nameOfW + L"*" + nameOfPrevH;
        wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
        wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);

        w = builder.CreateLearnableParameter(nameOfW, outputLayerSize, penultimateSize);
        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
        b = builder.CreateLearnableParameter(nameOfB, outputLayerSize, 1);
        m_net->InitLearnableParameters(b, L"fixedValue", 0);
        output = builder.Plus(builder.Times(w, input, 1, nameOfTimes), b, nameOfPlus);
        m_net->RenameNode(output, L"HLast");

        if (m_needPrior)
        {
            Matrix<ElemType> zeros = Matrix<ElemType>::Zeros(outputLayerSize, 1, m_deviceId);
            prior = builder.Mean(label, L"Prior");
            static_pointer_cast<PreComputedNodeBase<ElemType>>(prior)->MarkComputed(false);
            prior->Value().SetValue(zeros);
        }
    }

    AddTrainAndEvalCriterionNodes(output, label);

    if (layerType == "perceptron" || m_needPrior)
    {
        input = builder.Log(prior, L"LogOfPrior");

        // following two lines is needed only if true probability is needed
        // output = builder.Softmax(output);
        // output = builder.Log(output);

        scaledLogLikelihood = builder.CreateComputationNode(OperationNameOf(MinusNode), L"ScaledLogLikelihood");
        scaledLogLikelihood->AttachInputs({ output, input });
        m_net->AddToNodeGroup(L"output", scaledLogLikelihood);
    }
    else
    {
        m_net->AddToNodeGroup(L"output", output);
    }

    if (!CheckDbnTag(fstream, "EDBN"))
        RuntimeError("Error reading DBN file - did not find expected tag ENET\n");

    // perform necessary validation and post-processing
    m_net->CompileNetwork();

    return m_net;
}

// layer is 0 based
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(ComputationNodePtr input, const size_t layer, const std::wstring nodeName)
{
    ComputationNetworkBuilder<ElemType> builder(*m_net);

    ComputationNodePtr output;
    wstring nonLinearFunction = m_nonLinearFunctions[layer];
    if (nonLinearFunction == OperationNameOf(SigmoidNode))
        output = builder.Sigmoid(input, nodeName);
    else if (nonLinearFunction == OperationNameOf(RectifiedLinearNode))
        output = builder.RectifiedLinear(input, nodeName);
    else if (nonLinearFunction == OperationNameOf(TanhNode))
        output = builder.Tanh(input, nodeName);
    else if (nonLinearFunction == L"None" || nonLinearFunction == L"none" || nonLinearFunction == L"")
    {
        output = input; // linear layer
        if (nodeName != L"")
            m_net->RenameNode(output, nodeName);
    }
    else
        LogicError("Unsupported nonlinear function.");

    return output;
}

template <class ElemType>
shared_ptr<ComputationNode<ElemType>> SimpleNetworkBuilder<ElemType>::AddTrainAndEvalCriterionNodes(ComputationNodePtr input, ComputationNodePtr label, ComputationNodePtr matrix,
                                                                                                    const std::wstring trainNodeName, const std::wstring evalNodeName,
                                                                                                    ComputationNodePtr clspostprob, ComputationNodePtr trans)
{
    ComputationNetworkBuilder<ElemType> builder(*m_net);

    m_net->AddToNodeGroup(L"label", label);

    ComputationNodePtr output;

    // BUGBUG: Use of 'tinput' conflicts with some criteria that expect their top weight matrix transposed, e.g. [200 x 10000] with vocab size of 10000 instead of [10000 x 200].
    //         E.g. ClassCrossEntropyWithSoftmax uses this, but that is incompatible with 'tinput.' Now 'tinput' is computed on demand, but if a criterion node is
    //         used that needs it, we will still have this incompatibility.
    ComputationNodePtr tinput = input;

    switch (m_trainCriterion)
    {
    case TrainingCriterion::CrossEntropyWithSoftmax:
        if (matrix != nullptr)
            tinput = builder.Times(matrix, input);
        output = builder.CrossEntropyWithSoftmax(label, tinput, (trainNodeName == L"") ? L"CrossEntropyWithSoftmax" : trainNodeName);
        break;
    case TrainingCriterion::SquareError:
        if (matrix != nullptr)
            tinput = builder.Times(matrix, input);
        output = builder.SquareError(label, tinput, (trainNodeName == L"") ? L"SquareError" : trainNodeName);
        break;
    case TrainingCriterion::Logistic:
        if (matrix != nullptr)
            tinput = builder.Times(matrix, input);
        output = builder.Logistic(label, tinput, (trainNodeName == L"") ? L"Logistic" : trainNodeName);
        break;
#ifdef COMING_SOON
    case TrainingCriterion::CRF:
        assert(trans != nullptr);
        output = builder.CRF(label, input, trans, (trainNodeName == L"") ? L"CRF" : trainNodeName);
        break;
#endif
    case TrainingCriterion::ClassCrossEntropyWithSoftmax:
        output = builder.ClassCrossEntropyWithSoftmax(label, input, matrix, clspostprob, (trainNodeName == L"") ? L"ClassCrossEntropyWithSoftmax" : trainNodeName);
        break;
    case TrainingCriterion::NCECrossEntropyWithSoftmax:
        output = builder.NoiseContrastiveEstimation(label, input, matrix, clspostprob, (trainNodeName == L"") ? L"NoiseContrastiveEstimationNode" : trainNodeName);
        // output = builder.NoiseContrastiveEstimation(label, input, matrix, clspostprob, (trainNodeName == L"") ? L"NoiseContrastiveEstimationNode" : trainNodeName);
        break;
    default:
        LogicError("Unsupported training criterion.");
    }
    m_net->AddToNodeGroup(L"criterion", output);

    if (!((m_evalCriterion == EvalCriterion::CrossEntropyWithSoftmax && m_trainCriterion == TrainingCriterion::CrossEntropyWithSoftmax) ||
          (m_evalCriterion == EvalCriterion::SquareError && m_trainCriterion == TrainingCriterion::SquareError) ||
          (m_evalCriterion == EvalCriterion::Logistic && m_trainCriterion == TrainingCriterion::Logistic) ||
          (m_evalCriterion == EvalCriterion::CRF && m_trainCriterion == TrainingCriterion::CRF) ||
          (m_evalCriterion == EvalCriterion::ClassCrossEntropyWithSoftmax && m_trainCriterion == TrainingCriterion::ClassCrossEntropyWithSoftmax) ||
          (m_evalCriterion == EvalCriterion::NCECrossEntropyWithSoftmax && m_trainCriterion == TrainingCriterion::NCECrossEntropyWithSoftmax)))
    {
        switch (m_evalCriterion)
        {
        case EvalCriterion::CrossEntropyWithSoftmax:
            if (matrix != nullptr && tinput == input)
                tinput = builder.Times(matrix, input);
            // output = builder.CrossEntropyWithSoftmax(label, tinput, (evalNodeName == L"")?L"EvalCrossEntropyWithSoftmax":evalNodeName);
            output = builder.CrossEntropyWithSoftmax(label, tinput, (evalNodeName == L"") ? L"CrossEntropyWithSoftmax" : evalNodeName);
            break;
        case EvalCriterion::ClassCrossEntropyWithSoftmax:
            // output = builder.ClassCrossEntropyWithSoftmax(label, input, matrix, clspostprob, (evalNodeName == L"") ? L"EvalClassCrossEntropyWithSoftmax" : evalNodeName);
            output = builder.ClassCrossEntropyWithSoftmax(label, input, matrix, clspostprob, (evalNodeName == L"") ? L"ClassCrossEntropyWithSoftmax" : evalNodeName);
            break;
        case EvalCriterion::NCECrossEntropyWithSoftmax:
            output = builder.NoiseContrastiveEstimation(label, input, matrix, clspostprob, (evalNodeName == L"") ? L"NoiseContrastiveEstimationNode" : evalNodeName);
            break;
        case EvalCriterion::SquareError:
            if (matrix != nullptr && tinput == input)
                tinput = builder.Times(matrix, input);
            // output = builder.SquareError(label, tinput, (evalNodeName == L"")?L"EvalSquareError":evalNodeName);
            output = builder.SquareError(label, tinput, (evalNodeName == L"") ? L"SquareError" : evalNodeName);
            break;
        case EvalCriterion::Logistic:
            if (matrix != nullptr && tinput == input)
                tinput = builder.Times(matrix, input);
            // output = builder.SquareError(label, tinput, (evalNodeName == L"")?L"EvalSquareError":evalNodeName);
            output = builder.Logistic(label, tinput, (evalNodeName == L"") ? L"Logistic" : evalNodeName);
            break;
        case EvalCriterion::ClassificationError:
            if (matrix != nullptr && tinput == input)
                tinput = builder.Times(matrix, input);
            output = builder.ClassificationError(label, tinput, (evalNodeName == L"") ? L"EvalClassificationError" : evalNodeName);
            break;
#ifdef COMING_SOON
        case EvalCriterion::CRF:
            assert(trans != nullptr);
            if (matrix != nullptr && tinput == input)
                tinput = builder.Times(matrix, input);
            output = builder.CRF(label, tinput, trans, (evalNodeName == L"") ? L"EvalCRF" : evalNodeName);
            break;
#endif
        default:
            LogicError("Unsupported training criterion.");
        }
        output->SetLearningRateMultiplier(0);
    }

    m_net->AddToNodeGroup(L"evaluation", output);

    return output;
}

template class SimpleNetworkBuilder<float>;
template class SimpleNetworkBuilder<double>;

// -----------------------------------------------------------------------
// and some helpers
// -----------------------------------------------------------------------

TrainingCriterion ParseTrainingCriterionString(wstring s)
{
    if      (EqualCI(s, L"crossEntropyWithSoftmax"))      return TrainingCriterion::CrossEntropyWithSoftmax;
    else if (EqualCI(s, L"squareError"))                  return TrainingCriterion::SquareError;
    else if (EqualCI(s, L"logistic"))                     return TrainingCriterion::Logistic;
    else if (EqualCI(s, L"noiseContrastiveEstimation"))   return TrainingCriterion::NCECrossEntropyWithSoftmax;
    // legacy/deprecated
    else if (EqualCI(s, L"classCrossEntropyWithSoftmax")) return TrainingCriterion::ClassCrossEntropyWithSoftmax;
    else if (EqualCI(s, L"sequenceWithSoftmax"))          return TrainingCriterion::SequenceWithSoftmax;
    else if (EqualCI(s, L"latticeSequenceWithSoftmax"))   return TrainingCriterion::LatticeSequenceWithSoftmax;
    else LogicError("trainingCriterion: Invalid trainingCriterion value. Valid values are (crossEntropyWithSoftmax | squareError | logistic | classCrossEntropyWithSoftmax| sequenceWithSoftmax | sequenceWithLattice)");
}

EvalCriterion ParseEvalCriterionString(wstring s)
{
    if      (EqualCI(s, L"classificationError"))          return EvalCriterion::ClassificationError;
    else if (EqualCI(s, L"crossEntropyWithSoftmax"))      return EvalCriterion::CrossEntropyWithSoftmax;
    else if (EqualCI(s, L"logistic"))                     return EvalCriterion::Logistic;
    else if (EqualCI(s, L"noiseContrastiveEstimation"))   return EvalCriterion::NCECrossEntropyWithSoftmax;
    else if (EqualCI(s, L"squareError"))                  return EvalCriterion::SquareError;
    // legacy/deprecated
    else if (EqualCI(s, L"classCrossEntropyWithSoftmax")) return EvalCriterion::ClassCrossEntropyWithSoftmax;
    else if (EqualCI(s, L"sequenceWithSoftmax"))          return EvalCriterion::SequenceWithSoftmax;
    else if (EqualCI(s, L"errorPrediction"))              return EvalCriterion::ClassificationError;
    else LogicError("evalCriterion: Invalid trainingCriterion value. Valid values are (errorPrediction | crossEntropyWithSoftmax | squareError | logistic | sequenceWithSoftmax)");
}

}}}
back to top