Content - f2f5bbabf294d90b6cf67b5e209d274a08891d86 - 6f630cb/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx

visit type:
Tip revision: d67eba806018248667f135a19386a668d5798e02 authored by Vadim Mazalov on 15 August 2018, 23:12:34 UTC
Remove template definition
Tip revision: d67eba8
CNTKBook_CN_Chapter.lyx
#LyX 2.1 created this file. For more info see http://www.lyx.org/
\lyxformat 474
\begin_document
\begin_header
\textclass extbook
\begin_preamble
\usepackage{algorithm}
\usepackage{algpseudocode}  
\end_preamble
\use_default_options false
\master CNTKBook-master.lyx
\maintain_unincluded_children false
\language english
\language_package default
\inputencoding auto
\fontencoding global
\font_roman default
\font_sans default
\font_typewriter default
\font_math auto
\font_default_family default
\use_non_tex_fonts false
\font_sc false
\font_osf false
\font_sf_scale 100
\font_tt_scale 100
\graphics default
\default_output_format default
\output_sync 0
\bibtex_command default
\index_command default
\paperfontsize 11
\spacing single
\use_hyperref false
\papersize default
\use_geometry false
\use_package amsmath 1
\use_package amssymb 2
\use_package cancel 0
\use_package esint 1
\use_package mathdots 1
\use_package mathtools 0
\use_package mhchem 1
\use_package stackrel 0
\use_package stmaryrd 0
\use_package undertilde 0
\cite_engine basic
\cite_engine_type default
\biblio_style plain
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date false
\justification true
\use_refstyle 0
\index Index
\shortcut idx
\color #008000
\end_index
\secnumdepth 3
\tocdepth 3
\paragraph_separation indent
\paragraph_indentation default
\quotes_language english
\papercolumns 1
\papersides 1
\paperpagestyle default
\tracking_changes false
\output_changes false
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\end_header

\begin_body

\begin_layout Chapter
Computational Network
\begin_inset Index idx
status open

\begin_layout Plain Layout
Computational Network
\end_layout

\end_inset


\begin_inset CommandInset label
LatexCommand label
name "chap:CN"

\end_inset


\end_layout

\begin_layout Section
Computational Network
\end_layout

\begin_layout Standard
There is a common property in key machine learning models, such as deep
 neural networks (DNNs) 
\begin_inset CommandInset citation
LatexCommand cite
key "CD-DNN-HMM-Trans-Dahl+2012,DNN-SWB-seide+2011,FeatEngInDNN-Seide+2011,DNN-Bottlenec-Yu:2011,DNN-LVSR-Jaitly+2012,DNN-4Groups-Hinton+2012"

\end_inset

, convolutional neural networks (CNNs) 
\begin_inset CommandInset citation
LatexCommand cite
key "CNN-lecun:1995,CNN-FastComputing-chellapilla+2006,CNN-FeatureHierarchy-kavukcuoglu+2010,DCNN-ImageNet-krizhevsky+2012,CNN-ASR-abdel+2012,TransferLearning-DNN-Ciresan+2012,CNN-LVCSR-sainath+2013,DCNN-LVCSR-sainath+2013,CNN-ASR-Abdel+2013,CNN-ASR-deng+2013"

\end_inset

, and recurrent neural networks (RNNs) 
\begin_inset CommandInset citation
LatexCommand cite
key "LSTM-hochreiter:1997,ParseLanguageWithRNN-Socher+2011,GenTextWithRNN-Sutskever+2011,CD-RNN-LM-Mikolov:2012,RNNLMWithLinguisticFeature-Shi+2012"

\end_inset

.
 All these models can be described as a series of computational steps.
 For example, a one-hidden-layer sigmoid neural network can be described
 as the computational steps listed in Algorithm 
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:Computation-Steps-One-Hidden-Layer"

\end_inset

.
 If we know how to compute each step and in which order the steps are computed
 we have an implementation of the neural network.
 This observation suggests that we can unify all these models under the
 framework of computational network (CN
\begin_inset Index idx
status open

\begin_layout Plain Layout
CN
\end_layout

\end_inset

), part of which has been implemented in toolkits such as Theano 
\begin_inset CommandInset citation
LatexCommand cite
key "Theano-bergstra+2010"

\end_inset

, CNTK 
\begin_inset CommandInset citation
LatexCommand cite
key "SGD-CNTK-Guenter+2013"

\end_inset

 and RASR/NN 
\begin_inset CommandInset citation
LatexCommand cite
key "RASR-NN-RWTH-Toolkit-Wiesler+2014"

\end_inset

..
\end_layout

\begin_layout Standard
\begin_inset Float algorithm
wide false
sideways false
status open

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{algorithmic}[1]
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Procedure
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset

OneHiddenLayerNNComputation
\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $\mathbf{X}$
\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset


\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
Each column of 
\begin_inset Formula $\mathbf{X}$
\end_inset

 is an observation vector
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit

\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset

 
\begin_inset Formula $\mathbf{T}^{(1)}\leftarrow\mathbf{W^{\mathrm{(1)}}\mathbf{X}}$
\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset

 
\begin_inset Formula $\mathbf{P}^{(1)}\leftarrow\mathbf{T^{\mathrm{(1)}}}+\mathbf{B}^{(1)}$
\end_inset

 
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset


\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
Each column of 
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit

\begin_inset Formula $\mathbf{B}^{(1)}$
\end_inset

 is the bias 
\begin_inset Formula $\mathbf{b}^{(1)}$
\end_inset

 
\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset

 
\begin_inset Formula $\mathbf{S}^{(1)}\leftarrow\sigma\left(\mathbf{P}^{(1)}\right)$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset


\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\sigma\left(.\right)$
\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 is the sigmoid function applied element-wise
\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset

 
\begin_inset Formula $\mathbf{T}^{(2)}\leftarrow\mathbf{W\mathbf{^{\mathrm{(2)}}S^{\mathrm{(1)}}}}$
\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset

 
\begin_inset Formula $\mathbf{P}^{(2)}\leftarrow\mathbf{T}^{(2)}+\mathbf{B}^{(2)}$
\end_inset

 
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset


\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
Each column of 
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit

\begin_inset Formula $\mathbf{B}^{(2)}$
\end_inset

 is the bias 
\begin_inset Formula $\mathbf{b}^{(2)}$
\end_inset

 
\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset


\begin_inset Formula $\mathbf{O}\leftarrow\textnormal{softmax}\left(\mathbf{P}^{(2)}\right)$
\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit

\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset


\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
Apply softmax column-wise
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 to get output 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\mathbf{O}$
\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit

\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
EndProcedure
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
end{algorithmic}
\end_layout

\end_inset

 
\begin_inset Caption Standard

\begin_layout Plain Layout
Computational Steps Involved in an One-Hidden-Layer Sigmoid Neural Network
\begin_inset CommandInset label
LatexCommand label
name "alg:Computation-Steps-One-Hidden-Layer"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
A computational network
\begin_inset Index idx
status open

\begin_layout Plain Layout
computational network
\end_layout

\end_inset

 is a directed graph 
\begin_inset Formula $\left\{ \mathbf{\mathbb{V}},\mathbf{\mathbb{E}}\right\} $
\end_inset

, where 
\begin_inset Formula $\mathbf{\mathbb{V}}$
\end_inset

 is a set of vertices and 
\begin_inset Formula $\mathbf{\mathbb{E}}$
\end_inset

 is a set of directed edges.
 Each vertex, called a computation node
\begin_inset Index idx
status open

\begin_layout Plain Layout
computation node
\end_layout

\end_inset

, represents a computation.
 Vertices with edges toward a computation node are the operands of the associate
d computation and sometimes called the children of the computation node.
 Here the order of operands matters for some operations such as matrix multiplic
ation.
 Leaf nodes do not have children and are used to represent input values
 or model parameters that are not result of some computation.
 A CN can be easily represented as a set of computation nodes 
\begin_inset Formula $n$
\end_inset

 and their children 
\begin_inset Formula $\left\{ n:c_{1},\cdots,c_{K_{n}}\right\} $
\end_inset

, where 
\begin_inset Formula $K_{n}$
\end_inset

 is the number of children of node 
\begin_inset Formula $n$
\end_inset

.
 For leaf nodes 
\begin_inset Formula $K_{n}=0$
\end_inset

.
 Each computation node knows how to compute the value of itself given the
 input operands (children).
\end_layout

\begin_layout Standard
Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CN-1HiddenNN"

\end_inset

 is the one-hidden-layer sigmoid neural network of Algorithm 
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:Computation-Steps-One-Hidden-Layer"

\end_inset

 represented as a CN in which each node 
\begin_inset Formula $n$
\end_inset

 is identified by a 
\begin_inset Formula $\{nodename:operatortype\}$
\end_inset

 pair and takes its ordered children as the operator's inputs.
 From the figure, we can observe that in CN there is no concept of layers.
 Instead, a computation node is the basic element of operations.
 This makes the description of a simple model such as DNN more cumbersome,
 but this can be alleviated by grouping computation nodes together with
 macros.
 In return, CN provides us with greater flexibility in describing arbitrary
 networks and allows us to build almost all models we are interested in
 within the same unified framework.
 For example, we can easily modify the network illustrated in Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CN-1HiddenNN"

\end_inset

 to use rectified linear unit instead of a sigmoid nonlinearity.
 We can also build a network that has two input nodes as shown in Figure
 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CN-2Inputs"

\end_inset

 or a network with shared model parameters as shown in Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CN-ShareWeights"

\end_inset

.
 
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\align center
\begin_inset Graphics
	filename ../figures/CN-1HiddenNN.png
	scale 70

\end_inset


\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset CommandInset label
LatexCommand label
name "fig:CN-1HiddenNN"

\end_inset

Represent the one-hidden-layer sigmoid neural network of Algorithm 
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:Computation-Steps-One-Hidden-Layer"

\end_inset

 with a computational network
\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\align center
\begin_inset Graphics
	filename ../figures/CN-2Inputs.png
	scale 70

\end_inset


\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset CommandInset label
LatexCommand label
name "fig:CN-2Inputs"

\end_inset

A computational network with two input nodes (highlighted).
\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\align center
\begin_inset Graphics
	filename ../figures/CN-ShareWeight.png
	scale 70

\end_inset


\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset CommandInset label
LatexCommand label
name "fig:CN-ShareWeights"

\end_inset

A computational network with shared model parameters (highlighted).
 Here we assume the input 
\begin_inset Formula $\mathbf{X}$
\end_inset

, hidden layer 
\begin_inset Formula $\mathbf{S}^{(1)}$
\end_inset

, and output layer 
\begin_inset Formula $\mathbf{O}$
\end_inset

 have the same dimension.
\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Section
Forward Computation
\begin_inset Index idx
status open

\begin_layout Plain Layout
Forward Computation
\end_layout

\end_inset


\end_layout

\begin_layout Standard
When the model parameters (i.e., weight nodes in Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CN-1HiddenNN"

\end_inset

) are known, we can compute the value of any node given the new input values.
 Unlike in the DNN case, where the computation order can be trivially determined
 as layer-by-layer computation from bottom up, in CN different network structure
 comes with a different computation order.
 When the CN is a directed acyclic graph
\begin_inset Index idx
status open

\begin_layout Plain Layout
directed acyclic graph
\end_layout

\end_inset

 (DAG
\begin_inset Index idx
status open

\begin_layout Plain Layout
DAG
\end_layout

\end_inset

) the computation order can be determined with a depth-first traverse
\begin_inset Index idx
status open

\begin_layout Plain Layout
depth-first traverse
\end_layout

\end_inset

 over the DAG.
 Note that in a DAG there is no directed cycles (i.e., no recurrent loop).
 However, there might be loops if we don't consider edge directions, for
 which Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CN-ShareWeights"

\end_inset

 is an example.
 This is because the same computation node may be a child of several other
 nodes.
 Algorithm 
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:CN-ForwardOrder-DAG"

\end_inset

.
 determines the computation order of a DAG and takes care of this condition.
 Once the order is decided, it will remain the same for all the subsequent
 runs, regardless of the computational environment.
 In other words, this algorithm only needs to be executed per output node
 and then cache the computation order.
 Following the order determined by Algorithm 
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:CN-ForwardOrder-DAG"

\end_inset

, the forward computation
\begin_inset Index idx
status open

\begin_layout Plain Layout
Forward Computation
\end_layout

\end_inset

 of the CN is carried out synchronously.
 The computation of the next node starts only after the computation of the
 previous node has finished.
 It is suitable for environments where single computing device, such as
 one GPGPU or one CPU host, is used, or the CN itself is inherently sequential,
 e.g., when the CN represents a DNN.
\end_layout

\begin_layout Standard
\begin_inset Float algorithm
wide false
sideways false
status open

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{algorithmic}[1]
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Procedure
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset

DecideForwardComputationOrder
\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $root$
\end_inset

, 
\begin_inset Formula $visited$
\end_inset

, 
\begin_inset Formula $order$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset

Enumerate nodes in the DAG in the depth-first order.
\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $visited$
\end_inset

 is initialized as an empty set.
 
\begin_inset Formula $order$
\end_inset

 is initialized as an empty queue
\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
If
\end_layout

\end_inset

 
\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $root\notin visited$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset


\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
the same node may be a child of several nodes.
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit

\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 
\begin_inset Formula $visited\leftarrow visited\cup root$
\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
For
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset

each 
\begin_inset Formula $c\in root.children$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset


\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
apply to children recursive
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
ly
\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset

 call 
\noun on
DecideForwardComputationOrder
\noun default
(
\begin_inset Formula $c$
\end_inset

, 
\begin_inset Formula $visited$
\end_inset

, 
\begin_inset Formula $order$
\end_inset

)
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
EndFor
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset

 
\begin_inset Formula $order\leftarrow order+root$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset


\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
Add 
\begin_inset Formula $root$
\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 to the end of 
\begin_inset Formula $order$
\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
EndIf
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
EndProcedure
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
end{algorithmic}
\end_layout

\end_inset

 
\begin_inset Caption Standard

\begin_layout Plain Layout
Synchronous forward computation of a CN.
 The computation order is determined by a depth-first traverse over the
 DAG.
 
\begin_inset CommandInset label
LatexCommand label
name "alg:CN-ForwardOrder-DAG"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
The forward computation can also be carried out asynchronously with which
 the order of the computation is determined dynamically.
 This can be helpful when the CN has many parallel branches and there are
 more than one computing device to compute these branches in parallel.
 Algorithm 
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:CN-ForwardAsync-DAG"

\end_inset

 shows an algorithm that carries out the forward computation of a CN asynchronou
sly.
 In this algorithm, all the nodes whose children have not been computed
 are in the waiting set and those whose children are computed are in the
 ready set.
 At the beginning, all non-leaf descendants of 
\begin_inset Formula $root$
\end_inset

 are in the waiting set and all leaf descendants  are in the ready set.
 The scheduler picks a node from the ready set based on some policy, removes
 it from the ready set, and dispatches it for computation.
 Popular policies include first-come/first-serve, shortest task first, and
 least data movement.
 When the computation of the node finishes, the system calls the 
\noun on
SignalComplete
\noun default
 method to signal to all its parent nodes.
 If all the children of a node have been computed, the node is moved from
 the waiting set to the ready set.
 The algorithm stops when all nodes are computed.
 Although not explicitly indicated in the Algorithm 
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:CN-ForwardAsync-DAG"

\end_inset

, the 
\noun on
SignalComplete
\noun default
 procedure is called under concurrent threads and should be guarded for
 thread safety.
 This algorithm can be exploited to carry out computation on any DAG instead
 of just a CN.
\end_layout

\begin_layout Standard
\begin_inset Float algorithm
wide false
sideways false
status open

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{algorithmic}[1]
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Procedure
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset

SignalComplete
\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $node$
\end_inset

, 
\begin_inset Formula $waiting$
\end_inset

, 
\begin_inset Formula $ready$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset

Called when the computation of the 
\begin_inset Formula $node$
\end_inset

 is finished.
 Needs to be thread safe.
\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $waiting$
\end_inset

 is initialized to include all non-leaf descendants of 
\begin_inset Formula $root$
\end_inset

.
\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset

 
\begin_inset Formula $ready$
\end_inset

 is initialized to include all leaf descendants of 
\begin_inset Formula $root$
\end_inset

.
\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
For
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset

each 
\begin_inset Formula $p\in node.parents\wedge p\in waiting$
\end_inset

 
\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 
\begin_inset Formula $p.numFinishedChildren++$
\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
If
\end_layout

\end_inset

 
\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $p.numFinishedChildren==p.numChildren$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset


\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
all ch
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
ildren have been computed
\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 
\begin_inset Formula $waiting\leftarrow waiting-node$
\end_inset


\end_layout

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 
\begin_inset Formula $ready\leftarrow ready\cup node$
\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
EndIf
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
EndFor
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
EndProcedure
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Procedure
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset

ScheduleComputation
\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $ready$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset

Called by the job scheduler when a new node is ready or computation resource
 is available.
\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset

 pick
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 
\begin_inset Formula $node\in ready$
\end_inset

 according to some policy
\end_layout

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset

 
\begin_inset Formula $ready\leftarrow ready-node$
\end_inset


\end_layout

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset

 dispatch 
\begin_inset Formula $node$
\end_inset

 for computation.
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
EndProcedure
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
end{algorithmic}
\end_layout

\end_inset

 
\begin_inset Caption Standard

\begin_layout Plain Layout
Asynchronous forward computation of a CN.
 A node is moved to the ready set when all its children have been computed.
 A scheduler monitors the ready set and decides where to compute each node
 in the set.
 
\begin_inset CommandInset label
LatexCommand label
name "alg:CN-ForwardAsync-DAG"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
In many cases, we may need to compute the value for a node with changing
 input values.
 To avoid duplicate computation of shared branches, we can add a time stamp
\begin_inset Index idx
status open

\begin_layout Plain Layout
time stamp
\end_layout

\end_inset

 to each node and only recompute the value of the node if at least one of
 the children has newer value.
 This can be easily implemented by updating the time stamp whenever a new
 value is provided or computed, and by excluding nodes whose children is
 older from the actual computation.
\end_layout

\begin_layout Standard
In both Algorithms 
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:CN-ForwardOrder-DAG"

\end_inset

 and 
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:CN-ForwardAsync-DAG"

\end_inset

 each computation node needs to know how to compute its value when the operands
 are known.
 The computation can be as simple as matrix summation or element-wise applicatio
n of sigmoid function or as complex as whatever it may be.
 We will describes the evaluation functions for popular computation node
 types in Section 
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:Typical-Computation-Nodes"

\end_inset

.
\end_layout

\begin_layout Section
Model Training
\begin_inset Index idx
status open

\begin_layout Plain Layout
Model Training
\end_layout

\end_inset


\end_layout

\begin_layout Standard
To train a CN, we need to define a training criterion 
\begin_inset Formula $J$
\end_inset

.
 Popular criteria include cross-entropy
\begin_inset Index idx
status open

\begin_layout Plain Layout
cross-entropy
\end_layout

\end_inset

 (CE) for classification and mean square error
\begin_inset Index idx
status open

\begin_layout Plain Layout
mean square error
\end_layout

\end_inset

 (MSE) for regression.
 Since the training criterion is also a result of some computation, it can
 be represented as a computation node and inserted into the CN.
 Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CN-1HiddenNN+CE"

\end_inset

 illustrates a CN that represents an one-hidden-layer sigmoid neural network
 augmented with a CE training criterion node.
 If the training criterion contains regularization terms the regularization
 terms can also be implemented as computation nodes and the final training
 criterion node is a weighted sum of the main criterion and the regularization
 term.
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\align center
\begin_inset Graphics
	filename ../figures/CN+TrainingCriterion.png
	scale 70

\end_inset


\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset CommandInset label
LatexCommand label
name "fig:CN-1HiddenNN+CE"

\end_inset

The one-hidden-layer sigmoid neural network augmented with a cross entropy
 training criterion node 
\begin_inset Formula $J$
\end_inset

 and a label node 
\begin_inset Formula $L$
\end_inset

.
\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
The model parameters in a CN can be optimized over a training set
\begin_inset Newline newline
\end_inset

 
\begin_inset Formula $\mathbb{S=}\left\{ \left(\mathbf{x}^{m},\mathbf{y}^{m}\right)|0\leq m<M\right\} $
\end_inset

 using the minibatch based backpropagation (BP) algorithm.
 More specifically, we improve the model parameter 
\begin_inset Formula $\mathbf{W}$
\end_inset

 at each step 
\begin_inset Formula $t+1$
\end_inset

 as
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{equation}
\mathbf{W}_{t+1}\leftarrow\mathbf{\mathbf{W}_{\textrm{t}}}-\varepsilon\triangle\mathbf{W}_{t},\label{eq:CN-weightupdate}
\end{equation}

\end_inset

where 
\begin_inset Formula 
\begin{equation}
\triangle\mathbf{W}_{t}=\frac{1}{M_{b}}\sum_{m=1}^{M_{b}}\nabla_{\mathbf{W}_{t}}J\left(\mathbf{W};\mathbf{x}^{m},\mathbf{y}^{m}\right),\label{eq:CN-delta-W}
\end{equation}

\end_inset

and 
\begin_inset Formula $M_{b}$
\end_inset

 is the minibatch size.
 The key here is the computation of 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\nabla_{\mathbf{W}_{t}}J\left(\mathbf{W};\mathbf{x}^{m},\mathbf{y}^{m}\right)$
\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 which we will simplify as 
\begin_inset Formula $\nabla_{\mathbf{W}}^{J}$
\end_inset

.
 Since a CN can have arbitrary structure, we cannot use the exact same BP
 algorithm described in Algorithm 
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:DNN-Backpropagation"

\end_inset

 to compute 
\begin_inset Formula $\nabla_{\mathbf{W}}^{J}$
\end_inset

.
 
\end_layout

\begin_layout Standard
A naive solution to compute 
\begin_inset Formula $\nabla_{\mathbf{W}}^{J}$
\end_inset

 is illustrated in Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CN-NaiveGradient"

\end_inset

, in which 
\begin_inset Formula $\mathbf{W}^{(1)}$
\end_inset

 and 
\begin_inset Formula $\mathbf{W}^{(2)}$
\end_inset

 are model parameters.
 In this solution, each edge is associated with a partial derivative, and
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\nabla_{\mathbf{W^{(1)}}}^{J} & = & \frac{\partial J}{\partial\mathbf{V}^{(1)}}\frac{\partial\mathbf{V}^{(1)}}{\partial\mathbf{V}^{(2)}}\frac{\partial\mathbf{V}^{(2)}}{\partial\mathbf{W^{\mathrm{(1)}}}}+\frac{\partial J}{\partial\mathbf{V}^{(3)}}\frac{\partial\mathbf{V}^{(3)}}{\partial\mathbf{V}^{(4)}}\frac{\partial\mathbf{V}^{(4)}}{\partial\mathbf{W^{\mathrm{(1)}}}}\\
\nabla_{\mathbf{\mathbf{W^{\mathrm{(2)}}}}}^{J} & = & \frac{\partial J}{\partial\mathbf{V}^{(1)}}\frac{\partial\mathbf{V}^{(1)}}{\partial\mathbf{V}^{(2)}}\frac{\partial\mathbf{V}^{(2)}}{\partial\mathbf{W^{\mathrm{(2)}}}}.
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
This solution comes with two major drawbacks.
 First, each derivative can have very high dimension.
 If 
\begin_inset Formula $\mathbf{V}\in\mathbb{R}^{N_{1}\times N_{2}}$
\end_inset

 and 
\begin_inset Formula $\mathbf{W}\in\mathbb{R}^{N_{3}\times N_{4}}$
\end_inset

 then 
\begin_inset Formula $\mathbf{\frac{\partial\mathbf{V}}{\partial\mathbf{W}}}\in\mathbb{R}^{\left(N_{1}\times N_{2}\right)\times\left(N_{3}\times N_{4}\right)}$
\end_inset

.
 This means a large amount of memory is needed to keep the derivatives.
 Second, there are many duplicated computations.
 For example, 
\begin_inset Formula $\frac{\partial J}{\partial\mathbf{V}^{(1)}}\frac{\partial\mathbf{V}^{(1)}}{\partial\mathbf{V}^{(2)}}$
\end_inset

 is computed twice in this example, once for 
\begin_inset Formula $\nabla_{\mathbf{W^{(1)}}}^{J}$
\end_inset

 and once for 
\begin_inset Formula $\nabla_{\mathbf{\mathbf{W^{\mathrm{(2)}}}}}^{J}$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\align center
\begin_inset Graphics
	filename ../figures/CN-NaiveGradient.png
	scale 70

\end_inset


\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset CommandInset label
LatexCommand label
name "fig:CN-NaiveGradient"

\end_inset

The naive gradient computation in CN.
 
\begin_inset Formula $\mathbf{W}^{(1)}$
\end_inset

 and 
\begin_inset Formula $\mathbf{W}^{(2)}$
\end_inset

 are model parameters and each edge is associated with a partial derivative.
\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
Fortunately, there is a much simpler and more efficient approach to compute
 the gradient as illustrated in Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CN-EfficientGradient"

\end_inset

.
 In this approach, each node 
\begin_inset Formula $n$
\end_inset

 keeps two values: the evaluation (forward computation) result 
\begin_inset Formula $\boldsymbol{\mathbf{V}}_{n}$
\end_inset

 and the gradient 
\begin_inset Formula $\nabla_{n}^{J}$
\end_inset

 .
 Note that the training criterion 
\begin_inset Formula $J$
\end_inset

 is always a scalar, If 
\begin_inset Formula $\boldsymbol{\mathbf{V}}_{n}\in\mathbb{R}^{N_{1}\times N_{2}}$
\end_inset

 then 
\begin_inset Formula $\nabla_{n}^{J}\in\mathbb{R}^{N_{1}\times N_{2}}$
\end_inset

.
 This requires significantly less memory than that required in the naive
 solution illustrated in Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CN-NaiveGradient"

\end_inset

.
 This approach also allows for factorizing out the common prefix terms and
 making computation linear in the number of nodes in the graph.
 For example, 
\begin_inset Formula $\frac{\partial J}{\partial\mathbf{V}^{(2)}}$
\end_inset

 is computed only once and used twice when computing 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\frac{\partial J}{\partial\mathbf{W}^{(1)}}$
\end_inset

 and
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\frac{\partial J}{\partial\mathbf{W}^{(2)}}$
\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 with this approach.
 This is analogous to common subexpression elimination in a conventional
 expression graph, only here the common subexpressions are the parents of
 the nodes, rather than the children.
 
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\align center
\begin_inset Graphics
	filename ../figures/CN-EfficientGradient.png
	scale 70

\end_inset


\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset CommandInset label
LatexCommand label
name "fig:CN-EfficientGradient"

\end_inset

An efficient gradient computation algorithm in CN.
 
\begin_inset Formula $\mathbf{W}^{(1)}$
\end_inset

 and 
\begin_inset Formula $\mathbf{W}^{(2)}$
\end_inset

 are model parameters.
 Each node 
\begin_inset Formula $n$
\end_inset

 stores both the value of the node 
\begin_inset Formula $\boldsymbol{\upsilon}_{n}$
\end_inset

 and the gradient 
\begin_inset Formula $\nabla_{n}^{J}$
\end_inset

 .
 
\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
Automatic differentiation has been an active research area for decades and
 many techniques have been proposed ( 
\begin_inset CommandInset citation
LatexCommand cite
key "EvaluatingDerivatives-griewank+2008"

\end_inset

).
 A simple recursive algorithm for deciding the gradient computation order
\begin_inset Index idx
status open

\begin_layout Plain Layout
gradient computation order
\end_layout

\end_inset

 so that the gradient computation can be efficiently carried out is shown
 in Algorithm 
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:CN-Gradient"

\end_inset

 to which a similar recursive algorithm for scalar functions has been provided
 
\begin_inset CommandInset citation
LatexCommand cite
key "AutomaticDifferentiation-bischof+1997,SymbolicDifferentiation-guenter2007"

\end_inset

.
 This algorithm assumes that each node has a 
\begin_inset Formula $\mathrm{ComputePartialGradient}(child)$
\end_inset

 function which computes the gradient of the training criterion with regard
 to the node's child 
\begin_inset Formula $child$
\end_inset

 and is called in the order that is decided by the algorithm.
 Before the algorithm is computed, the gradient 
\begin_inset Formula $\nabla_{n}^{J}$
\end_inset

 at each node 
\begin_inset Formula $n$
\end_inset

 is set to 0, the queue 
\begin_inset Formula $order$
\end_inset

 is set to empty, and 
\begin_inset Formula $parentsLeft$
\end_inset

 is set to the number of parents of each node.
 This function is then called on a criterion node that evaluates to a scalar.
 Similar to the forward computation, an asynchronous algorithm can be derived.
 Once the gradient is known, the minibatch based stochastic gradient descent
 (SGD) algorithm  and other training algorithms that depend on gradient
 only can be used to train the model.
\end_layout

\begin_layout Standard
Alternatively, the gradients can be computed by following the reverse order
 of the forward computation and calling each node's parents' 
\begin_inset Formula $\mathrm{ComputePartialGradient}(child)$
\end_inset

 function and passing itself as the 
\emph on
child
\emph default
 argument.
 This approach, however, requires additional book keeping, e.g., keeping pointers
 to a node's parents which can introduce additional overhead when manipulating
 the network architecture.
\end_layout

\begin_layout Standard
\begin_inset Float algorithm
wide false
sideways false
status open

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{algorithmic}[1]
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Procedure
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset

DecideGradientComputationOrder
\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $node$
\end_inset

, 
\begin_inset Formula $parentsLeft$
\end_inset

, 
\begin_inset Formula $order$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset

Decide the order to compute the gradient at all descendants of 
\begin_inset Formula $node$
\end_inset

.
\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $parentsLeft$
\end_inset

 is initialized to the number of parents of each node.
\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $order$
\end_inset

 is initialized to an empty queue.
\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
If
\end_layout

\end_inset

 
\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset


\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
IsNotLeaf
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit

\begin_inset Formula $(node)$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset

 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $parentsLeft[node]--$
\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
If
\end_layout

\end_inset

 
\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $parentsLeft[node]$
\end_inset

 == 
\begin_inset Formula $0$
\end_inset

 
\begin_inset Formula $\wedge$
\end_inset

 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $node\notin order$
\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit

\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset


\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
All parents have been computed.
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit

\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset

 
\begin_inset Formula $order\leftarrow order+node$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset


\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
Add 
\begin_inset Formula $node$
\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 to the end of 
\begin_inset Formula $order$
\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
For
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset

each 
\begin_inset Formula $c\in node.children$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset

 call 
\noun on
DecideGradientComputationOrder
\noun default
(
\begin_inset Formula $c$
\end_inset

, 
\begin_inset Formula $parentsLeft$
\end_inset

, 
\begin_inset Formula $order$
\end_inset

)
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
EndFor
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
EndIf
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
EndIf
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
EndProcedure
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
end{algorithmic}
\end_layout

\end_inset

 
\begin_inset Caption Standard

\begin_layout Plain Layout
Reverse automatic gradient computation algorithm.
 At the top level 
\begin_inset Formula $node$
\end_inset

 must be a training criterion node that evaluates to a scalar.
\begin_inset CommandInset label
LatexCommand label
name "alg:CN-Gradient"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
In many cases, not all the gradients need to be computed.
 For example, the gradient with regard to the input value is never needed.
 When adapting the model, some of the model parameters don't need to be
 updated and thus it is unnecessary to compute the gradients with regard
 to these parameters.
 We can reduce the gradient computation by keeping a 
\begin_inset Formula $needsGradient$
\end_inset

 flag for each node.
 Once the flags of leaf nodes (either input values or model parameters)
 are known, the flags of the non-leaf nodes can be determined using Algorithm
 
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:CN-needsGradient"

\end_inset

, which is essentially a depth-first traversal over the DAG.
 Since both Algorithms 
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:CN-ForwardOrder-DAG"

\end_inset

 and 
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:CN-needsGradient"

\end_inset

 are essentially depth-first traversal over the DAG and both only need to
 be executed once they may be combined in one function.
\end_layout

\begin_layout Standard
\begin_inset Float algorithm
wide false
sideways false
status open

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{algorithmic}[1]
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Procedure
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset

UpdateneedsGradientFlag
\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $root$
\end_inset

, 
\begin_inset Formula $visited$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset

Enumerate nodes in the DAG in the depth-first order.
\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $visited$
\end_inset

 is initialized as an empty set.
\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
If
\end_layout

\end_inset

 
\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $root\notin visited$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
Comment
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset


\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
The same node may be a child of several nodes and revisited.
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit

\begin_inset ERT
status collapsed

\begin_layout Plain Layout

}
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 
\begin_inset Formula $visited\leftarrow visited\cup root$
\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
For
\end_layout

\end_inset


\begin_inset ERT
status collapsed

\begin_layout Plain Layout

{
\end_layout

\end_inset

each 
\begin_inset Formula $c\in root.children$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset

 call 
\noun on
UpdateneedsGradientFlag
\noun default
(
\begin_inset Formula $c$
\end_inset

, 
\begin_inset Formula $visited$
\end_inset

, 
\begin_inset Formula $order$
\end_inset

)
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
If
\end_layout

\end_inset

 
\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $IsNotLeaf(node)$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
If
\end_layout

\end_inset

 
\begin_inset ERT
status open

\begin_layout Plain Layout

{
\end_layout

\end_inset


\begin_inset Formula $node.AnyChildneedsGradient()$
\end_inset


\begin_inset ERT
status open

\begin_layout Plain Layout

}
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 
\begin_inset Formula $node.needsGradient\leftarrow true$
\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
Else
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
State
\end_layout

\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 
\begin_inset Formula $node.needsGradient\leftarrow false$
\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
EndIf
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
EndIf
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
EndFor
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
EndIf
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
EndProcedure
\end_layout

\end_inset

 
\end_layout

\begin_layout Plain Layout
\begin_inset ERT
status collapsed

\begin_layout Plain Layout


\backslash
end{algorithmic}
\end_layout

\end_inset

 
\begin_inset Caption Standard

\begin_layout Plain Layout
Update the 
\begin_inset Formula $needsGradient$
\end_inset

 flag recursively.
 
\begin_inset CommandInset label
LatexCommand label
name "alg:CN-needsGradient"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
Since every instantiation of a CN is task dependent and different, it is
 critical to have a way to check and verify the gradients computed automatically.
 A simple technique to estimate the gradient numerically
\begin_inset Index idx
status open

\begin_layout Plain Layout
gradient checker
\end_layout

\end_inset

 is:
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial w_{ij}}\approx\frac{J\left(w_{ij}+\epsilon\right)-J\left(w_{ij}-\epsilon\right)}{2\epsilon},
\end{equation}

\end_inset

where 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $w_{ij}$
\end_inset

 is the 
\begin_inset Formula $\left(i,j\right)$
\end_inset

-th element of a model parameter 
\begin_inset Formula $\mathbf{W}$
\end_inset

, 
\begin_inset Formula $\epsilon$
\end_inset

 is a small constant typically set to 
\begin_inset Formula $10^{-4}$
\end_inset

, and 
\begin_inset Formula $J\left(w_{ij}+\epsilon\right)$
\end_inset

 and 
\begin_inset Formula $J\left(w_{ij}-\epsilon\right)$
\end_inset

 are the objective function values evaluated with all other parameters fixed
 and 
\begin_inset Formula $w_{ij}$
\end_inset

 changed to 
\begin_inset Formula $w_{ij}+\epsilon$
\end_inset

 and 
\begin_inset Formula $w_{ij}-\epsilon$
\end_inset

, respectively.
 In most cases the numerically estimated gradient and the gradient computed
 from the automatic gradient computation agree to at least 4 significant
 digits if double precision computation is used.
 Note that this technique works well with a large range of 
\begin_inset Formula $\epsilon$
\end_inset

 values, except extremely small values such as 
\begin_inset Formula $10^{-20}$
\end_inset

 which would lead to numerical roundoff errors.
 
\end_layout

\begin_layout Section
Typical Computation
\begin_inset Index idx
status open

\begin_layout Plain Layout
Node! Typical Computation
\end_layout

\end_inset

 Nodes
\begin_inset CommandInset label
LatexCommand label
name "sec:Typical-Computation-Nodes"

\end_inset


\end_layout

\begin_layout Standard
For the forward computation and gradient calculation algorithms described
 above to work we assume that each type of computation node implements a
 function 
\begin_inset Formula $Evaluate$
\end_inset

 to evaluate the value of the node given the values of its child nodes,
 and the function 
\begin_inset Formula $ComputePartialGradient(child)$
\end_inset

 to compute the gradient of the training criterion with regard to the child
 node 
\begin_inset Formula $child$
\end_inset

 given the node value 
\begin_inset Formula $\boldsymbol{\mathbf{V}}_{n}$
\end_inset

 and the gradient 
\begin_inset Formula $\nabla_{n}^{J}$
\end_inset

 of the node 
\begin_inset Formula $n$
\end_inset

 and values of all its child nodes.
 For simplicity we will remove the subscript in the following discussion.
\end_layout

\begin_layout Standard
In this section we introduce the most widely used computation node types
 and the corresponding 
\begin_inset Formula $Evaluate$
\end_inset

 and 
\begin_inset Formula $ComputePartialGradient(child)$
\end_inset

 functions.
 In the following discussion we use symbols listed in Table 
\begin_inset CommandInset ref
LatexCommand ref
reference "tab:CN-Notations"

\end_inset

 to describe the computations.
 We treat each minibatch of input values as a matrix in which each column
 is a sample.
 In all the derivations of the gradients we use the identity
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial x_{ij}},
\end{equation}

\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
where 
\begin_inset Formula $\upsilon_{mn}$
\end_inset

 is the 
\begin_inset Formula $\left(m,n\right)$
\end_inset

-th element of the matrix 
\begin_inset Formula $\mathbf{V}$
\end_inset

, and 
\begin_inset Formula $x_{ij}$
\end_inset

 is the 
\begin_inset Formula $\left(i,j\right)$
\end_inset

-th element of matrix 
\begin_inset Formula $\mathbf{X}$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Float table
wide false
sideways false
status open

\begin_layout Plain Layout
\align center
\begin_inset Tabular
<lyxtabular version="3" rows="17" columns="2">
<features rotate="0" tabularvalignment="middle">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
Symbols
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
Description
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula $\lambda$
\end_inset

 
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
a scalar
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
\begin_inset Formula $\mathbf{d}$
\end_inset

 
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
column vector that represents the diagonal of a square matrix
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
\begin_inset Formula $\mathbf{X}$
\end_inset


\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
matrix of the first operand
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
\begin_inset Formula $\mathbf{Y}$
\end_inset


\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
matrix of the second operand
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula $\mathbf{\boldsymbol{V}}$
\end_inset


\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
value of current node
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
\begin_inset Formula $\nabla_{n}^{J}$
\end_inset

 (or 
\begin_inset Formula $\nabla_{\mathbf{V}}^{J}$
\end_inset

)
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
gradient of the current node
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula $\nabla_{\mathbf{X}}^{J}$
\end_inset


\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
gradient of the child node (operand) 
\begin_inset Formula $\mathbf{X}$
\end_inset


\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
\begin_inset Formula $\nabla_{\mathbf{\mathbf{Y}}}^{J}$
\end_inset

 
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
gradient of the child node (operand) 
\begin_inset Formula $\mathbf{Y}$
\end_inset


\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
\begin_inset Formula $\bullet$
\end_inset

 
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
element-wise product
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
\begin_inset Formula $\oslash$
\end_inset


\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
element-wise division
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
\begin_inset Formula $\circ$
\end_inset

 
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
inner product of vectors applied on matrices column-wise
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
\begin_inset Formula $\circledcirc$
\end_inset

 
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
inner product applied to each row
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
\begin_inset Formula $\delta\left(.\right)$
\end_inset


\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
Kronecker delta
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
\begin_inset Formula $\mathbf{1}_{m,n}$
\end_inset


\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
an 
\begin_inset Formula $m\times n$
\end_inset

 matrix with all 1's
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula $\mathbf{\boldsymbol{X}^{\alpha}}$
\end_inset

 
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
element-wise power
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
\begin_inset Formula $\mathrm{vec}\left(\mathbf{X}\right)$
\end_inset

 
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
vector formed by concatenating columns of 
\begin_inset Formula $\mathbf{X}$
\end_inset


\end_layout

\end_inset
</cell>
</row>
</lyxtabular>

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Symbols used in describing the computation nodes
\begin_inset CommandInset label
LatexCommand label
name "tab:CN-Notations"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Subsection
Computation Node Types with No Operand
\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! No Operand
\end_layout

\end_inset


\end_layout

\begin_layout Standard
The values of a computation node that has no operand are given instead of
 computed.
 As a result both 
\begin_inset Index idx
status open

\begin_layout Plain Layout
Evaluate 
\end_layout

\end_inset


\begin_inset Formula $Evaluate$
\end_inset

 and 
\begin_inset Index idx
status open

\begin_layout Plain Layout
ComputePartialGradient 
\end_layout

\end_inset


\begin_inset Formula $ComputePartialGradient(child)$
\end_inset

 functions for these computation node types are empty.
\end_layout

\begin_layout Itemize

\emph on
Parameter
\begin_inset Index idx
status collapsed

\begin_layout Plain Layout
Parameter ! computation node
\end_layout

\end_inset


\noun on
: u
\emph default
\noun default
sed to represent model parameters that need to be saved as part of the model.
 
\end_layout

\begin_layout Itemize

\emph on
InputValue
\emph default
: used to represent features, labels, or control parameters that are provided
 by users at run time.
\end_layout

\begin_layout Subsection
Computation Node Types with One Operand
\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! One Operand
\end_layout

\end_inset


\end_layout

\begin_layout Standard
In these computation node types, 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $Evaluate=\mathbf{\boldsymbol{V}}\left(\mathbf{X}\right)$
\end_inset

 and 
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit

\begin_inset Formula $ComputePartialGradient(\mathbf{X})=\nabla_{\mathbf{\mathbf{X}}}^{J}$
\end_inset

 
\end_layout

\begin_layout Itemize

\emph on
Negate
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Negate
\end_layout

\end_inset

: reverse the sign of each element in the operand 
\begin_inset Formula $\mathbf{X}$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{V}}\left(\mathbf{X}\right) & \leftarrow & -\mathbf{X}\\
\nabla_{\mathbf{\mathbf{X}}}^{J} & \leftarrow & \nabla_{\mathbf{\mathbf{X}}}^{J}-\mathbf{\nabla_{n}^{\mathit{J}}}.
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient can be derived by observing
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\begin{cases}
-1 & m=i\wedge n=j\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=-\frac{\partial J}{\partial\upsilon{}_{ij}}.
\end{equation}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
Sigmoid
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Sigmoid
\end_layout

\end_inset

: apply the sigmoid function element-wise to the operand 
\begin_inset Formula $\mathbf{X}$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{V}}\left(\mathbf{X}\right) & \leftarrow & \frac{1}{1+e^{-\mathbf{X}}}\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}\bullet\left[\mathbf{\boldsymbol{V}}\bullet\left(1-\mathbf{\boldsymbol{V}}\right)\right].
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient can be derived by observing
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\begin{cases}
\upsilon_{ij}\left(1-\upsilon_{ij}\right) & m=i\wedge n=j\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\frac{\partial J}{\partial\upsilon{}_{ij}}\upsilon_{ij}\left(1-\upsilon_{ij}\right).
\end{equation}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
Tanh
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Tanh
\end_layout

\end_inset

: apply the tanh function element-wise to the operand 
\begin_inset Formula $\mathbf{X}$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{V}}\left(\mathbf{X}\right) & \leftarrow & \frac{e^{\mathbf{X}}-e^{-\mathbf{X}}}{e^{\mathbf{X}}+e^{-\mathbf{X}}}\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}\bullet\left(1-\mathbf{\boldsymbol{V}}\bullet\mathbf{\boldsymbol{V}}\right).
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient can be derived by observing
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\begin{cases}
1-\upsilon_{ij}^{2} & m=i\wedge n=j\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\frac{\partial J}{\partial\upsilon{}_{ij}}\left(1-\upsilon_{ij}^{2}\right)
\end{equation}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
ReLU
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
ReLU
\end_layout

\end_inset

: apply the rectified linear operation element-wise to the operand 
\begin_inset Formula $\mathbf{X}$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{V}}\left(\mathbf{X}\right) & \leftarrow & \max\left(0,\mathbf{X}\right)\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}\bullet\delta\left(\mathbf{X}>0\right).
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient can be derived by observing
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\begin{cases}
\delta\left(x_{ij}>0\right) & m=i\wedge n=j\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
we have
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\frac{\partial J}{\partial\upsilon{}_{ij}}\delta\left(x_{ij}>0\right).
\end{equation}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
Log
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Log
\end_layout

\end_inset

: apply the log function element-wise to the operand 
\begin_inset Formula $\mathbf{X}$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{V}}\left(\mathbf{X}\right) & \leftarrow & \log\left(\mathbf{X}\right)\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}\bullet\frac{1}{\mathbf{X}}.
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient can be derived by observing
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\begin{cases}
\frac{1}{x_{ij}} & m=i\wedge n=j\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\frac{\partial J}{\partial\upsilon{}_{ij}}\frac{1}{x_{ij}}.
\end{equation}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
Exp
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Exp
\end_layout

\end_inset

: apply the exponent function element-wise to the operand 
\begin_inset Formula $\mathbf{X}$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{V}}\left(\mathbf{X}\right) & \leftarrow & \exp\left(\mathbf{X}\right)\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}\bullet\mathbf{\boldsymbol{V}}.
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient can be derived by observing
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\begin{cases}
\upsilon_{ij} & m=i\wedge n=j\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
we have
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\frac{\partial J}{\partial\upsilon{}_{ij}}\upsilon_{ij}.
\end{equation}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
Softmax
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Softmax
\end_layout

\end_inset

: apply the softmax function column-wise to the operand 
\begin_inset Formula $\mathbf{X}$
\end_inset

.
 Each column is treated as a separate sample.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
m_{j}\left(\mathbf{X}\right) & \leftarrow & \max_{i}x_{ij}\\
e_{ij}\left(\mathbf{X}\right) & \leftarrow & e^{x_{ij-m_{j}\left(\mathbf{X}\right)}}\\
s_{j}\left(\mathbf{X}\right) & \leftarrow & \sum_{i}e_{ij}\left(\mathbf{X}\right)\\
\upsilon_{ij}\left(\mathbf{X}\right) & \leftarrow & \frac{e_{ij}\left(\mathbf{X}\right)}{s_{j}\left(\mathbf{X}\right)}\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+\left[\mathbf{\nabla_{n}^{\mathit{J}}}-\mathbf{\nabla_{n}^{\mathit{J}}}\circ\mathbf{\boldsymbol{V}}\right]\bullet\mathbf{\boldsymbol{V}}.
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient can be derived by observing
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\begin{cases}
\upsilon_{ij}\left(1-\upsilon_{ij}\right) & m=i\wedge n=j\\
-\upsilon_{mj}\upsilon_{ij} & m\neq i\wedge n=j\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
we have
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\left(\frac{\partial J}{\partial\upsilon{}_{ij}}-\sum_{m}\frac{\partial J}{\partial\upsilon_{mj}}\upsilon_{mj}\right)\upsilon_{ij}.
\end{equation}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
LogSoftmax
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
LogSoftmax
\end_layout

\end_inset

: apply the log of softmax function column-wise to the operand 
\begin_inset Formula $\mathbf{X}$
\end_inset

.
 Each column is treated as a separate sample.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
m_{j}\left(\mathbf{X}\right) & \leftarrow & \max_{i}x_{ij}\\
e_{ij}\left(\mathbf{X}\right) & \leftarrow & e^{x_{ij-m_{j}\left(\mathbf{X}\right)}}\\
s_{j}\left(\mathbf{X}\right) & \leftarrow & \sum_{i}e_{ij}\left(\mathbf{X}\right)\\
\upsilon_{ij}\left(\mathbf{X}\right) & \leftarrow log & \frac{e_{ij}\left(\mathbf{X}\right)}{s_{j}\left(\mathbf{X}\right)}\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+\left[\mathbf{\nabla_{n}^{\mathit{J}}}-exp\left(\mathbf{\boldsymbol{V}}\right)Diag\left(VectorSum\left(\mathbf{\nabla_{n}^{\mathit{J}}}\right)\right)\right],
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
where 
\begin_inset Formula $VectorSum\left(\mathbf{.}\right)$
\end_inset

 sums each column vector, and 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $Diag\left(\mathbf{x}\right)$
\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 is a diagonal matrix formed by vector 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\mathbf{x}$
\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
.
 The gradient can be derived by observing
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\begin{cases}
1-exp\left(\upsilon_{ij}\right) & m=i\wedge n=j\\
-exp\left(\upsilon_{ij}\right) & m\neq i\wedge n=j\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
we have
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\left(\frac{\partial J}{\partial\upsilon{}_{ij}}-\sum_{m}\frac{\partial J}{\partial\upsilon_{mj}}exp\left(\upsilon_{ij}\right)\right).
\end{equation}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
Sum
\emph default
Elements
\begin_inset Index idx
status open

\begin_layout Plain Layout
SumElements
\end_layout

\end_inset

: sum over all elements in the operand 
\begin_inset Formula $\mathbf{X}$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{\upsilon}}\left(\mathbf{X}\right) & \leftarrow & \sum_{i,j}x_{ij}\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}.
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient can be derived by noting that 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\upsilon$
\end_inset

 and 
\begin_inset Formula $\mathbf{\nabla_{n}^{\mathit{J}}}$
\end_inset

 are scalars,
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon}{\partial x_{ij}}=1
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\frac{\partial J}{\partial\upsilon}\frac{\partial\upsilon}{\partial x_{ij}}=\frac{\partial J}{\partial\upsilon}.
\end{equation}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
L1Norm
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
L1Norm
\end_layout

\end_inset

: take the matrix 
\begin_inset Formula $L_{1}$
\end_inset

 norm of the operand X.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{\upsilon}}\left(\mathbf{X}\right) & \leftarrow & \sum_{i,j}|x_{ij}|\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}\mathrm{sgn}\left(\mathbf{X}\right).
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient can be derived by noting that 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\upsilon$
\end_inset

 and 
\begin_inset Formula $\mathbf{\nabla_{n}^{\mathit{J}}}$
\end_inset

 are scalars,
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon}{\partial x_{ij}}=\mathrm{sgn}\left(x_{ij}\right)
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\frac{\partial J}{\partial\upsilon}\frac{\partial\upsilon}{\partial x_{ij}}=\frac{\partial J}{\partial\upsilon}\mathrm{sgn}\left(x_{ij}\right).
\end{equation}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
L2Norm
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
L2Norm
\end_layout

\end_inset

: take the matrix 
\begin_inset Formula $L_{2}$
\end_inset

 norm (Frobenius norm)of the operand 
\begin_inset Formula $\mathbf{X}$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{\mathit{\upsilon}}}\left(\mathbf{X}\right) & \leftarrow & \sqrt{\sum_{i,j}\left(x_{ij}\right)^{2}}\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+\frac{1}{\upsilon}\mathbf{\nabla_{n}^{\mathit{J}}}\mathbf{X}.
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient can be derived by noting that 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\upsilon$
\end_inset

 and 
\begin_inset Formula $\mathbf{\nabla_{n}^{\mathit{J}}}$
\end_inset

 are scalars,
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon}{\partial x_{ij}}=\frac{x_{ij}}{\upsilon}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\frac{\partial J}{\partial\upsilon}\frac{\partial\upsilon}{\partial x_{ij}}=\frac{1}{\upsilon}\frac{\partial J}{\partial\upsilon}x_{ij}.
\end{equation}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
\begin_inset Note Comment
status open

\begin_layout Itemize

\emph on
TimeReverse 
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
TimeReverse
\end_layout

\end_inset

: does time reverse operation on the input matrix 
\begin_inset Formula $\mathbf{X}$
\end_inset

.
\end_layout

\begin_layout Plain Layout
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{\mathit{\upsilon}}}\left(\mathbf{X}(:,1:T\right) & \leftarrow & \mathbf{\boldsymbol{\mathit{\upsilon}}}\left(\mathbf{X}(:,T:-1:1\right)\\
\nabla_{\mathbf{X}}^{J}(:,T:-1:1) & \leftarrow & \mathbf{\nabla_{n}^{\mathit{J}}}\mathbf{X(:,}1:T\mathbf{)}.
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Plain Layout

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
The time reverse node is usually used for bi-directional model.
 To build a bi-directional, firstly use a TimeReverse node to do time reverse
 operation on the input.
 Then, do process on the processed input.
 Finally, use another TimeReverse node on the output of the processed data.
 
\end_layout

\end_inset


\end_layout

\begin_layout Subsection
Computation Node Types with 
\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! Two Operands
\end_layout

\end_inset

Two Operands
\end_layout

\begin_layout Standard
In these computation node types, 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $Evaluate=\mathbf{\boldsymbol{V}}\left(a,\mathbf{Y}\right)$
\end_inset

, where 
\begin_inset Formula $a$
\end_inset

 can be 
\begin_inset Formula $\mathbf{X}$
\end_inset

, 
\begin_inset Formula $\lambda$
\end_inset

 or 
\begin_inset Formula $\mathbf{d}$
\end_inset

, and 
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit

\begin_inset Formula $ComputePartialGradient(\mathbf{b})=\nabla_{\mathbf{b}}^{J}$
\end_inset

 where 
\begin_inset Formula $\mathbf{b}$
\end_inset

 can be 
\begin_inset Formula $ $
\end_inset


\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\mathbf{X}$
\end_inset

, 
\begin_inset Formula $\mathbf{Y}$
\end_inset

 or 
\begin_inset Formula $\mathbf{d}$
\end_inset

.
\end_layout

\begin_layout Itemize

\emph on
Scale
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! Scale
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
Scale
\end_layout

\end_inset

: scale each element of 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\mathbf{\mathbf{Y}}$
\end_inset

 by 
\begin_inset Formula $\lambda$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{V}}\left(\lambda,\mathbf{\mathbf{Y}}\right) & \leftarrow & \lambda\mathbf{Y}\\
\nabla_{\mathbf{\lambda}}^{J} & \leftarrow & \nabla_{\mathbf{\lambda}}^{J}+\mathrm{vec}\left(\mathbf{\mathbf{\nabla_{n}^{\mathit{J}}}}\right)\circ\mathrm{vec}\left(\mathbf{\mathbf{Y}}\right)\\
\nabla_{\mathbf{Y}}^{J} & \leftarrow & \nabla_{\mathbf{Y}}^{J}+\lambda\mathbf{\nabla_{n}^{\mathit{J}}}.
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient 
\begin_inset Formula $\nabla_{\mathbf{\lambda}}^{J}$
\end_inset

 can be derived by observing
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial\lambda}=y_{mn}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial\lambda}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial\lambda}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}y_{mn}.
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
Similarly to derive the gradient 
\begin_inset Formula $\nabla_{\mathbf{y}}^{J}$
\end_inset

, we note that
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial y_{ij}}=\begin{cases}
\lambda & m=i\wedge n=j\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and get
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial y_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial y_{ij}}=\lambda\frac{\partial J}{\partial\upsilon{}_{ij}}
\end{equation}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
Times
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! Times
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
Times
\end_layout

\end_inset

: matrix product of operands 
\begin_inset Formula $\mathbf{X}$
\end_inset

 and 
\begin_inset Formula $\mathbf{Y}$
\end_inset

.
 Must satisfy 
\begin_inset Formula $\mathbf{X}.cols=\mathbf{Y}.rows$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{V}}\left(\mathbf{X},\mathbf{Y}\right) & \leftarrow & \mathbf{XY}\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+\mathbf{\mathbf{\nabla_{n}^{\mathit{J}}}}\mathbf{\mathbf{Y}}^{T}\\
\nabla_{\mathbf{\mathbf{Y}}}^{J} & \leftarrow & \nabla_{\mathbf{Y}}^{J}+\mathbf{\mathbf{X}}^{T}\mathbf{\nabla_{n}^{\mathit{J}}}.
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient 
\begin_inset Formula $\nabla_{\mathbf{X}}^{J}$
\end_inset

 can be derived by observing
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\begin{cases}
y_{jn} & m=i\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\sum_{n}\frac{\partial J}{\partial\upsilon_{in}}y_{jn}.
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
Similarly to derive the gradient 
\begin_inset Formula $\nabla_{\mathbf{\mathbf{Y}}}^{J}$
\end_inset

, we note that
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial y_{ij}}=\begin{cases}
x_{mi} & n=j\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and get
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial y_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial y_{ij}}=\sum_{m}\frac{\partial J}{\partial\upsilon_{mj}}x_{mi}
\end{equation}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
ElementTimes
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! ElementTimes
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
ElementTimes
\end_layout

\end_inset

: element-wise product of two matrices.
 Must satisfy 
\begin_inset Formula $\mathbf{X}.rows=\mathbf{Y}.rows$
\end_inset

 and 
\begin_inset Formula $\mathbf{X}.cols=\mathbf{Y}.cols$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\upsilon_{ij}\left(\mathbf{X},\mathbf{Y}\right) & \leftarrow & x_{ij}y_{ij}\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}\bullet\mathbf{Y}\\
\nabla_{\mathbf{\mathbf{Y}}}^{J} & \leftarrow & \nabla_{\mathbf{\mathbf{Y}}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}\bullet\mathbf{X}.
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient 
\begin_inset Formula $\nabla_{\mathbf{X}}^{J}$
\end_inset

 can be derived by observing
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\begin{cases}
y_{ij} & m=i\wedge n=j\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial y_{ij}}=\frac{\partial J}{\partial\upsilon{}_{ij}}y_{ij}.
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
The gradient 
\begin_inset Formula $\nabla_{\mathbf{\mathbf{Y}}}^{J}$
\end_inset

 can be derived exactly the same way due to symmetry.
\end_layout

\begin_layout Itemize

\emph on
Plus
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! Plus
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
Plus
\end_layout

\end_inset

: sum of two matrices 
\begin_inset Formula $\mathbf{X}$
\end_inset

 and 
\begin_inset Formula $\mathbf{Y}$
\end_inset

.
 Must satisfy 
\begin_inset Formula $\mathbf{X}.rows=\mathbf{Y}.rows$
\end_inset

.
 If 
\begin_inset Formula $\mathbf{X}.cols\neq\mathbf{Y}.cols$
\end_inset

 but one of them is a multiple of the other, the smaller matrix needs to
 be expanded by repeating itself.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{V}}\left(\mathbf{X},\mathbf{Y}\right) & \leftarrow & \mathbf{X+Y}\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \begin{cases}
\nabla_{\mathbf{X}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}} & \mathbf{X}.rows=\mathbf{V}.rows\wedge\mathbf{X}.cols=\mathbf{V}.cols\\
\nabla_{\mathbf{X}}^{J}+\mathbf{1_{\mathit{1,\mathbf{V}.rows}}\nabla_{n}^{\mathit{J}}} & \mathbf{X}.rows=1\wedge\mathbf{X}.cols=\mathbf{V}.cols\\
\nabla_{\mathbf{X}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}\mathbf{1}_{\mathbf{V}.cols,1} & \mathbf{X}.rows=\mathbf{V}.rows\wedge\mathbf{X}.cols=1\\
\nabla_{\mathbf{X}}^{J}+\mathbf{1}_{\mathit{1,\mathbf{V}.rows}}\mathbf{\nabla_{n}^{\mathit{J}}}\mathbf{1}_{v.cols,1} & \mathbf{X}.rows=1\wedge\mathbf{X}.cols=1
\end{cases}\\
\nabla_{\mathbf{Y}}^{J} & \leftarrow & \begin{cases}
\nabla_{\mathbf{Y}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}} & \mathbf{Y}.rows=\mathbf{V}.rows\wedge\mathbf{Y}.cols=\mathbf{V}.cols\\
\nabla_{\mathbf{Y}}^{J}+\mathbf{1_{\mathit{1,\mathbf{V}.rows}}\nabla_{n}^{\mathit{J}}} & \mathbf{Y}.rows=1\wedge\mathbf{Y}.cols=\mathbf{V}.cols\\
\nabla_{\mathbf{Y}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}\mathbf{1}_{\mathbf{V}.cols,1} & \mathbf{Y}.rows=\mathbf{V}.rows\wedge\mathbf{Y}.cols=1\\
\nabla_{\mathbf{Y}}^{J}+\mathbf{1}_{\mathit{1,\mathbf{V}.rows}}\mathbf{\nabla_{n}^{\mathit{J}}}\mathbf{1}_{\mathbf{V}.cols,1} & \mathbf{Y}.rows=1\wedge\mathbf{Y}.cols=1
\end{cases}
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient 
\begin_inset Formula $\nabla_{\mathbf{X}}^{J}$
\end_inset

 can be derived by observing that when 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\mathbf{X}$
\end_inset

 has the same dimension as 
\begin_inset Formula $\mathbf{\mathbf{V}}$
\end_inset

, we have 
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\begin{cases}
1 & m=i\wedge n=j\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\frac{\partial J}{\partial\upsilon_{ij}}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
If 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\mathbf{X}.rows=1\wedge\mathbf{X}.cols=\mathbf{V}.cols$
\end_inset

 we have
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\begin{cases}
1 & n=j\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\sum_{m}\frac{\partial J}{\partial\upsilon_{mj}}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
We can derive 
\begin_inset Formula $\nabla_{\mathbf{X}}^{J}$
\end_inset

 and 
\begin_inset Formula $\nabla_{\mathbf{Y}}^{J}$
\end_inset

 under other conditions similarly.
\end_layout

\begin_layout Itemize

\emph on
Minus
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! Minus
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
Minus
\end_layout

\end_inset

: difference of two matrices 
\begin_inset Formula $\mathbf{X}$
\end_inset

 and 
\begin_inset Formula $\mathbf{Y}$
\end_inset

.
 Must satisfy 
\begin_inset Formula $\mathbf{X}.rows=\mathbf{Y}.rows$
\end_inset

.
 If 
\begin_inset Formula $\mathbf{X}.cols\neq\mathbf{Y}.cols$
\end_inset

 but one of them is a multiple of the other, the smaller matrix needs to
 be expanded by repeating itself.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{V}}\left(\mathbf{X},\mathbf{Y}\right) & \leftarrow & \mathbf{X-Y}\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \begin{cases}
\nabla_{\mathbf{X}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}} & \mathbf{X}.rows=\mathbf{V}.rows\wedge\mathbf{X}.cols=\mathbf{V}.cols\\
\nabla_{\mathbf{X}}^{J}+\mathbf{1_{\mathit{1,\mathbf{V}.rows}}\nabla_{n}^{\mathit{J}}} & \mathbf{X}.rows=1\wedge\mathbf{X}.cols=\mathbf{V}.cols\\
\nabla_{\mathbf{X}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}\mathbf{1}_{\mathbf{V}.cols,1} & \mathbf{X}.rows=\mathbf{V}.rows\wedge\mathbf{X}.cols=1\\
\nabla_{\mathbf{X}}^{J}+\mathbf{1}_{\mathit{1,\mathbf{V}.rows}}\mathbf{\nabla_{n}^{\mathit{J}}}\mathbf{1}_{v.cols,1} & \mathbf{X}.rows=1\wedge\mathbf{X}.cols=1
\end{cases}\\
\nabla_{\mathbf{Y}}^{J} & \leftarrow & \begin{cases}
\nabla_{\mathbf{Y}}^{J}-\mathbf{\nabla_{n}^{\mathit{J}}} & \mathbf{Y}.rows=\mathbf{V}.rows\wedge\mathbf{Y}.cols=\mathbf{V}.cols\\
\nabla_{\mathbf{Y}}^{J}-\mathbf{1_{\mathit{1,\mathbf{V}.rows}}\nabla_{n}^{\mathit{J}}} & \mathbf{Y}.rows=1\wedge\mathbf{Y}.cols=\mathbf{V}.cols\\
\nabla_{\mathbf{Y}}^{J}-\mathbf{\nabla_{n}^{\mathit{J}}}\mathbf{1}_{\mathbf{V}.cols,1} & \mathbf{Y}.rows=\mathbf{V}.rows\wedge\mathbf{Y}.cols=1\\
\nabla_{\mathbf{Y}}^{J}-\mathbf{1}_{\mathit{1,\mathbf{V}.rows}}\mathbf{\nabla_{n}^{\mathit{J}}}\mathbf{1}_{\mathbf{V}.cols,1} & \mathbf{Y}.rows=1\wedge\mathbf{Y}.cols=1
\end{cases}
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The derivation of the gradients is similar to that for the 
\emph on
Plus
\emph default
 node.
\end_layout

\begin_layout Itemize

\emph on
DiagTimes
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! DiagTimes
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
DiagTimes
\end_layout

\end_inset

: the product of a diagonal matrix (whose diagonal equals to 
\begin_inset Formula $\mathbf{d}$
\end_inset

) and an arbitrary matrix 
\begin_inset Formula $\mathbf{Y}$
\end_inset

.
 Must satisfy 
\begin_inset Formula $\mathbf{d}.rows=\mathbf{Y}.rows$
\end_inset

.
 
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\upsilon_{ij}\left(\mathbf{d},\mathbf{Y}\right) & \leftarrow & d_{i}y_{ij}\\
\nabla_{\mathbf{d}}^{J} & \leftarrow & \nabla_{\mathbf{d}}^{J}+\mathbf{\mathbf{\nabla_{n}^{\mathit{J}}}}\circledcirc\mathbf{Y}\\
\nabla_{\mathbf{\mathbf{Y}}}^{J} & \leftarrow & \nabla_{\mathbf{Y}}^{J}+\mathrm{DiagTimes}\left(\mathbf{d},\mathbf{\nabla_{n}^{\mathit{J}}}\right).
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient 
\begin_inset Formula $\nabla_{\mathbf{d}}^{J}$
\end_inset

 can be derived by observing 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial d_{i}}=\begin{cases}
y_{in} & m=i\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial d_{i}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial d_{i}}=\sum_{n}\frac{\partial J}{\partial\upsilon_{in}}y_{in}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
Similarly to derive the gradient 
\begin_inset Formula $\nabla_{\mathbf{\mathbf{Y}}}^{J}$
\end_inset

 we note that
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial y_{ij}}=\begin{cases}
d_{i} & m=i\wedge n=j\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and get
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial y_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial y_{ij}}=\frac{\partial J}{\partial\upsilon_{ij}}d_{i}
\end{equation}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
Dropout
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! Dropout
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
Dropout
\end_layout

\end_inset

: randomly set 
\begin_inset Formula $\lambda$
\end_inset

 percentage of values of 
\begin_inset Formula $\mathbf{\mathbf{Y}}$
\end_inset

 to be zero and scale the rest so that the expectation of the sum is not
 changed: 
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
m_{ij}\left(\lambda\right) & \leftarrow & \begin{cases}
0 & rand\left(0,1\right)\leq\lambda\\
\frac{1}{1-\lambda} & else
\end{cases}\\
v_{ij}\left(\lambda,\mathbf{\mathbf{Y}}\right) & \leftarrow & m_{ij}y_{ij}\\
\nabla_{\mathbf{\mathbf{Y}}}^{J} & \leftarrow & \nabla_{\mathbf{Y}}^{J}+\begin{cases}
\mathbf{\nabla_{n}^{\mathit{J}}} & \lambda=0\\
\mathbf{\nabla_{n}^{\mathit{J}}}\bullet\mathbf{M} & else
\end{cases}
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
Note that 
\begin_inset Formula $\lambda$
\end_inset

 is a given value instead of part of the model.
 We only need to get the gradient with regard to 
\begin_inset Formula $\mathbf{\mathbf{Y}}$
\end_inset

.
 If 
\begin_inset Formula $\lambda=0$
\end_inset

 then 
\begin_inset Formula $\mathbf{V}=\mathbf{X}$
\end_inset

 which is a trivial case.
 Otherwise it's equivalent to the 
\emph on
ElementTimes
\emph default
 node with a randomly set mask 
\begin_inset Formula $\mathbf{M}$
\end_inset

.
 
\end_layout

\begin_layout Itemize

\emph on
KhatriRaoProduct
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! KhatriRaoProduct
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
KhatriRaoProduct
\end_layout

\end_inset

: column-wise cross product of two matrices 
\begin_inset Formula $\mathbf{X}$
\end_inset

 and 
\begin_inset Formula $\mathbf{Y}$
\end_inset

.
 Must satisfy 
\begin_inset Formula $\mathbf{X}.cols=\mathbf{Y}.cols$
\end_inset

.
 Useful for constructing tensor networks.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{\upsilon}}_{.j}\left(\mathbf{X},\mathbf{Y}\right) & \leftarrow & \mathbf{x_{\mathit{.j}}\otimes y}_{\mathit{.j}}\\
\left[\nabla_{\mathbf{X}}^{J}\right]_{.j} & \leftarrow & \left[\nabla_{\mathbf{X}}^{J}\right]_{.j}+\left[\left[\mathbf{\mathbf{\nabla_{n}^{\mathit{J}}}}\right]_{.j}\right]_{\mathbf{X}.rows,\mathbf{Y}.rows}\mathbf{Y}\\
\left[\nabla_{\mathbf{Y}}^{J}\right]_{.j} & \leftarrow & \left[\nabla_{\mathbf{Y}}^{J}\right]_{.j}+\left[\left[\left[\mathbf{\mathbf{\nabla_{n}^{\mathit{J}}}}\right]_{.j}\right]_{\mathbf{X}.rows,\mathbf{Y}.rows}\right]^{T}\mathbf{X},
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
where 
\begin_inset Formula $\left[\mathbf{X}\right]_{m,n}$
\end_inset

 reshapes 
\begin_inset Formula $\mathbf{X}$
\end_inset

 to become an 
\begin_inset Formula $m\times n$
\end_inset

 matrix.
 The gradient 
\begin_inset Formula $\nabla_{\mathbf{X}}^{J}$
\end_inset

 can be derived by observing
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\begin{cases}
y_{kj} & n=j\wedge i=m/\mathbf{Y}.rows\wedge k=modulus(m,\mathbf{Y}.rows)\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial x_{ij}}=\sum_{i,k}\frac{\partial J}{\partial\upsilon_{i\times y.rows+k,j}}y_{kj}.
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
The gradient 
\begin_inset Formula $\nabla_{\mathbf{y}}^{J}$
\end_inset

 can be derived similarly.
\end_layout

\begin_layout Itemize

\emph on
Cos
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! Cos
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
Cos
\end_layout

\end_inset

: column-wise cosine distance of two matrices 
\begin_inset Formula $\mathbf{X}$
\end_inset

 and 
\begin_inset Formula $\mathbf{Y}$
\end_inset

.
 Must satisfy 
\begin_inset Formula $\mathbf{X}.cols=\mathbf{Y}.cols$
\end_inset

.
 The result is a row vector.
 Frequently used in natural language processing tasks.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{\mathit{\upsilon}}}_{.j}\left(\mathbf{X},\mathbf{Y}\right) & \leftarrow & \frac{\mathbf{x_{\mathit{.j}}^{\mathit{T}}y}_{\mathit{.j}}}{\mathbf{\left\Vert x_{\mathit{.j}}\right\Vert \left\Vert y_{\mathit{.j}}\right\Vert }}\\
\left[\nabla_{\mathbf{X}}^{J}\right]_{.j} & \leftarrow & \left[\nabla_{\mathbf{X}}^{J}\right]_{.j}+\left[\mathbf{\mathbf{\nabla_{n}^{\mathit{J}}}}\right]_{.j}\bullet\left[\frac{y_{ij}}{\mathbf{\left\Vert x_{\mathit{.j}}\right\Vert \left\Vert y_{\mathit{.j}}\right\Vert }}-\frac{x_{ij}\upsilon_{.,j}}{\mathbf{\left\Vert x_{\mathit{.j}}\right\Vert ^{2}}}\right]\\
\left[\nabla_{\mathbf{Y}}^{J}\right]_{.j} & \leftarrow & \left[\nabla_{\mathbf{Y}}^{J}\right]_{.j}+\left[\mathbf{\mathbf{\nabla_{n}^{\mathit{J}}}}\right]_{.j}\bullet\left[\frac{x_{ij}}{\mathbf{\left\Vert x_{\mathit{.j}}\right\Vert \left\Vert y_{\mathit{.j}}\right\Vert }}-\frac{y_{ij}\upsilon_{.,j}}{\mathbf{\left\Vert y_{\mathit{.j}}\right\Vert ^{2}}}\right].
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient 
\begin_inset Formula $\nabla_{\mathbf{X}}^{J}$
\end_inset

 can be derived by observing
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{.n}}{\partial x_{ij}}=\begin{cases}
\frac{y_{ij}}{\mathbf{\left\Vert x_{\mathit{.j}}\right\Vert \left\Vert y_{\mathit{.j}}\right\Vert }}-\frac{x_{ij}\left(\mathbf{x_{\mathit{.j}}^{\mathit{T}}y}_{\mathit{.j}}\right)}{\mathbf{\left\Vert x_{\mathit{.j}}\right\Vert ^{3}}\left\Vert \mathbf{y}_{\mathit{.j}}\right\Vert } & n=j\\
0 & else
\end{cases}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{eqnarray}
\frac{\partial J}{\partial x_{ij}} & = & \sum_{n}\frac{\partial J}{\partial\upsilon_{.n}}\frac{\partial\upsilon_{.n}}{\partial x_{ij}}=\frac{\partial J}{\partial\upsilon_{.,j}}\left[\frac{y_{ij}}{\mathbf{\left\Vert x_{\mathit{.j}}\right\Vert \left\Vert y_{\mathit{.j}}\right\Vert }}-\frac{x_{ij}\left(\mathbf{x_{\mathit{.j}}^{\mathit{T}}y}_{\mathit{.j}}\right)}{\mathbf{\left\Vert x_{\mathit{.j}}\right\Vert ^{3}}\left\Vert \mathbf{y}_{\mathit{.j}}\right\Vert }\right].\\
 & = & \frac{\partial J}{\partial\upsilon_{.,j}}\left[\frac{y_{ij}}{\mathbf{\left\Vert x_{\mathit{.j}}\right\Vert \left\Vert y_{\mathit{.j}}\right\Vert }}-\frac{x_{ij}\upsilon_{.,j}}{\mathbf{\left\Vert x_{\mathit{.j}}\right\Vert ^{2}}}\right].
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient 
\begin_inset Formula $\nabla_{\mathbf{y}}^{J}$
\end_inset

 can be derived similarly.
\end_layout

\begin_layout Itemize

\emph on
ClassificationError
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! ClassificationError
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
ClassificationError
\end_layout

\end_inset

: compute the total number of columns in which the indexes of the maximum
 values disagree.
 Each column is considered as a sample and 
\begin_inset Formula $\delta$
\end_inset

 is the Kronecker delta.
 Must satisfy 
\begin_inset Formula $\mathbf{X}.cols=\mathbf{Y}.cols$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
a_{j}\left(\mathbf{X}\right) & \leftarrow & \arg\max_{i}x_{ij}\\
b_{j}\left(\mathbf{Y}\right) & \leftarrow & \arg\max_{i}y_{ij}\\
v\left(\mathbf{X},\mathbf{\mathbf{Y}}\right) & \leftarrow & \sum_{j}\delta\left(a_{j}\left(\mathbf{X}\right)\neq b_{j}\left(\mathbf{Y}\right)\right)
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
This node type is only used to compute classification errors during the
 decoding time and is not involved in the model training.
 For this reason, calling 
\begin_inset Formula $ComputePartialGradient(\mathbf{b})$
\end_inset

 should just raise an error.
\end_layout

\begin_layout Itemize

\emph on
SquareError
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! SquareError
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
SquareError
\end_layout

\end_inset

: compute the square of Frobenius norm of the difference 
\begin_inset Formula $\mathbf{X}-\mathbf{Y}$
\end_inset

.
 Must satisfy 
\begin_inset Formula $\mathbf{X}.rows=\mathbf{Y}.rows$
\end_inset

 and 
\begin_inset Formula $\mathbf{X}.cols=\mathbf{Y}.cols$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
v\left(\mathbf{X},\mathbf{Y}\right) & \leftarrow & \mathrm{Tr}\left(\left(\mathbf{X}-\mathbf{Y}\right)\left(\mathbf{X}-\mathbf{Y}\right)^{T}\right)\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+2\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right)\\
\nabla_{\mathbf{Y}}^{J} & \leftarrow & \nabla_{\mathbf{Y}}^{J}-2\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right).
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
Note that 
\begin_inset Formula $v$
\end_inset

 is a scalar.
 The derivation of the gradients is trivial given
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{eqnarray}
\frac{\partial v}{\partial\mathbf{X}} & = & +2\left(\mathbf{X}-\mathbf{Y}\right)\\
\frac{\partial v}{\partial\mathbf{Y}} & = & -2\left(\mathbf{X}-\mathbf{Y}\right).
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
CrossEntropy
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! CrossEntropy
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
CrossEntropy
\end_layout

\end_inset

:
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
 compute the sum of cross entropy computed column-wise (over samples) where
 each column of 
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit

\begin_inset Formula $\mathbf{X}$
\end_inset

 and 
\begin_inset Formula $\mathbf{Y}$
\end_inset

 is a probability distribution.
 Must satisfy 
\begin_inset Formula $\mathbf{X}.rows=\mathbf{Y}.rows$
\end_inset

 and 
\begin_inset Formula $\mathbf{X}.cols=\mathbf{Y}.cols$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{R}\left(\mathbf{\mathbf{Y}}\right) & \leftarrow & \log\left(\mathbf{Y}\right)\\
v\left(\mathbf{X},\mathbf{\mathbf{Y}}\right) & \leftarrow & -\mathrm{vec}\left(\mathbf{X}\right)\circ\mathrm{vec}\left(\mathbf{R}\left(\mathbf{\mathbf{Y}}\right)\right)\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}-\mathbf{\nabla_{n}^{\mathit{J}}}\mathbf{R}\left(\mathbf{\mathbf{Y}}\right)\\
\nabla_{\mathbf{\mathbf{Y}}}^{J} & \leftarrow & \nabla_{\mathbf{\mathbf{Y}}}^{J}-\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}\oslash\mathbf{Y}\right).
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
Note that 
\begin_inset Formula $v$
\end_inset

 is a scalar.
 The gradient 
\begin_inset Formula $\nabla_{\mathbf{X}}^{J}$
\end_inset

 can be derived by observing
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
 
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon}{\partial x_{ij}}=-\log\left(y_{ij}\right)=-r_{ij}\left(\mathbf{\mathbf{Y}}\right)
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial x_{ij}}=\frac{\partial J}{\partial\upsilon}\frac{\partial\upsilon}{\partial x_{ij}}=-\frac{\partial J}{\partial\upsilon}r_{ij}\left(\mathbf{\mathbf{Y}}\right)
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
Similarly to derive the gradient 
\begin_inset Formula $\nabla_{\mathbf{\mathbf{Y}}}^{J}$
\end_inset

 we note that
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon}{\partial y_{ij}}=-\frac{x_{ij}}{y_{ij}}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and get
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial y_{ij}}=\frac{\partial J}{\partial\upsilon}\frac{\partial\upsilon}{\partial y_{ij}}=-\frac{\partial J}{\partial\upsilon}\frac{x_{ij}}{y_{ij}}.
\end{equation}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
CrossEntropyWithSoftmax
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! CrossEntropyWithSoftmax
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
CrossEntropyWithSoftmax
\end_layout

\end_inset

:
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
 same as 
\family default
\series default
\shape default
\size default
\emph on
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
CrossEntropy
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
 except that 
\begin_inset Formula $\mathbf{Y}$
\end_inset

 contains values before the softmax operation (i.e., unnormalized).
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{P}\left(\mathbf{\mathbf{Y}}\right) & \leftarrow & \mathrm{Softmax}\left(\mathbf{\mathbf{Y}}\right)\\
\mathbf{R}\left(\mathbf{\mathbf{Y}}\right) & \leftarrow & \log\left(\mathbf{P}\left(\mathbf{\mathbf{Y}}\right)\right)\\
v\left(\mathbf{X},\mathbf{\mathbf{Y}}\right) & \leftarrow & \mathrm{vec}\left(\mathbf{X}\right)\circ\mathrm{vec}\left(\mathbf{R}\left(\mathbf{\mathbf{Y}}\right)\right)\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}-\mathbf{\nabla_{n}^{\mathit{J}}}\mathbf{R}\left(\mathbf{\mathbf{Y}}\right)\\
\nabla_{\mathbf{\mathbf{Y}}}^{J} & \leftarrow & \nabla_{\mathbf{\mathbf{Y}}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{\mathbf{P}\left(\mathbf{\mathbf{Y}}\right)}-\mathbf{X}\right)
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
The gradient 
\begin_inset Formula $\nabla_{\mathbf{X}}^{J}$
\end_inset

 is the same as in the 
\emph on
CrossEntropy
\emph default
 node.
 To derive the gradient 
\begin_inset Formula $\nabla_{\mathbf{\mathbf{Y}}}^{J}$
\end_inset

 we note that
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon}{\partial y_{ij}}=\mathbf{p}_{ij}\left(\mathbf{\mathbf{Y}}\right)-x_{ij}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
and get
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial y_{ij}}=\frac{\partial J}{\partial\upsilon}\frac{\partial\upsilon}{\partial y_{ij}}=\frac{\partial J}{\partial\upsilon}\left(\mathbf{p}_{ij}\left(\mathbf{\mathbf{Y}}\right)-x_{ij}\right).
\end{equation}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
ClassBasedCrossEntropyWithSoftmax
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! ClassBasedCrossEntropyWithSoftmax
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
ClassBasedCrossEntropyWithSoftmax
\end_layout

\end_inset

:
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
 same as 
\family default
\series default
\shape default
\size default
\emph on
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
CrossEntropyWithSoftmax
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
 except that label 
\begin_inset Formula $\mathbf{Y}$
\end_inset

 is a 4 bye T dense matrix.
 The first row is the word id, the second row is the class id of this word
 id, the third row is the first word id in this class, and the last row
 is the first word id of the next class.
 The last two rows provide the left and right boundary of a slice that correspon
d to softmax of label given its observation 
\begin_inset Formula $\mathbf{X}$
\end_inset

.
 It also takes a weight matrix 
\begin_inset Formula $\mathbf{W}$
\end_inset

 for computing with observation 
\begin_inset Formula $\mathbf{X}$
\end_inset

.
 It also takes outputs 
\begin_inset Formula $\mathbf{Q}$
\end_inset

 from Times node that provides inputs for computing posterior probability
 given class.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
for & t=1 & :T\\
z_{t} & \leftarrow & Y(0,t)\\
\mathbf{Z_{t}} & \leftarrow & [0\cdots\delta(i=z_{t})\cdots0]^{T}\\
c_{t} & \leftarrow & Y(1,t)\\
\mathbf{C_{t}} & \leftarrow & [0\cdots\delta(i=c_{t})\cdots0]^{T}\\
l_{t} & \leftarrow & Y(2,t)\\
r_{t} & \leftarrow & Y(3,t)\\
\mathbf{W_{t}} & \leftarrow & W[:,l_{t}:r_{t}-1]\\
\mathbf{P_{t}} & \leftarrow & \mathrm{Softmax}\left(\mathbf{W_{t}^{T}X}\right)\\
\mathbf{R_{t}} & \leftarrow & \log\left(\mathbf{P_{t}}\right)\\
v\left(\mathbf{X},\mathbf{\mathbf{Y}}\right) & += & \mathbf{Z_{t}}\circ\mathbf{R_{t}}\\
\mathbf{U_{t}} & \leftarrow & logSoftmax(\mathbf{Q_{t}})\\
v\left(\mathbf{X},\mathbf{\mathbf{Y}}\right) & += & \mathbf{C_{t}}\mathbf{\circ U_{t}}\\
\nabla_{\mathbf{Q}}^{J} & \leftarrow\nabla_{\mathbf{Q}}^{J}+ & \mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{\mathbf{exp(U}_{t})}-\mathbf{C_{t}}\right)\\
\nabla_{\mathbf{\mathbf{P}}}^{J} & \leftarrow & \mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{\mathbf{P}_{t}}-\mathbf{Z_{t}}\right)\\
\nabla_{\mathbf{W}}^{J} & \leftarrow\nabla_{\mathbf{W}}^{J}+ & \mathbf{X_{t}\nabla_{P}^{\mathit{J}T}}\\
\nabla_{\mathbf{X}}^{J} & \leftarrow\nabla_{\mathbf{X}}^{J}+ & \mathbf{W_{t}}\mathbf{\nabla_{P}^{\mathit{J}}}
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
\begin_inset Note Comment
status open

\begin_layout Itemize

\emph on
CRF
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! CRF
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
CRF
\end_layout

\end_inset

:
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
 CRF stands for conditional random fields.
 This node does sequence-level training, using CRF criterion.
 This node has three inputs.
 The first is the label 
\family default
\series bold
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
L
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
.
 The second is the position dependent score 
\family default
\series bold
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
H
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
.
 The third input is transition score 
\family default
\series bold
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
T
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
.
 A typical usage of CRF node is using 
\family default
\series bold
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
L
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
 same as from labels used in cross-entropy node, computing 
\family default
\series bold
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
H
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
 from child nodes using RNNs, and computing transition score 
\family default
\series bold
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
A
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
 from a matrix.
 The matrix for 
\family default
\series bold
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
A
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
 can be initialized with values of 1/|
\family default
\series bold
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
L
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
|, where |
\family default
\series bold
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
L
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
| is the cardinality of the label 
\family default
\series bold
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
L
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
.
 A forward pass computes the following accumulated score for each time t
 for label i.
 A backward pass computes gradients of the final accumulated score with
 respect to the position dependent input 
\begin_inset Formula $\frac{\partial R}{\partial h_{t}(i)}$
\end_inset

 at time 
\begin_inset Formula $t$
\end_inset

 for label 
\begin_inset Formula $i$
\end_inset

 as follows, and similarly for the gradient to the transition weight 
\begin_inset Formula $\frac{\partial R}{\partial a_{i,j}}$
\end_inset

 for transition from label 
\begin_inset Formula $i$
\end_inset

 to label 
\begin_inset Formula $j$
\end_inset

: 
\end_layout

\begin_layout Plain Layout
\begin_inset Formula 
\begin{eqnarray}
\alpha_{t}\left(i\right) & \leftarrow & h_{it}+LogAdd{k}\left(\delta_{t-1}(k)+\eta a_{ki}\right)\\
\mathbf{\frac{\partial R}{\partial\delta_{t-1}(i)}} & \leftarrow & \sum_{j}\frac{\partial C_{logadd}}{\partial\delta_{t}(j)}\frac{\exp(\delta_{t-1}(i)+a_{i,j})}{\sum_{k}\exp(\delta_{t-1}(k)+a_{k,j})}\\
\mathbf{\frac{\partial R}{\partial\delta_{T}(i)}} & \leftarrow & \frac{\exp(\delta_{T}(i))}{\sum_{k}\exp(\delta_{T}(k))}\\
\frac{\partial R}{\partial h_{t}(i)} & \leftarrow & l_{t}(i)-\frac{\partial R}{\partial\delta_{t}(i)}\\
\frac{\partial R}{\partial a_{i,j}} & \leftarrow & \sum_{t}[1(l_{t-1}(i)=1\&\&l_{t}(j)=1)-\frac{\partial R}{\partial\delta_{t}(j)}\frac{\exp(\alpha_{t-1}(i)+\eta a_{i,j})}{\sum_{k}\exp(\alpha_{t-1}(k)+\eta a_{k,j})}]
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Plain Layout
Notice that the gradient to the transition weights need to be summed over
 the whole observation sequence, which is the current minibatch.
 
\end_layout

\end_inset


\end_layout

\begin_layout Subsection
Computation Node Types for Computing Statistics
\end_layout

\begin_layout Standard
Sometimes we only want to get some statistics of the input values (either
 input features or labels).
 For example, to normalize the input features we need to compute the mean
 and standard deviation of the input feature.
 In speech recognition we need to compute the frequencies (mean) of the
 state labels to convert state posterior probability to the scaled likelihood.
 Unlike other computation node types we just described, computation node
 types for computing statistics do not require a gradient computation function
 (i.e., this function should not be called for these types of nodes) because
 they are not learned and often need to be precomputed before model training
 starts.
 Here we list the most popular computation node types in this category.
\end_layout

\begin_layout Itemize

\emph on
Mean
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! Mean
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
Mean
\end_layout

\end_inset

: compute the mean of the operand 
\begin_inset Formula $\mathbf{X}$
\end_inset

 across the whole training set.
 When the computation is finished, it needs to be marked so to avoid recomputati
on.
 When a minibatch of input 
\begin_inset Formula $\mathbf{X}$
\end_inset

 is fed in
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
k & \leftarrow & k+\mathbf{X}.cols\\
\mathbf{\boldsymbol{\upsilon}}\left(\mathbf{X}\right) & \leftarrow & \mathbf{\mathit{\frac{1}{k}}\mathbf{X}1_{X\mathit{.cols,1}}}+\mathit{\frac{k-\mathbf{X}.cols}{k}}\mathbf{\boldsymbol{\upsilon}}\left(\mathbf{X}\right)
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
Note here
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
 
\begin_inset Formula $\mathbf{X}.cols$
\end_inset

 is the number of samples in the minibatch.
\end_layout

\begin_layout Itemize

\emph on
InvStdDev
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! InvStdDev
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
InvStdDev
\end_layout

\end_inset

: compute the invert standard deviation of the operand 
\begin_inset Formula $\mathbf{X}$
\end_inset

 element-wise across the whole training set.
 When the computation is finished, it needs to be marked so to avoid recomputati
on.
 In the accumulation step
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
k & \leftarrow & k+\mathbf{X}.cols\\
\mathbf{\boldsymbol{\upsilon}}\left(\mathbf{X}\right) & \leftarrow & \mathbf{\mathit{\frac{1}{k}}\mathbf{X}1_{X\mathit{.cols,1}}}+\mathit{\frac{k-\mathbf{X}.cols}{k}}\mathbf{\boldsymbol{\upsilon}}\left(\mathbf{X}\right)\\
\mathbf{\omega}\left(\mathbf{X}\right) & \leftarrow & \mathbf{\mathit{\frac{1}{k}}\left(X\bullet X\right)1}+\mathit{\frac{k-\mathbf{X}.cols}{k}}\mathbf{\omega}\left(\mathbf{X}\right)
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
When the end of the training set is reached, 
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{\upsilon}} & \leftarrow & \left(\mathbf{\omega}-\left(\upsilon\bullet\upsilon\right)\right)^{1/2}\\
\mathbf{\boldsymbol{\upsilon}} & \leftarrow & 1\oslash\mathbf{\boldsymbol{\upsilon}}.
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Itemize

\emph on
PerDimMeanVarNorm
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! PerDimMeanVarNorm
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
PerDimMeanVarNorm
\end_layout

\end_inset

: compute the normalized operand 
\begin_inset Formula $\mathbf{X}$
\end_inset

 using mean 
\begin_inset Formula $\mathbf{m}$
\end_inset

 and invert standard deviation 
\begin_inset Formula $\mathbf{s}$
\end_inset

 for each sample.
 Here 
\begin_inset Formula $\mathbf{X}$
\end_inset

 is matrix whose number of columns equals to the number of samples in the
 minibatch and 
\begin_inset Formula $\mathbf{m}$
\end_inset

 and 
\begin_inset Formula $\mathbf{s}$
\end_inset

 are vectors that needs to be expanded before element-wise product is applied.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{V}}\left(\mathbf{X}\right) & \leftarrow & \left(\mathbf{X}\mathit{-\mathbf{m}}\right)\mathbf{\bullet s}.
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Section
Convolutional Neural Network
\end_layout

\begin_layout Standard
A 
\begin_inset Index idx
status open

\begin_layout Plain Layout
convolutional neural network
\end_layout

\end_inset

Convolutional neural network (CNN
\begin_inset Index idx
status open

\begin_layout Plain Layout
CNN
\end_layout

\end_inset

) 
\begin_inset CommandInset citation
LatexCommand cite
key "CNN-lecun:1995,CNN-FastComputing-chellapilla+2006,CNN-FeatureHierarchy-kavukcuoglu+2010,DCNN-ImageNet-krizhevsky+2012,CNN-ASR-abdel+2012,TransferLearning-DNN-Ciresan+2012,CNN-LVCSR-sainath+2013,DCNN-LVCSR-sainath+2013,CNN-ASR-Abdel+2013,CNN-ASR-deng+2013,CNN-atlas1988"

\end_inset

 provides shift invariance over time and space and is critical to achieve
 state-of-the-art performance on image recognition.
 It has also been shown to improve speech recognition accuracy over pure
 DNNs on some tasks 
\begin_inset CommandInset citation
LatexCommand cite
key "CNN-ASR-abdel+2012,CNN-ASR-Abdel+2013,CNN-ASR-deng+2013,CNN-LVCSR-sainath+2013,DCNN-LVCSR-sainath+2013"

\end_inset

.
 To support CNN we need to implement several new computation nodes.
\end_layout

\begin_layout Itemize

\emph on
Convolution
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! Convolution
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
Convolution
\end_layout

\end_inset

: convolve element-wise products of a kernel to an image.
 An example of a convolution operation is shown in Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CNN-Computation"

\end_inset

, where the input to the convolution node has three channels (represented
 by three 
\begin_inset Formula $3\times3$
\end_inset

 matrices) and the output has two channels (represented by two 
\begin_inset Formula $2\times2$
\end_inset

 matrices at top).
 A channel is a view of the same image.
 For example, an RGB image can be represented with three channels: R, G,
 B.
 Each channel is of the same size.
 
\end_layout

\begin_layout Standard
There is a kernel
\begin_inset Index idx
status open

\begin_layout Plain Layout
kernel ! convolutional neural network
\end_layout

\end_inset

 for each output and input channel pair.
 The total number of kernels equals to the product of the number of input
 channels 
\begin_inset Formula $C_{x}$
\end_inset

 and the number of output channels 
\begin_inset Formula $C_{v}$
\end_inset

.
 In Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CNN-Computation"

\end_inset

 
\begin_inset Formula $C_{x}=3$
\end_inset

, 
\begin_inset Formula $C_{v}=2$
\end_inset

 and the total number of kernels is 6.
 Each kernel 
\begin_inset Formula $\mathbf{K}_{k\ell}$
\end_inset

 of input channel 
\begin_inset Formula $k$
\end_inset

 and output channel 
\begin_inset Formula $\ell$
\end_inset

 is a matrix.
 The kernel moves along (and thus shared across) the input with strides
 (or subsampling rate) 
\begin_inset Formula $S_{r}$
\end_inset

 and 
\begin_inset Formula $S_{c}$
\end_inset

 at the vertical (row) and horizontal (column) direction, respectively.
 For each output channel 
\begin_inset Formula $\ell$
\end_inset

 and input slice 
\begin_inset Formula $\left(i,j\right)$
\end_inset

 (the 
\begin_inset Formula $i$
\end_inset

-th step along the vertical direction and 
\begin_inset Formula $j$
\end_inset

-th step along the horizontal direction)
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\upsilon_{\ell ij}\left(\mathbf{K,Y}\right) & = & \sum_{k}\mathrm{vec}\left(\mathbf{K}_{k\ell}\right)\circ\mathrm{vec}\left(\mathbf{Y}_{kij}\right),
\end{eqnarray}

\end_inset

where 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\mathbf{Y}_{kij}$
\end_inset

 has the same size as 
\begin_inset Formula $\mathbf{K}_{k\ell}$
\end_inset

.
\end_layout

\begin_layout Standard
This evaluation function involves many small matrix operations and can be
 slow.
 Chellapilla et al.
 
\begin_inset CommandInset citation
LatexCommand cite
key "CNN-FastComputing-chellapilla+2006"

\end_inset

 proposed a technique to convert all these small matrix operations to a
 large matrix product as shown at the bottom of Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CNN-Computation"

\end_inset

.
 With this trick all the kernel parameters are combined into a big kernel
 matrix 
\begin_inset Formula $\mathbf{W}$
\end_inset

 as shown at left bottom of Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CNN-Computation"

\end_inset

.
 Note that to allow for the output of the convolution node to be used by
 another convolution node, in Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CNN-Computation"

\end_inset

 we have organized the conversion slightly differently from what proposed
 by Chellapilla et al.
 
\begin_inset CommandInset citation
LatexCommand cite
key "CNN-FastComputing-chellapilla+2006"

\end_inset

 by transposing both the kernel matrix and the input features matrix as
 well as the order of the matrices in the product.
 By doing so, each sample in the output can be represented by 
\begin_inset Formula $O_{r}$
\end_inset


\begin_inset Formula $\times O_{c}$
\end_inset

 columns of 
\begin_inset Formula $C_{v}\times1$
\end_inset


\begin_inset Formula $ $
\end_inset

 vectors which can be reshaped to become a single column, where
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{equation}
O_{r}=\begin{cases}
\frac{I_{r}-K_{r}}{S_{r}}+1 & \mathrm{no\:padding}\\
\frac{\left(I_{r}-\mathrm{mod}\left(K_{r},2\right)\right)}{S_{r}}+1 & \mathrm{zero\:padding}
\end{cases}
\end{equation}

\end_inset

 is the number of rows in the output image, and 
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{equation}
O_{c}=\begin{cases}
\frac{I_{c}-K_{c}}{S_{c}}+1 & \mathrm{no\:padding}\\
\frac{\left(I_{c}-\mathrm{mod}\left(K_{c},2\right)\right)}{S_{c}}+1 & \mathrm{zero\:padding}
\end{cases}
\end{equation}

\end_inset

is the number of rows in the output image, where 
\begin_inset Formula $I_{r}$
\end_inset

 and 
\begin_inset Formula $I_{c}$
\end_inset

 are, respectively, the number of rows and columns of the input image, 
\begin_inset Formula $K_{r}$
\end_inset

 and 
\begin_inset Formula $K_{c}$
\end_inset

 are, respectively, the number of rows and columns in each kernel.
 The combined kernel matrix is of size 
\begin_inset Formula $C_{v}\times\left(O_{r}\times O_{c}\times C_{x}\right)$
\end_inset

, and the packed input feature matrix is of size 
\begin_inset Formula $\left(O_{r}\times O_{c}\times C_{x}\right)\times\left(K_{r}\times K_{c}\right)$
\end_inset

.
\end_layout

\begin_layout Standard
With this conversion the related computations of the convolution node with
 operands 
\begin_inset Formula $\mathbf{W},\mathbf{Y}$
\end_inset

 become
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{X}\left(\mathit{\mathbf{Y}}\right) & \leftarrow & \mathrm{Pack}\left(\mathbf{Y}\right)\\
\mathbf{\boldsymbol{V}}\left(\mathbf{W\mathit{,}Y}\right) & \leftarrow & \mathbf{W\mathbf{X}^{\mathit{}}}\\
\nabla_{\mathbf{W}}^{J} & \leftarrow & \nabla_{W}^{J}+\mathbf{\mathbf{\nabla_{n}^{\mathit{J}}}}\mathbf{\mathbf{X}}^{T}\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \mathbf{\mathbf{W}}^{T}\mathbf{\nabla_{n}^{\mathit{J}}}\\
\nabla_{\mathbf{\mathbf{Y}}}^{J} & \leftarrow & \nabla_{\mathbf{Y}}^{J}+\mathrm{Unpack}\left(\nabla_{\mathbf{X}}^{J}\right).
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
Note that this technique enables better parallelization with large matrix
 operations but introduces additional cost to pack and unpack the matrices.
 In most conditions, the gain outweighs the cost.
 By composing convolution node with plus nodes and element-wise nonlinear
 functions we can add bias and nonlinearity to the convolution operations.
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\align center
\begin_inset Graphics
	filename ../figures/CNNComputation.png
	lyxscale 90
	scale 45

\end_inset


\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset CommandInset label
LatexCommand label
name "fig:CNN-Computation"

\end_inset

Example convolution operations.
 Top: the original convolution operations.
 Bottom: the same operations represented as a large matrix product.
 Our matrix organization is different from what proposed by Chellapilla
 et al.
 
\begin_inset CommandInset citation
LatexCommand cite
key "CNN-FastComputing-chellapilla+2006"

\end_inset

 to allow for stacked convolution operations.
 (Modified from the figure in Chellapilla et al.
 
\begin_inset CommandInset citation
LatexCommand cite
key "CNN-FastComputing-chellapilla+2006"

\end_inset

, permitted to use by Simard)
\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Itemize

\emph on
MaxPooling
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! MaxPooling
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
MaxPooling
\end_layout

\end_inset

: apply the maximum pooling operation to input 
\begin_inset Formula $\mathbf{X}$
\end_inset

 inside a window with size 
\begin_inset Formula $K_{r}\times K_{c}$
\end_inset

 for each channel.
 The operation window moves along the input with strides (or subsampling
 rate) 
\begin_inset Formula $S_{r}$
\end_inset

 and 
\begin_inset Formula $S_{c}$
\end_inset

 at the vertical (row) and horizontal (column) direction, respectively.
 The pooling operation does not change the number of channels and so 
\begin_inset Formula $C_{v}=C_{x}$
\end_inset

.
 For each output channel 
\begin_inset Formula $\ell$
\end_inset

 and the 
\begin_inset Formula $\left(i,j\right)$
\end_inset

-th input slice 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\mathbf{X}_{\ell ij}$
\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 of size 
\begin_inset Formula $K_{r}\times K_{c}$
\end_inset

 we have
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\upsilon_{\ell ij}\left(\mathbf{X}\right) & \leftarrow & \max\left(\mathbf{X}_{\ell ij}\right)\\
\left[\nabla_{\mathbf{X}}^{J}\right]_{\ell,i_{m},j_{n}} & \leftarrow & \begin{cases}
\left[\nabla_{\mathbf{X}}^{J}\right]_{\ell,i_{m},j_{n}}+\left[\nabla_{\mathbf{n}}^{J}\right]_{\ell,i_{m},j_{n}} & \left(m,n\right)=arg\max_{m,n}x_{\ell,i_{m},j_{n}}\\
\left[\nabla_{\mathbf{X}}^{J}\right]_{\ell,i_{m},j_{n}} & else
\end{cases}
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
where 
\begin_inset Formula $i_{m}=i\times S_{r}+m$
\end_inset

, and 
\begin_inset Formula $j_{n}=j\times S_{c}+n$
\end_inset

.
\end_layout

\begin_layout Itemize

\emph on
AveragePooling
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
Node ! AveragePooling
\end_layout

\end_inset


\begin_inset Index idx
status open

\begin_layout Plain Layout
AveragePooling
\end_layout

\end_inset

: same as 
\emph on
MaxPooling
\emph default
 except average instead of maximum is applied to input 
\begin_inset Formula $\mathbf{X}$
\end_inset

 inside a window with size 
\begin_inset Formula $K_{r}\times K_{c}$
\end_inset

 for each channel.
 The operation window moves along the input with strides (or subsampling
 rate) 
\begin_inset Formula $S_{r}$
\end_inset

 and 
\begin_inset Formula $S_{c}$
\end_inset

 at the vertical (row) and horizontal (column) direction, respectively.
 For each output channel 
\begin_inset Formula $\ell$
\end_inset

 and the 
\begin_inset Formula $\left(i,j\right)$
\end_inset

-th input slice 
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\mathbf{X}_{\ell ij}$
\end_inset


\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
 of size 
\begin_inset Formula $K_{r}\times K_{c}$
\end_inset

 we have
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\upsilon_{\ell ij}\left(\mathbf{X}\right) & \leftarrow & \frac{1}{K_{r}\times K_{c}}\sum_{m,n}x_{\ell,i_{m},j_{n}}\\
\left[\nabla_{\mathbf{X}}^{J}\right]_{\ell ij} & \leftarrow & \left[\nabla_{\mathbf{X}}^{J}\right]_{\ell ij}+\frac{1}{K_{r}\times K_{c}}\left[\nabla_{\mathbf{n}}^{J}\right]_{\ell ij},
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard
where 
\begin_inset Formula $i_{m}=i\times S_{r}+m$
\end_inset

, and 
\begin_inset Formula $j_{n}=j\times S_{c}+n$
\end_inset

.
 
\end_layout

\begin_layout Section
Recurrent Connections
\end_layout

\begin_layout Standard
In the above sections, we assumed that the CN is a DAG.
 However, when there are recurrent connections in the CN, this assumption
 is no longer true.
 The recurrent connection
\begin_inset Index idx
status open

\begin_layout Plain Layout
Recurrent ! connection
\end_layout

\end_inset

 can be implemented using a 
\emph on
Delay
\begin_inset Index idx
status open

\begin_layout Plain Layout
Delay
\end_layout

\end_inset


\emph default
 node that retrieves the value 
\begin_inset Formula $\lambda$
\end_inset

 samples to the past where each column of 
\begin_inset Formula $\mathbf{Y}$
\end_inset

 is a separate sample stored in the ascending order of time.
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{\boldsymbol{\upsilon}}_{.j}\left(\mathbf{\lambda},\mathbf{Y}\right) & \leftarrow & \mathbf{Y}_{\mathit{.\left(j-\lambda\right)}}\\
\left[\nabla_{\mathbf{\mathbf{Y}}}^{J}\right]_{.j} & \leftarrow & \left[\nabla_{\mathbf{\mathbf{Y}}}^{J}\right]_{.j}+\left[\nabla_{\mathbf{\mathbf{n}}}^{J}\right]_{.j+\lambda}.
\end{eqnarray}

\end_inset

 When 
\begin_inset Formula $j-\lambda<0$
\end_inset

 some default values need to be set for 
\begin_inset Formula $\mathbf{Y}_{\mathit{.\left(j-\lambda\right)}}$
\end_inset

.
 The gradient can be derived by observing
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial\upsilon_{mn}}{\partial y_{ij}}=\begin{cases}
1 & m=i\wedge n=j+\lambda\\
0 & else
\end{cases}
\end{equation}

\end_inset

and
\end_layout

\begin_layout Standard

\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
\begin_inset Formula 
\begin{equation}
\frac{\partial J}{\partial y_{ij}}=\sum_{m,n}\frac{\partial J}{\partial\upsilon_{mn}}\frac{\partial\upsilon_{mn}}{\partial y_{ij}}=\frac{\partial J}{\partial\upsilon{}_{ij+\lambda}}.
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
An example CN that contains a delay node is shown in Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CN-WithDelayNode"

\end_inset

.
 Different from the CN without a directed loop, a CN with a loop cannot
 be computed for a sequence of samples as a batch since the next sample's
 value depends on the previous samples.
 A simple way to do forward computation and backpropagation in a recurrent
 network is to unroll all samples in the sequence over time.
 Once unrolled, the graph is expanded into a DAG and the forward computation
 and gradient calculation algorithms we just discussed can be directly
 used.
 This means, however, all computation nodes in the CN need to be computed
 sample by sample and this significantly reduces the potential of parallelizatio
n.
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\align center
\begin_inset Graphics
	filename ../figures/CN-WithDelayNode.png
	scale 70

\end_inset


\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset CommandInset label
LatexCommand label
name "fig:CN-WithDelayNode"

\end_inset

An example CN with a delay node.
 The shaded nodes form a recurrent loop which can be treated as a composite
 node.
\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
There are two approaches to speedup the computation of a CN with directed
 loops.
 In the next two subsections, we will discuss them.
\end_layout

\begin_layout Subsection
Sample by Sample Processing Only Within Loops
\end_layout

\begin_layout Standard
The first approach identifies the loops in the CN and only applies the sample-by
-sample computation for nodes inside the loops.
 For the rest of the computation nodes all samples in the sequence can be
 computed in parallel as a single matrix operation.
 For example, in Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CN-WithDelayNode"

\end_inset

 all the nodes included in the loop of 
\begin_inset Formula $\mathbf{T}^{(3)}\rightarrow\mathbf{P}^{(3)}\rightarrow\mathbf{S}^{(1)}\rightarrow\mathbf{D}\rightarrow\mathbf{T}^{(3)}$
\end_inset

 need to be computed sample by sample.
 All the rest of the nodes can be computed in batches.
 A popular technique is to identify the strongly connected components (SCC)
 
\begin_inset Index idx
status open

\begin_layout Plain Layout
strongly connected components
\end_layout

\end_inset

 in the graph, in which there is a path between each pair of vertices and
 adding any edges or vertices would violate this property, using algorithms
 such as Tarjan's strongly connected components algorithm 
\begin_inset CommandInset citation
LatexCommand cite
key "StronglyConnectedComponents-Tarjan-1972"

\end_inset


\begin_inset Foot
status open

\begin_layout Plain Layout
Tarjan's algorithm is favored over others such as Kosaraju's algorithm 
\begin_inset CommandInset citation
LatexCommand cite
key "StronglyConnectedComponents-Hopcroft+1983"

\end_inset

 since it only requires one depth-first traverse, has a complexity of 
\begin_inset Formula $O\left(\mathbf{\mathbb{\left|V\right|}}+\left|\mathbf{\mathbb{E}}\right|\right)$
\end_inset

, and does not require reversing arcs in the graph.
\end_layout

\end_inset

.
 Once the loops are identified, they can be treated as a composite node
 in the CN and the CN is reduced to a DAG.
 All the nodes inside each loop (or composite node) can be unrolled over
 time and also reduced to a DAG.
 For all these DAGs the forward computation and backpropagation algorithms
 we discussed in the previous sections can be applied.
 The detailed procedure in determining the forward computation order in
 the CN with arbitrary recurrent connections is described in Algorithm 
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:Forward-computation-of-arbitrary-CN"

\end_inset

.
 Since the input to the delay nodes are computed in the past they can be
 considered as the leaf if we only consider one time slice.
 This makes the order decision inside loops much easier.
\end_layout

\begin_layout Standard
\begin_inset Float algorithm
wide false
sideways false
status open

\begin_layout Plain Layout
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{algorithmic}[1] 
\end_layout

\begin_layout Plain Layout


\backslash
Procedure{DecideForwardComputationOrderWithReccurentLoop}{$G=(V,E)$} 
\backslash
State{StronglyConnectedComponentsDetection($G,G'$)} 
\end_layout

\begin_layout Plain Layout


\backslash
Comment{$G'$ is a DAG of strongly connected components (SCC)} 
\end_layout

\begin_layout Plain Layout


\backslash
State{Call DecideForwardComputationOrder on $G'$ $
\backslash
rightarrow$ $order$ for DAG} 
\end_layout

\begin_layout Plain Layout


\backslash
For {$v
\backslash
in G, v
\backslash
in V$} 
\end_layout

\begin_layout Plain Layout


\backslash
State{Set the $order$ of $v$ equal the max order of the SCC $V$} 
\end_layout

\begin_layout Plain Layout


\backslash
Comment{This guarantee the forward order of SCC is correct} 
\end_layout

\begin_layout Plain Layout


\backslash
EndFor 
\end_layout

\begin_layout Plain Layout


\backslash
For {each SCC $V$ in G} 
\end_layout

\begin_layout Plain Layout


\backslash
State{Call GetLoopForwardOrder($root$ of V)} 
\end_layout

\begin_layout Plain Layout

$
\backslash
rightarrow$ $order$ for each SCC 
\end_layout

\begin_layout Plain Layout


\backslash
EndFor 
\end_layout

\begin_layout Plain Layout


\backslash
State{
\backslash
Return {$order$ for DAG and $order$ for each SCC (loop)}} 
\end_layout

\begin_layout Plain Layout


\backslash
EndProcedure
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout


\backslash
Procedure{GetLoopForwardOrder}{$root$} 
\end_layout

\begin_layout Plain Layout


\backslash
State{Treat all the $delayNode$ as leaf and Call DecideForwardComponentionOrder}
 
\end_layout

\begin_layout Plain Layout


\backslash
EndProcedure
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout


\backslash
Procedure{StronglyConnectedComponentsDetection}{$G=(V,E),DAG$}
\end_layout

\begin_layout Plain Layout


\backslash
State{$index=0, S=empty$} 
\end_layout

\begin_layout Plain Layout


\backslash
For {$v 
\backslash
in V$} 
\end_layout

\begin_layout Plain Layout


\backslash
If{$v.index$ is undefined} 	
\end_layout

\begin_layout Plain Layout

StrongConnectComponents($v,DAG$) 
\end_layout

\begin_layout Plain Layout


\backslash
EndIf 
\end_layout

\begin_layout Plain Layout


\backslash
EndFor 
\end_layout

\begin_layout Plain Layout


\backslash
EndProcedure
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout


\backslash
Procedure{StrongConnectComponent}{$v, DAG$} 
\end_layout

\begin_layout Plain Layout


\backslash
State{$v.index=index,v.lowlink=index,index= index+1$} 
\end_layout

\begin_layout Plain Layout


\backslash
State{$S.push(v)$} 
\end_layout

\begin_layout Plain Layout


\backslash
For {$(v,w) 
\backslash
in E$} 
\end_layout

\begin_layout Plain Layout


\backslash
If {$w.index$ is undefined} 
\end_layout

\begin_layout Plain Layout


\backslash
State{StrongConnectComponent($w$)} 
\end_layout

\begin_layout Plain Layout


\backslash
State{$v.lowlink=
\backslash
min(v.lowlink, w.lowlink)$} 
\end_layout

\begin_layout Plain Layout


\backslash
ElsIf{$w
\backslash
in S$} 
\end_layout

\begin_layout Plain Layout


\backslash
State{$v.lowlink=
\backslash
min(v.lowlink,w.index)$} 
\end_layout

\begin_layout Plain Layout


\backslash
EndIf 
\backslash
EndFor
\end_layout

\begin_layout Plain Layout


\backslash
If {$v.lowlink=v.index$}
\end_layout

\begin_layout Plain Layout


\backslash
Comment{If v is a root node, pop the stack and generate an SCC}
\end_layout

\begin_layout Plain Layout


\backslash
State{start a new strongly connected component} 
\end_layout

\begin_layout Plain Layout


\backslash
Repeat 
\end_layout

\begin_layout Plain Layout


\backslash
State{$w=S.pop()$} 
\end_layout

\begin_layout Plain Layout


\backslash
State{add $w$ to current strongly connected component} 
\end_layout

\begin_layout Plain Layout


\backslash
Until{$w==v$} 
\end_layout

\begin_layout Plain Layout


\backslash
State{Save current strongly connected component to $DAG$} 
\end_layout

\begin_layout Plain Layout


\backslash
EndIf 
\end_layout

\begin_layout Plain Layout


\backslash
EndProcedure 
\end_layout

\begin_layout Plain Layout


\backslash
end{algorithmic}
\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Forward computation of an arbitrary CN
\begin_inset CommandInset label
LatexCommand label
name "alg:Forward-computation-of-arbitrary-CN"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Subsection
Processing Multiple Utterances Simultaneously
\end_layout

\begin_layout Standard
The second approach to speed up the processing in the recurrent CN is to
 process multiple sequences at a time.
 To implement this, we need to organize sequences in a way that the frames
 with the same frame id from different sequences are grouped together as
 shown in Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CN-SequenceBatch"

\end_inset

.
 By organizing sequences in this way we can compute frames from different
 sequences in batch when inside a loop and compute all samples in all utterances
 in one batch outside loops.
 For example, in Figure 
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:CN-SequenceBatch"

\end_inset

 we can compute 4 frames together for each time step.
 If sequences have different lengths, we can truncate them to the same length
 and save the final state of the sequences that are not finished yet.
 The remaining frames can be grouped with other sequences for further processing.
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\align center
\begin_inset Graphics
	filename ../figures/SequenceBatch.png
	lyxscale 80
	scale 70

\end_inset


\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset CommandInset label
LatexCommand label
name "fig:CN-SequenceBatch"

\end_inset

Process multiple sequences in a batch.
 Shown in the figure is an example with 4 sequences.
 Each color represents one sequence.
 Frames with the same frame id from different sequences are grouped together
 and computed in batch.
 The right matrix is a reshape of the left matrix.
\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Subsection
Building Arbitrary Recurrent Neural Networks
\begin_inset Index idx
status open

\begin_layout Plain Layout
Recurrent ! neural network (RNN)
\end_layout

\end_inset


\end_layout

\begin_layout Standard
With the inclusion of delay nodes, we can easily construct complicated recurrent
 networks and dynamic systems.
 For example, the long-short-term-memory
\begin_inset Index idx
status open

\begin_layout Plain Layout
long-short-term memory
\end_layout

\end_inset

 (LSTM
\begin_inset Index idx
status open

\begin_layout Plain Layout
LSTM
\end_layout

\end_inset

) 
\begin_inset CommandInset citation
LatexCommand cite
key "LSTM-hochreiter:1997,LSTM-GenSequence-Graves-2013"

\end_inset

 neural network that is widely used to recognize and generate hand-written
 characters involves the following operations:
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{i}_{t} & = & \sigma\left(\mathbf{W}^{(xi)}\mathbf{x}_{t}+\mathbf{W}^{(hi)}\mathbf{h}_{t-1}+\mathbf{W}^{(ci)}\mathbf{c}_{t-1}+\mathbf{b}^{(i)}\right)\\
\mathbf{f}_{t} & = & \sigma\left(\mathbf{W}^{(xf)}\mathbf{x}_{t}+\mathbf{W}^{(hf)}\mathbf{h}_{t-1}+\mathbf{W}^{(cf)}\mathbf{c}_{t-1}+\mathbf{b}^{(f)}\right)\\
\mathbf{c}_{t} & = & \mathbf{f}_{t}\bullet\mathbf{c}_{t-1}+\mathbf{i}_{t}\bullet\mathrm{tanh}\left(\mathbf{W}^{(xc)}\mathbf{x}_{t}+\mathbf{W}^{(hc)}\mathbf{h}_{t-1}+\mathbf{b}^{(c)}\right)\\
\mathbf{o}_{t} & = & \sigma\left(\mathbf{W}^{(xo)}\mathbf{x}_{t}+\mathbf{W}^{(ho)}\mathbf{h}_{t-1}+\mathbf{W}^{(co)}\mathbf{c}_{t}+\mathbf{b}^{(o)}\right)\\
\mathbf{h}_{t} & = & \mathbf{o}_{t}\bullet\mathrm{tanh}\left(\mathbf{c}_{t}\right),
\end{eqnarray}

\end_inset

where 
\begin_inset Formula $σ\left(.\right)$
\end_inset

 is the logistic sigmoid function, 
\begin_inset Formula $\mathbf{i}_{t}$
\end_inset

, 
\begin_inset Formula $\mathbf{f}_{t}$
\end_inset

, 
\begin_inset Formula $\mathbf{o}_{t}$
\end_inset

, 
\begin_inset Formula $\mathbf{c}_{t}$
\end_inset

 and 
\begin_inset Formula $\mathbf{h}_{t}$
\end_inset

 are vectors with same size (since there is one and only one of each in
 every cell) to represent values at time 
\begin_inset Formula $t$
\end_inset

 of the input gate, forget gate, output gate, cell, cell input activation,
 and hidden layer, respectively, 
\begin_inset Formula $\mathbf{W}$
\end_inset

's are the weight matrices connecting different gates, and 
\begin_inset Formula $\mathbf{b}$
\end_inset

's are the corresponding bias vectors.
 All the weight matrices are full except the weight matrix 
\begin_inset Formula $\mathbf{W}^{(ci)}$
\end_inset

 from the cell to gate vectors which is diagonal.
 It is obvious that the whole LSTM can be described as a CN with following
 node types: 
\emph on
Times
\emph default
, 
\emph on
Plus
\emph default
, 
\emph on
Sigmoid
\emph default
, 
\emph on
Tanh
\emph default
, 
\emph on
DiagTimes
\emph default
, 
\emph on
ElementTimes
\emph default
 and 
\emph on
Delay
\emph default
.
 More specifically, using these computational nodes, the LSTM can be described
 as:
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{H}^{(d)} & = & Delay\left(\mathbf{H}\right)\\
\mathbf{C}^{(d)} & = & Delay\left(\mathbf{C}\right)\\
\mathbf{T}^{\left(1\right)} & = & Macro2W1b(\mathbf{W}^{(xi)},\mathbf{X},\mathbf{W}^{(hi)},\mathbf{H}^{(d)},\mathbf{b}^{(i)})\\
\mathbf{I} & = & Sigmoid\left(Plus\left(\mathbf{T}^{\left(1\right)},DiagTimes\left(\mathbf{d}^{(ci)},\mathbf{C}^{(d)}\right)\right)\right)\\
\mathbf{T}^{\left(2\right)} & = & Macro3W1b\left(\mathbf{W}^{(xf)},\mathbf{X},\mathbf{W}^{(hf)},\mathbf{H}^{(d)},\mathbf{W}^{(cf)}\mathbf{C}^{(d)},\mathbf{b}^{(f)}\right)\\
\mathbf{F} & = & Sigmoid\left(\mathbf{T}^{\left(2\right)}\right)\\
\mathbf{T}^{\left(3\right)} & = & Macro2W1b\left(\mathbf{W}^{(xc)},\mathbf{X},\mathbf{W}^{(hc)},\mathbf{H}^{(d)},\mathbf{b}^{(c)}\right)\\
\mathbf{T}^{\left(4\right)} & = & ElementTime\left(\mathbf{F},\mathbf{C}^{(d)}\right)\\
\mathbf{T}^{\left(5\right)} & = & ElementTimes\left(\mathbf{I},Tanh\left(\mathbf{T}^{\left(3\right)}\right)\right)\\
\mathbf{C} & = & Plus\left(\mathbf{T}^{\left(4\right)},\mathbf{T}^{\left(5\right)}\right)\\
\mathbf{T}^{\left(6\right)} & = & Macro3W1b\left(\mathbf{W}^{(xo)},\mathbf{X},\mathbf{W}^{(ho)},\mathbf{H}^{(d)},\mathbf{W}^{(co)}\mathbf{C},\mathbf{b}^{(o)}\right)\\
\mathbf{O} & = & Sigmoid\left(\mathbf{T}^{\left(6\right)}\right)\\
\mathbf{H} & = & ElementTime\left(\mathbf{\mathbf{O}},Tanh\left(\mathbf{C}\right)\right),
\end{eqnarray}

\end_inset

where we have defined the macro 
\begin_inset Formula $Macro2W1b\left(\mathbf{W}^{\left(1\right)},\mathbf{I}^{\left(1\right)},\mathbf{W}^{\left(2\right)},\mathbf{I}^{\left(2\right)},\mathbf{b}\right)$
\end_inset

 as
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{S}^{\left(1\right)} & = & Plus\left(Times\left(\mathbf{W}^{\left(1\right)},\mathbf{I}^{\left(1\right)}\right),Times\left(\mathbf{W}^{\left(2\right)},\mathbf{I}^{\left(2\right)}\right)\right)\\
Macro2W1b & = & Plus\left(\mathbf{S}^{\left(1\right)},\mathbf{b}\right)
\end{eqnarray}

\end_inset

and macro 
\begin_inset Formula $Macro3W1b\left(\mathbf{W}^{\left(1\right)},\mathbf{I}^{\left(1\right)},\mathbf{W}^{\left(2\right)},\mathbf{I}^{\left(2\right)},\mathbf{\mathbf{W}^{\mathit{\left(3\right)}},\mathbf{I}^{\mathit{\left(3\right)}},b}\right)$
\end_inset

 as
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{eqnarray}
\mathbf{S}^{\left(1\right)} & = & Macro2W1b\left(\mathbf{W}^{\left(1\right)},\mathbf{I}^{\left(1\right)},\mathbf{W}^{\left(2\right)},\mathbf{I}^{\left(2\right)},\mathbf{b}\right)\\
Macro3W1b & = & Plus\left(\mathbf{S}^{\left(1\right)},Times\left(\mathbf{W}^{\left(3\right)},\mathbf{I}^{\left(3\right)}\right)\right)
\end{eqnarray}

\end_inset


\end_layout

\begin_layout Standard

\end_layout

\end_body
\end_document
Browse the archive

https://github.com/Microsoft/CNTK