%!TEX root = lecture_NN.tex
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section {The perceptron}
Lets suppose we are (as with regression regression) with $(\tb{x}_i,y_i) ; i=1,..,m$ the data points and labels. This is a classification problem with two classes $y\in\{-1,1\}$
Like with regression we are looking for a linear predictor (classifier)
\[
h_\tb{w}(\tb{x}) = \tb{x}\tb{w} = \sum_{d=0}^D {x^dw^d}
\]
(we added the $x^0=1$ component so we can get the free term $w^0$) such that $h_\tb{w}(\tb{x})\geq 0$ when $y=1$ and $h_\tb{w}(\tb{x})\leq 0$ when $y=-1$.
On $y=-1$ data points: given that all \tb{x} and $y$ are numerical, we will make the following transformation: when $y=-1$, we will reverse the sign of the input; that is replace \tb{x} with \tb{-x} and $y=-y$. Then the condition $h_\tb{w}(\tb{x})\leq 0$ becomes $h_\tb{w}(\tb{x})\geq 0$ for all data points.
\begin{figure}[h!]
\begin{center}
\includegraphics[width=0.8\columnwidth]{make_all_positive}
\vspace{-3ex}
\end{center}
\caption{transforming $y=-1$ datapoints into $y=1$ datapoints} \label{all_positive}
\end{figure}
The perceptron objective function is a combination of the number of miss-classification points and how bad the miss-classification is
\[
J(\tb{w}) = \sum_{\tb{x}\in M}{-h_\tb{w}(\tb{x})} = \sum_{\tb{x}\in M}{-\tb{x}\tb{w}}
\]
where $M$ is the set of miss-classified data points. Note that each term of the sum is positive, since miss-classified implies $\tb{w}\tb{x} <0$. Using gradient descent, we first differentiate $J$
\[
\nabla_\tw J(\tb{w}) = \sum_{\tx \in M}{-\tx^T}
\]
then we write down the gradient descent update rule
\[
\tw := \tw + \lambda \sum_{\tx \in M}{\tx^T}
\]
($\lambda$ is the learning rate). The batch version looks like\\
\\
1. init \tw \\
2. LOOP\\
3. \hspace{8ex} get $M$ = set of missclassified data points\\
4. \hspace{8ex} $\tw = \tw + \lambda \sum_{\tx \in M}{\tx^T}$\\
5. UNTIL $|\lambda \sum_{\tx \in M}{\tx}|<\epsilon $\\
Assume the instances are linearly separable. Then we can modify the algorithm\\
\\
1. init \tw \\
2. LOOP\\
3. \hspace{8ex} get $M$ = set of missclassified data points\\
4. \hspace{8ex} for each $\tx \in M$ do $\tw = \tw + \lambda \tx^T$\\
5. UNTIL $M$ is empty\\
\begin{figure}[h!]
\begin{center}
\includegraphics[width=0.8\columnwidth]{perceptPict-1}
\vspace{-3ex}
\end{center}
\caption{perceptron update: the plane normal $w$ moves in the direction of misclassified $x$ until the $x$ is on the correct side.} \label{perceptron_update}
\end{figure}
Intuitively, the update $w^{new}=w^{old} + x$ for misclassified points $x$ is the follwoing: if $x$ is on the wrong side of the plane $=0$, it means that the normal vector to the plane, $w$, is on the opposite side to $x$. The update essentially moves $w$ in the direction of $x$; as long as $x$ remains on the wrong side, $w$ moves towards it until it $w$ and $x$ are on the same side of the plane (thus $x$ is correctly classified).
\textbf{Proof of perceptron convergence}
Assuming data is linearly separable , or there is a solution $\bar{\tw}$ such that $\tx\bar{\tw}>0$ for all \tx.\\
Lets call $\tw_k$ the $\tw$ obtained at the $k$-th iteration (update). Fix an $\alpha>0$. Then
\[
\tw_{k+1} - \alpha\bar{\tw} = (\tw_k - \alpha\bar{\tw}) + \tx_k^T
\]
where $\tx_k$ is the datapoint that updated $\tw$ at iteration $k$.
Then
\[
||\tw_{k+1}-\alpha\bar{\tw}||^2 = ||\tw_{k}-\alpha\bar{\tw}||^2 + 2\tx_k(\tw_k-\alpha\bar{\tw}) + ||\tx_k||^2 \leq
||\tw_{k}-\alpha\bar{\tw}||^2 - 2\tx_k\alpha\bar{\tw} + ||\tx_k||^2
\]
Since $\tx_k\bar{\tw}>0$ all we need is an $\alpha$ sufficiently large to show that this update process cannot go on forever. When it stops, all datapoints must be classified correctly.
\begin{figure}[h!]
\begin{center}
\includegraphics[width=0.8\columnwidth]{img100}
\vspace{-3ex}
\end{center}
\caption{bias unit} \label{bias}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Multilayer perceptrons}
\begin{figure}[h!]
\begin{center}
\includegraphics[width=0.7\columnwidth]{img101}
\vspace{-3ex}
\end{center}
\caption{multilayer perceptron} \label{multilayer_perceptron}
\end{figure}
\newpage
\subsection{More than linear functions, example: XOR}
\begin{center}
\begin{figure}[h!]
\includegraphics[width=0.8\columnwidth]{img099}
\caption{XOR NNet} \label{XOR_nnet}
\end{figure}
\end{center}
\vspace{-3ex}
Perceptrons have been shown to have limited processing power. The decision boundary of a perceptron is a line (hyper plane), which means perceptrons can only classify objects that are linearly separable. Early researches discovered that perceptrons are not able to learn a XOR function. As can be seen in Figure 3, there is no single line that can separate the red points from the black points. Since XOR is an important function in circuit design, this shows the limited abilities of perceptron in practice.
However, perceptrons can implement {\bf AND}, {\bf OR} and {\bf NOT} gates fairly easily, since the corresponding problems are linearly separable. We know {\bf XOR} gates can be constructed using {\bf AND}, {\bf OR} and {\bf NOT} gates:
\[
XOR(x,y)=OR(x,y) ~AND~ (NOT (AND(x,y)))
\]
This gives us the hint that by composing perceptrons together, we can get greater processing power. This leads to multi-layer neural networks (also called multi-layer perceptrons).
\subsection{ Construction and structure of NNets}
Typical neural networks have the following structure.
\begin{figure}[h!]
\begin{center}
\includegraphics[width=0.8\columnwidth]{img102}
\vspace{-3ex}
\end{center}
\caption{NNet fully connected} \label{nnet}
\end{figure}
This is a 3-layer neural network. It consists of one input layer, one hidden layer and one output layer. Each node in the input layer represent a component of the feature vector. Each hidden unit performs the weighted sum of inputs to form a net activation:
\[net_j = \sum _{i=1}^d x_i w_{ji} +w_{j0}=\sum _{i=0} ^d x_i w_{ji} ={ \tw_j^t \tx}
\]
where $w_{ji}$ denotes the weight between input node $i$ and hidden node $j$. Each hidden unit emits an output
\[y_j = f(net_j)
\]
Each output unit computes a net activation based on hidden unit outputs
\[net_k = \sum _{j=1}^{n_H} y_jw_{kj} +w_{k0} = \sum_{j=0}^{n_H}y_jw_{kj} = \tw_k^t \mathbf {y},
\]
and emits an output
\[z_k = f(net_k)
\]
For a 3-layer neural network, the outputs can be written as
\[
g_k(\tx) = z_k = f\left(\sum_j {w_{kj} f\left(\sum_i{w_{ij} x_i+ w_{j0}} \right) +w_{k0} }\right) = F\left( F(\tx\tw_j)\tw_k \right)
\]
Given these discriminant functions, we can predict each datapoint's label as
\[\arg\max_k g_k(x)
\]
\subsection{ Kolmogorov theorem, expressive power of NNet}
Any function $g$ can be written
\[
g(\tx) = \sum_j \Xi_j\left(\sum_d\Psi_{dj}(x^d)\right)
\]
but there is no practical way to use this theorem in practice. Usually $\Xi$ and $\Psi$ are very complex and not smooth.
It can be shown that neural networks are universal approximators. A feed-forward network with a single hidden layer containing a finite number of neurons, can approximate any continuous functions, under mild assumptions.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Training, Error backpropagation}
In this section, we'll discuss how to learn a multi-layer neural network using a special kind of gradient descent algorithm -- back propagation algorithm.
Similarly to linear regression, we try to minimize the squared error between the true labels and the predicted values. To make gradient descent easier, we often choose sigmoid(logistic) function as the activation function, because sigmoid function is differentiable.
Let's consider a 3-layer neural network:
- error
For one datapoint x:
\[
J(w)=\frac{1}{2}\sum_k(t_k-z_k)^2
\]
\[
\Delta w_{pq}=-\lambda \frac{\partial J}{\partial w_{pq}},
\]
where $\lambda$ is the learning rate.
- propagation to last set of weights (close to output)
\[
\frac{\partial J}{\partial w_{kj}}=\frac{\partial J}{\partial net_k}\frac{\partial net_k}{\partial w_{kj}}=-\delta_k \frac{\partial net_k}{\partial w_{kj}}
\]
\[
\delta_k = -\frac{\partial J}{\partial net_k}=-\frac{\partial J}{\partial z_k}\frac{\partial z_k}{\partial net_k} = (t_k-z_k)f'(net_k)
\]
\[
\frac{\partial net_k}{\partial w_{kj}}=y_j
\]
So we get the update rule for hidden-to-output weights:
\[
w_{kj}=w_{kj}+\lambda(t_k-z_k)f'(net_k)y_j
\]
- propagation to first set of weights (close to input)
Using chain rule, we get
\[
\frac{\partial J}{\partial w_{ji}}=\frac{\partial J}{\partial y_j}\frac{\partial y_j}{\partial net_j}\frac{\partial net_j}{\partial w_{ji}}
\]
\begin{align*}
\frac{\partial J}{\partial y_j}=&\frac{\partial [\frac{1}{2}\sum_{k}(t_k-z_k)^2]}{\partial y_j}\\
=&-\sum_k(t_k-z_k)\frac{\partial z_k}{\partial net_k}\frac{\partial net_k}{\partial y_j}
\end{align*}
\[
\frac{\partial h_j}{\partial net_j}=f'(net_j)
\]
\[
\frac{\partial net_j}{\partial w_{ji}}=x_i
\]
\[
w_{ji}\leftarrow w_{ji}+\lambda[\sum_k(t_k-z_k)f'(net_k)w_{kj}]f'(net_j)x_i
\]
- stochastic VS batch
Based on the above derivation, we get the following two algorithms:
\vbox{%
\begin{tabbing}
12 \= 12 \= 12 \= 12 \= 12 \= 12 \= 12 \= \kill
\sc{Stochastic training}\\
Select $x_t$ (randomly chosen)\\
\> $w_{ij}=w_{ij}+\lambda \delta_j x_i$\\
\> $w_{jk}=w_{jk}+\lambda \delta_ky_j$\\
until $|\bigtriangledown _w J|<\epsilon$
\end{tabbing}}
\vbox{%
\begin{tabbing}
12 \= 12 \= 12 \= 12 \= 12 \= 12 \= 12 \= \kill
\sc{Batch training}\\
for each iteration:\\
\> for each $x_t$\\
\> \> $\delta w_{ij} = \delta w_{ij}+\lambda \delta_j x_i$\\
\> \> $\delta w_{jk} = \delta w_{jk}+\lambda \delta_k y_j$\\
\> $w_{ij}\leftarrow w_{ij}+\delta w_{ij}$\\
\> $w_{jk}\leftarrow w_{jk}+\delta w_{jk}$\\
until $||\bigtriangledown _w J||<\epsilon$
\end{tabbing}}
\section{Toy Example}
\begin{figure}[h!]
\begin{center}
\includegraphics[width=1.0\columnwidth]{toyexample}
\vspace{-3ex}
\end{center}
\caption{NN Example} \label{toy}
\end{figure}
Now, we will give an example on Feed Forward and Back Propagation in Neural Network. And we have following assumptions:
\begin{itemize}
\item Two inputs $x^1 = -3$ and $x^2 = 1$.
\item One output $y = 0$.
\item Need to learn 9 parameters, $w^{ij}$ for nodes $i \in \{1, 2 , 3\}$ and $j \in \{0, 1, 2\}$. For each node $i$, we set up an implicit bias $w^{i0}$ to $x^{i0} = 1$. And we initialize the $w^{ij} = i$.
\item Let $s_i$ be output of $\Sigma$ for node $i$.
\item Let $o_i$ be output of sigmoid $\int$ for node $i$.
\item Add $p^{ij} = w^{ij} * (input)$.
\item For each $\delta_z = \frac{\partial E}{\partial z}$.
\end{itemize}
After we get all these assumptions, we could look at the Feed Forward and Back Propagation processing now.
\subsection{Feed Forward}
\begin{itemize}
\item $s_1 = w^{10} \cdot x^0 + w^{11} \cdot x^1 + w^{12} \cdot x^2 = p^{10} + p^{11} + p^{12} = 1\cdot 1 + 1 \cdot -3 + 1 \cdot 1 = -1$
\item $s_2 = w^{20} \cdot x^0 + w^{21} \cdot x^1 + w^{22} \cdot x^2 = p^{20} + p^{21} + p^{22} = 2\cdot 1 + 2 \cdot -3 + 2 \cdot 1 = -2$
\item $o_1 = \frac{1}{1 + e^{-s_1}} = 0.268941421$
\item $o_2 = \frac{1}{1 + e^{-s_2}} = 0.119202922$
\item $s_3 = w^{30} \cdot x^0 + w^{31} \cdot o_1 + w^{32} \cdot o_2 = p^{30} + p^{31} + p^{32} = 4.164433029$
\item $o_3 = \frac{1}{1 + e^{-s_3}} = 0.984699229$
\item $E = (y - o_3)^2 = 0.969632521$
\end{itemize}
\subsection{Back Propagation}
\begin{itemize}
\item $\delta_{o_{3}} = \frac{\partial E}{\partial o_3} = 2 \cdot (y - o_3) \cdot -1 = 1.969398458$
\item $\delta_{s_{3}} = \frac{\partial E}{\partial s_3} = \frac{\partial E}{\partial o_3} \cdot \frac{\partial o_3}{\partial s_3} = \delta_{o_{3}} \cdot o_3 (1- o_3) = 0.029672351$
\item $\delta_{p^{30}} = \frac{\partial E}{\partial p^{30}} = \frac{\partial E}{\partial s_3} \cdot \frac{\partial s_3}{\partial p^{30}} = \delta_{s_{3}} \cdot 1 = \delta_{s_{3}}$
\item $\delta_{p^{31}} = \frac{\partial E}{\partial p^{31}} = \frac{\partial E}{\partial s_3} \cdot \frac{\partial s_3}{\partial p^{31}} = \delta_{s_{3}} \cdot 1 = \delta_{s_{3}} $
\item $\delta_{p^{32}} = \frac{\partial E}{\partial p^{32}} = \frac{\partial E}{\partial s_3} \cdot \frac{\partial s_3}{\partial p^{32}} = \delta_{s_{3}} \cdot 1 = \delta_{s_{3}} $
\item $\delta_{w^{30}} = \frac{\partial E}{\partial p^{30}} \cdot \frac{\partial p^{30}}{\partial w^{30}} = \delta_{p^{30}} \cdot 1 = 0.029672351$
\item $\delta_{w^{31}} = \frac{\partial E}{\partial p^{31}} \cdot \frac{\partial p^{31}}{\partial w^{31}} = \delta_{p^{31}} \cdot o_1 = 0.007980097348$
\item $\delta_{w^{32}} = \frac{\partial E}{\partial p^{32}} \cdot \frac{\partial p^{32}}{\partial w^{32}} = \delta_{p^{32}} \cdot o_2 = 0.003537019022$
\item $\delta_{o_{1}} = \delta_{p^{31}} \cdot w^{31} = 0.089016753$
\item \dots
\item Then you can update the weights $w^{ij}$ by each $\delta$
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\section{How to improve backpropagation}
%\subsection {Activation function}
%
%- continuous, differentiable (smoothness)
%
%- nonlinear
%
%- saturation
%
%- monotonicity
%
%\subsection {Scale input}
%\subsection{Target values}
%
%\subsection {Noise}
%\subsection{Number of hidden units}
%
%\subsection {Weights initialization}
%\subsection {Learning rates}
%\begin{figure}[h!]
%\begin{center}
%\includegraphics[width=0.8\columnwidth]{learn_rates.jpg}
%\vspace{-3ex}
%\end{center}
%\caption{learning rates} \label{learn_rates}
%\end{figure}
%\newpage
%\subsection{ Construction and structure of NNets}
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\section {Network size and structure. Regularization}
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\section {Jacobian and Hessian}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%\section {}