\documentclass[11pt]{article}
\usepackage{fullpage}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{graphicx}
\usepackage[]{algorithm2e}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{array}
\usepackage{multirow}
\usepackage{enumitem}
%, amsthm, amssymb, bm, natbib, longtable}
\input{CS6140_Assignment_Style.tex}
\newcommand\MyBox[2]{
\fbox{\lower0.75cm
\vbox to 1.7cm{\vfil
\hbox to 2.5cm{\hfil\parbox{5.0cm}{#1\\#2}\hfil}
\vfil}%
}%
}
\begin{document}
% first argument is assignment number
% 2nd argument is assignment date
% 3rd argument is assignment due name
\myheader{7}{April, 16}{April, 26}{Boosting}
This is an optional assignment, and if you choose to submit a solution your final grade will be determined based on best six out of the seven assignments.
If you submit, you should be able to do the codewalk on or before April,30.
\section{Boosting}
Algorithm-\ref{algo:GenericBoost} lists a generic boosting algorithm.
Boosting relies on training a weak (base) classifier on weighted data.
In general, boosting provides a clean interface between boosting and the underlying weak learning algorithm: in each round, the boosting algorithm provides a weighted data set to the weak learner, and the weak learner provides a predictor in return.
You may choose to keep this clean interface (which would allow you to run boosting using any weak learning algorithm) or you may choose to more simply incorporate the weak learning algorithm inside your boosting code.
As boosting remains agnostic to the weak learner, it is known as a \textit{meta-learning} algorithm.
The weighting scheme (how $\alpha$ is defined in Algorithm-\ref{algo:GenericBoost}) can be varied to modify the overall boosting algorithms.
In this question you will implement AdaBoost which chooses $\alpha$ as:
\[
\alpha_{t}=\frac{1}{2}\ln{\frac{1+\gamma_{t}}{1-\gamma_{t}}}
\]
where $\gamma$ is the so-called \emph{edge} of $h_{t}(x)$, and defined as:
\[
\gamma_{t}=\sum_{i=1}^{N}D_{t}(i)y_{i}h_{t}(x_{i})
\]
\begin{algorithm}
\KwData{Training data: $(x_{1},y_{1}),\ldots,(x_{N},y_{N})$, where $x_{i}\in \mathbb{R}^{m}$ and $y_{i}\in \{-1,1\}$}
\KwResult{Boosted Classifier: $H(x)=\text{sign}(\sum_{t=1}^{T}{\alpha_{t}h_{t}(x)})$}
Initialize $D_{1}(i) \leftarrow \frac{1}{N}$\;
\For{t=1 to T}{
Train base classifier using distribution $D_{t}$\;
Get base classifier, $h_{t}:X \rightarrow \{-1,1\}$\;
Choose $\alpha_{t} \in \mathbb{R}$\;
Update: $D_{t+1}(i)=\frac{D_{t}(i)\exp(-\alpha_{t}y_{i}h_{t}(x_{i}))}{Z_{t}}$, where $Z_{t}$ is chosen so that $\sum_{i}D_{t+1}(i)=1$;}
\caption{A generic algorithm for boosting.}
\label{algo:GenericBoost}
\end{algorithm}
\subsection{Decision Stumps}
Each predictor will correspond to a decision stump, which is just a feature-threshold pair. Note that for each feature $f_i$, you may have many possible thresholds which we shall denote as $\tau$.
Given an input instance to classify, a decision stump corresponding to feature $f_i$ and threshold $\tau$ will predict $+1$ if the input instance has a feature $f_i$ value greater than the threshold $\tau$; otherwise, it predicts $-1$.
\noindent \textbf{Determining suitable threshold: }
To create the various thresholds for each feature $f_i$, you should:
\begin{enumerate}
\item{sort the training examples by their $f_i$ values}
\item{remove duplicate values}
\item{construct thesholds that are midway between successive feature values}
\end{enumerate}
You should also add two thresholds for each feature: one below all observed values for that feature and one above all values for that feature. By removing duplicate values we are trying to cut down the number of thresold we need to check. (\textit{Note:} The search procedure is the same as used for selecting a threshold for defining a binary split on a contiuous feature in decision trees, except we are maximizing the training error instead of information gain or Gini index.)
\noindent \textbf{Optimal Decision Stump: } The optimal threshold $\tau^{*}$ for feature $f_{i}$ corresponds to the threshold that maximizes:
\[
\tau^{*} =\underset{\tau}{\text{argmax}\ } |\frac{1}{2}-error(\tau)|
\]
where, $error(.)$ is calculated as the sum of the weights ($D_{t}(i)$) of the misclassified instances. In this case since we are dealing with absolute values if a threshold has an error of $0.2$ and another threshold produces an error of $0.9$ we will prefer the latter one since it has a higher difference from $0.5$ (for such decision stumps you would need to invert the prediction so that for instances with a feature value greater than $\tau$ you will predict $-1$ and $+1$ otherwise. You would need to remember this when you combine such decision stumps with others to evaluate $H(x)$).
\subsection{Programming Tasks}
\begin{enumerate}
\item{Implement the AdaBoost algorithm, using optimal decision stumps.}
\item{Implement the AdaBoost algorithm, using random decision stumps (see below).}
\end{enumerate}
Random decision stumps on the other hand are completely random, where you will select the feature and threshold randomly.
\noindent Run your AdaBoost implementation on the Spambase, Diabetes and Breast Cancer datasets.
For each dataset partition it into a training set $(80\%)$ and a test set $(20\%)$.
After each round, you should compute:
\begin{enumerate}
\item{The local "round" error for the decision stump returned, i.e., the error using $h_{t}$ measured using $D_{t}$}
\item{The training error for the linear combination of decision stumps, i.e., the error using $H_{t}$ measured on training data (ignoring $D_{t}$)}
\item{The test error for the linear combination of decision stumps, i.e., the error using $H_{t}$ measured on test data}
\end{enumerate}
\subsection{Deliverables:}
\begin{enumerate}
\item {\textit{Boosting with optimal decision stumps:}
For each dataset provide three plots that show the local error, training error and test error.}
\item{\textit{Boosting with random decision stumps:}
For each dataset provide three plots that show the local error, training error and test error.}
\item{In both cases how did you determine that boosting has converged i.e., how did you set the parameter $T$ in Algorithm-\ref{algo:GenericBoost}.}
\end{enumerate}
\end{document}