\documentclass[11pt]{article}
\usepackage{amssymb} %maths
\usepackage{amsmath} %maths
\newcommand{\twopartdef}[4]
{
\left\{
\begin{array}{ll}
#1 & \mbox{if } #2 \\
#3 & \mbox{if } #4
\end{array}
\right.
}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\E}{\mathbb{E}}
\title{EM for Gaussian Mixture Models}
\date{October 17, 2013}
\author{Jesse Anderton\\ Northeastern University}
\begin{document}
\maketitle
\section{Problem}
\begin{itemize}
\item You have $n$ data samples with $d$ real-valued features in each sample
\item You want to find clusters of data points, but you also want a probabilistic description of each cluster
\begin{itemize}
\item Maybe you want to assume your data set is representative of future data, and quickly decide which cluster future data points fit in
\item Maybe you want to look at the correlations between your features on a cluster by cluster basis
\item Maybe you want to whiten your data – transform your features so they're decorrelated by removing shared information
\item Maybe you want a compact representation of your clusters for a classification or IR task
\end{itemize}
\item One way to set up this task is as a GMM, using EM to find the distributions which define each cluster and the cluster membership of each data point
\end{itemize}
\section{GMMs}
\begin{itemize}
\item A Gaussian Mixture Model assumes that your data is generated by first choosing a ``cluster'' and then generating the point from the distribution for that cluster
\item Each cluster is a multivariate Gaussian, which is just a Normal with multiple dimensions (``features'')
\item A univariate Gaussian has density:
\[
\phi_1(x|\mu, \sigma) \triangleq \frac{ exp\left( -\frac{1}{2} (x-\mu)^2 \sigma^{-2} \right) } {\sqrt{ 2 \pi \sigma^2 }}
\]
\item A $d$-dimensional multivariate Gaussian has density:
\[
\phi_d(\vec{x}|\vec{\mu}, \Sigma) \triangleq \frac{ exp\left( -\frac{1}{2} (\vec{x}-\vec{\mu})^T \Sigma^{-1} (\vec{x}-\vec{\mu}) \right) } {\sqrt{ (2 \pi)^d |\Sigma| }}
\]
\item A GMM with $k$ components has a weight $w_i>0$ for each component s.t. $\sum_{i=1}^k w_i = 1$, so its density is:
\[
p(\vec{x}|\theta) = \sum_{i=1}^{k}{ w_i \phi_d(\vec{x}|\theta) }
\]
\end{itemize}
\section{EM for GMMs}
\subsection{Setup}
\begin{itemize}
\item Given observed data $\mathbf{y} = \vec{y_1}, \dots, \vec{y_n}$
\item Want cluster membership $\mathbf{z} = z_1, \dots, z_n, \forall i, z_i \in \mathbb{Z}, 1 \le z_i \le k$
\item Want model parameters $\theta = \{( w_i, \mu_i, \Sigma_i )\}_{i=1}^{k}$
\item Define complete data $\mathbf{x} = (\mathbf{y}, \mathbf{z})$
\end{itemize}
\subsection{EM}
\begin{enumerate}
\item Initialize: Choose $w_j^{(0)}, \mu_j^{(0)}, \Sigma_j^{(0)}, j = 1, \dots, k$ and compute likelihood:
\[
L^{(0)} = \frac{1}{n} \sum_{i=1}^{n} { \log \left( \sum_{j=1}^{k} w_j^{(0)} \phi_d \left(\vec{y_i} | \vec{\mu_{j}}^{(0)}, \Sigma_j^{(0)} \right) \right) }
\]
\item E-Step:
\begin{align*}
\gamma_{i,j}^{(m)} &= \frac{ w_j^{(m)} \phi_d \left(\vec{y_i} | \vec{\mu_{j}}^{(m)}, \Sigma_j^{(m)} \right) }{\sum_{j=1}^{k} w_j^{(m)} \phi_d \left(\vec{y_i} | \vec{\mu_{j}}^{(m)}, \Sigma_j^{(m)} \right) } && i=1,\dots,n ;j=1,\dots,k \text{ (prob y from j)} \\
n_j^{(m)} &= \sum_{i=1}^{n} \gamma_{i,j}^{(m)} && j=1,\dots,k \text{ (prob for j)}
\end{align*}
\item M-Step:
\begin{align*}
w_j^{(m+1)} &= \frac{ n_j^{(m)} }{n} && j=1,\dots,k \\
\vec{\mu_j}^{(m+1)} &= \frac{1}{n_j^{(m)}} \sum_{i=1}^{n} \gamma_{i,j}^{(m)} \vec{y}_i && j=1,\dots,k \\
\Sigma_j^{(m+1)} &= \frac{1}{n_j^{(m)}} \sum_{i=1}{n} \gamma_{i,j}^{(m)} \left(\vec{y_i}-\vec{\mu_j}^{(m+1)}\right)\left(\vec{y_i}-\vec{\mu_j}^{(m+1)}\right)^T && j=1,\dots,k
\end{align*}
\item Convergence check:
\begin{itemize}
\item Compute $L^{(m+1)} = \frac{1}{n} \sum_{i=1}^{n} { \log \left( \sum_{j=1}^{k} w_j^{(m+1)} \phi_d \left(\vec{y_i} | \vec{\mu_{j}}^{(m+1)}, \Sigma_j^{(m+1)} \right) \right) }$
\item If $|L^{(m+1)} - L^{(m)}| < \delta$, stop
\end{itemize}
\end{enumerate}
\section{Initializing from a clustering}
This model is not convex, so EM is going to find some local maximum. One way to find a reasonable maximum is to use many random initializiations. Another is to use some other clustering algorithm, such as k-means, and building your initial parameters from there. Using some prior clustering:
\[
\gamma_{i,j}^{(-1)} = \twopartdef{1}{x_i \text{ in cluster } j}{0}{\text{not}}
\]
You can then compute $n_j^{(0)}$ and use the M-step to get $w_j^{(0)}, \vec{\mu_j}^{(0)}, \text{ and } \Sigma_j^{(0)}$.
\section{The Singularity Problem}
For some initializations, the nearest local maximum has infinite likelihood. This happens because certain initializations prefer components with zero covariance whose mean is on one of the data points. A zero volume one-point cluster with infinite likelihood is a singularity, and isn't usually interesting to us. We want to filter them out. (At least) two ways we can do this: re-initialize, or compute MAP instead of ML using a careful prior.
\subsection{Using MAP for EM}
Given a parameter prior $p(\theta)$, we wish to compute:
\begin{align*}
\hat{\theta}_{MAP} &= \argmax_{\theta \in \Theta} {\log p(\theta|\mathbf{y})} \\
&= \argmax_{\theta \in \Theta}( {\log p(\mathbf{y}|\theta) p(\theta)} )\\
&= \argmax_{\theta \in \Theta}( {L(\theta) p(\theta)} )
\end{align*}
\noindent So the E-step is the same, and the M-step becomes:
\[
\theta^{(m+1)} = \argmax_{\theta \in \Theta} {\left( Q(\theta|\theta^{(m)}) + \log p(\theta) \right)}
\]
\noindent This version has the same monotonicity guarantees as the ML version.
\section{Ways to play with GMM EM}
\begin{itemize}
\item Using MAP, you can transition gracefully from some prior belief to what the data is telling you as you get more data
\item If you choose a careful prior, you can constrain the solutions to favor more useful distributions or certain modeling assumptions, while still moving away from these assumptions if the data just doesn't fit
\item Modify the algorithm so all components share a covariance matrix, if you think covariance doesn't change based on cluster. This eliminates singularities, if the number of components is much smaller than $n$.
\item Constrain the covariance matrix so the component contours are spherical (proportional to the identity matrix) or axis-aligned (diagonal). This helps avoid failures of estimation, like singularities.
\end{itemize}
\end{document}