## 高斯混合模型與最大期望算法

$\displaystyle \mathcal{L}(\boldsymbol{\theta}|\mathcal{X})=p(\mathcal{X}|\boldsymbol{\theta})=\prod_{i=1}^np(\mathbf{x}_i|\boldsymbol{\theta})$

$\displaystyle \boldsymbol{\theta}^\ast=\arg\max_{\boldsymbol{\theta}}\mathcal{L}(\boldsymbol{\theta}|\mathcal{X})$

$\displaystyle p(\mathbf{x}|\boldsymbol{\Theta})=\sum_{k=1}^m\alpha_kp(\mathbf{x}|\boldsymbol{\theta}_k)$

$\displaystyle \int p(\mathbf{x}|\boldsymbol{\Theta})d\mathbf{x}=\sum_{k=1}^m\alpha_k\int p(\mathbf{x}|\boldsymbol{\theta}_k)d\mathbf{x}=\sum_{k=1}^m\alpha_k=1$

$\displaystyle \mathcal{L}(\boldsymbol{\Theta}|\mathcal{X})=p(\mathcal{X}|\boldsymbol{\Theta})=\prod_{i=1}^np(\mathbf{x}_i|\boldsymbol{\Theta})=\prod_{i=1}^n\sum_{k=1}^m\alpha_kp(\mathbf{x}_i|\boldsymbol{\theta}_k)$

$\displaystyle \log\mathcal{L}(\boldsymbol{\Theta}|\mathcal{X})=\log\left\{\prod_{i=1}^np(\mathbf{x}_i|\boldsymbol{\Theta})\right\}=\sum_{i=1}^n\log\left\{\sum_{k=1}^m\alpha_kp(\mathbf{x}_i|\boldsymbol{\theta}_k)\right\}$

$\displaystyle p(\mathbf{z})=\prod_{k=1}^mp(z_k=1)^{z_k}=\prod_{k=1}^m\alpha_k^{z_k}$

$\displaystyle p(\mathbf{x}|z_k=1)=p(\mathbf{x}|\boldsymbol{\theta}_k),~~~k=1,\ldots,m$

$\displaystyle p(\mathbf{x}|\mathbf{z})=\prod_{k=1}^mp(\mathbf{x}|z_k=1)^{z_k}=\prod_{k=1}^mp(\mathbf{x}|\boldsymbol{\theta}_k)^{z_k}$

\displaystyle \begin{aligned} p(\mathbf{x}|\boldsymbol{\Theta})&=\sum_{\mathbf{z}}p(\mathbf{x}|\mathbf{z})p(\mathbf{z})\\ &=\sum_{k=1}^mp(\mathbf{x}|z_k=1)p(z_k=1)\\ &=\sum_{k=1}^m\alpha_kp(\mathbf{x}|\boldsymbol{\theta}_k). \end{aligned}

\displaystyle \begin{aligned} r_{k}(\mathbf{x})&=p(z_k=1|\mathbf{x})\\ &=\frac{p(\mathbf{x}|z_k=1)p(z_k=1)}{p(\mathbf{x})}\\ &=\frac{p(\mathbf{x}|z_k=1)p(z_k=1)}{\sum_{l=1}^mp(\mathbf{x}|z_l=1)p(z_l=1)}\\ &=\frac{\alpha_k\,p(\mathbf{x}|\boldsymbol{\theta}_k)}{\sum_{l=1}^m\alpha_l\,p(\mathbf{x}|\boldsymbol{\theta}_l)}. \end{aligned}

$\displaystyle \log\mathcal{L}(\boldsymbol{\Theta})=\sum_{i=1}^n\log p(\mathbf{x}_i|\boldsymbol{\Theta})=\sum_{i=1}^n\log\left\{\sum_{k=1}^mp(\mathbf{x}_i|z_k=1)p(z_k=1)\right\}$

\displaystyle \begin{aligned} \log\mathcal{L}'-\log\mathcal{L}&=\sum_{i=1}^n\log\frac{p'(\mathbf{x}_i)}{p(\mathbf{x}_i)}\\ &=\sum_{i=1}^n\log\left\{\frac{\sum_{k=1}^mp'(\mathbf{x}_i|z_k=1)p'(z_k=1)}{p(\mathbf{x}_i)}\right\}\\ &=\sum_{i=1}^n\log\left\{\sum_{k=1}^mp(z_k=1|\mathbf{x}_i)\frac{p'(\mathbf{x}_i|z_k=1)p'(z_k=1)}{p(\mathbf{x}_i)p(z_k=1|\mathbf{x}_i)}\right\}. \end{aligned}

$\displaystyle \log\left\{\sum_{k=1}^m\lambda_ky_k\right\}\ge\sum_{k=1}^m\lambda_k\log y_k$

\displaystyle \begin{aligned} \log\mathcal{L}'-\log\mathcal{L}&\ge \sum_{i=1}^n\sum_{k=1}^mp(z_k=1|\mathbf{x}_i)\log\left\{\frac{p'(\mathbf{x}_i|z_k=1)p'(z_k=1)}{p(\mathbf{x}_i)p(z_k=1|\mathbf{x}_i)}\right\}\\ &=\sum_{i=1}^n\sum_{k=1}^mp(z_k=1|\mathbf{x}_i)\log\left\{p'(\mathbf{x}_i|z_k=1)p'(z_k=1)\right\}\\ &~~~-\sum_{i=1}^n\sum_{k=1}^mp(z_k=1|\mathbf{x}_i)\log\left\{p(\mathbf{x}_i)p(z_k=1|\mathbf{x}_i)\right\}, \end{aligned}

\displaystyle \begin{aligned} Q(\boldsymbol{\Theta}',\boldsymbol{\Theta})&=\sum_{i=1}^n\sum_{k=1}^mp(z_k=1|\mathbf{x}_i)\log\left\{p'(\mathbf{x}_i|z_k=1)p'(z_k=1)\right\}\\ &=\sum_{i=1}^n\sum_{k=1}^mp(z_k=1|\mathbf{x}_i)\log\left\{\alpha'_k p(\mathbf{x}_i|\boldsymbol{\theta}'_k)\right\}. \end{aligned}

1. E─步驟. 使用 $\boldsymbol{\Theta}^t=\{\alpha_k^t,\boldsymbol{\theta}^t_k\}_{k=1}^m$ 定義 $Q(\boldsymbol{\Theta}',\boldsymbol{\Theta}^t)$。具體地說，計算每個樣本的責任，對於 $i=1,\ldots,n$$k=1,\ldots,m$

$\displaystyle r^t_{ik}=\frac{\alpha^t_k\,p(\mathbf{x}_i|\boldsymbol{\theta}^t_k)}{\sum_{l=1}^m\alpha^t_l\,p(\mathbf{x}_i|\boldsymbol{\theta}^t_l)}$

2. M─步驟. 解這個優化問題：

\displaystyle \begin{aligned} \boldsymbol{\Theta}^{t+1}&=\arg\max_{\boldsymbol{\Theta}'}Q(\boldsymbol{\Theta}',\boldsymbol{\Theta}^t)\\ &=\arg\max_{\boldsymbol{\Theta}'}\sum_{i=1}^n\sum_{k=1}^mr^t_{ik}\log\left\{\alpha'_k\,p(\mathbf{x}_i|\boldsymbol{\theta}'_k)\right\}\\ &=\arg\max_{\boldsymbol{\Theta}'}\left\{Q_1\left(\{\alpha'_k\},\boldsymbol{\Theta}^t\right)+Q_2\left(\{\boldsymbol{\theta}'_k\},\boldsymbol{\Theta}^t\right)\right\}, \end{aligned}

其中

\displaystyle \begin{aligned} Q_1\left(\{\alpha'_k\},\boldsymbol{\Theta}^t\right)&=\sum_{i=1}^n\sum_{k=1}^mr^t_{ik}\log\alpha'_k\\ Q_2\left(\{\boldsymbol{\theta}'_k\},\boldsymbol{\Theta}^t\right)&=\sum_{i=1}^n\sum_{k=1}^mr^t_{ik}\log p(\mathbf{x}_i|\boldsymbol{\theta}'_k).\end{aligned}

\displaystyle \begin{aligned} \{\alpha^{t+1}_k\}&=\arg\max_{\{\alpha'_k\}}Q_1\left(\{\alpha'_k\},\boldsymbol{\Theta}^t\right)\\ \{\boldsymbol{\theta}^{t+1}_k\}&=\arg\max_{\{\boldsymbol{\theta}'_k\}}Q_2\left(\{\boldsymbol{\theta}'_k\},\boldsymbol{\Theta}^t\right).\end{aligned}

$\displaystyle \begin{array}{ll} \hbox{maximize} & Q_1\left(\{\alpha'_k\},\boldsymbol{\Theta}^t\right)\\ \hbox{subject to}& \sum_{k=1}^m\alpha'_k=1. \end{array}$

$\displaystyle L\left(\{\alpha'_k\},\lambda\right)=\sum_{i=1}^n\sum_{k=1}^mr^t_{ik}\log\alpha'_k+\lambda\left(\sum_{k=1}^m\alpha'_k-1\right)$

$\displaystyle \frac{\partial L}{\partial\alpha'_k}=\frac{1}{\alpha'_k}\sum_{i=1}^nr^t_{ik}+\lambda$

$\displaystyle \lambda=\sum_{k=1}^m\lambda\alpha'_k=-\sum_{i=1}^n\sum_{k=1}^mr^t_{ik}=-n$

$\displaystyle \alpha'_k=\frac{n^t_k}{n}$

$\displaystyle \mathcal{N}(\mathbf{x}|\boldsymbol{\mu}_k,\Sigma_k)=\frac{1}{(2\pi)^{d/2}|\Sigma_k|^{1/2}}\exp\left\{-\frac{1}{2}(\mathbf{x}-\boldsymbol{\mu}_k)^T\Sigma_k^{-1}(\mathbf{x}-\boldsymbol{\mu}_k)\right\}$

\displaystyle\begin{aligned} Q_2(\{\boldsymbol{\mu}'_k,\Sigma'_k\},\boldsymbol{\Theta}^t) &=\sum_{i=1}^n\sum_{k=1}^mr^t_{ik}\log\mathcal{N}(\mathbf{x}_i|\boldsymbol{\mu}'_k,\Sigma'_k)\\ &=\sum_{i=1}^n\sum_{k=1}^mr^t_{ik}\left(-\frac{d}{2}\log(2\pi)-\frac{1}{2}\log|\Sigma'_k|-\frac{1}{2}(\mathbf{x}_i-\boldsymbol{\mu}'_k)^T(\Sigma'_k)^{-1}(\mathbf{x}_i-\boldsymbol{\mu}'_k)\right). \end{aligned}

$\displaystyle \frac{\partial Q_2}{\partial \boldsymbol{\mu}'_k}=\sum_{i=1}^nr^t_{ik}(\Sigma'_k)^{-1}(\mathbf{x}_i-\boldsymbol{\mu}'_k)=(\Sigma'_k)^{-1}\sum_{i=1}^nr^t_{ik}(\mathbf{x}_i-\boldsymbol{\mu}'_k)$

$\displaystyle \boldsymbol{\mu}'_k=\frac{1}{n^t_k}\sum_{i=1}^nr^t_{ik}\mathbf{x}_i$

\displaystyle\begin{aligned} Q_2(\{\boldsymbol{\mu}'_k,\Sigma'_k\},\boldsymbol{\Theta}^t) &=\sum_{i=1}^n\sum_{k=1}^mr^t_{ik}\left(-\frac{d}{2}\log(2\pi)-\frac{1}{2}\log|\Sigma'_k|-\frac{1}{2}\hbox{trace}\left\{(\mathbf{x}_i-\boldsymbol{\mu}'_k)^T(\Sigma'_k)^{-1}(\mathbf{x}_i-\boldsymbol{\mu}'_k)\right\}\right)\\ &=\sum_{i=1}^n\sum_{k=1}^mr^t_{ik}\left(-\frac{d}{2}\log(2\pi)-\frac{1}{2}\log|\Sigma'_k|-\frac{1}{2}\hbox{trace}\left\{(\Sigma'_k)^{-1}(\mathbf{x}_i-\boldsymbol{\mu}'_k)(\mathbf{x}_i-\boldsymbol{\mu}'_k)^T\right\}\right). \end{aligned}

$\displaystyle \frac{\partial Q_2}{\partial \Sigma'_k}=-\frac{1}{2}\sum_{i=1}^nr^t_{ik}\left((\Sigma'_k)^{-1}-(\Sigma'_k)^{-1}(\mathbf{x}_i-\boldsymbol{\mu}'_k)(\mathbf{x}_i-\boldsymbol{\mu}'_k)^T(\Sigma'_k)^{-1}\right)$

$\displaystyle \Sigma'_k=\frac{1}{n^t_k}\sum_{i=1}^nr^t_{ik}(\mathbf{x}_i-\boldsymbol{\mu}'_k)(\mathbf{x}_i-\boldsymbol{\mu}'_k)^T$

1. E─步驟. 對於 $i=1,\ldots,n$$k=1,\ldots,m$

$\displaystyle r^t_{ik}=\frac{\alpha^t_k\mathcal{N}(\mathbf{x}_i|\boldsymbol{\mu}^t_k,\Sigma^t_k)}{\sum_{l=1}^m\alpha^t_l\mathcal{N}(\mathbf{x}_i|\boldsymbol{\mu}^t_l,\Sigma^t_l)}$

2. M─步驟. 對於 $k=1,\ldots,m$$n^t_k=\sum_{i=1}^nr^t_{ik}$，且

\displaystyle \begin{aligned} \alpha^{t+1}_k&=\frac{n^t_k}{n}\\ \boldsymbol{\mu}^{t+1}_k&=\frac{1}{n^t_k}\sum_{i=1}^nr^t_{ik}\mathbf{x}_i\\ \Sigma^{t+1}_k&=\frac{1}{n^t_k}\sum_{i=1}^nr^t_{ik}(\mathbf{x}_i-\boldsymbol{\mu}^{t+1}_k)(\mathbf{x}_i-\boldsymbol{\mu}^{t+1}_k)^T. \end{aligned}

[1] 在數理統計的計算，我們常運用跡數化簡，例如，

$\displaystyle \mathbf{x}^TA\mathbf{x}=\hbox{trace}(\mathbf{x}^TA\mathbf{x})=\hbox{trace}(A\mathbf{x}\mathbf{x}^T)$

[2] 維基百科：Expectation-maximization algorithm

### 2 Responses to 高斯混合模型與最大期望算法

1. Ricky Li 說道：

周老师，有机会能讲一讲 SVM 的推导吗？

2. ccjou 說道：

全面性的理解svm需要知道優化理論的duality, KKT, SMO等概念與方法。等有時間把這些背景知識介紹後再講svm。