1
0
Quellcode durchsuchen

Merge pull request #17 from huaqing89/master

第9章 聚类 公式推导
)s vor 7 Jahren
Ursprung
Commit
dcff493bc3
1 geänderte Dateien mit 155 neuen und 0 gelöschten Zeilen
  1. 155 0
      docs/chapter9.md

+ 155 - 0
docs/chapter9.md

@@ -0,0 +1,155 @@
+## 9.33
+
+$$
+\sum_{j=1}^m \frac{\alpha_{i}\cdot p\left(\boldsymbol{x_{j}}|\boldsymbol\mu _{i},\boldsymbol\Sigma_{i}\right)}{\sum_{l=1}^k \alpha_{l}\cdot p(\boldsymbol{x_{j}}|\boldsymbol\mu_{l},\boldsymbol\Sigma_{l})}(\boldsymbol{x_{j}-\mu_{i}})=0
+$$
+
+[推导]:根据公式(9.28)可知:
+$$
+p(\boldsymbol{x_{j}|\boldsymbol\mu_{i},\boldsymbol\Sigma_{i}})=\frac{1}{(2\pi)^\frac{n}{2}\left| \boldsymbol\Sigma_{i}\right |^\frac{1}{2}}e^{-\frac{1}{2}(\boldsymbol{x_{j}}-\boldsymbol\mu_{i})^T\boldsymbol\Sigma_{i}^{-1}(\boldsymbol{x_{j}-\mu_{i}})}
+$$
+
+
+又根据公式(9.32),由
+$$
+\frac {\partial LL(D)}{\partial \boldsymbol\mu_{i}}=0
+$$
+可得
+$$\begin{aligned}
+\frac {\partial LL(D)}{\partial\boldsymbol\mu_{i}}&=\frac {\partial}{\partial \boldsymbol\mu_{i}}\sum_{j=1}^mln\Bigg(\sum_{i=1}^k \alpha_{i}\cdot p(\boldsymbol{x_{j}}|\boldsymbol\mu_{i},\boldsymbol\Sigma_{i})\Bigg) \\
+&=\sum_{j=1}^m\frac{\partial}{\partial\boldsymbol\mu_{i}}ln\Bigg(\sum_{i=1}^k \alpha_{i}\cdot p(\boldsymbol{x_{j}}|\boldsymbol\mu_{i},\boldsymbol\Sigma_{i})\Bigg) \\
+&=\sum_{j=1}^m\frac{\alpha_{i}\cdot \frac{\partial}{\partial\boldsymbol{\mu_{i}}}(p(\boldsymbol x_{j}|\boldsymbol{\mu_{i},\Sigma_{i}}))}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{l},\Sigma_{l}})} \\
+&=\sum_{j=1}^m\frac{\alpha_{i}\cdot \frac{1}{(2\pi)^\frac{n}{2}\left| \boldsymbol\Sigma_{i}\right |^\frac{1}{2}}e^{-\frac{1}{2}(\boldsymbol{x_{j}}-\boldsymbol\mu_{i})^T\boldsymbol\Sigma_{i}^{-1}(\boldsymbol{x_{j}-\mu_{i}})}}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{l},\Sigma_{l}})}\frac{\partial}{\partial \boldsymbol\mu_{i}}\left(-\frac{1}{2}\left(\boldsymbol{x_{j}-\mu_{i}}\right)^T\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)\right) \\
+&=\sum_{j=1}^m\frac{\alpha_{i}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{i},\Sigma_{i}})}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{l},\Sigma_{l}})}(-\frac{1}{2})\left(\left(\boldsymbol\Sigma_{i}^{-1}+\left(\boldsymbol\Sigma_{i}^{-1}\right)^T\right)\cdot\left(\boldsymbol{x_{j}-\mu_{i}}\right)\cdot(-1)\right) \\
+&=\sum_{j=1}^m\frac{\alpha_{i}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{i},\Sigma_{i}})}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{l},\Sigma_{l}})}(-\frac{1}{2})\left(-\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)-\left(\boldsymbol\Sigma_{i}^{-1}\right)^T\left(\boldsymbol{x_{j}-\mu_{i}}\right)\right)=0 \\
+&=\sum_{j=1}^m\frac{\alpha_{i}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{i},\Sigma_{i}})}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{l},\Sigma_{l}})}(-\frac{1}{2})\left(-\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)-\left(\boldsymbol\Sigma_{i}^T\right)^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)\right) \\
+&=\sum_{j=1}^m\frac{\alpha_{i}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{i},\Sigma_{i}})}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{l},\Sigma_{l}})}(-\frac{1}{2})\left(-\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)-\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)\right) \\
+&=\sum_{j=1}^m\frac{\alpha_{i}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{i},\Sigma_{i}})}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{l},\Sigma_{l}})}(-\frac{1}{2})\left(-2\cdot\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)\right) \\
+&=\sum_{j=1}^m \frac{\alpha_{i}\cdot p\left(\boldsymbol{x_{j}}|\boldsymbol\mu _{i},\boldsymbol\Sigma_{i}\right)}{\sum_{l=1}^k \alpha_{l}\cdot p(\boldsymbol{x_{j}}|\boldsymbol\mu_{l},\boldsymbol\Sigma_{l})}\boldsymbol\Sigma_{i}^{-1}(\boldsymbol{x_{j}-\mu_{i}}) \\
+&=\sum_{j=1}^m \frac{\alpha_{i}\cdot p\left(\boldsymbol{x_{j}}|\boldsymbol\mu _{i},\boldsymbol\Sigma_{i}\right)}{\sum_{l=1}^k \alpha_{l}\cdot p(\boldsymbol{x_{j}}|\boldsymbol\mu_{l},\boldsymbol\Sigma_{l})}(\boldsymbol{x_{j}-\mu_{i}})=0
+\end{aligned}$$
+
+## 9.35
+
+$$
+\boldsymbol\Sigma_{i}=\frac{\sum_{j=1}^m\gamma_{ji}(\boldsymbol{x_{j}-\mu_{i}})(\boldsymbol{x_{j}-\mu_{i}})^T}{\sum_{j=1}^m}\gamma_{ji}
+$$
+
+[推导]:根据公式(9.28)可知:
+$$
+p(\boldsymbol{x_{j}|\boldsymbol\mu_{i},\boldsymbol\Sigma_{i}})=\frac{1}{(2\pi)^\frac{n}{2}\left| \boldsymbol\Sigma_{i}\right |^\frac{1}{2}}e^{-\frac{1}{2}(\boldsymbol{x_{j}}-\boldsymbol\mu_{i})^T\boldsymbol\Sigma_{i}^{-1}(\boldsymbol{x_{j}-\mu_{i}})}
+$$
+又根据公式(9.32),由
+$$
+\frac {\partial LL(D)}{\partial \boldsymbol\Sigma_{i}}=0
+$$
+可得
+$$\begin{aligned}
+\frac {\partial LL(D)}{\partial\boldsymbol\Sigma_{i}}&=\frac {\partial}{\partial \boldsymbol\Sigma_{i}}\sum_{j=1}^mln\Bigg(\sum_{i=1}^k \alpha_{i}\cdot p(\boldsymbol{x_{j}}|\boldsymbol\mu_{i},\boldsymbol\Sigma_{i})\Bigg) \\
+&=\sum_{j=1}^m\frac{\partial}{\partial\boldsymbol\Sigma_{i}}ln\Bigg(\sum_{i=1}^k \alpha_{i}\cdot p(\boldsymbol{x_{j}}|\boldsymbol\mu_{i},\boldsymbol\Sigma_{i})\Bigg) \\
+&=\sum_{j=1}^m \frac{\alpha_{i}\cdot \frac{\partial}{\partial\boldsymbol\Sigma_{i}}p(\boldsymbol x_{j}|\boldsymbol \mu_{i},\boldsymbol\Sigma_{i})}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j},|\boldsymbol \mu_{l},\boldsymbol\Sigma_{l})} \\
+&=\sum_{j=1}^m \frac{\alpha_{i}\cdot \frac{\partial}{\partial\boldsymbol\Sigma_{i}}\frac{1}{(2\pi)^\frac{n}{2}\left| \boldsymbol\Sigma_{i}\right |^\frac{1}{2}}e^{-\frac{1}{2}(\boldsymbol{x_{j}}-\boldsymbol\mu_{i})^T\boldsymbol\Sigma_{i}^{-1}(\boldsymbol{x_{j}-\mu_{i}})}}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j},|\boldsymbol \mu_{l},\boldsymbol\Sigma_{l})}\\
+&=\sum_{j=1}^m \frac{\alpha_{i}\cdot \frac{\partial}{\partial\boldsymbol\Sigma_{i}}e^{ln\left(\frac{1}{(2\pi)^\frac{n}{2}\left| \boldsymbol\Sigma_{i}\right |^\frac{1}{2}}e^{-\frac{1}{2}(\boldsymbol{x_{j}}-\boldsymbol\mu_{i})^T\boldsymbol\Sigma_{i}^{-1}(\boldsymbol{x_{j}-\mu_{i}})}\right)}}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j},|\boldsymbol \mu_{l},\boldsymbol\Sigma_{l}} \\
+&=\sum_{j=1}^m \frac{\alpha_{i}\cdot \frac{\partial}{\partial\boldsymbol\Sigma_{i}}e^{-\frac{n}{2}ln\left(2\pi\right)-\frac{1}{2}ln\left(|\boldsymbol\Sigma_{i}|\right)-\frac{1}{2}\left(\boldsymbol{x_{j}-\mu_{i}}\right)^T\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)}}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j},|\boldsymbol \mu_{l},\boldsymbol\Sigma_{l}} \\
+&=\sum_{j=1}^m \frac{\alpha_{i}\cdot p(\boldsymbol x_{j}|\boldsymbol \mu_{i},\boldsymbol\Sigma_{i})}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j},|\boldsymbol \mu_{l},\boldsymbol\Sigma_{l})}\frac{\partial}{\partial\boldsymbol\Sigma_{i}}\left(-\frac{n}{2}ln\left(2\pi\right)-\frac{1}{2}ln\left(|\boldsymbol\Sigma_{i}|\right)-\frac{1}{2}\left(\boldsymbol{x_{j}-\mu_{i}}\right)^T\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)\right) \\
+&=\sum_{j=1}^m \frac{\alpha_{i}\cdot p(\boldsymbol x_{j}|\boldsymbol \mu_{i},\boldsymbol\Sigma_{i})}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j},|\boldsymbol \mu_{l},\boldsymbol\Sigma_{l})}\left(-\frac{1}{2}\left(\boldsymbol\Sigma_{i}^{-1}\right)^T-\frac{1}{2}\frac{\partial}{\partial\boldsymbol\Sigma_{i}}\left(\boldsymbol{x_{j}-\mu_{i}}\right)^T\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)\right)
+\end{aligned}$$
+
+为求得
+$$
+\frac{\partial}{\partial\boldsymbol\Sigma_{i}}\left(\boldsymbol{x_{j}-\mu_{i}}\right)^T\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)
+$$
+
+首先分析对$\boldsymbol \Sigma_{i}$中单一元素的求导,用$r$代表矩阵$\boldsymbol\Sigma_{i}$的行索引,$c$代表矩阵$\boldsymbol\Sigma_{i}$的列索引,则
+$$\begin{aligned}
+\frac{\partial}{\partial\Sigma_{i_{rc}}}\left(\boldsymbol{x_{j}-\mu_{i}}\right)^T\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)&=\left(\boldsymbol{x_{j}-\mu_{i}}\right)^T\frac{\partial\boldsymbol\Sigma_{i}^{-1}}{\partial\Sigma_{i_{rc}}}\left(\boldsymbol{x_{j}-\mu_{i}}\right) \\
+&=-\left(\boldsymbol{x_{j}-\mu_{i}}\right)^T\boldsymbol\Sigma_{i}^{-1}\frac{\partial\boldsymbol\Sigma_{i}}{\partial\Sigma_{i_{rc}}}\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)
+\end{aligned}$$
+设$B=\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)$,则
+$$\begin{aligned}
+B^T&=\left(\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)\right)^T \\
+&=\left(\boldsymbol{x_{j}-\mu_{i}}\right)^T\left(\boldsymbol\Sigma_{i}^{-1}\right)^T \\
+&=\left(\boldsymbol{x_{j}-\mu_{i}}\right)^T\boldsymbol\Sigma_{i}^{-1}
+\end{aligned}$$
+所以
+$$\begin{aligned}
+\frac{\partial}{\partial\Sigma_{i_{rc}}}\left(\boldsymbol{x_{j}-\mu_{i}}\right)^T\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)=-B^T\frac{\partial\boldsymbol\Sigma_{i}}{\partial\Sigma_{i_{rc}}}B\end{aligned}$$
+其中$B$为$n\times1$阶矩阵,$\frac{\partial\boldsymbol\Sigma_{i}}{\partial\Sigma_{i_{rc}}}$为$n$阶方阵,且$\frac{\partial\boldsymbol\Sigma_{i}}{\partial\Sigma_{i_{rc}}}$仅在$\left(r,c\right)$位置处的元素值为1,其它位置处的元素值均为$0$,所以
+$$\begin{aligned}
+\frac{\partial}{\partial\Sigma_{i_{rc}}}\left(\boldsymbol{x_{j}-\mu_{i}}\right)^T\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)=-B^T\frac{\partial\boldsymbol\Sigma_{i}}{\partial\Sigma_{i_{rc}}}B=-B_{r}\cdot B_{c}=-\left(B\cdot B^T\right)_{rc}=\left(-B\cdot B^T\right)_{rc}\end{aligned}$$
+即对$\boldsymbol\Sigma_{i}$中特定位置的元素的求导结果对应于$\left(-B\cdot B^T\right)$中相同位置的元素值,所以
+$$\begin{aligned}
+\frac{\partial}{\partial\boldsymbol\Sigma_{i}}\left(\boldsymbol{x_{j}-\mu_{i}}\right)^T\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)&=-B\cdot B^T\\
+&=-\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)\left(\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)\right)^T\\
+&=-\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)\left(\boldsymbol{x_{j}-\mu_{i}}\right)^T\boldsymbol\Sigma_{i}^{-1}
+\end{aligned}$$
+
+因此最终结果为
+$$
+\frac {\partial LL(D)}{\partial \boldsymbol\Sigma_{i}}=\sum_{j=1}^m \frac{\alpha_{i}\cdot p(\boldsymbol x_{j}|\boldsymbol \mu_{i},\boldsymbol\Sigma_{i})}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j},|\boldsymbol \mu_{l},\boldsymbol\Sigma_{l})}\left( -\frac{1}{2}\left(\boldsymbol\Sigma_{i}^{-1}-\boldsymbol\Sigma_{i}^{-1}\left(\boldsymbol{x_{j}-\mu_{i}}\right)\left(\boldsymbol{x_{j}-\mu_{i}}\right)^T\boldsymbol\Sigma_{i}^{-1}\right)\right)=0
+$$
+
+整理可得
+$$
+\boldsymbol\Sigma_{i}=\frac{\sum_{j=1}^m\gamma_{ji}(\boldsymbol{x_{j}-\mu_{i}})(\boldsymbol{x_{j}-\mu_{i}})^T}{\sum_{j=1}^m}\gamma_{ji}
+$$
+
+## 9.38
+
+$$
+\alpha_{i}=\frac{1}{m}\sum_{j=1}^m\gamma_{ji}
+$$
+
+[推导]:基于公式(9.37)进行恒等变形:
+$$
+\sum_{j=1}^m\frac{p(\boldsymbol x_{j}|\boldsymbol{\mu_{i},\Sigma_{i}})}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{l},\Sigma_{l}})}+\lambda=0
+$$
+
+$$
+\Rightarrow\sum_{j=1}^m\frac{\alpha_{i}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{i},\Sigma_{i}})}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{l},\Sigma_{l}})}+\alpha_{i}\lambda=0
+$$
+
+对所有混合成分进行求和:
+$$
+\Rightarrow\sum_{i=1}^k\left(\sum_{j=1}^m\frac{\alpha_{i}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{i},\Sigma_{i}})}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{l},\Sigma_{l}})}+\alpha_{i}\lambda\right)=0
+$$
+
+$$
+\Rightarrow\sum_{i=1}^k\sum_{j=1}^m\frac{\alpha_{i}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{i},\Sigma_{i}})}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{l},\Sigma_{l}})}+\sum_{i=1}^k\alpha_{i}\lambda=0
+$$
+
+$$
+\Rightarrow\lambda=-\sum_{i=1}^k\sum_{j=1}^m\frac{\alpha_{i}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{i},\Sigma_{i}})}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{l},\Sigma_{l}})}=-m
+$$
+
+又
+$$
+\sum_{j=1}^m\frac{\alpha_{i}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{i},\Sigma_{i}})}{\sum_{l=1}^k\alpha_{l}\cdot p(\boldsymbol x_{j}|\boldsymbol{\mu_{l},\Sigma_{l}})}+\alpha_{i}\lambda=0
+$$
+
+$$
+\Rightarrow\sum_{j=1}^m\gamma_{ji}+\alpha_{i}\lambda=0
+$$
+
+$$
+\Rightarrow\alpha_{i}=-\frac{\sum_{j=1}^m\gamma_{ji}}{\lambda}=\frac{1}{m}\sum_{j=1}^m\gamma_{ji}
+$$
+
+
+
+## 附录
+参考公式
+$$
+\frac{\partial\boldsymbol x^TB\boldsymbol x}{\partial\boldsymbol x}=\left(B+B^T\right)\boldsymbol x 
+$$
+$$
+\frac{\partial}{\partial A}ln|A|=\left(A^{-1}\right)^T
+$$
+$$
+\frac{\partial}{\partial x}\left(A^{-1}\right)=-A^{-1}\frac{\partial A}{\partial x}A^{-1}
+$$
+参考资料
+Petersen, K. B., & Pedersen, M. S. *The Matrix Cookbook*.
+Bishop, C. M. (2006). *Pattern recognition and machine learning*. springer.
+
+