$$ \sum{j=1}^m \frac{\alpha{i}\cdot p\left(\boldsymbol{x_{j}}|\boldsymbol\mu {i},\boldsymbol\Sigma{i}\right)}{\sum{l=1}^k \alpha{l}\cdot p(\boldsymbol{x{j}}|\boldsymbol\mu{l},\boldsymbol\Sigma{l})}(\boldsymbol{x{j}-\mu_{i}})=0 $$
[推导]:根据公式(9.28)可知: $$ p(\boldsymbol{x{j}|\boldsymbol\mu{i},\boldsymbol\Sigma{i}})=\frac{1}{(2\pi)^\frac{n}{2}\left| \boldsymbol\Sigma{i}\right |^\frac{1}{2}}e^{-\frac{1}{2}(\boldsymbol{x{j}}-\boldsymbol\mu{i})^T\boldsymbol\Sigma{i}^{-1}(\boldsymbol{x{j}-\mu_{i}})} $$
又根据公式(9.32),由 $$ \frac {\partial LL(D)}{\partial \boldsymbol\mu{i}}=0 $$ 可得 $$\begin{aligned} \frac {\partial LL(D)}{\partial\boldsymbol\mu{i}}&=\frac {\partial}{\partial \boldsymbol\mu{i}}\sum{j=1}^mln\Bigg(\sum{i=1}^k \alpha{i}\cdot p(\boldsymbol{x{j}}|\boldsymbol\mu{i},\boldsymbol\Sigma{i})\Bigg) \ &=\sum{j=1}^m\frac{\partial}{\partial\boldsymbol\mu{i}}ln\Bigg(\sum{i=1}^k \alpha{i}\cdot p(\boldsymbol{x{j}}|\boldsymbol\mu{i},\boldsymbol\Sigma{i})\Bigg) \ &=\sum{j=1}^m\frac{\alpha{i}\cdot \frac{\partial}{\partial\boldsymbol{\mu{i}}}(p(\boldsymbol x{j}|\boldsymbol{\mu{i},\Sigma{i}}))}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{l},\Sigma{l}})} \ &=\sum{j=1}^m\frac{\alpha{i}\cdot \frac{1}{(2\pi)^\frac{n}{2}\left| \boldsymbol\Sigma{i}\right |^\frac{1}{2}}e^{-\frac{1}{2}(\boldsymbol{x{j}}-\boldsymbol\mu{i})^T\boldsymbol\Sigma{i}^{-1}(\boldsymbol{x{j}-\mu{i}})}}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{l},\Sigma{l}})}\frac{\partial}{\partial \boldsymbol\mu{i}}\left(-\frac{1}{2}\left(\boldsymbol{x{j}-\mu{i}}\right)^T\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)\right) \ &=\sum{j=1}^m\frac{\alpha{i}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{i},\Sigma{i}})}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{l},\Sigma{l}})}(-\frac{1}{2})\left(\left(\boldsymbol\Sigma{i}^{-1}+\left(\boldsymbol\Sigma{i}^{-1}\right)^T\right)\cdot\left(\boldsymbol{x{j}-\mu{i}}\right)\cdot(-1)\right) \ &=\sum{j=1}^m\frac{\alpha{i}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{i},\Sigma{i}})}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{l},\Sigma{l}})}(-\frac{1}{2})\left(-\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)-\left(\boldsymbol\Sigma{i}^{-1}\right)^T\left(\boldsymbol{x{j}-\mu{i}}\right)\right)=0 \ &=\sum{j=1}^m\frac{\alpha{i}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{i},\Sigma{i}})}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{l},\Sigma{l}})}(-\frac{1}{2})\left(-\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)-\left(\boldsymbol\Sigma{i}^T\right)^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)\right) \ &=\sum{j=1}^m\frac{\alpha{i}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{i},\Sigma{i}})}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{l},\Sigma{l}})}(-\frac{1}{2})\left(-\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)-\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)\right) \ &=\sum{j=1}^m\frac{\alpha{i}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{i},\Sigma{i}})}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{l},\Sigma{l}})}(-\frac{1}{2})\left(-2\cdot\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)\right) \ &=\sum{j=1}^m \frac{\alpha{i}\cdot p\left(\boldsymbol{x{j}}|\boldsymbol\mu {i},\boldsymbol\Sigma{i}\right)}{\sum{l=1}^k \alpha{l}\cdot p(\boldsymbol{x{j}}|\boldsymbol\mu{l},\boldsymbol\Sigma{l})}\boldsymbol\Sigma{i}^{-1}(\boldsymbol{x{j}-\mu{i}}) \ &=\sum{j=1}^m \frac{\alpha{i}\cdot p\left(\boldsymbol{x_{j}}|\boldsymbol\mu {i},\boldsymbol\Sigma{i}\right)}{\sum{l=1}^k \alpha{l}\cdot p(\boldsymbol{x{j}}|\boldsymbol\mu{l},\boldsymbol\Sigma{l})}(\boldsymbol{x{j}-\mu_{i}})=0 \end{aligned}$$
$$ \boldsymbol\Sigma{i}=\frac{\sum{j=1}^m\gamma{ji}(\boldsymbol{x{j}-\mu{i}})(\boldsymbol{x{j}-\mu{i}})^T}{\sum{j=1}^m}\gamma_{ji} $$
[推导]:根据公式(9.28)可知: $$ p(\boldsymbol{x{j}|\boldsymbol\mu{i},\boldsymbol\Sigma{i}})=\frac{1}{(2\pi)^\frac{n}{2}\left| \boldsymbol\Sigma{i}\right |^\frac{1}{2}}e^{-\frac{1}{2}(\boldsymbol{x{j}}-\boldsymbol\mu{i})^T\boldsymbol\Sigma{i}^{-1}(\boldsymbol{x{j}-\mu{i}})} $$ 又根据公式(9.32),由 $$ \frac {\partial LL(D)}{\partial \boldsymbol\Sigma{i}}=0 $$ 可得 $$\begin{aligned} \frac {\partial LL(D)}{\partial\boldsymbol\Sigma{i}}&=\frac {\partial}{\partial \boldsymbol\Sigma{i}}\sum{j=1}^mln\Bigg(\sum{i=1}^k \alpha{i}\cdot p(\boldsymbol{x{j}}|\boldsymbol\mu{i},\boldsymbol\Sigma{i})\Bigg) \ &=\sum{j=1}^m\frac{\partial}{\partial\boldsymbol\Sigma{i}}ln\Bigg(\sum{i=1}^k \alpha{i}\cdot p(\boldsymbol{x{j}}|\boldsymbol\mu{i},\boldsymbol\Sigma{i})\Bigg) \ &=\sum{j=1}^m \frac{\alpha{i}\cdot \frac{\partial}{\partial\boldsymbol\Sigma{i}}p(\boldsymbol x{j}|\boldsymbol \mu{i},\boldsymbol\Sigma{i})}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j},|\boldsymbol \mu{l},\boldsymbol\Sigma{l})} \ &=\sum{j=1}^m \frac{\alpha{i}\cdot \frac{\partial}{\partial\boldsymbol\Sigma{i}}\frac{1}{(2\pi)^\frac{n}{2}\left| \boldsymbol\Sigma{i}\right |^\frac{1}{2}}e^{-\frac{1}{2}(\boldsymbol{x{j}}-\boldsymbol\mu{i})^T\boldsymbol\Sigma{i}^{-1}(\boldsymbol{x{j}-\mu{i}})}}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j},|\boldsymbol \mu{l},\boldsymbol\Sigma{l})}\ &=\sum{j=1}^m \frac{\alpha{i}\cdot \frac{\partial}{\partial\boldsymbol\Sigma{i}}e^{ln\left(\frac{1}{(2\pi)^\frac{n}{2}\left| \boldsymbol\Sigma{i}\right |^\frac{1}{2}}e^{-\frac{1}{2}(\boldsymbol{x{j}}-\boldsymbol\mu{i})^T\boldsymbol\Sigma{i}^{-1}(\boldsymbol{x{j}-\mu{i}})}\right)}}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j},|\boldsymbol \mu{l},\boldsymbol\Sigma{l}} \ &=\sum{j=1}^m \frac{\alpha{i}\cdot \frac{\partial}{\partial\boldsymbol\Sigma{i}}e^{-\frac{n}{2}ln\left(2\pi\right)-\frac{1}{2}ln\left(|\boldsymbol\Sigma{i}|\right)-\frac{1}{2}\left(\boldsymbol{x{j}-\mu{i}}\right)^T\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)}}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j},|\boldsymbol \mu{l},\boldsymbol\Sigma{l}} \ &=\sum{j=1}^m \frac{\alpha{i}\cdot p(\boldsymbol x{j}|\boldsymbol \mu{i},\boldsymbol\Sigma{i})}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j},|\boldsymbol \mu{l},\boldsymbol\Sigma{l})}\frac{\partial}{\partial\boldsymbol\Sigma{i}}\left(-\frac{n}{2}ln\left(2\pi\right)-\frac{1}{2}ln\left(|\boldsymbol\Sigma{i}|\right)-\frac{1}{2}\left(\boldsymbol{x{j}-\mu{i}}\right)^T\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)\right) \ &=\sum{j=1}^m \frac{\alpha{i}\cdot p(\boldsymbol x{j}|\boldsymbol \mu{i},\boldsymbol\Sigma{i})}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j},|\boldsymbol \mu{l},\boldsymbol\Sigma{l})}\left(-\frac{1}{2}\left(\boldsymbol\Sigma{i}^{-1}\right)^T-\frac{1}{2}\frac{\partial}{\partial\boldsymbol\Sigma{i}}\left(\boldsymbol{x{j}-\mu{i}}\right)^T\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)\right) \end{aligned}$$
为求得 $$ \frac{\partial}{\partial\boldsymbol\Sigma{i}}\left(\boldsymbol{x{j}-\mu{i}}\right)^T\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right) $$
首先分析对$\boldsymbol \Sigma{i}$中单一元素的求导,用$r$代表矩阵$\boldsymbol\Sigma{i}$的行索引,$c$代表矩阵$\boldsymbol\Sigma{i}$的列索引,则 $$\begin{aligned} \frac{\partial}{\partial\Sigma{i{rc}}}\left(\boldsymbol{x{j}-\mu{i}}\right)^T\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)&=\left(\boldsymbol{x{j}-\mu{i}}\right)^T\frac{\partial\boldsymbol\Sigma{i}^{-1}}{\partial\Sigma{i{rc}}}\left(\boldsymbol{x{j}-\mu{i}}\right) \ &=-\left(\boldsymbol{x{j}-\mu{i}}\right)^T\boldsymbol\Sigma{i}^{-1}\frac{\partial\boldsymbol\Sigma{i}}{\partial\Sigma{i{rc}}}\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right) \end{aligned}$$ 设$B=\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)$,则 $$\begin{aligned} B^T&=\left(\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)\right)^T \ &=\left(\boldsymbol{x{j}-\mu{i}}\right)^T\left(\boldsymbol\Sigma{i}^{-1}\right)^T \ &=\left(\boldsymbol{x{j}-\mu{i}}\right)^T\boldsymbol\Sigma{i}^{-1} \end{aligned}$$ 所以 $$\begin{aligned} \frac{\partial}{\partial\Sigma{i{rc}}}\left(\boldsymbol{x{j}-\mu{i}}\right)^T\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)=-B^T\frac{\partial\boldsymbol\Sigma{i}}{\partial\Sigma{i{rc}}}B\end{aligned}$$ 其中$B$为$n\times1$阶矩阵,$\frac{\partial\boldsymbol\Sigma{i}}{\partial\Sigma{i{rc}}}$为$n$阶方阵,且$\frac{\partial\boldsymbol\Sigma{i}}{\partial\Sigma{i{rc}}}$仅在$\left(r,c\right)$位置处的元素值为1,其它位置处的元素值均为$0$,所以 $$\begin{aligned} \frac{\partial}{\partial\Sigma{i{rc}}}\left(\boldsymbol{x{j}-\mu{i}}\right)^T\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)=-B^T\frac{\partial\boldsymbol\Sigma{i}}{\partial\Sigma{i{rc}}}B=-B{r}\cdot B{c}=-\left(B\cdot B^T\right){rc}=\left(-B\cdot B^T\right){rc}\end{aligned}$$ 即对$\boldsymbol\Sigma{i}$中特定位置的元素的求导结果对应于$\left(-B\cdot B^T\right)$中相同位置的元素值,所以 $$\begin{aligned} \frac{\partial}{\partial\boldsymbol\Sigma{i}}\left(\boldsymbol{x{j}-\mu{i}}\right)^T\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)&=-B\cdot B^T\ &=-\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)\left(\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)\right)^T\ &=-\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)\left(\boldsymbol{x{j}-\mu{i}}\right)^T\boldsymbol\Sigma_{i}^{-1} \end{aligned}$$
因此最终结果为 $$ \frac {\partial LL(D)}{\partial \boldsymbol\Sigma{i}}=\sum{j=1}^m \frac{\alpha{i}\cdot p(\boldsymbol x{j}|\boldsymbol \mu{i},\boldsymbol\Sigma{i})}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j},|\boldsymbol \mu{l},\boldsymbol\Sigma{l})}\left( -\frac{1}{2}\left(\boldsymbol\Sigma{i}^{-1}-\boldsymbol\Sigma{i}^{-1}\left(\boldsymbol{x{j}-\mu{i}}\right)\left(\boldsymbol{x{j}-\mu{i}}\right)^T\boldsymbol\Sigma{i}^{-1}\right)\right)=0 $$
整理可得 $$ \boldsymbol\Sigma{i}=\frac{\sum{j=1}^m\gamma{ji}(\boldsymbol{x{j}-\mu{i}})(\boldsymbol{x{j}-\mu{i}})^T}{\sum{j=1}^m}\gamma_{ji} $$
$$ \alpha{i}=\frac{1}{m}\sum{j=1}^m\gamma_{ji} $$
[推导]:基于公式(9.37)进行恒等变形: $$ \sum{j=1}^m\frac{p(\boldsymbol x{j}|\boldsymbol{\mu{i},\Sigma{i}})}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{l},\Sigma_{l}})}+\lambda=0 $$
$$ \Rightarrow\sum{j=1}^m\frac{\alpha{i}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{i},\Sigma{i}})}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{l},\Sigma{l}})}+\alpha_{i}\lambda=0 $$
对所有混合成分进行求和: $$ \Rightarrow\sum{i=1}^k\left(\sum{j=1}^m\frac{\alpha{i}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{i},\Sigma{i}})}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{l},\Sigma{l}})}+\alpha{i}\lambda\right)=0 $$
$$ \Rightarrow\sum{i=1}^k\sum{j=1}^m\frac{\alpha{i}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{i},\Sigma{i}})}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{l},\Sigma{l}})}+\sum{i=1}^k\alpha_{i}\lambda=0 $$
$$ \Rightarrow\lambda=-\sum{i=1}^k\sum{j=1}^m\frac{\alpha{i}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{i},\Sigma{i}})}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{l},\Sigma_{l}})}=-m $$
又 $$ \sum{j=1}^m\frac{\alpha{i}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{i},\Sigma{i}})}{\sum{l=1}^k\alpha{l}\cdot p(\boldsymbol x{j}|\boldsymbol{\mu{l},\Sigma{l}})}+\alpha_{i}\lambda=0 $$
$$ \Rightarrow\sum{j=1}^m\gamma{ji}+\alpha_{i}\lambda=0 $$
$$ \Rightarrow\alpha{i}=-\frac{\sum{j=1}^m\gamma{ji}}{\lambda}=\frac{1}{m}\sum{j=1}^m\gamma_{ji} $$
参考公式
$$
\frac{\partial\boldsymbol x^TB\boldsymbol x}{\partial\boldsymbol x}=\left(B+B^T\right)\boldsymbol x
$$
$$
\frac{\partial}{\partial A}ln|A|=\left(A^{-1}\right)^T
$$
$$
\frac{\partial}{\partial x}\left(A^{-1}\right)=-A^{-1}\frac{\partial A}{\partial x}A^{-1}
$$
参考资料
Petersen, K. B., & Pedersen, M. S. The Matrix Cookbook.
Bishop, C. M. (2006). Pattern recognition and machine learning. springer.