1 år sedan · 7c20e2a305
--- a/docs/chapter7/chapter7.md
+++ b/docs/chapter7/chapter7.md
@@ -10,9 +10,7 @@
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 R(c_i|\boldsymbol x)=1*P(c_1|\boldsymbol x)+...+1*P(c_{i-1}|\boldsymbol x)+0*P(c_i|\boldsymbol x)+1*P(c_{i+1}|\boldsymbol x)+...+1*P(c_N|\boldsymbol x)
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -20,9 +18,7 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 R(c_i|\boldsymbol x)=1-P(c_i|\boldsymbol x)
			
 
				-
			
 
				 $$
			
 
				 
			
 
				  此即式(7.5）。
			
@@ -50,13 +46,11 @@ $$
 
				 根据式(7.11)和式(7.10)可知参数求解式为 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				 \hat{\boldsymbol{\theta}}_{c}&=\underset{\boldsymbol{\theta}_{c}}{\arg \max } LL\left(\boldsymbol{\theta}_{c}\right) \\
			
 
				 &=\underset{\boldsymbol{\theta}_{c}}{\arg \min } -LL\left(\boldsymbol{\theta}_{c}\right) \\
			
 
				 &= \underset{\boldsymbol{\theta}_{c}}{\arg \min }-\sum_{\boldsymbol{x} \in D_{c}} \log P\left(\boldsymbol{x} | \boldsymbol{\theta}_{c}\right)
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -64,9 +58,7 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 P\left(\boldsymbol{x} | \boldsymbol{\theta}_{c}\right)=P\left(\boldsymbol{x} | \boldsymbol{\mu}_{c}, \boldsymbol{\sigma}_{c}^{2}\right)=\frac{1}{\sqrt{(2 \pi)^{d}|\boldsymbol{\Sigma}_c|}} \exp \left(-\frac{1}{2}(\boldsymbol{x}-\boldsymbol{\mu}_c)^{\mathrm{T}} \boldsymbol{\Sigma}_c^{-1}(\boldsymbol{x}-\boldsymbol{\mu}_c)\right)
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -74,14 +66,12 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				 (\hat{\boldsymbol{\mu}}_{c}, \hat{\boldsymbol{\Sigma}}_{c})&= \underset{(\boldsymbol{\mu}_{c},\boldsymbol{\Sigma}_c)}{\arg \min }-\sum_{\boldsymbol{x} \in D_{c}} \log\left[\frac{1}{\sqrt{(2 \pi)^{d}|\boldsymbol{\Sigma}_c|}} \exp \left(-\frac{1}{2}(\boldsymbol{x}-\boldsymbol{\mu}_c)^{\mathrm{T}} \boldsymbol{\Sigma}_c^{-1}(\boldsymbol{x}-\boldsymbol{\mu}_c)\right)\right] \\
			
 
				 &= \underset{(\boldsymbol{\mu}_{c},\boldsymbol{\Sigma}_c)}{\arg \min }-\sum_{\boldsymbol{x} \in D_{c}} \left[-\frac{d}{2}\log(2 \pi)-\frac{1}{2}\log|\boldsymbol{\Sigma}_c|-\frac{1}{2}(\boldsymbol{x}-\boldsymbol{\mu}_c)^{\mathrm{T}} \boldsymbol{\Sigma}_c^{-1}(\boldsymbol{x}-\boldsymbol{\mu}_c)\right] \\
			
 
				 &= \underset{(\boldsymbol{\mu}_{c},\boldsymbol{\Sigma}_c)}{\arg \min }\sum_{\boldsymbol{x} \in D_{c}} \left[\frac{d}{2}\log(2 \pi)+\frac{1}{2}\log|\boldsymbol{\Sigma}_c|+\frac{1}{2}(\boldsymbol{x}-\boldsymbol{\mu}_c)^{\mathrm{T}} \boldsymbol{\Sigma}_c^{-1}(\boldsymbol{x}-\boldsymbol{\mu}_c)\right] \\
			
 
				 &= \underset{(\boldsymbol{\mu}_{c},\boldsymbol{\Sigma}_c)}{\arg \min }\sum_{\boldsymbol{x} \in D_{c}} \left[\frac{1}{2}\log|\boldsymbol{\Sigma}_c|+\frac{1}{2}(\boldsymbol{x}-\boldsymbol{\mu}_c)^{\mathrm{T}} \boldsymbol{\Sigma}_c^{-1}(\boldsymbol{x}-\boldsymbol{\mu}_c)\right] \\
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -89,12 +79,10 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				 (\hat{\boldsymbol{\mu}}_{c}, \hat{\boldsymbol{\Sigma}}_{c})&=\underset{(\boldsymbol{\mu}_{c},\boldsymbol{\Sigma}_c)}{\arg \min }\sum_{i=1}^{n} \left[\frac{1}{2}\log|\boldsymbol{\Sigma}_c|+\frac{1}{2}(\boldsymbol{x}_{i}-\boldsymbol{\mu}_c)^{\mathrm{T}} \boldsymbol{\Sigma}_c^{-1}(\boldsymbol{x}_{i}-\boldsymbol{\mu}_c)\right]\\
			
 
				 &=\underset{(\boldsymbol{\mu}_{c},\boldsymbol{\Sigma}_c)}{\arg \min }\frac{n}{2}\log|\boldsymbol{\Sigma}_c|+\sum_{i=1}^{n}\frac{1}{2}(\boldsymbol{x}_i-\boldsymbol{\mu}_c)^{\mathrm{T}} \boldsymbol{\Sigma}_c^{-1}(\boldsymbol{x}_i-\boldsymbol{\mu}_c)\\
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -102,7 +90,6 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				 &\sum_{i=1}^{n}\frac{1}{2}(\boldsymbol{x}_i-\boldsymbol{\mu}_c)^{\mathrm{T}} \boldsymbol{\Sigma}_c^{-1}(\boldsymbol{x}_i-\boldsymbol{\mu}_c)\\
			
 
				 =&\frac{1}{2}\operatorname{tr}\left[\boldsymbol{\Sigma}_c^{-1}\sum_{i=1}^{n}(\boldsymbol{x}_i-\boldsymbol{\mu}_c)(\boldsymbol{x}_i-\boldsymbol{\mu}_c)^{\mathrm{T}}\right]\\
			
@@ -116,16 +103,13 @@ $$
 
				 =&\frac{1}{2}\operatorname{tr}\left[\boldsymbol{\Sigma}_c^{-1}\sum_{i=1}^{n}(\boldsymbol{x}_i-\bar{\boldsymbol{x}})(\boldsymbol{x}_i-\bar{\boldsymbol{x}})^{\mathrm{T}}\right]+\frac{n}{2}\operatorname{tr}\left[\boldsymbol{\Sigma}_c^{-1}(\boldsymbol{\mu}_c-\bar{\boldsymbol{x}})(\boldsymbol{\mu}_c-\bar{\boldsymbol{x}})^{\mathrm{T}}\right]\\
			
 
				 =&\frac{1}{2}\operatorname{tr}\left[\boldsymbol{\Sigma}_c^{-1}\sum_{i=1}^{n}(\boldsymbol{x}_i-\bar{\boldsymbol{x}})(\boldsymbol{x}_i-\bar{\boldsymbol{x}})^{\mathrm{T}}\right]+\frac{n}{2}(\boldsymbol{\mu}_c-\bar{\boldsymbol{x}})^{\mathrm{T}} \boldsymbol{\Sigma}_c^{-1}(\boldsymbol{\mu}_c-\bar{\boldsymbol{x}})
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				  所以
			
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 (\hat{\boldsymbol{\mu}}_{c}, \hat{\boldsymbol{\Sigma}}_{c})=\underset{(\boldsymbol{\mu}_{c},\boldsymbol{\Sigma}_c)}{\arg \min }\frac{n}{2}\log|\boldsymbol{\Sigma}_c|+\frac{1}{2}\operatorname{tr}\left[\boldsymbol{\Sigma}_{c}^{-1}\sum_{i=1}^{n}(\boldsymbol{x}_i-\bar{\boldsymbol{x}})(\boldsymbol{x}_i-\bar{\boldsymbol{x}})^{\mathrm{T}}\right]+\frac{n}{2}(\boldsymbol{\mu}_c-\bar{\boldsymbol{x}})^{\mathrm{T}} \boldsymbol{\Sigma}_c^{-1}(\boldsymbol{\mu}_c-\bar{\boldsymbol{x}})
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -133,9 +117,7 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \hat{\boldsymbol{\mu}}_{c}=\bar{\boldsymbol{x}}=\frac{1}{n}\sum_{i=1}^{n}\boldsymbol{x}_i
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -143,9 +125,7 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \hat{\boldsymbol{\Sigma}}_{c}=\underset{\boldsymbol{\Sigma}_c}{\arg \min }\frac{n}{2}\log|\boldsymbol{\Sigma}_c|+\frac{1}{2}\operatorname{tr}\left[\boldsymbol{\Sigma}_{c}^{-1}\sum_{i=1}^{n}(\boldsymbol{x}_i-\bar{\boldsymbol{x}})(\boldsymbol{x}_i-\bar{\boldsymbol{x}})^{\mathrm{T}}\right]
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -155,9 +135,7 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \frac{n}{2}\log|\boldsymbol{\Sigma}|+\frac{1}{2}\operatorname{tr}\left[\boldsymbol{\Sigma}^{-1}\mathbf{B}\right]\geq\frac{n}{2}\log|\mathbf{B}|+\frac{pn}{2}(1-\log n)
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -176,21 +154,17 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				 L(\theta)&=\theta\cdot\theta\cdot(1-\theta)\cdot\theta\cdot(1-\theta)\\
			
 
				 &=\theta^{3}(1-\theta)^2
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				  对数似然为
			
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 LL(\theta)=\ln L(\theta)=3\ln\theta+2\ln (1-\theta)
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -198,13 +172,11 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				     \frac{\partial LL(\theta)}{\partial\theta}&=\frac{\partial\left(3\ln\theta+2\ln (1-\theta)\right)}{\partial\theta}\\
			
 
				     &=\frac{3}{\theta}-\frac{2}{1-\theta}\\
			
 
				     &=\frac{3-5\theta}{\theta(1-\theta)}
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -226,10 +198,8 @@ $D=\{x_1,x_2,\cdots,x_n\}$，则根据贝叶斯式可得，在给定样本集$D$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 P(\theta|D)=\frac{P(D|\theta)P(\theta)}{P(D)}=\frac{P(D|\theta)P(\theta)}
			
 
				 {\sum_{\theta}P(D|\theta)P(\theta)}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -237,11 +207,9 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 P(\theta|D)=\frac{P(D|\theta)P(\theta)}
			
 
				 {\sum_{\theta}P(D|\theta)P(\theta)}=\frac{\prod_{i=1}^{n}P(x_i|\theta)
			
 
				 P(\theta)}{\sum_{\theta}\prod_{i=1}^{n}P(x_i|\theta)P(\theta)}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -259,9 +227,7 @@ Categorical分布又称为广义伯努利分布，是将伯努利分布中的随
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 P(X=x_i)=p(x_i)=\theta_i
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -276,11 +242,9 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 p(\boldsymbol{x};\boldsymbol{\alpha})=\frac{\Gamma \left(\sum _{i=1}^{k}\alpha _{i}\right)}
			
 
				 {\prod _{i=1}^{k}\Gamma (\alpha _{i})}\prod
			
 
				 _{i=1}^{k}x_{i}^{\alpha _{i}-1}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				  其中$\Gamma (z)=\int
			
@@ -295,9 +259,7 @@ d}x$为Gamma函数，当$\boldsymbol{\alpha}=(1,1,\cdots,1)$时，Dirichlet分
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 P(C=c_i)=P(c_i)=\theta_i
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -308,16 +270,13 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 P(D|\boldsymbol{\theta})=\theta_1^{y_1}...\theta_k^{y_k}=\prod_{i=1}^{k}\theta_i^{y_i}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
 
				 则有后验概率 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				 P(\boldsymbol{\theta}|D)&=\frac{P(D|\boldsymbol{\theta})P(\boldsymbol{\theta})}{P(D)}\\
			
 
				 &=\frac{P(D|\boldsymbol{\theta})P(\boldsymbol{\theta})}{\sum_{\boldsymbol{\theta}}
			
@@ -326,7 +285,6 @@ P(D|\boldsymbol{\theta})P(\boldsymbol{\theta})}\\
 
				 P(\boldsymbol{\theta})}{\sum_{\boldsymbol{\theta}}\left[\prod_{i=1}^{k}\theta_i^{y_i}\cdot
			
 
				 P(\boldsymbol{\theta})\right]}
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -334,16 +292,13 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 P(\boldsymbol{\boldsymbol{\theta}};\boldsymbol{\alpha})=\frac{\Gamma \left(\sum_{i=1}^{k}\alpha_{i}\right)}{\prod_{i=1}^{k}\Gamma (\alpha_{i})}\prod_{i=1}^{k}\theta_{i}^{\alpha_{i}-1}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
 
				 将其代入$P(D|\boldsymbol{\theta})$可得 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				 P(\boldsymbol{\theta}|D)&=\dfrac{\prod_{i=1}^{k}\theta_i^{y_i}
			
 
				 \cdot P(\boldsymbol{\theta})}{\sum_{\boldsymbol{\theta}}
			
@@ -371,7 +326,6 @@ _{i=1}^{k}\theta_{i}^{\alpha _{i}-1}}
 
				 &=\dfrac{\prod_{i=1}^{k}\theta_i^{\alpha_{i}+y_i-1}}{\sum_{\boldsymbol{\theta}}
			
 
				 \left[\prod_{i=1}^{k}\theta_i^{\alpha_{i}+y_i-1}\right]}
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -379,7 +333,6 @@ $$
 
				 \mathbb{R}^{k}$，则根据Dirichlet分布的定义可知 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				 P(\boldsymbol{\theta};\boldsymbol{\alpha}+\boldsymbol{y})&=
			
 
				 \dfrac{\Gamma \left(\sum _{i=1}^{k}(\alpha_{i}+y_i)\right)}{\prod _{i=1}^{k}\Gamma (\alpha_{i}+y_i)}\prod _{i=1}^{k}\theta_{i}^{\alpha_{i}+y_i-1} \\
			
@@ -397,14 +350,12 @@ _{i=1}^{k}(\alpha_{i}+y_i)\right)}{\prod _{i=1}^{k}\Gamma
 
				 _{i=1}^{k}\theta_{i}^{\alpha_{i}+y_i-1}\right] \\
			
 
				 \frac{1}{\sum_{\boldsymbol{\theta}}\left[\prod _{i=1}^{k}\theta_{i}^{\alpha_{i}+y_i-1}\right]}&=\frac{\Gamma \left(\sum _{i=1}^{k}(\alpha_{i}+y_i)\right)}{\prod _{i=1}^{k}\Gamma (\alpha_{i}+y_i)} \\
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				  将此结论代入$P(D|\boldsymbol{\theta})$可得
			
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				 P(\boldsymbol{\theta}|D)&=\frac{\prod_{i=1}^{k}\theta_i^{\alpha_{i}+y_i-1}}{\sum_{\boldsymbol{\theta}}\left[\prod_{i=1}^{k}\theta_i^{\alpha_{i}+y_i-1}\right]}\\
			
 
				 &=\frac{\Gamma \left(\sum _{i=1}^{k}(\alpha_{i}+y_i)\right)}{\prod
			
@@ -412,7 +363,6 @@ _{i=1}^{k}\Gamma
 
				 (\alpha_{i}+y_i)}\prod _{i=1}^{k}\theta_{i}^{\alpha _{i}+y_i-1} \\
			
 
				 &=P(\boldsymbol{\theta};\boldsymbol{\alpha}+\boldsymbol{y})
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -420,7 +370,6 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				 \theta_i&=\mathbb E_{P(\boldsymbol{\theta}|D)}[\theta_i]\\
			
 
				 &=\mathbb E_{P(\boldsymbol{\theta};\boldsymbol{\alpha}+\boldsymbol{y})}[\theta_i]\\
			
@@ -428,7 +377,6 @@ $$
 
				 &=\frac{\alpha_i+y_i}{\sum_{j=1}^k\alpha_j+\sum_{j=1}^ky_j}\\
			
 
				 &=\frac{\alpha_i+y_i}{\sum_{j=1}^k\alpha_j+m}\\
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -446,9 +394,7 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 I(x_i,x_j|y)=\sum_{n=1}^{N}P(x_i,x_j|c_n)\log\frac{P(x_i,x_j|c_n)}{P(x_i|c_n)P(x_j|c_n)}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -460,13 +406,11 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				 P(\boldsymbol{x}, c) & =P\left(x_1, x_2, \ldots, x_d, c\right) \\
			
 
				 & =P\left(x_1, x_2, \ldots, x_d \mid c\right) P(c) \\
			
 
				 & =P\left(x_1, \ldots, x_{i-1}, x_{i+1}, \ldots, x_d \mid c, x_i\right) P\left(c, x_i\right)
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -475,9 +419,7 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 P(x_1,...,x_{i-1},x_{i+1},...,x_d|c,x_i)=\prod_{j=1\\j\neq i}^{d}P(x_j|c,x_i)
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -485,23 +427,19 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 P(x_1,...,x_{i-1},x_{i+1},...,x_d|c,x_i)=\prod_{j=1}^{d}P(x_j|c,x_i)
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
 
				 综上可得： 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				 P(c|\boldsymbol{x})&=\frac{P(\boldsymbol{x},c)}{P(\boldsymbol{x})}\\ 
			
 
				 &=\frac{P\left(c, x_i\right)P\left(x_1, \ldots, x_{i-1}, x_{i+1}, \ldots, x_d \mid c, x_i\right)}{P(\boldsymbol{x})}\\
			
 
				 &\propto P\left(c, x_i\right)P\left(x_1, \ldots, x_{i-1}, x_{i+1}, \ldots, x_d \mid c, x_i\right) \\
			
 
				 &=P\left(c, x_i\right)\prod_{j=1}^{d}P(x_j|c,x_i)
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -520,27 +458,23 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned} 
			
 
				 P(x_3,x_4|x_1)&=\frac{P(x_1,x_3,x_4)}{P(x_1)} \\
			
 
				 &=\frac{P(x_1)P(x_3|x_1)P(x_4|x_1)}{P(x_1)} \\
			
 
				 &=P(x_3|x_1)P(x_4|x_1) \\
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				  顺序结构：在给定节点$x$的条件下$y,z$独立
			
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned} 
			
 
				 P(y,z|x)&=\frac{P(x,y,z)}{P(x)} \\
			
 
				 &=\frac{P(z)P(x|z)P(y|x)}{P(x)} \\
			
 
				 &=\frac{P(z,x)P(y|x)}{P(x)} \\
			
 
				 &=P(z|x)P(y|x) \\
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -555,9 +489,7 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 f\left(t x_1 + (1-t)x_2\right)\leqslant tf(x_1)+(1-t)f(x_2)
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -565,9 +497,7 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 f(t_1 x_1 + t_2x_2+...+t_nx_n)\leqslant t_1f(x_1)+t_2f(x_2)+...+t_nf(t_n)
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -575,9 +505,7 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \varphi(\mathbb{E}[X])\leqslant \mathbb{E}[\varphi(X)]
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -589,12 +517,10 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned} 
			
 
				 LL(\theta) &=\sum_{i=1}^{m} \ln p(x_i; \theta) \\ 
			
 
				 &=\sum_{i=1}^{m} \ln \sum_{z_i} p(x_i, z_i; \theta) 
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -606,13 +532,11 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned} 
			
 
				 LL(\theta)&=\ln P(X\vert \theta)\\
			
 
				 &=\ln \sum_Z P(X,Z\vert\theta)\\
			
 
				 &=\ln \left(\sum_Z P(X\vert Z,\theta)P(Z\vert \theta)\right)
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -620,18 +544,15 @@ EM算法采用的是通过迭代逐步近似极大化$L(\theta)$：假设第$t$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				 LL(\theta)-LL(\theta^{(t)})&=\ln \left(\sum_Z P(X\vert Z,\theta)P(Z\vert \theta)\right)-\ln P(X\vert\theta^{(t)}) \\
			
 
				 &=\ln \left(\sum_Z P(Z\vert X,\theta^{(t)}) \cfrac{P(X\vert Z,\theta)P(Z\vert \theta)}{P(Z\vert X,\theta^{(t)})}\right)-\ln P(X\vert\theta^{(t)})
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				  由上述Jensen不等式可得 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				 LL(\theta)-LL(\theta^{(t)})
			
 
				 &\geqslant \sum_Z P(Z\vert X,\theta^{(t)})\ln \cfrac{P(X\vert Z,\theta)P(Z\vert \theta)}{P(Z\vert X,\theta^{(t)})}-\ln P(X\vert\theta^{(t)}) \\
			
@@ -640,25 +561,20 @@ LL(\theta)-LL(\theta^{(t)})
 
				 &=\sum_Z P(Z\vert X,\theta^{(t)}) \left( \ln \cfrac{P(X\vert Z,\theta)P(Z\vert \theta)}{P(Z\vert X,\theta^{(t)})} - \ln P(X\vert\theta^{(t)}) \right)\\
			
 
				 &= \sum_Z P(Z\vert X,\theta^{(t)})\ln \cfrac{P(X\vert Z,\theta)P(Z\vert \theta)}{P(Z\vert X,\theta^{(t)})P(X\vert\theta^{(t)})}
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				  令
			
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 B(\theta,\theta^{(t)})=LL(\theta^{(t)})+\sum_Z P(Z\vert X,\theta^{(t)})\ln \cfrac{P(X\vert Z,\theta)P(Z\vert \theta)}{P(Z\vert X,\theta^{(t)})P(X\vert\theta^{(t)})}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
 
				 则 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 LL(\theta)\geqslant B(\theta,\theta^{(t)})
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -666,9 +582,7 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 B(\theta^{(t+1)},\theta^{(t)}) \geqslant B(\theta,\theta^{(t)})
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -676,18 +590,14 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 LL(\theta^{(t+1)})\geqslant B(\theta^{(t+1)},\theta^{(t)})\geqslant B(\theta^{(t)},\theta^{(t)})=LL(\theta^{(t)})
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 LL(\theta^{(t+1)})\geqslant LL(\theta^{(t)})
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -695,24 +605,20 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				 \theta^{(t+1)}&=\mathop{\arg\max}_{\theta}B(\theta,\theta^{(t)}) \\
			
 
				 &=\mathop{\arg\max}_{\theta}\left( LL(\theta^{(t)})+\sum_Z P(Z\vert X,\theta^{(t)})\ln \cfrac{P(X\vert Z,\theta)P(Z\vert \theta)}{P(Z\vert X,\theta^{(t)})P(X\vert\theta^{(t)})}\right)
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				  略去对$\theta$极大化而言是常数的项 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned}
			
 
				 \theta^{(t+1)}&=\mathop{\arg\max}_{\theta}\left(\sum_Z P(Z\vert X,\theta^{(t)})\ln\left( P(X\vert Z,\theta)P(Z\vert \theta)\right)\right) \\
			
 
				 &=\mathop{\arg\max}_{\theta}\left(\sum_Z P(Z\vert X,\theta^{(t)})\ln P(X,Z\vert \theta)\right) \\
			
 
				 &=\mathop{\arg\max}_{\theta}Q(\theta,\theta^{(t)})
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -722,9 +628,7 @@ E步：计算完全数据的对数似然函数$\ln P(X,Z\vert \theta)$关于在
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 Q(\theta,\theta^{(t)})=\mathbb{E}_Z[\ln P(X,Z\vert \theta)\vert X,\theta^{(t)}]=\sum_Z P(Z\vert X,\theta^{(t)})\ln P(X,Z\vert \theta)
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -735,13 +639,11 @@ M步：求使得$Q(\theta,\theta^{(t)})$达到极大的$\theta^{(t+1)}$。
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned} 
			
 
				 LL(\theta) &=\sum_{i=1}^{m} \ln p(x_i; \theta) \\ 
			
 
				 &=\sum_{i=1}^{m} \ln \sum_{z_i} p(x_i, z_i; \theta) \\
			
 
				 &=\sum_{i=1}^{m} \ln \sum_{z_i} Q_i(z_i)\cfrac{p(x_i, z_i; \theta)}{Q_i(z_i)} \\
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -749,9 +651,7 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \sum_{z_i} Q_i(z_i)\cfrac{p(x_i, z_i; \theta)}{Q_i(z_i)}=\mathbb{E}_{z_i}\left[\cfrac{p(x_i, z_i; \theta)}{Q_i(z_i)}\right]
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -759,29 +659,23 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \ln\left(\mathbb{E}_{z_i}\left[\cfrac{p(x_i, z_i; \theta)}{Q_i(z_i)}\right]\right)\geqslant \mathbb{E}_{z_i}\left[\ln\left(\cfrac{p(x_i, z_i; \theta)}{Q_i(z_i)}\right)\right]
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \ln\sum_{z_i} Q_i(z_i)\cfrac{p(x_i, z_i; \theta)}{Q_i(z_i)}\geqslant \sum_{z_i} Q_i(z_i)\ln\cfrac{p(x_i, z_i; \theta)}{Q_i(z_i)}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
 
				 将此式代入$LL(\theta)$可得 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned} 
			
 
				 LL(\theta) &=\sum_{i=1}^{m} \ln \sum_{z_i} Q_i(z_i)\cfrac{p(x_i, z_i; \theta)}{Q_i(z_i)}\geqslant \sum_{i=1}^{m}\sum_{z_i} Q_i(z_i)\ln\cfrac{p(x_i, z_i; \theta)}{Q_i(z_i)} \quad \textcircled{1}
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -789,54 +683,42 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \cfrac{p(x_i, z_i; \theta)}{Q_i(z_i)}=c
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 p(x_i, z_i; \theta)=c\cdot Q_i(z_i)
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \sum_{z_i}p(x_i, z_i; \theta)=c\cdot \sum_{z_i}Q_i(z_i)
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \sum_{z_i}p(x_i, z_i; \theta)=c
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \cfrac{p(x_i, z_i; \theta)}{Q_i(z_i)}=\sum_{z_i}p(x_i, z_i; \theta)
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 Q_i(z_i)=\cfrac{p(x_i, z_i; \theta)}{\sum\limits_{z_i}p(x_i, z_i; \theta)}=\cfrac{p(x_i, z_i; \theta)}{p(x_i; \theta)}=p(z_i|x_i; \theta)
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -844,14 +726,12 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned} 
			
 
				 LL(\theta) &=\sum_{i=1}^{m} \ln \sum_{z_i} Q_i(z_i)\cfrac{p(x_i, z_i; \theta)}{Q_i(z_i)} & \quad \textcircled{2}\\
			
 
				 &=\sum_{i=1}^{m} \ln \sum_{z_i}p(z_i|x_i; \theta)\cfrac{p(x_i, z_i; \theta)}{p(z_i|x_i; \theta)} & \quad \textcircled{3}\\
			
 
				 &=\sum_{i=1}^{m}\sum_{z_i} p(z_i|x_i; \theta)\ln\cfrac{p(x_i, z_i; \theta)}{p(z_i|x_i; \theta)} & \quad \textcircled{4}\\
			
 
				 &=\max\{B(\theta)\} & \quad \textcircled{5} \\
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -859,20 +739,17 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned} 
			
 
				 \theta^{(t+1)}&=\arg\max_{\theta}\max\{B(\theta)\}  & \quad \textcircled{6}\\
			
 
				 &=\arg\max_{\theta}\sum_{i=1}^{m}\sum_{z_i} p(z_i|x_i;\theta^{(t)})\ln\cfrac{p(x_i, z_i; \theta)}{p(z_i|x_i; \theta^{(t)})}  & \quad \textcircled{7}\\
			
 
				 &=\arg\max_{\theta}\sum_{i=1}^{m}\sum_{z_i} p(z_i|x_i;\theta^{(t)})\ln p(x_i, z_i; \theta) & \quad \textcircled{8}
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				  此时将$\theta^{(t+1)}$代入$LL(\theta)$可推得
			
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned} 
			
 
				 LL(\theta^{(t+1)}) &=\max\{B(\theta^{(t+1)})\} &\quad\textcircled{9} \\
			
 
				 &=\sum_{i=1}^{m}\sum_{z_i} p(z_i|x_i; \theta^{(t+1)})\ln\cfrac{p(x_i, z_i; \theta^{(t+1)})}{p(z_i|x_i; \theta^{(t+1)})} &\quad\textcircled{10}\\
			
@@ -881,7 +758,6 @@ LL(\theta^{(t+1)}) &=\max\{B(\theta^{(t+1)})\} &\quad\textcircled{9} \\
 
				 &=\max\{B(\theta^{(t)})\} &\quad\textcircled{13} \\
			
 
				 &=LL(\theta^{(t)})&\quad\textcircled{14}
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -889,9 +765,7 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 Q(\theta,\theta^{(t)})=\sum_{i=1}^{m}\sum_{z_i} p(z_i|x_i; \theta^{(t)})\ln p(x_i, z_i; \theta)
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -905,14 +779,12 @@ M步：求使得$Q(\theta,\theta^{(t)})$到达极大的$\theta^{(t+1)}$。
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned} Q(\theta|\theta^{(t)})&=\sum_Z P(Z|X,\theta^{(t)})\ln P(X,Z|\theta) \\
			
 
				 &=\sum_{z_1,z_2,...,z_m}\left\{\prod_{i=1}^mP(z_i|x_i,\theta^{(t)})\ln\left[ \prod_{i=1}^m P(x_i,z_i|\theta) \right] \right\} \\
			
 
				 &=\sum_{z_1,z_2,...,z_m}\left\{\prod_{i=1}^mP(z_i|x_i,\theta^{(t)})\left[ \sum_{i=1}^m\ln P(x_i,z_i|\theta) \right] \right\} \\
			
 
				 &=\sum_{z_1,z_2,...,z_m}\left\{\prod_{i=1}^mP(z_i|x_i,\theta^{(t)})\left[\ln P(x_1,z_1|\theta) + \ln P(x_2,z_2|\theta) +...+ \ln P(x_m,z_m|\theta)\right] \right\} \\
			
 
				 &=\sum_{z_1,z_2,...,z_m}\left[\prod_{i=1}^mP(z_i|x_i,\theta^{(t)})\cdot\ln P(x_1,z_1|\theta) \right]+...+\sum_{z_1,z_2,...,z_m}\left[\prod_{i=1}^mP(z_i|x_i,\theta^{(t)})\cdot\ln P(x_m,z_m|\theta) \right]  \\
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
@@ -920,7 +792,6 @@ $$
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned} 
			
 
				 &\sum\limits_{z_1,z_2,...,z_m}\left[\prod\limits_{i=1}^mP(z_i|x_i,\theta^{(t)})\cdot\ln P(x_1,z_1|\theta) \right] \\
			
 
				 =&\sum\limits_{z_1,z_2,...,z_m}\left[\prod_{i=2}^mP(z_i|x_i,\theta^{(t)})\cdot P(z_1|x_1,\theta^{(t)})\cdot\ln P(x_1,z_1|\theta) \right] \\
			
@@ -933,42 +804,35 @@ $$
 
				 =&\sum_{z_1}P(z_1|x_1,\theta^{(t)})\ln P(x_1,z_1|\theta)\times \left\{1\times1\times...\times1\right\} \\
			
 
				 =&\sum_{z_1}P(z_1|x_1,\theta^{(t)})\ln P(x_1,z_1|\theta)  \\
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				  所以
			
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \sum\limits_{z_1,z_2,...,z_m}\left[\prod_{i=1}^mP(z_i|x_i,\theta^{(t)})\cdot\ln P(x_1,z_1|\theta) \right]=\sum_{z_1}P(z_1|x_1,\theta^{(t)})\ln P(x_1,z_1|\theta)
			
 
				-
			
 
				 $$
			
 
				 
			
 
				 
			
 
				 同理可得 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned} 
			
 
				 \sum\limits_{z_1,z_2,...,z_m}\left[\prod_{i=1}^mP(z_i|x_i,\theta^{(t)})\cdot\ln P(x_2,z_2|\theta) \right] &=\sum_{z_2}P(z_2|x_2,\theta^{(t)})\ln P(x_2,z_2|\theta) \\
			
 
				 &\vdots\\
			
 
				 \sum\limits_{z_1,z_2,...,z_m}\left[\prod_{i=1}^mP(z_i|x_i,\theta^{(t)})\cdot\ln P(x_m,z_m|\theta) \right] &=\sum_{z_m}P(z_m|x_m,\theta^{(t)})\ln P(x_m,z_m|\theta)
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$
			
 
				 
			
 
				  将上式代入$Q(\theta|\theta^{(t)})$可得
			
 
				 
			
 
				 
			
 
				 $$
			
 
				-
			
 
				 \begin{aligned} 
			
 
				 Q(\theta|\theta^{(t)})&=\sum_{z_1,z_2,...,z_m}\left[\prod_{i=1}^mP(z_i|x_i,\theta^{(t)})\cdot\ln P(x_1,z_1|\theta) \right]+...+\sum_{z_1,z_2,...,z_m}\left[\prod_{i=1}^mP(z_i|x_i,\theta^{(t)})\cdot\ln P(x_m,z_m|\theta) \right]  \\
			
 
				 &=\sum_{z_1}P(z_1|x_1,\theta^{(t)})\ln P(x_1,z_1|\theta) +...+\sum_{z_m}P(z_m|x_m,\theta^{(t)})\ln P(x_m,z_m|\theta) \\
			
 
				 &=\sum_{i=1}^m\sum_{z_i}P(z_i|x_i,\theta^{(t)})\ln P(x_i,z_i|\theta)\\
			
 
				 \end{aligned}
			
 
				-
			
 
				 $$