diff --git a/lectures/05-Metropolis-Hastings.qmd b/lectures/05-Metropolis-Hastings.qmd index 6183a8b..503fc29 100644 --- a/lectures/05-Metropolis-Hastings.qmd +++ b/lectures/05-Metropolis-Hastings.qmd @@ -11,27 +11,6 @@ * Markov chains are defined by stochastic matrices; they have at least one stationary distribution -```{python} -import numpy as np -import matplotlib.pylab as plt - -plt.rc('font', size=20) - -# transition matrix as defined in last lecture -def transition_matrix(alpha, beta): - return np.array([[1-alpha, beta], - [alpha, 1-beta]]) - -# sampling method as defined in last lecture -def sample_chain(S, alpha=0.5, beta=0.5, x0=0): - X = [x0] - P = transition_matrix(alpha, beta) - while len(X) < S: - p = P[:,X[-1]] - X.append(np.random.multinomial(1, p).argmax()) - return np.array(X) -``` - * Reducible and periodic chains will not converge to a unique stationary distribution So only if the Markov is both *irreducible* and *aperiodic*, then we have a unique stationary distribution. This is detailed in the following theorems. @@ -58,31 +37,71 @@ $$ That is, if $P$ is irreducible and aperiodic, then matrix powers of $P$ converge to a rank-1 matrix. -The fundamental theorem for Markov chains follows from the [Perron-Frobenius theorem](https://en.wikipedia.org/wiki/Perron%E2%80%93Frobenius_theorem) for non-negative matrices and the irreducibility of $P$. - -Why does theorem (@eq-fundamental) have implications for sampling? As we saw in the previous lectures, it might be difficult to sample a probabilistic model directly. Sometimes variable transformations allow us to sample a model directly, but this is only rarely the case for complex models. When resorting to rejection or importance sampling, it is generally difficult to find a good proposal distribution. On the other hand, simulation of a Markov chain is simple to implement (see algorithm above): we just have to move from $x$ to $y$ according to $P(y, x)$. No matter where we start in sample space, the states that we produce by simulating a Markov chain will eventually follow the stationary distribution. +#### Convergence of two-state example -But there is still something missing in order to use the simulation of a Markov chain for probabilistic inference. In our setting, we are given a probabilistic model $p$ (our target distribution) rather than a transition matrix $P$. So we are still facing the challenge of designing a suitable Markov chain that has the desired target as its stationary distribution. This problem has been solved in a very ingenious fashion by Metropolis et al., as we will see soon. +To demonstrate this with a simple example, we revisit the two-state model from last lecture. ```{python} +# Here we demonstrate that matrix power of the transition matrix +# converges to a rank-1 matrix on the example of our two-state model + +import numpy as np +import matplotlib.pylab as plt + +plt.rc('font', size=20) + +# transition matrix as defined in last lecture +def transition_matrix(alpha, beta): + """ + return the transition matrix of the two-state model + given the model parameters `alpha` and `beta` + """ + return np.array([[1-alpha, beta], + [alpha, 1-beta]]) + +# sampling method as defined in last lecture +def sample_chain(S, alpha=0.5, beta=0.5, x0=0): + """ + Sample `S` values from the two-state Markov Chain + given the model parameters `alpha` and `beta`, and + and initial state `x0` ∈ {0, 1} + """ + X = [x0] + P = transition_matrix(alpha, beta) + while len(X) < S: + p = P[:,X[-1]] # transition probabilites given last state + x = np.random.choice([0, 1], p=p) # sample new state ∈ {0, 1} + X.append(x) + return np.array(X) -# matrix power converges to rank-1 matrix def compute_pi(alpha, beta): + """ + compute the stationary distribution of the two-state model + given the model parameters `alpha` and `beta` + """ return np.array([beta, alpha]) / (alpha + beta) -S = 50 -chains = [(0.1, 0.1), (0.1, 0.), (1e-2, 0.), (0.9, 0.9), (0.99, 0.99), (0.1, 0.7)] +S = 50 # number of steps +# list of model parameters ((alpha, beta) pairs): +params = [ + (0.10, 0.10), + (0.10, 0.00), # reducible (can't reach x1 from x2) + (0.01, 0.00), # reducible + (0.90, 0.90), + (0.99, 0.99), # close to periodic + (0.10, 0.70), +] kw = dict(ylim=[-0.1, 2.1], ylabel=r'$|P^s-\pi\mathbb{1}^T|$', xlabel='$s$') fig, ax = plt.subplots(2, 3, figsize=(12, 6), sharex='all', sharey='all', subplot_kw=kw) ax = list(ax.flat) -for i, (alpha, beta) in enumerate(chains): +for i, (alpha, beta) in enumerate(params): P = transition_matrix(alpha, beta) - pi = np.array([beta, alpha]) / (alpha + beta) - P_inf = np.multiply.outer(pi, np.ones(2)) - P_s = P.copy() + pi = compute_pi(alpha, beta) # expected stationary distribution + P_inf = np.multiply.outer(pi, np.ones(2)) # expected P_s for s -> ∞, a rank-1 matrix + P_s = P.copy() # initialize P_s with P_1 d = [] while len(d) < S: @@ -96,8 +115,16 @@ fig.tight_layout() ``` ```{python} -# convergence in distribution +# Here we demonstrate that we eventually produce samples from the +# stationary distribution *independent* of which initial state we +# start from. + def estimate_pi0(X): + """ + Given a list of samples `X` from the two-state Markov chain, + return a **running** list of estimates for probability of state + x1 (0) under the stationary distribution. + """ return 1-np.add.accumulate(X) / np.add.accumulate(np.ones(len(X))) S = 1000 @@ -106,7 +133,7 @@ kw = dict(ylim=[-0.1, 1.1], ylabel=r'$p^{(s)}(x=x_1)$', xlabel='$s$') fig, ax = plt.subplots(2, 3, figsize=(12, 6), sharex='all', sharey='all', subplot_kw=kw) ax = list(ax.flat) -for i, (alpha, beta) in enumerate(chains): +for i, (alpha, beta) in enumerate(params): X = sample_chain(S, alpha, beta, x0=0) pi = compute_pi(alpha, beta) pi_est = estimate_pi0(X) @@ -117,6 +144,13 @@ for i, (alpha, beta) in enumerate(chains): fig.tight_layout() ``` +The fundamental theorem for Markov chains follows from the [Perron-Frobenius theorem](https://en.wikipedia.org/wiki/Perron%E2%80%93Frobenius_theorem) for non-negative matrices and the irreducibility of $P$. + +Why does theorem (@eq-fundamental) have implications for sampling? As we saw in the previous lectures, it might be difficult to sample a probabilistic model directly. Sometimes variable transformations allow us to sample a model directly, but this is only rarely the case for complex models. When resorting to rejection or importance sampling, it is generally difficult to find a good proposal distribution. On the other hand, simulation of a Markov chain is simple to implement (see algorithm above): we just have to move from $x$ to $y$ according to $P(y, x)$. No matter where we start in sample space, the states that we produce by simulating a Markov chain will eventually follow the stationary distribution. + +But there is still something missing in order to use the simulation of a Markov chain for probabilistic inference. In our setting, we are given a probabilistic model $p$ (our target distribution) rather than a transition matrix $P$. So we are still facing the challenge of designing a suitable Markov chain that has the desired target as its stationary distribution. This problem has been solved in a very ingenious fashion by Metropolis et al., as we will see soon. + + ### Strong law of large numbers (LLN) for Markov chains Before we explain the Metropolis algorithm, let us briefly state the convergence result for irreducible and aperiodic Markov chains in a fashion that is closer to the *Monte Carlo approximation* introduced in the first lecture. @@ -349,7 +383,7 @@ X = np.arange(m) p = X + 1. p *= 2 / m / (m+1) -# uniform proposal +# uniform proposal (accepts current state as argument, but does not use it) Q = lambda x=None: np.random.choice(X) x = Q() @@ -360,7 +394,7 @@ while len(samples) < 1e5: # proposal step y = Q(x) - # acceptance probability + # acceptance ratio A = p[y] / p[x] # accept / reject? @@ -374,8 +408,12 @@ counts = counts / float(counts.sum()) fig, ax = plt.subplots(1, 2, figsize=(10, 4)) ax[0].bar(bins, counts, color='k', alpha=0.2) +ax[0].set_xlabel("x") +ax[0].set_ylabel("p(x)") ax[0].step(np.append(-1,X) + 0.5, np.append(0,p), color='r') ax[1].plot(samples[-500:], color='k', alpha=0.75) +ax[1].set_xlabel("sample") +ax[1].set_ylabel("x") fig.tight_layout() ``` @@ -423,7 +461,7 @@ The acceptance ratio $R(y, x)$ (Eq. @eq-acceptance-ratio) assesses how unbalance To better understand how the mapping from some irreducible proposal chain $Q$ to a $p$-reversible Metropolis chain $P$ works, let me try to explain a very nice paper by [Billera & Diaconis: A Geometric Interpretation of the Metropolis-Hastings Algorithm](https://projecteuclid.org/euclid.ss/1015346318). This paper sets out to provide a global view on why the MH algorithm in some sense provides the optimal way of turning some arbitrary Markov chain $Q$ into a Markov chain with the desired stationary distribution. -First let us think of the space of all possible Markov chains indexed by states from the finite sample space $\mathcal X$. This space is formed by left stochastic square matrices of size $|\mathcal X|$ and will be called $\mathcal S(\mathcal X)$. $\mathcal S(\mathcal X)$ is convex, because the convex combination of two Markov matrices is again a stochastic matrix. The dimension of $\mathcal S(\mathcal X)$ is $|\mathcal X|(|\mathcal X| -1 )$: there are $|X|^2$ non-negative entries in total from which we need to subtract $|X|$ diagonal entries that are fixed by column stochasticity (Eq. @eq-leftstochastic). +First let us think of the space of all possible Markov chains indexed by states from the finite sample space $\mathcal X$. This space is formed by left stochastic square matrices of size $|\mathcal X|$ and will be called $\mathcal S(\mathcal X)$. $\mathcal S(\mathcal X)$ is convex, because the convex combination of two Markov matrices is again a stochastic matrix. The dimension of $\mathcal S(\mathcal X)$ is $|\mathcal X|(|\mathcal X| -1 )$: there are $|\mathcal X|^2$ non-negative entries in total from which we need to subtract $|\mathcal X|$ diagonal entries that are fixed by column stochasticity (Eq. @eq-leftstochastic). For a fixed target distribution $p$, the subset $\mathcal R(p)$ of all Markov matrices that are $p$-reversible $$ @@ -448,6 +486,7 @@ a straight line segment through the origin with slope $p(x_1)/p(x_2)$. The following figure shows $\mathcal{R}(p)$ for $p(x_1) = 0.4$, $p(x_2) = 1-p(x_1) = 0.6$. ```{python} +#| echo: false # 2D visualization def make_plot(p0=0.4, limits=(-0.05, 1.05), ax=None): @@ -493,7 +532,7 @@ $$ \begin{pmatrix} \alpha \\ \beta \end{pmatrix} \to -\min\bigl\{\alpha\, p(x_1), \beta\, (1-p(x_2)) \bigr\} +\min\bigl\{\alpha\, p(x_1), \beta\, p(x_2) \bigr\} \begin{pmatrix} 1 / p(x_1) \\ 1 / p(x_2) \end{pmatrix} @@ -502,6 +541,7 @@ $$ Examples for the Metropolis map are shown in the following figure: ```{python} +#| echo: false def M(p0, alpha, beta): alpha_new = min(alpha, (1-p0) * beta / p0) beta_new = min(beta, p0 * alpha / (1-p0)) @@ -526,20 +566,21 @@ In the above figure, $Q$ is either shifted to the left along the $\alpha$ axis, A suitable metric on $\mathcal S(\mathcal{X})$ is $$ -d(P, P') = \sum_{x\in\mathcal{X}} \sum_{y\not= x} p(x)\, \left|P(y, x) - P'(y, x)\right| +d(Q, Q') = \sum_{x\in\mathcal{X}} \sum_{y\not= x} p(x)\, \left|Q(y, x) - Q'(y, x)\right| $$ {#eq-metric} -which is only zero, if $P'=P$. The following figure shows "circles" around some $Q\in\mathcal S(\mathcal X)$ which are of course not actual circles because $d(P,P')$ is a weighted L1 norm, so $d$-circles are diamonds. +which is only zero, if $Q'=Q$. The following figure shows "circles" around some $Q\in\mathcal S(\mathcal X)$ which are of course not actual circles because $d(Q,Q')$ is a weighted L1 norm, so $d$-circles are diamonds. For the 2-state system, we have $$ -d(P, P') = p(x_1)\, |\alpha - \alpha'| + p(x_2)\, |\beta - \beta'| +d(Q, Q') = p(x_1)\, |\alpha - \alpha'| + p(x_2)\, |\beta - \beta'| $$ -where $\alpha, \alpha'$ etc. are the off-diagonal entries of the transition matrices $P, P'$. +where $\alpha, \alpha'$ etc. are the off-diagonal entries of the transition matrices $Q, Q'$. ```{python} +#| echo: false def distance(p, P, Q): M = 1 - np.eye(len(p)) return np.sum(np.fabs(P-Q)*M*p)