import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('seaborn-v0_8-darkgrid')Markov Chain Monte Carlo
Grabbing our libraries
Writing a function to select random values from any probability distribution:
def sample_from_prob_dist(pdf, lower, upper, num):
samples = []
while len(samples) < num:
x = np.random.uniform(lower, upper)
p = pdf(x)
if np.random.uniform(0,1) <= p:
samples.append(x)
return np.array(samples)Creating a convenience function to plot and compute areas:
def compute_area(func, b_low, b_up, prob, n_samples):
func = np.vectorize(func)
prob = np.vectorize(prob)
samples = sample_from_prob_dist(prob, b_low, b_up, n_samples)
f = func(samples)
g = prob(samples)
out = np.mean(f/g)
xs = np.linspace(b_low, b_up, 1000)
fig, ax = plt.subplots()
ax.plot(xs, func(xs))
ax.fill_between(xs, 0, func(xs), alpha=0.5, hatch='/')
ax.plot(samples, func(samples), 'x', alpha=0.5)
ax.plot(xs, prob(xs), '--', lw=2)
ax.set_title(f"Area = {out:0.3f}")
plt.show()
return outTesting out with a simple function
def f(x):
return -4/5 * x + 4
compute_area(f, 0, 5, lambda x: 1 / 5, 1000)9.756275336303013
And so everything seems to work fine here.
Trying a new function, and one with a non-uniform probability distribution:
def g(x):
if x < 4:
return 0.001
elif x < 5:
return 5*x - 20
elif x < 6:
return -5*x + 30
else:
return 0.001
def p(x):
if x < 3:
return 0.0
elif x < 5:
return 0.25*x - 0.75
elif x < 7:
return -0.25*x + 1.75
else:
return 0.0
compute_area(g, 0, 10, lambda x: 1 / 10, 1000)
compute_area(g, 0, 10, p, 1000)5.103788070059598
Oh thank heavens, we finally got it working. Note to self: always make sure your pdf’s actually have area 1…
Chain it?
I’ll begin by trying to define a function to do the random walk:
def chain_sampler(func, low, high, start, steps):
chain = [start]
for i in range(0, steps+1):
current = chain[i]
prop = np.random.normal(loc=chain[-1], scale=1)
#print(current, prop, func(current)/func(prop))
if func(prop)/func(current) > np.random.uniform(0,1) and low <= prop <= high:
chain.append(prop)
else:
chain.append(current)
#chain.append(prop)
return np.array(chain)Testing this out?
samples = chain_sampler(g, 0, 10, 3, 100_000)
values, bins, _ = plt.hist(samples, bins=500, density=True)
plt.show()Then in theory we could sample from this for our integration?
def p(x):
bin_idx = np.argmax(x <= bins)
if bin_idx >= len(values):
return 0.0
return float(values[bin_idx])
compute_area(g, 0, 10, p, 1000)5.021989172865116
Seeing if we can work in log space
def chain_ln_sampler(func, low, high, start, steps):
chain = [start]
for i in range(0, steps+1):
current = chain[i]
prop = np.random.normal(loc=chain[-1], scale=1)
if np.log(func(prop)) - np.log(func(current)) > np.log(np.random.uniform(0,1)) and low <= prop <= high:
chain.append(prop)
else:
chain.append(current)
return np.array(chain)And then testing it out:
samples = chain_ln_sampler(g, 0, 10, 3, 100_000)
plt.hist(samples, bins=500, density=True)
plt.show()Problems from Hogg & Foreman
Problem 2
from scipy import stats
gauss = stats.norm(2,np.sqrt(2))
samples = chain_sampler(gauss.pdf, -5, 10, 0, 10000)
plt.hist(samples, bins=75, density=True, alpha=0.5)
xs=np.linspace(-5,10,1000)
plt.plot(xs, gauss.pdf(xs), lw=2)
plt.show()Problem 3
def pdf(x):
if 3 < x < 7:
return 0.25
else:
return 0.0
samples = chain_sampler(pdf, 0, 10, 4, 10000)
plt.hist(samples, bins=50, density=True, alpha=0.5)
xs=np.linspace(0,10,1000)
plt.plot(xs, np.vectorize(pdf)(xs), lw=2)
plt.show()Problem 4
Part (a)
from scipy import stats
gauss2d = stats.multivariate_normal([0,0], [[2, 1.2],[1.2,2]])
def chain_sampler2d(func, low, high, start, steps):
chain = [start]
for i in range(0, steps+1):
current = chain[i]
prop = np.random.multivariate_normal(mean=current, cov=np.identity(current.size))
#print(current, prop, func(current)/func(prop))
if func(prop)/func(current) > np.random.uniform(0,1):
chain.append(prop)
else:
chain.append(current)
#chain.append(prop)
return np.array(chain)
samples = chain_sampler2d(gauss2d.pdf, -10, 10, np.array([1,1]), 10000)
plt.figure(figsize=(8,8))
plt.subplot(221)
plt.hist(samples[:,0], bins=50, density=True, alpha=0.5)
plt.xlabel('X')
plt.subplot(224)
plt.hist(samples[:,1], bins=50, density=True, alpha=0.5)
plt.xlabel('Y')
plt.subplot(223)
plt.hist2d(samples[:,0], samples[:,1], bins=50, density=True, cmap='Blues')
plt.ylabel('Y')
plt.xlabel('X')
#xs=np.linspace(0,10,1000)
#plt.plot(xs, (xs), lw=2)
plt.show()Part (b)
def pdf(pt):
x,y = pt
if (3 < x < 7) and (1 < y < 9):
return 1/24
else:
return 0.0
samples = chain_sampler2d(pdf, 0, 10, np.array([4,2]), 10000)
plt.figure(figsize=(8,8))
plt.subplot(221)
plt.hist(samples[:,0], bins=50, density=True, alpha=0.5)
plt.xlabel('X')
plt.xlim(0,10)
plt.subplot(224)
plt.hist(samples[:,1], bins=50, density=True, alpha=0.5)
plt.xlabel('Y')
plt.xlim(0,10)
plt.subplot(223)
plt.hist2d(samples[:,0], samples[:,1], bins=50, density=True, cmap='Blues')
plt.ylabel('Y')
plt.xlabel('X')
plt.xlim([0,10])
plt.ylim([0,10])
#xs=np.linspace(0,10,1000)
#plt.plot(xs, (xs), lw=2)
plt.show()Model Fitting
Generating some data:
m_true = 4
b_true = -1
xs = np.linspace(0,10,100)
ys = m_true * xs + b_true + xs*np.random.normal(0,4,size=xs.size)Checking how it looks:
Getting normal polyfit information
(m, b), cov = np.polyfit(xs, ys, deg=1, cov=True)
print(m,b)
print(cov)3.930604342944226 -0.1384781645103587
[[ 0.55799405 -2.78997024]
[-2.78997024 18.69374002]]
Attempting to set up the necessary pieces of our model:
def ln_prior(params):
m,b = params
if 0 < m < 20 and -20 < b < 20:
return 0.0
return -np.inf
def ln_likelihood(X, Y, params):
m,b = params
y_model = m * X + b
return -0.5 * np.sum((Y-y_model)**2)
def ln_pdf(params, X, Y):
x = ln_prior(params)
if x == -np.inf:
return -np.inf
return x + ln_likelihood(X,Y,params)
def chain_sampler(lnfunc, start, data, steps):
X,Y = data
chain = [start]
for i in range(0, steps+1):
current = chain[i]
prop = np.random.multivariate_normal(mean=current, cov=0.5*np.identity(current.size))
if lnfunc(prop,X,Y) - lnfunc(current,X,Y) > np.log(np.random.uniform(0,1)):
chain.append(prop)
else:
chain.append(current)
return np.array(chain)And then running this?
samples = chain_sampler(ln_pdf, np.array([1,1]), [xs,ys], 50000)Removing burn-in:
samples = samples[200:]Can I compare?
m_gauss = stats.norm(m, np.sqrt(cov[0,0]))
ms = np.linspace(-5,10,500)
plt.hist(samples[(samples[:,0] > -5) & (samples[:,0] < 10),0], bins=20, density=True, alpha=0.5)
plt.plot(ms, m_gauss.pdf(ms), lw=2)
plt.xlim(3,5)
plt.show()Random sampling of 100 lines:
plt.plot(xs, ys, '.')
plt.plot(xs, m*xs + b, '--', lw=2, color='C3')
for i in range(100):
mn,bn = samples[np.random.randint(0, samples.shape[0])]
plt.plot(xs, mn*xs + bn, '-', color='gray', alpha=0.1, lw=2)
plt.show()Using Emcee
Adding the package
import emceeTrying with my same ln_pdf function from earlier:
starting_poss = np.array([4,-1]) + 1E-4 * np.random.randn(32,2)
nwalkers, ndim = starting_poss.shape
sampler = emcee.EnsembleSampler(
nwalkers, ndim, ln_pdf, args=(xs, ys)
)
sampler.run_mcmc(starting_poss, 5000)
flat_samples = sampler.get_chain(discard=100, thin=15, flat=True)
plt.plot(sampler.get_chain()[:,:,0], 'k', alpha=0.3)
plt.show()
plt.plot(xs, ys, '.')
plt.plot(xs, m*xs + b, '--', lw=2, color='C3')
for i in range(100):
mn,bn = flat_samples[np.random.randint(len(flat_samples))]
plt.plot(xs, mn*xs + bn, '-', color='gray', alpha=0.5)
plt.xlim(0,10)
plt.ylim(-10,50)
plt.show()