Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated CR to allow any number of disconnected regions and any number of percentiles #109

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions MCcubed/mc/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ def mcmc(data=None, uncert=None, func=None, indparams=None,
plots=None, ioff=None, showbp=None,
savefile=None, savemodel=None, resume=None,
rms=None, log=None, pnames=None, texnames=None,
full_output=None, chireturn=None,
cfile=None, parname=None):
full_output=None, chireturn=None, percentile=None,
cfile=None, parname=None):
"""
MCMC driver routine to execute a Markov-chain Monte Carlo run.

Expand Down Expand Up @@ -143,8 +143,6 @@ def mcmc(data=None, uncert=None, func=None, indparams=None,
If True, calculate the RMS of data-bestmodel.
log: String or Log object.
Filename to store screen outputs.
cfile: String
Configuration file name.
pnames: 1D string ndarray
List of parameter names (including fixed and shared parameters)
to display on output screen and figures. See also texnames.
Expand All @@ -153,11 +151,16 @@ def mcmc(data=None, uncert=None, func=None, indparams=None,
texnames: 1D string iterable
Parameter names for figures, which may use latex syntax.
If not defined, default to pnames.
parname: 1D string ndarray
Deprecated. Use pnames instead.
full_output: Bool
If True, return the full posterior sample, including the burned-in
iterations.
chireturn: FINDME
percentile: list, floats
Percentile(s) to report credible region(s).
cfile: String
Configuration file name.
parname: 1D string ndarray
Deprecated. Use pnames instead.

Returns
-------
Expand Down Expand Up @@ -289,6 +292,10 @@ def mcmc(data=None, uncert=None, func=None, indparams=None,
args["indparams"] = mu.isfile(args["indparams"], 'indparams', log, 'bin',
unpack=False)

# Make sure percentile is of the proper data type
if type(args["percentile"]) == float:
args["percentile"] = [percentile]

# Call MCMC:
outputs = mc.mcmc(**args)

Expand Down Expand Up @@ -433,6 +440,9 @@ def parse():
help="If True, return chi-squared, red. chi-squared,"
"the chi-squared rescaling factor, and the BIC"
" [default: %(default)s]")
group.add_argument("--percentile", dest="percentile", action="store",
type=mu.parray, default=[0.6827, 0.9545],
help="Percentile(s) to report credible region(s).")
group.add_argument("--parname", dest="parname", action="store",
type=mu.parray, default=None,
help="Deprecated, see pnames.")
Expand Down
77 changes: 48 additions & 29 deletions MCcubed/mc/mcmc.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def mcmc(data, uncert=None, func=None, indparams=[],
plots=False, ioff=False, showbp=True,
savefile=None, savemodel=None, resume=False,
rms=False, log=None, pnames=None, texnames=None,
full_output=False, chireturn=False,
full_output=False, chireturn=False, percentile=[0.6827, 0.9545],
parname=None):
"""
This beautiful piece of code runs a Markov-chain Monte Carlo algorithm.
Expand Down Expand Up @@ -155,19 +155,22 @@ def mcmc(data, uncert=None, func=None, indparams=[],
iterations.
chireturn: Bool
If True, include chi-squared statistics in the return.
percentile: list, floats
Percentile(s) to report credible region(s).
parname: 1D string ndarray
Deprecated, use pnames.

Returns
-------
bestp: 1D ndarray
Array of the best-fitting parameters (including fixed and shared).
CRlo: 1D ndarray
The lower boundary of the marginal 68%-highest posterior density
(the credible region) for each parameter, with respect to bestp.
CRhi: 1D ndarray
The upper boundary of the marginal 68%-highest posterior density
(the credible region) for each parameter, with respect to bestp.
creg: list of strings
The posterior credible regions corresponding to the percentiles specified
by `percentile`. Format is e.g., '(lo_1, hi_1) U (lo_2, hi_2)' for a CR
composed of two disconnected regions.
creg[i] gives the i-th parameter's CRs.
creg[i][j] gives the i-th parameter's j-th CR, corresponding to the j-th
`percentile` value.
stdp: 1D ndarray
Array of the best-fitting parameter uncertainties, calculated as the
standard deviation of the marginalized, thinned, burned-in posterior.
Expand Down Expand Up @@ -612,19 +615,18 @@ def mcmc(data, uncert=None, func=None, indparams=[],
format(numaccept.value*100.0/nsample), indent=2)

# Compute the credible region for each parameter:
CRlo = np.zeros(nparams)
CRhi = np.zeros(nparams)
CR = [] # Holds boundaries of all regions making up the CRs
creg = [] # Holds a string representation, '[lo_1, hi_1] U [lo_2, hi_2] U ...'
CRlo = []
CRhi = []
pdf = []
xpdf = []
for i in range(nfree):
PDF, Xpdf, HPDmin = mu.credregion(posterior[:,i])
pdf.append(PDF)
PDF, Xpdf, crlo, crhi = mu.credregion(posterior[:,i], percentile)
pdf .append(PDF)
xpdf.append(Xpdf)
CRlo[ifree[i]] = np.amin(Xpdf[PDF>HPDmin])
CRhi[ifree[i]] = np.amax(Xpdf[PDF>HPDmin])
# CR relative to the best-fitting value:
CRlo[ifree] -= bestp[ifree]
CRhi[ifree] -= bestp[ifree]
CRlo.append(crlo)
CRhi.append(crhi)

# Get the mean and standard deviation from the posterior:
meanp = np.zeros(nparams, np.double) # Parameters mean
Expand All @@ -635,27 +637,44 @@ def mcmc(data, uncert=None, func=None, indparams=[],
bestp[s] = bestp[-int(stepsize[s])-1]
meanp[s] = meanp[-int(stepsize[s])-1]
stdp [s] = stdp [-int(stepsize[s])-1]
CRlo [s] = CRlo [-int(stepsize[s])-1]
CRhi [s] = CRhi [-int(stepsize[s])-1]

log.msg("\nParam name Best fit Lo HPD CR Hi HPD CR Mean Std dev S/N"
"\n----------- ----------------------------------- ---------------------- ---------", width=80)
log.msg("\nParam name Best fit Mean Std dev S/N"
"\n----------- ------------ ---------------------- ---------",
width=80)
for i in range(nparams):
snr = "{:.1f}". format(np.abs(bestp[i])/stdp[i])
mean = "{: 11.4e}".format(meanp[i])
lo = "{: 11.4e}".format(CRlo[i])
hi = "{: 11.4e}".format(CRhi[i])
if i in ifree: # Free-fitting value
pass
icr = np.where(ifree == i)[0][0]
# Format each CR as '(lo1, hi1) U (lo2, hi2) U ... U (lon, hin)'
creg.append([' U '.join(['({:10.4e}, {:10.4e})'.format(CRlo[icr][j][k],
CRhi[icr][j][k])
for k in range(len(CRlo[icr][j]))])
for j in range(len(CRlo[icr]))])
elif i in ishare: # Shared value
snr = "[share{:02d}]".format(-int(stepsize[i]))
creg.append('Shared') # No CRs for shared values
else: # Fixed value
snr = "[fixed]"
mean = "{: 11.4e}".format(bestp[i])
log.msg("{:<11s} {:11.4e} {:>11s} {:>11s} {:>11s} {:10.4e} {:>9s}".
format(pnames[i][0:11], bestp[i], lo, hi, mean, stdp[i], snr),
creg.append('Fixed') # No CRs for fixed values
# Print all info except CRs
log.msg("{:<11s} {:11.4e} {:>11s} {:10.4e} {:>9s}".
format(pnames[i][0:11], bestp[i], mean, stdp[i], snr),
width=160)

# Print CRs
log.msg("\nParam name Credible Region"
"\n----------- "
"------------------------------------------------------------------",
width=80)
for i in ifree:
for p in range(len(percentile)):
if p == 0:
log.msg("{:<11s} {:>6s}%: {:<56s}".
format(pnames[i][0:11], str(100*percentile[p]), creg[i][p]))
else:
log.msg("{:<11s} {:>6s}%: {:<56s}".
format("", str(100*percentile[p]), creg[i][p]))
if leastsq and bestchisq.value-fitchisq < -3e-8:
np.set_printoptions(precision=8)
log.warning("MCMC found a better fit than the minimizer:\n"
Expand Down Expand Up @@ -713,11 +732,11 @@ def mcmc(data, uncert=None, func=None, indparams=[],
savefile=fname+"_pairwise.png")
# Histograms:
mp.histogram(posterior, pnames=texnames[ifree], bestp=bestfreepars,
savefile=fname+"_posterior.png",
percentile=0.683, pdf=pdf, xpdf=xpdf)
savefile=fname+"_posterior.png", percentile=percentile,
pdf=pdf, xpdf=xpdf)
# RMS vs bin size:
if rms:
mp.RMS(bs, RMS, stderr, RMSlo, RMShi, binstep=len(bs)//500+1,
mp.RMS(bs, RMS, stderr, RMSlo, RMShi, binstep=len(bs)//500 + 1,
savefile=fname+"_RMS.png")
# Sort of guessing that indparams[0] is the X array for data as in y=y(x):
if (indparams != [] and
Expand Down
54 changes: 35 additions & 19 deletions MCcubed/plots/mcplots.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,9 +295,9 @@ def pairwise(posterior, pnames=None, thinning=1, fignum=200,
return axes, cb


def histogram(posterior, pnames=None, thinning=1, fignum=300,
savefile=None, bestp=None, percentile=None, pdf=None,
xpdf=None, ranges=None, axes=None, lw=2.0, fs=11):
def histogram(posterior, pnames=None, thinning=1, fignum=300,
savefile=None, bestp=None, percentile=None, pdf=None,
xpdf=None, ranges=None, axes=None, lw=2.0, fs=11):
"""
Plot parameter marginal posterior distributions

Expand All @@ -317,10 +317,11 @@ def histogram(posterior, pnames=None, thinning=1, fignum=300,
bestp: 1D float ndarray
If not None, plot the best-fitting values for each parameter
given by bestp.
percentile: Float
percentile: list of floats
If not None, plot the percentile- highest posterior density region
of the distribution. Note that this should actually be the
fractional part, i.e. set percentile=0.68 for a 68% HPD.
fractional part, i.e. set percentile=0.68 for a 68% HPD, or
percentile=[0.68, 0.95] for both 68% and 95% HPD.
pdf: 1D float ndarray or list of ndarrays
A smoothed PDF of the distribution for each parameter.
xpdf: 1D float ndarray or list of ndarrays
Expand Down Expand Up @@ -409,21 +410,36 @@ def histogram(posterior, pnames=None, thinning=1, fignum=300,
ax.set_xlabel(pnames[i], size=fs)
vals, bins, h = ax.hist(posterior[0::thinning, i], bins=25,
range=ranges[i], normed=False, zorder=0, **hkw)
# Plot HPD region:
if percentile is not None:
PDF, Xpdf, HPDmin = mu.credregion(posterior[:,i], percentile,
pdf[i], xpdf[i])
# Plot HPD region(s):
if percentile is not None:
PDF = []
xpdf = []
CRlo = []
CRhi = []
for p in range(len(percentile)):
Pdf, Xpdf, crlo, crhi = mu.credregion(posterior[:,i], percentile,
pdf[i], xpdf[i])
PDF .append(Pdf)
xpdf.append(Xpdf)
CRlo.append(crlo)
CRhi.append(crhi)
# Setup to interpolate xpdf into the histogram
vals = np.r_[0, vals, 0]
bins = np.r_[bins[0] - (bins[1]-bins[0]), bins]
# interpolate xpdf into the histogram:
f = si.interp1d(bins+0.5*(bins[1]-bins[0]), vals, kind='nearest')
# Plot the HPD region as shaded areas:
if ranges[i] is not None:
xran = np.argwhere((Xpdf>ranges[i][0]) & (Xpdf<ranges[i][1]))
Xpdf = Xpdf[np.amin(xran):np.amax(xran)]
PDF = PDF [np.amin(xran):np.amax(xran)]
ax.fill_between(Xpdf, 0, f(Xpdf), where=PDF>=HPDmin,
facecolor='0.75', edgecolor='none', interpolate=False, zorder=-2)
bins = np.r_[ bins[0] - (bins[1]-bins[0]), bins]
f = si.interp1d(bins+0.5 * (bins[1]-bins[0]), vals, kind='nearest')
# Plot the credible region(s) as shaded areas:
# Note reverse ordering is to allow overplotting of lower percentiles
for k in range(len(CRlo[i])-1, 0-1, -1):
if ranges[i] is not None:
xran = np.argwhere((xpdf>ranges[i][0]) & (xpdf<ranges[i][1]))
xpdf[i] = xpdf[i][np.amin(xran):np.amax(xran)]
PDF [i] = PDF [i][np.amin(xran):np.amax(xran)]
for r in range(len(CRlo[i][k])):
ax.fill_between(xpdf[i], 0, f(xpdf[i]),
where=(xpdf[i]>=CRlo[i][k][r]) * \
(xpdf[i]<=CRhi[i][k][r]),
facecolor=str(0.5 + 0.25*k/(len(CRlo[i])-1)),
edgecolor='none', interpolate=False, zorder=-2)
if bestp is not None:
ax.axvline(bestp[i], dashes=(7,4), lw=1.0, **bkw)
maxylim = np.amax((maxylim, ax.get_ylim()[1]))
Expand Down
78 changes: 58 additions & 20 deletions MCcubed/utils/mcutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,9 @@ def isfile(input, iname, log, dtype, unpack=True, notnone=False):
return load(ifile)


def credregion(posterior=None, percentile=0.6827, pdf=None, xpdf=None):
def credregion(posterior= None, percentile= [0.6827, 0.9545],
pdf = None, xpdf = None,
lims = (None,None), numpts = 100):
"""
Compute a smoothed posterior density distribution and the minimum
density for a given percentile of the highest posterior density.
Expand All @@ -264,22 +266,31 @@ def credregion(posterior=None, percentile=0.6827, pdf=None, xpdf=None):
----------
posterior: 1D float ndarray
A posterior distribution.
percentile: Float
percentile: 1D float ndarray, list, or float.
The percentile (actually the fraction) of the credible region.
A value in the range: (0, 1).
pdf: 1D float ndarray
A smoothed-interpolated PDF of the posterior distribution.
xpdf: 1D float ndarray
The X location of the pdf values.
lims: tuple, floats
Minimum and maximum allowed values for posterior. Should only be used if
there are physically-imposed limits.
numpts: int.
Number of points to use when calculating the PDF.

Returns
-------
pdf: 1D float ndarray
A smoothed-interpolated PDF of the posterior distribution.
xpdf: 1D float ndarray
The X location of the pdf values.
HPDmin: Float
The minimum density in the percentile-HPD region.
regions: list of 2D float ndarrays
The values of the credible regions specified by `percentile`.
regions[0] corresponds to percentile[0], etc.
regions[0][0] gives the start and stop values of the first region of the CR
regions[0][1] gives the second CR start/stop values, if the CR is composed
of disconnected regions

Example
-------
Expand All @@ -294,34 +305,61 @@ def credregion(posterior=None, percentile=0.6827, pdf=None, xpdf=None):
>>> pdf, xpdf, HPDmin = credregion(pdf=pdf, xpdf=xpdf, percentile=0.9545)
>>> print(np.amin(xpdf[pdf>HPDmin]), np.amax(xpdf[pdf>HPDmin]))
"""
# Make sure `percentile` is a list or array
if type(percentile) == float:
percentile = np.array([percentile])
if pdf is None and xpdf is None:
# Thin if posterior has too many samples (> 120k):
thinning = np.amax([1, int(np.size(posterior)/120000)])
# Compute the posterior's PDF:
kernel = stats.gaussian_kde(posterior[::thinning])
# Remove outliers:
mean = np.mean(posterior)
std = np.std(posterior)
k = 6
lo = np.amax([mean-k*std, np.amin(posterior)])
hi = np.amin([mean+k*std, np.amax(posterior)])
kernel = stats.gaussian_kde(posterior)
# Use a Gaussian kernel density estimate to trace the PDF:
x = np.linspace(lo, hi, 100)
# Interpolate-resample over finer grid (because kernel.evaluate
# is expensive):
if lims[0] is not None:
lo = min(np.amin(posterior), lims[0])
else:
lo = np.amin(posterior)
if lims[1] is not None:
hi = max(np.amax(posterior), lims[1])
else:
hi = np.amax(posterior)
x = np.linspace(lo, hi, numpts)
f = si.interp1d(x, kernel.evaluate(x))
xpdf = np.linspace(lo, hi, 3000)
xpdf = np.linspace(lo, hi, 100*numpts)
pdf = f(xpdf)

# Sort the PDF in descending order:
ip = np.argsort(pdf)[::-1]
# Sorted CDF:
cdf = np.cumsum(pdf[ip])
# Indices of the highest posterior density:
iHPD = np.where(cdf >= percentile*cdf[-1])[0][0]
# Minimum density in the HPD region:
HPDmin = np.amin(pdf[ip][0:iHPD])
return pdf, xpdf, HPDmin

# List to hold boundaries of CRs
# List is used because a given CR may be multiple disconnected regions
CRlo = []
CRhi = []
# Find boundary for each specified percentile
for i in range(len(percentile)):
# Indices of the highest posterior density:
iHPD = np.where(cdf >= percentile[i]*cdf[-1])[0][0]
# Minimum density in the HPD region:
HPDmin = np.amin(pdf[ip][0:iHPD])
# Find the contiguous areas of the PDF greater than or equal to HPDmin
HPDbool = pdf >= HPDmin
idiff = np.diff(HPDbool) # True where HPDbool changes T to F or F to T
iregion, = idiff.nonzero() # Indexes of Trues. Note , because returns tuple
# Check boundaries
if HPDbool[0]:
iregion = np.insert(iregion, 0, -1) # This -1 is changed to 0 below when
if HPDbool[-1]: # correcting start index for regions
iregion = np.append(iregion, len(HPDbool)-1)
# Reshape into 2 columns of start/end indices
iregion.shape = (-1, 2)
# Add 1 to start of each region due to np.diff() functionality
iregion[:,0] += 1
# Store the min and max of each (possibly disconnected) region
CRlo.append(xpdf[iregion[:,0]])
CRhi.append(xpdf[iregion[:,1]])

return pdf, xpdf, CRlo, CRhi


def default_parnames(npars):
Expand Down