diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/graham_nz_err.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/graham_nz_err.pdf index b2594bf6..d748b0e3 100644 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/graham_nz_err.pdf and b/docs/desc-0000-qp-photo-z_approximation/figures/graham_nz_err.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/graham_nz_kld.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/graham_nz_kld.pdf index 1450d10a..002985b3 100644 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/graham_nz_kld.pdf and b/docs/desc-0000-qp-photo-z_approximation/figures/graham_nz_kld.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/graham_pz_err.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/graham_pz_err.pdf index 2b68c138..b2e559bf 100644 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/graham_pz_err.pdf and b/docs/desc-0000-qp-photo-z_approximation/figures/graham_pz_err.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/graham_pz_kld.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/graham_pz_kld.pdf index ed91efce..995121e1 100644 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/graham_pz_kld.pdf and b/docs/desc-0000-qp-photo-z_approximation/figures/graham_pz_kld.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_nz_err.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_nz_err.pdf index 516df86f..4efb8c91 100644 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_nz_err.pdf and b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_nz_err.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_nz_kld.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_nz_kld.pdf index 17f42967..a48c3efe 100644 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_nz_kld.pdf and b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_nz_kld.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pz_err.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pz_err.pdf index dcd9345a..d1f89b9e 100644 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pz_err.pdf and b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pz_err.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pz_kld.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pz_kld.pdf index 3d4f0cd1..760026d9 100644 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pz_kld.pdf and b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pz_kld.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/main.bib b/docs/desc-0000-qp-photo-z_approximation/main.bib index 1a7acfa1..e8a559a5 100644 --- a/docs/desc-0000-qp-photo-z_approximation/main.bib +++ b/docs/desc-0000-qp-photo-z_approximation/main.bib @@ -407,7 +407,6 @@ @article{radovich_searching_2017 @article{de_jong_third_2017, title = {The third data release of the {Kilo}-{Degree} {Survey} and associated data products}, volume = {604}, - url = {https://ui.adsabs.harvard.edu/#abs/2017A&A...604A.134D/abstract}, doi = {10.1051/0004-6361/201730747}, journal = {Astronomy and Astrophysics}, author = {de Jong, A. Jelte T. and A, Jelte T. and Kleijn, Gijs A. Verdoes and Erben, Thomas and Hildebrandt, Hendrik and Kuijken, Konrad and Sikkema, Gert and Brescia, Massimo and Bilicki, Maciej and Napolitano, Nicola R. and Amaro, Valeria and Begeman, Kor G. and Boxhoorn, Danny R. and Buddelmeijer, Hugo and Cavuoti, Stefano and Getman, Fedor and Grado, Aniello and Helmich, Ewout and Huang, Zhuoyi and Irisarri, Nancy and La Barbera, Francesco and Longo, Giuseppe and McFarland, John P. and Nakajima, Reiko and Paolillo, Maurizio and Puddu, Emanuella and Radovich, Mario and Rifatto, Agatino and Tortora, Crescenzo and Valentijn, Edwin A. and Vellucci, Civita and Vriend, Willem-Jan and Amon, Alexandra and Blake, Chris and Choi, Ami and Conti, Ian Fenech and Gwyn, Stephen D. J. and Herbonnet, Ricardo and Heymans, Catherine and Hoekstra, Henk and Klaes, Dominik and Merten, Julian and Miller, Lance and Schneider, Peter and Viola, Massimo}, diff --git a/docs/desc-0000-qp-photo-z_approximation/main.tex b/docs/desc-0000-qp-photo-z_approximation/main.tex index 5d45c258..cc3b7bea 100644 --- a/docs/desc-0000-qp-photo-z_approximation/main.tex +++ b/docs/desc-0000-qp-photo-z_approximation/main.tex @@ -465,7 +465,7 @@ \subsection{Comparison Metrics} of a PDF. We note that $M_{0}[P]=1$ for all properly normalized probability distributions, $M_{1}[P]=\bar{z}$ is the \textit{mean}, $M_{2}[P]$ is the -\textit{variance}, and $M_{3}[P]$ is the \textit{kurtosis}. +\textit{variance}, and $M_{3}[P]$ is the \textit{skewness}. Though the first few moments are not in general sufficient to characterize a highly structured probability distribution, they are included in this analysis because they can prove useful in setting ballpark estimates of the influence of @@ -705,7 +705,7 @@ \subsection{Individual \pz s} \includegraphics[width=\columnwidth]{graham_pz_kld.pdf} \includegraphics[width=\columnwidth]{schmidt_pz_kld.pdf} \caption{ - The means of the mean ($\pentagon$), variance ($\bigstar$), and kurtosis + The means of the mean ($\pentagon$), variance ($\bigstar$), and skewness ($\APLstar$) of the log-KLD distributions for each dataset as a function of the number $N_{f}$ of stored parameters for the quantile (dashed purple line), samples (dash-dotted green line), and histogram (dotted orange line) formats @@ -762,7 +762,7 @@ \subsection{Individual \pz s} \includegraphics[width=\columnwidth]{schmidt_pz_err.pdf} \caption{ The median $\log_{10}$-percent errors on the mean ($\pentagon$), variance -($\bigstar$), and kurtosis ($\APLstar$) of the \pz s for each dataset as a +($\bigstar$), and skewness ($\APLstar$) of the \pz s for each dataset as a function of the number $N_{f}$ of stored parameters per \pz\ for the quantile (dashed purple line), samples (dash-dotted green line), and histogram (dotted orange line) formats with interquartile range error bars based on 10 @@ -952,7 +952,7 @@ \subsection{Stacked $\hat{n}(z)$ estimator} \includegraphics[width=\columnwidth]{schmidt_nz_err.pdf} \caption{ The percent error on the mean ($\pentagon$), variance ($\bigstar$), and -kurtosis ($\APLstar$) of the stacked estimator $hat{n}(z)$ of the redshift +skewness ($\APLstar$) of the stacked estimator $hat{n}(z)$ of the redshift distribution for each dataset as a function of the number $N_{f}$ of stored parameters for the quantile (dashed purple line), samples (dash-dotted greenline), and histogram (dotted orangeline) formats with interquartile range @@ -1194,6 +1194,7 @@ \subsection*{Acknowledgments} \input{acknowledgments} + \bibliography{lsstdesc,main} \end{document} diff --git a/docs/notebooks/demo.ipynb b/docs/notebooks/demo.ipynb index 049b685d..0009129c 100644 --- a/docs/notebooks/demo.ipynb +++ b/docs/notebooks/demo.ipynb @@ -18,9 +18,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -48,13 +46,12 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "# ! cat qp/pdf.py\n", - "P = qp.PDF()" + "P = qp.PDF(vb=True)" ] }, { @@ -69,15 +66,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "dist = sps.norm(loc=0, scale=1)\n", "print(type(dist))\n", "demo_limits = (-5., 5.)\n", - "P = qp.PDF(truth=dist, limits=demo_limits)\n", + "P = qp.PDF(funcform=dist, limits=demo_limits)\n", "P.plot()" ] }, @@ -93,14 +88,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", "\n", - "samples = P.sample(1000, using='truth', vb=False)\n", + "samples = P.sample(1000, using='mix_mod', vb=False)\n", "S = qp.PDF(samples=samples, limits=demo_limits)\n", "S.plot()" ] @@ -118,7 +111,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -140,9 +132,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "histogram = P.histogramize(N=10, binrange=demo_limits)\n", @@ -165,14 +155,21 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "print P.approximate(np.array([0.314]), using='quantiles')" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "P.mix_mod" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -183,9 +180,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "print P.integrate([0., 1.], using='quantiles')" @@ -202,7 +197,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -221,9 +215,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "print P.scheme\n", @@ -243,13 +235,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "grid = np.linspace(-3., 3., 20)\n", - "gridded = P.evaluate(grid, using='truth', vb=False)\n", + "gridded = P.evaluate(grid, using='mix_mod', vb=False)\n", "\n", "G = qp.PDF(gridded=gridded, limits=demo_limits)\n", "G.sample(100, vb=False)\n", @@ -267,7 +257,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -278,9 +267,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "print G.last,'approximation, ', G.scheme, 'interpolation'" @@ -289,9 +276,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# 10-point grid for a coarse approximation:\n", @@ -312,12 +297,10 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ - "MM = qp.PDF(truth=dist, limits=demo_limits)\n", + "MM = qp.PDF(funcform=dist, limits=demo_limits)\n", "MM.sample(1000, vb=False)\n", "MM.mix_mod_fit(n_components=5)\n", "MM.plot()" @@ -345,7 +328,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -353,38 +335,43 @@ "P.plot()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Quantitative Comparisons" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "symm_lims = np.array([-1., 1.])\n", + "all_lims = [symm_lims, 2.*symm_lims, 3.*symm_lims]" + ] + }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ - "### Quantitative Comparisons\n", - "\n", "Next, let's compare the different parametrizations to the truth using the Kullback-Leibler Divergence (KLD). The KLD is a measure of how close two probability distributions are to one another -- a smaller value indicates closer agreement. It is measured in units of bits of information, the information lost in going from the second distribution to the first distribution. The KLD calculator here takes in a shared grid upon which to evaluate the true distribution and the interpolated approximation of that distribution and returns the KLD of the approximation relative to the truth, which is not in general the same as the KLD of the truth relative to the approximation. Below, we'll calculate the KLD of the approximation relative to the truth over different ranges, showing that it increases as it includes areas where the true distribution and interpolated distributions diverge." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false, - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ - "qD1 = qp.utils.calculate_kl_divergence(P, Q, limits=(-1.,1.), vb=True)\n", - "qD2 = qp.utils.calculate_kl_divergence(P, Q, limits=(-2.,2.), vb=True)\n", - "qD3 = qp.utils.calculate_kl_divergence(P, Q, limits=(-3.,3.), vb=True)\n", - "print 'Quantile approximation: KLD over 1,2,3 sigma ranges = ', qD1, qD2, qD3\n", - "hD1 = qp.utils.calculate_kl_divergence(P, H, limits=(-1.,1.), vb=True)\n", - "hD2 = qp.utils.calculate_kl_divergence(P, H, limits=(-2.,2.), vb=True)\n", - "hD3 = qp.utils.calculate_kl_divergence(P, H, limits=(-3.,3.), vb=True)\n", - "print 'Histogram approximation: KLD over 1,2,3 sigma ranges = ', hD1, hD2, hD3\n", - "sD1 = qp.utils.calculate_kl_divergence(P, S, limits=(-1.,1.), vb=True)\n", - "sD2 = qp.utils.calculate_kl_divergence(P, S, limits=(-2.,2.), vb=True)\n", - "sD3 = qp.utils.calculate_kl_divergence(P, S, limits=(-3.,3.), vb=True)\n", - "print 'Sampled approximation: KLD over 1,2,3 sigma ranges = ', sD1, sD2, sD3" + "for PDF in [Q, H, S]:\n", + " D = []\n", + " for lims in all_lims:\n", + " D.append(qp.metrics.calculate_kld(P, PDF, limits=lims, vb=False))\n", + " print(PDF.truth+' approximation: KLD over 1, 2, 3, sigma ranges = '+str(D))" ] }, { @@ -407,26 +394,15 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], "source": [ - "qRMSE1 = qp.utils.calculate_rmse(P, Q, limits=(-1.,1.), vb=False)\n", - "qRMSE2 = qp.utils.calculate_rmse(P, Q, limits=(-2.,2.), vb=False)\n", - "qRMSE3 = qp.utils.calculate_rmse(P, Q, limits=(-3.,3.), vb=False)\n", - "\n", - "hRMSE1 = qp.utils.calculate_rmse(P, H, limits=(-1.,1.), vb=False)\n", - "hRMSE2 = qp.utils.calculate_rmse(P, H, limits=(-2.,2.), vb=False)\n", - "hRMSE3 = qp.utils.calculate_rmse(P, H, limits=(-3.,3.), vb=False)\n", - "\n", - "sRMSE1 = qp.utils.calculate_rmse(P, S, limits=(-1.,1.), vb=False)\n", - "sRMSE2 = qp.utils.calculate_rmse(P, S, limits=(-2.,2.), vb=False)\n", - "sRMSE3 = qp.utils.calculate_rmse(P, S, limits=(-3.,3.), vb=False)\n", - "\n", - "print 'Quantile approximation: RMSE over 1,2,3 sigma ranges = ', qRMSE1, qRMSE2, qRMSE3\n", - "print 'Histogram approximation: RMSE over 1,2,3 sigma ranges = ', hRMSE1, hRMSE2, hRMSE3\n", - "print 'Sampled approximation: RMSE over 1,2,3 sigma ranges = ', sRMSE1, sRMSE2, sRMSE3" + "for PDF in [Q, H, S]:\n", + " D = []\n", + " for lims in all_lims:\n", + " D.append(qp.metrics.calculate_rmse(P, PDF, limits=lims, vb=False))\n", + " print(PDF.truth+' approximation: RMSE over 1, 2, 3, sigma ranges = '+str(D))" ] }, { @@ -446,9 +422,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "pdfs = [P, Q, H, S]\n", @@ -457,7 +431,7 @@ "for pdf in pdfs:\n", " moments = []\n", " for n in which_moments:\n", - " moments.append(qp.utils.calculate_moment(pdf, n))\n", + " moments.append(qp.metrics.calculate_moment(pdf, n))\n", " all_moments.append(moments)\n", " \n", "print('moments: '+str(which_moments))\n", @@ -493,9 +467,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "component_1 = {}\n", @@ -509,7 +481,7 @@ "composite_lims = (-5., 5.)\n", "\n", "C_dist = qp.composite(dist_info)\n", - "C = qp.PDF(truth=C_dist, limits=composite_lims)\n", + "C = qp.PDF(funcform=C_dist, limits=composite_lims)\n", "C.plot()" ] }, @@ -523,12 +495,10 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ - "Cq = qp.PDF(truth=C_dist, limits = composite_lims)\n", + "Cq = qp.PDF(funcform=C_dist, limits = composite_lims)\n", "Cq.quantize(N=20, limits=composite_lims, vb=False)\n", "Cq.plot()" ] @@ -543,12 +513,10 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ - "Ch = qp.PDF(truth=C_dist, limits = composite_lims)\n", + "Ch = qp.PDF(funcform=C_dist, limits = composite_lims)\n", "Ch.histogramize(N=20, binrange=composite_lims, vb=True)\n", "Ch.plot()" ] @@ -557,72 +525,163 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, samples from this distribution may also be taken, and a PDF may be reconstructed from them. Note: this uses `scipy.stats.gaussian_kde`, which determines its bandwidth/kernel size using [Scott's Rule](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gaussian_kde.html), which clearly leaves something to be desired! " + "Finally, samples from this distribution may also be taken, and a PDF may be reconstructed from them. Note: this uses [`scipy.stats.gaussian_kde`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gaussian_kde.html), which determines its bandwidth/kernel size using Scott's Rule, Silverman's Rule, a fixed bandwidth, or a callable function that returns a bandwidth." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ - "Cs = qp.PDF(truth=C_dist, limits = composite_lims)\n", - "Cs.sample(N=20, using='truth', vb=False)\n", + "Cs = qp.PDF(funcform=C_dist, limits = composite_lims)\n", + "Cs.sample(N=20, using='mix_mod', vb=False)\n", "Cs.plot()" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ - "qD = qp.utils.calculate_kl_divergence(C, Cq, limits=composite_lims, dx=0.001, vb=True)\n", - "hD = qp.utils.calculate_kl_divergence(C, Ch, limits=composite_lims, dx=0.001, vb=True)\n", - "sD = qp.utils.calculate_kl_divergence(C, Cs, limits=composite_lims, dx=0.001, vb=True)\n", + "qD = qp.metrics.calculate_kld(C, Cq, limits=composite_lims, dx=0.001, vb=True)\n", + "hD = qp.metrics.calculate_kld(C, Ch, limits=composite_lims, dx=0.001, vb=True)\n", + "sD = qp.metrics.calculate_kld(C, Cs, limits=composite_lims, dx=0.001, vb=True)\n", "print(qD, hD, sD)" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": { "collapsed": true }, + "source": [ + "### PDF Ensembles\n", + "\n", + "`qp` also includes infrastructure for handling ensembles of `PDF` objects with shared metaparameters, such as histogram bin ends, but unique per-object parameters, such as histogram bin heights. A `qp.Ensemble` object takes as input the number of items in the ensemble and, optionally, a list, with contents corresponding to one of the built-in formats.\n", + "\n", + "Let's demonstrate on PDFs with a functional form, which means the list of information for each member of the ensemble is `scipy.stats.rv_continuous` or `qp.composite` objects." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], - "source": [] + "source": [ + "N = 10\n", + "in_dists = []\n", + "for i in range(N):\n", + " dist = sps.norm(loc=sps.uniform.rvs(), scale=sps.uniform.rvs())\n", + " in_dists.append(dist)\n", + " \n", + "E = qp.Ensemble(N, funcform=in_dists, vb=True) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As with individual `qp.PDF` objects, we can evaluate the PDFs at given points, convert to other formats, and integrate." + ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, + "outputs": [], + "source": [ + "eval_range = np.linspace(-5., 5., 100)\n", + "E.evaluate(eval_range, using='mix_mod', vb=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], - "source": [] + "source": [ + "E.quantize(N=10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "E.integrate(demo_limits, using='mix_mod')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Previous versions of `qp` included a built-in function for \"stacking\" the member PDFs of a `qp.Ensemble` object. This functionality has been removed to discourage use of this procedure in science applications. However, we provide a simple function one may use should this functionality be desired." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def stack(ensemble, loc, using, vb=True):\n", + " \"\"\"\n", + " Produces an average of the PDFs in the ensemble\n", + "\n", + " Parameters\n", + " ----------\n", + " ensemble: qp.Ensemble\n", + " the ensemble of PDFs to stack\n", + " loc: ndarray, float or float\n", + " location(s) at which to evaluate the PDFs\n", + " using: string\n", + " which parametrization to use for the approximation\n", + " vb: boolean\n", + " report on progress\n", + "\n", + " Returns\n", + " -------\n", + " stacked: tuple, ndarray, float\n", + " pair of arrays for locations where approximations were evaluated\n", + " and the values of the stacked PDFs at those points\n", + " \"\"\"\n", + " evaluated = ensemble.evaluate(loc, using=using, norm=True, vb=vb)\n", + " stack = np.mean(evaluated[1], axis=0)\n", + " stacked = (evaluated[0], stack)\n", + " return stacked" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stacked = stack(E, eval_range, using='quantiles')\n", + "plt.plot(stacked[0], stacked[-1])" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "no really, Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.3" + "pygments_lexer": "ipython2", + "version": "2.7.14" } }, "nbformat": 4, diff --git a/docs/notebooks/kld.ipynb b/docs/notebooks/kld.ipynb index f29cda6e..f67c1bbb 100644 --- a/docs/notebooks/kld.ipynb +++ b/docs/notebooks/kld.ipynb @@ -47,11 +47,22 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/aimalz/.local/lib/python2.7/site-packages/matplotlib/__init__.py:1401: UserWarning: This call to matplotlib.use() has no effect\n", + "because the backend has already been chosen;\n", + "matplotlib.use() must be called *before* pylab, matplotlib.pyplot,\n", + "or matplotlib.backends is imported for the first time.\n", + "\n", + " warnings.warn(_use_error_msg)\n" + ] + } + ], "source": [ "import matplotlib\n", "matplotlib.use('TkAgg')\n", @@ -66,37 +77,55 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "interpolator set to [None, None]\n" + ] + } + ], "source": [ - "P = qp.PDF(truth=sps.norm(loc=0.0, scale=1.0))" + "P = qp.PDF(funcform=sps.norm(loc=0.0, scale=1.0))" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "interpolator set to [None, None]\n" + ] + } + ], "source": [ "x, sigma = 2.0, 1.0\n", - "Q = qp.PDF(truth=sps.norm(loc=x, scale=sigma))" + "Q = qp.PDF(funcform=sps.norm(loc=x, scale=sigma))" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.99999999989\n" + ] + } + ], "source": [ "infinity = 100.0\n", - "D = qp.utils.calculate_kl_divergence(P, Q, limits=(-infinity,infinity), vb=False)\n", + "D = qp.metrics.calculate_kld(P, Q, limits=(-infinity,infinity), vb=False)\n", "print D" ] }, @@ -119,14 +148,21 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "interpolator set to [None, None]\n", + "1.00094526906\n" + ] + } + ], "source": [ "x, sigma = 0.0, 4.37\n", - "Q = qp.PDF(truth=sps.norm(loc=x, scale=sigma))\n", - "D = qp.utils.calculate_kl_divergence(P, Q, limits=(-infinity,infinity), vb=False)\n", + "Q = qp.PDF(funcform=sps.norm(loc=x, scale=sigma))\n", + "D = qp.metrics.calculate_kld(P, Q, limits=(-infinity,infinity), vb=False)\n", "print D" ] }, @@ -213,10 +249,25 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "interpolator set to [None, None]\n", + "interpolator set to [None, None]\n", + "interpolator set to [None, None]\n", + "interpolator set to [None, None]\n", + "interpolator set to [None, None]\n", + "interpolator set to [None, None]\n", + "interpolator set to [None, None]\n", + "interpolator set to [None, None]\n", + "interpolator set to [None, None]\n", + "interpolator set to [None, None]\n" + ] + } + ], "source": [ "widths = np.logspace(-2.0,2.0,13)\n", "D = np.empty_like(widths)\n", @@ -226,18 +277,16 @@ "infinity = 1000.0\n", "\n", "for k,sigma in enumerate(widths):\n", - " Q = qp.PDF(truth=sps.norm(loc=x, scale=sigma))\n", - " D[k] = qp.utils.calculate_kl_divergence(P, Q, limits=(-infinity,infinity), vb=False)\n", - " E[k] = qp.utils.calculate_rmse(P, Q, limits=(-infinity,infinity), vb=False)\n", + " Q = qp.PDF(funcform=sps.norm(loc=x, scale=sigma))\n", + " D[k] = qp.metrics.calculate_kld(P, Q, limits=(-infinity,infinity), vb=False)\n", + " E[k] = qp.metrics.calculate_rmse(P, Q, limits=(-infinity,infinity), vb=False)\n", "print zip(widths, D)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "x = widths\n", @@ -297,9 +346,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "separations = np.linspace(0.0,15.0,16)\n", @@ -310,18 +357,16 @@ "infinity = 100.0\n", "\n", "for k,x0 in enumerate(separations):\n", - " Q = qp.PDF(truth=sps.norm(loc=x0, scale=sigma))\n", - " D[k] = qp.utils.calculate_kl_divergence(P, Q, limits=(-infinity,infinity), vb=False)\n", - " E[k] = qp.utils.calculate_rmse(P, Q, limits=(-infinity,infinity), vb=False)\n", + " Q = qp.PDF(funcform=sps.norm(loc=x0, scale=sigma))\n", + " D[k] = qp.metrics.calculate_kld(P, Q, limits=(-infinity,infinity), vb=False)\n", + " E[k] = qp.metrics.calculate_rmse(P, Q, limits=(-infinity,infinity), vb=False)\n", "print zip(separations, D)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots()\n", @@ -349,9 +394,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import sys\n", @@ -377,9 +420,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "infinity = 100.0\n", @@ -393,9 +434,9 @@ "for j,sigma in enumerate(widths):\n", " \n", " for k,x0 in enumerate(separations):\n", - " Q = qp.PDF(truth=sps.norm(loc=x0, scale=sigma))\n", - " D[j,k] = qp.utils.calculate_kl_divergence(P, Q, limits=(-infinity,infinity), vb=False)\n", - " E[j,k] = qp.utils.calculate_rmse(P, Q, limits=(-infinity,infinity), vb=False)\n", + " Q = qp.PDF(funcform=sps.norm(loc=x0, scale=sigma))\n", + " D[j,k] = qp.metrics.calculate_kld(P, Q, limits=(-infinity,infinity), vb=False)\n", + " E[j,k] = qp.metrics.calculate_rmse(P, Q, limits=(-infinity,infinity), vb=False)\n", " tensions[j,k] = x0 / np.sqrt(sigma*sigma + 1.0)\n", " " ] @@ -403,9 +444,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "x = tensions[0,:]\n", @@ -461,16 +500,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "no really, Python 2", "language": "python", "name": "python2" }, @@ -484,9 +521,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.13" + "version": "2.7.14" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/qp/__init__.py b/qp/__init__.py index 9a01534f..60cb3a38 100644 --- a/qp/__init__.py +++ b/qp/__init__.py @@ -1,4 +1,6 @@ -from pdf import * from composite import * from ensemble import * +from metrics import * +# from parametrization import * +from pdf import * from utils import * diff --git a/qp/ensemble.py b/qp/ensemble.py index 2846c5a6..fcdede02 100644 --- a/qp/ensemble.py +++ b/qp/ensemble.py @@ -17,7 +17,7 @@ class Ensemble(object): - def __init__(self, N, truth=None, quantiles=None, histogram=None, gridded=None, samples=None, limits=None, scheme='linear', vb=True, procs=None):# where='ensemble.db', procs=None):# + def __init__(self, N, funcform=None, quantiles=None, histogram=None, gridded=None, samples=None, limits=None, scheme='linear', vb=True, procs=None):# where='ensemble.db', procs=None):# """ Creates an object comprised of many qp.PDF objects to efficiently perform operations on all of them @@ -26,8 +26,7 @@ def __init__(self, N, truth=None, quantiles=None, histogram=None, gridded=None, ---------- N: int number of pdfs in the ensemble - truth: list of scipy.stats.rv_continuous objects or qp.composite objects - , optional + funcform: list of scipy.stats.rv_continuous objects or qp.composite objects, optional List of length (npdfs) containing the continuous, parametric forms of the PDFs quantiles: tuple of ndarrays, optional Pair of arrays of lengths (nquants) and (npdfs, nquants) containing @@ -75,7 +74,7 @@ def __init__(self, N, truth=None, quantiles=None, histogram=None, gridded=None, self.n_pdfs = N self.pdf_range = range(N) - if truth is None and quantiles is None and histogram is None and gridded is None and samples is None: + if funcform is None and quantiles is None and histogram is None and gridded is None and samples is None: print 'Warning: initializing an Ensemble object without inputs' return @@ -84,10 +83,10 @@ def __init__(self, N, truth=None, quantiles=None, histogram=None, gridded=None, else: self.limits = limits - if truth is None: - self.truth = [None] * N + if funcform is None: + self.mix_mod = [None] * N else: - self.truth = truth + self.mix_mod = funcform if samples is None: self.samples = [None] * N else: @@ -104,14 +103,14 @@ def __init__(self, N, truth=None, quantiles=None, histogram=None, gridded=None, self.gridded = (None, [None] * N) else: self.gridded = (None, [(gridded[0], gridded[1][i]) for i in self.pdf_range]) - self.mix_mod = None + # self.mix_mod = None self.evaluated = None self.scheme = scheme self.make_pdfs() - self.stacked = {} + # self.stacked = {} self.klds = {} def make_pdfs(self): @@ -121,7 +120,8 @@ def make_pdfs(self): def make_pdfs_helper(i): # with open(self.logfilename, 'wb') as logfile: # logfile.write('making pdf '+str(i)+'\n') - return qp.PDF(truth=self.truth[i], quantiles=self.quantiles[i], + return qp.PDF( funcform=self.mix_mod[i], + quantiles=self.quantiles[i], histogram=self.histogram[i], gridded=self.gridded[-1][i], samples=self.samples[i], limits=self.limits, scheme=self.scheme, vb=False) @@ -275,7 +275,7 @@ def mixmod_helper(i): return self.mix_mod - def evaluate(self, loc, using=None, norm=False, vb=True): + def evaluate(self, loc, using=None, norm=False, vb=False): """ Evaluates all PDFs @@ -327,6 +327,8 @@ def integrate(self, limits, using, dx=0.001): integral: numpy.ndarray, float value of the integral """ + if len(np.shape(limits)) == 1: + limits = [limits] * self.n_pdfs def integrate_helper(i): try: return self.pdfs[i].integrate(limits[i], using=using, dx=dx, vb=False) @@ -521,39 +523,39 @@ def rmse_helper(i): return rmses - def stack(self, loc, using, vb=True): - """ - Produces an average of the PDFs in the ensemble - - Parameters - ---------- - loc: ndarray, float or float - location(s) at which to evaluate the PDFs - using: string - which parametrization to use for the approximation - vb: boolean - report on progress - - Returns - ------- - self.stacked: tuple, ndarray, float - pair of arrays for locations where approximations were evaluated - and the values of the stacked PDFs at those points - - Notes - ----- - Stacking refers to taking the sum of PDFs evaluated on a shared grid and normalizing it such that it integrates to unity. This is equivalent to calculating an average probability (based on the PDFs in the ensemble) over the grid. This probably should be done in a script and not by qp! The right way to do it would be to call qp.Ensemble.evaluate() and sum those outputs appropriately. - TO DO: make this do something more efficient for mixmod, grid, histogram, samples - TO DO: enable stacking on irregular grid - """ - loc_range = max(loc) - min(loc) - delta = loc_range / len(loc) - evaluated = self.evaluate(loc, using=using, norm=True, vb=vb) - stack = np.mean(evaluated[1], axis=0) - stack /= np.sum(stack) * delta - assert(np.isclose(np.sum(stack) * delta, 1.)) - self.stacked[using] = (evaluated[0], stack) - return self.stacked + # def stack(self, loc, using, vb=True): + # """ + # Produces an average of the PDFs in the ensemble + # + # Parameters + # ---------- + # loc: ndarray, float or float + # location(s) at which to evaluate the PDFs + # using: string + # which parametrization to use for the approximation + # vb: boolean + # report on progress + # + # Returns + # ------- + # self.stacked: tuple, ndarray, float + # pair of arrays for locations where approximations were evaluated + # and the values of the stacked PDFs at those points + # + # Notes + # ----- + # Stacking refers to taking the sum of PDFs evaluated on a shared grid and normalizing it such that it integrates to unity. This is equivalent to calculating an average probability (based on the PDFs in the ensemble) over the grid. This probably should be done in a script and not by qp! The right way to do it would be to call qp.Ensemble.evaluate() and sum those outputs appropriately. + # TO DO: make this do something more efficient for mixmod, grid, histogram, samples + # TO DO: enable stacking on irregular grid + # """ + # loc_range = max(loc) - min(loc) + # delta = loc_range / len(loc) + # evaluated = self.evaluate(loc, using=using, norm=True, vb=vb) + # stack = np.mean(evaluated[1], axis=0) + # stack /= np.sum(stack) * delta + # assert(np.isclose(np.sum(stack) * delta, 1.)) + # self.stacked[using] = (evaluated[0], stack) + # return self.stacked # Note: A copious quantity of commented code has been removed in this commit! # For future reference, it can still be found here: diff --git a/qp/metrics.py b/qp/metrics.py new file mode 100644 index 00000000..5a293f3b --- /dev/null +++ b/qp/metrics.py @@ -0,0 +1,206 @@ +import numpy as np + +import qp + +def calculate_moment(p, N, using=None, limits=None, dx=0.01, vb=False): + """ + Calculates a moment of a qp.PDF object + + Parameters + ---------- + p: qp.PDF object + the PDF whose moment will be calculated + N: int + order of the moment to be calculated + limits: tuple of floats + endpoints of integration interval over which to calculate moments + dx: float + resolution of integration grid + vb: Boolean + print progress to stdout? + + Returns + ------- + M: float + value of the moment + """ + if limits is None: + limits = p.limits + if using is None: + using = p.first + # Make a grid from the limits and resolution + d = int((limits[-1] - limits[0]) / dx) + grid = np.linspace(limits[0], limits[1], d) + dx = (limits[-1] - limits[0]) / (d - 1) + # Evaluate the functions on the grid + pe = p.evaluate(grid, using=using, vb=vb)[1] + # pe = normalize_gridded(pe)[1] + # calculate the moment + grid_to_N = grid ** N + M = quick_moment(pe, grid_to_N, dx) + return M + +def quick_moment(p_eval, grid_to_N, dx): + """ + Calculates a moment of an evaluated PDF + + Parameters + ---------- + p_eval: numpy.ndarray, float + the values of a probability distribution + grid: numpy.ndarray, float + the grid upon which p_eval was evaluated + dx: float + the difference between regular grid points + N: int + order of the moment to be calculated + + Returns + ------- + M: float + value of the moment + """ + M = np.dot(grid_to_N, p_eval) * dx + return M + +def calculate_kld(p, q, limits=qp.utils.lims, dx=0.01, vb=False): + """ + Calculates the Kullback-Leibler Divergence between two qp.PDF objects. + + Parameters + ---------- + p: PDF object + probability distribution whose distance _from_ `q` will be calculated. + q: PDF object + probability distribution whose distance _to_ `p` will be calculated. + limits: tuple of floats + endpoints of integration interval in which to calculate KLD + dx: float + resolution of integration grid + vb: boolean + report on progress to stdout? + + Returns + ------- + Dpq: float + the value of the Kullback-Leibler Divergence from `q` to `p` + + Notes + ----- + TO DO: change this to calculate_kld + TO DO: have this take number of points not dx! + """ + # Make a grid from the limits and resolution + N = int((limits[-1] - limits[0]) / dx) + grid = np.linspace(limits[0], limits[1], N) + dx = (limits[-1] - limits[0]) / (N - 1) + # Evaluate the functions on the grid and normalize + pe = p.evaluate(grid, vb=vb, norm=True) + pn = pe[1] + qe = q.evaluate(grid, vb=vb, norm=True) + qn = qe[1] + # Normalize the evaluations, so that the integrals can be done + # (very approximately!) by simple summation: + # pn = pe / np.sum(pe) + #denominator = max(np.sum(qe), epsilon) + # qn = qe / np.sum(qe)#denominator + # Compute the log of the normalized PDFs + # logquotient = safelog(pn / qn) + # logp = safelog(pn) + # logq = safelog(qn) + # Calculate the KLD from q to p + Dpq = quick_kld(pn, qn, dx=dx)# np.dot(pn * logquotient, np.ones(len(grid)) * dx) + if Dpq < 0.: + print('broken KLD: '+str((Dpq, pn, qn, dx))) + Dpq = qp.utils.epsilon + return Dpq + +def quick_kld(p_eval, q_eval, dx=0.01): + """ + Calculates the Kullback-Leibler Divergence between two evaluations of PDFs. + + Parameters + ---------- + p_eval: numpy.ndarray, float + evaluations of probability distribution whose distance _from_ `q` will be calculated + q_eval: numpy.ndarray, float + evaluations of probability distribution whose distance _to_ `p` will be calculated. + dx: float + resolution of integration grid + + Returns + ------- + Dpq: float + the value of the Kullback-Leibler Divergence from `q` to `p` + + Notes + ----- + TO DO: change this to quick_kld + """ + logquotient = qp.utils.safelog(p_eval) - qp.utils.safelog(q_eval) + # logp = safelog(pn) + # logq = safelog(qn) + # Calculate the KLD from q to p + Dpq = dx * np.sum(p_eval * logquotient) + return Dpq + +def calculate_rmse(p, q, limits=qp.utils.lims, dx=0.01, vb=False): + """ + Calculates the Root Mean Square Error between two qp.PDF objects. + + Parameters + ---------- + p: PDF object + probability distribution function whose distance between its truth and the approximation of `q` will be calculated. + q: PDF object + probability distribution function whose distance between its approximation and the truth of `p` will be calculated. + limits: tuple of floats + endpoints of integration interval in which to calculate RMS + dx: float + resolution of integration grid + vb: boolean + report on progress to stdout? + + Returns + ------- + rms: float + the value of the RMS error between `q` and `p` + + Notes + ----- + TO DO: change dx to N + """ + # Make a grid from the limits and resolution + N = int((limits[-1] - limits[0]) / dx) + grid = np.linspace(limits[0], limits[1], N) + dx = (limits[-1] - limits[0]) / (N - 1) + # Evaluate the functions on the grid + pe = p.evaluate(grid, vb=vb)[1] + qe = q.evaluate(grid, vb=vb)[1] + # Calculate the RMS between p and q + rms = quick_rmse(pe, qe, N)# np.sqrt(dx * np.sum((pe - qe) ** 2)) + return rms + +def quick_rmse(p_eval, q_eval, N): + """ + Calculates the Root Mean Square Error between two evaluations of PDFs. + + Parameters + ---------- + p_eval: numpy.ndarray, float + evaluation of probability distribution function whose distance between + its truth and the approximation of `q` will be calculated. + q_eval: numpy.ndarray, float + evaluation of probability distribution function whose distance between + its approximation and the truth of `p` will be calculated. + N: int + number of points at which PDFs were evaluated + + Returns + ------- + rms: float + the value of the RMS error between `q` and `p` + """ + # Calculate the RMS between p and q + rms = np.sqrt(np.sum((p_eval - q_eval) ** 2) / N) + return rms diff --git a/qp/pdf.py b/qp/pdf.py index 263dea38..63dfda68 100644 --- a/qp/pdf.py +++ b/qp/pdf.py @@ -25,7 +25,8 @@ class PDF(object): - def __init__(self, truth=None, quantiles=None, histogram=None, + def __init__(self, funcform=None, + quantiles=None, histogram=None, gridded=None, samples=None, limits=default_lims, scheme='linear', vb=True): """ @@ -34,8 +35,8 @@ def __init__(self, truth=None, quantiles=None, histogram=None, Parameters ---------- - truth: scipy.stats.rv_continuous object or qp.composite object, optional - Continuous, parametric form of the PDF + # truth: scipy.stats.rv_continuous object or qp.composite object, optional + # Continuous, parametric form of the PDF quantiles: tuple of ndarrays, optional Pair of arrays of lengths (nquants, nquants) containing CDF values and quantiles @@ -61,25 +62,28 @@ def __init__(self, truth=None, quantiles=None, histogram=None, TO DO: change dx --> dz (or delta) TO DO: consider changing quantiles input to just be the z-values since interpolation gives the p(z) values anyway """ - self.truth = truth + # self.truth = truth self.quantiles = quantiles self.histogram = qp.utils.normalize_histogram(histogram, vb=vb) self.samples = samples - self.gridded = qp.utils.normalize_integral(qp.utils.normalize_gridded(gridded, vb=vb)) - self.mix_mod = None + self.gridded = qp.utils.normalize_integral(qp.utils.normalize_gridded(gridded)) + self.mix_mod = funcform + self.mixmod = self.mix_mod self.limits = limits - self.scheme = scheme + self.interpolator = [None, None] + # if vb: print('interpolator set to '+str(self.interpolator)) - if vb and self.truth is None and self.quantiles is None and self.histogram is None and self.gridded is None and self.samples is None: - print 'Warning: initializing a PDF object without inputs' - return + # if vb and self.quantiles is None and self.histogram is None and self.gridded is None and self.samples is None:# and self.truth is None: + # print 'Warning: initializing a PDF object without inputs' + # return # Record how the PDF object was initialized: - if self.truth is not None: - self.initialized = self.truth - self.first = 'truth' + if self.mix_mod is not None: + self.initialized = self.mix_mod + self.first = 'mix_mod' elif self.gridded is not None: + # if self.gridded is not None: self.initialized = self.gridded self.first = 'gridded' self.limits = (min(self.limits[0], np.min(self.gridded[0])), max(self.limits[-1], np.max(self.gridded[0]))) @@ -95,12 +99,15 @@ def __init__(self, truth=None, quantiles=None, histogram=None, self.initialized = self.quantiles self.first = 'quantiles' self.limits = (min(self.limits[0], np.min(self.quantiles[-1])), max(self.limits[-1], np.max(self.quantiles[-1]))) + else: + print 'Warning: initializing a PDF object without inputs' + return # The most recent parametrization used is, at this point, the # first one: + self.truth = self.first self.last = self.first - self.interpolator = [None, None] - self.klds = {} + # self.klds = {} return @@ -127,16 +134,16 @@ def evaluate(self, loc, using=None, norm=False, vb=True): the input locations and the value of the PDF (or its approximation) at the requested location(s) """ if using is None: - using = self.first + using = self.last - if using == 'truth': - if self.truth is not None: - if vb: print 'Evaluating the true distribution.' - val = self.truth.pdf(loc) - self.evaluated = (loc, val) - else: - raise ValueError('true PDF is not set, use an approximation instead (the most recent one was '+self.last+')') - elif using == 'mix_mod': + # if using == 'truth': + # if self.truth is not None: + # if vb: print 'Evaluating the true distribution.' + # val = self.truth.pdf(loc) + # self.evaluated = (loc, val) + # else: + # raise ValueError('true PDF is not set, use an approximation instead (the most recent one was '+self.last+')') + if using == 'mix_mod': if self.mix_mod is None: self.mix_mod = self.mix_mod_fit(vb=vb) if vb: print 'Evaluating the fitted mixture model distribution.' @@ -147,7 +154,7 @@ def evaluate(self, loc, using=None, norm=False, vb=True): evaluated = self.approximate(loc, using=using, vb=vb) val = evaluated[1] - gridded = qp.utils.normalize_gridded((loc, val), vb=vb) + gridded = qp.utils.normalize_gridded((loc, val)) if norm: gridded = qp.utils.normalize_integral(gridded, vb=vb) @@ -230,8 +237,8 @@ def quantize(self, quants=None, N=9, limits=None, vb=True): if limits is None: limits = self.limits - if self.truth is not None: - if isinstance(self.truth, qp.composite): + if self.mixmod is not None: + if isinstance(self.mixmod, qp.composite): if type(self.scheme) != int: order = 5 else: @@ -241,13 +248,13 @@ def quantize(self, quants=None, N=9, limits=None, vb=True): min_delta = np.min(extrapoints[1:] - extrapoints[:-1]) grid = np.linspace(limits[0], limits[-1], N + 1) - icdf = self.truth.cdf(grid) + icdf = self.mixmod.cdf(grid) unit_ext = 1. / (order + 1.) low_extended = 0 while icdf[0] >= quantpoints[0]: low_extended += 1 subgrid = np.linspace(limits[0] - 1., limits[0] - unit_ext, order) - subcdf = self.truth.cdf(subgrid) + subcdf = self.mixmod.cdf(subgrid) grid = np.concatenate((subgrid, grid)) icdf = np.concatenate((subcdf, icdf)) limits = (limits[0] - 1., limits[-1]) @@ -257,7 +264,7 @@ def quantize(self, quants=None, N=9, limits=None, vb=True): while icdf[-1] <= quantpoints[-1]: high_extended += 1 subgrid = np.linspace(limits[-1] + unit_ext, limits[-1] + 1., order) - subcdf = self.truth.cdf(subgrid) + subcdf = self.mixmod.cdf(subgrid) grid = np.concatenate((grid, subgrid)) icdf = np.concatenate((icdf, subcdf)) limits = (limits[0], limits[-1] + 1.) @@ -273,7 +280,7 @@ def quantize(self, quants=None, N=9, limits=None, vb=True): delta_i = new_deltas[i] / (order + 1.) subgrid = np.linspace(grid[i] + delta_i, grid[i+1] - delta_i, order) grid = np.sort(np.insert(grid, i, subgrid)) - subcdf = self.truth.cdf(subgrid) + subcdf = self.mixmod.cdf(subgrid) icdf = np.sort(np.insert(icdf, i, subcdf)) new_deltas = icdf[1:] - icdf[:-1] if vb: @@ -295,19 +302,20 @@ def quantize(self, quants=None, N=9, limits=None, vb=True): try: while (order>0) and (not np.array_equal(quantiles, np.sort(quantiles))): if vb: print('order is '+str(order)) - b = spi.InterpolatedUnivariateSpline(icdf, grid, k=order, ext=1) + b = spi.InterpolatedUnivariateSpline(icdf, grid, k=order, ext=1, check_finite=True) quantiles = b(quantpoints) order -= 1 assert(not np.any(np.isnan(quantiles))) - assert(type(quantiles) is not dfitpack.error) except AssertionError: print('ERROR: splines failed because '+str(AssertionError)+', defaulting to optimization for '+str((icdf, grid))) locs = np.array([bisect.bisect_right(icdf[:-1], quantpoints[n]) for n in range(N)]) - quantiles = self.truth.ppf(quantpoints, ivals=grid[locs]) + quantiles = self.mixmod.ppf(quantpoints, ivals=grid[locs]) assert(not np.any(np.isnan(quantiles))) + except Exception, e: + print('ERROR in `scipy.interpolate.InterpolatedUnivariateSpline`') if vb: print('output quantiles = '+str(quantiles)) else: - quantiles = self.truth.ppf(quantpoints) + quantiles = self.mixmod.ppf(quantpoints) else: print('New quantiles can only be computed from a truth distribution in this version.') return @@ -362,14 +370,14 @@ def histogramize(self, binends=None, N=10, binrange=None, vb=True): histogram = np.zeros(N) if vb: print 'Calculating histogram: ', binends - if self.truth is not None: - cdf = self.truth.cdf(binends) + if self.mixmod is not None: + cdf = self.mixmod.cdf(binends) heights = cdf[1:] - cdf[:-1] histogram = qp.utils.normalize_histogram((binends, heights), vb=vb) # for b in range(N): # histogram[b] = (cdf[b+1]-cdf[b])/(binends[b+1]-binends[b]) else: - print 'New histograms can only be computed from a truth distribution in this version.' + print 'New histograms can only be computed from a mixmod format in this version.' return if vb: print 'Result: ', histogram @@ -490,9 +498,9 @@ def sample(self, N=1000, infty=default_infty, using=None, vb=True): if vb: print 'Sampling from '+using+' parametrization.' - if using == 'truth': - samples = self.truth.rvs(size=N) - elif using == 'mix_mod': + # if using == 'truth': + # samples = self.truth.rvs(size=N) + if using == 'mix_mod': samples = self.mix_mod.rvs(size=N) elif using == 'gridded': @@ -513,8 +521,8 @@ def sample(self, N=1000, infty=default_infty, using=None, vb=True): if self.quantiles is None: self.quantiles = self.quantize(vb=vb) - (x, y) = qp.utils.evaluate_quantiles(self.quantiles, vb=vb) - (endpoints, weights) = qp.utils.normalize_quantiles(self.quantiles, (x, y), vb=vb) + # (x, y) = qp.utils.evaluate_quantiles(self.quantiles, vb=vb) + (endpoints, weights) = qp.utils.normalize_quantiles(self.quantiles, vb=vb) # endpoints = np.insert(self.quantiles[1], [0, -1], self.limits) # weights = qp.utils.evaluate_quantiles(self.quantiles)[1]# self.evaluate((endpoints[1:]+endpoints[:-1])/2.) # interpolator = self.interpolate(using='quantiles', vb=False) @@ -572,8 +580,9 @@ def interpolate(self, using=None, vb=True): using = self.last if using == 'truth' or using == 'mix_mod': + interpolator = self.mix_mod.pdf print 'A functional form needs no interpolation. Try converting to an approximate parametrization first.' - return + # return if using == 'quantiles': # First find the quantiles if none exist: @@ -586,14 +595,12 @@ def interpolate(self, using=None, vb=True): order = self.scheme if vb: print('input quantiles are '+str(self.quantiles[1])) - (x, y) = qp.utils.evaluate_quantiles(self.quantiles, vb=vb) - if vb: print('evaluated quantile PDF: '+str((x, y))) + # (x, y) = qp.utils.evaluate_quantiles(self.quantiles, vb=vb) + # if vb: print('evaluated quantile PDF: '+str((x, y))) # [x_crit_lo, x_crit_hi] = [x[0], x[-1]] # [y_crit_lo, y_crit_hi] = [y[0], y[-1]] - (x, y) = qp.utils.normalize_quantiles(self.quantiles, (x, y), vb=vb) + (x, y) = qp.utils.normalize_quantiles(self.quantiles, vb=vb) if vb: print('complete evaluated quantile PDF: '+str((x, y))) - alternate = spi.interp1d(x, y, kind='linear', bounds_error=False, fill_value=default_eps) - backup = qp.utils.make_kludge_interpolator((x, y), outside=default_eps) z = np.insert(self.quantiles[1], 0, min(x)) z = np.append(z, max(x)) @@ -652,12 +659,14 @@ def quantile_interpolator(xf): except AssertionError: print('ERROR: spline interpolation failed with '+str((xf[in_inds], yf[in_inds]))) try: + alternate = spi.interp1d(x, y, kind='linear', bounds_error=False, fill_value=default_eps) yf[in_inds] = alternate(xf[in_inds]) assert(np.all(yf >= default_eps)) if vb: print 'Created a linear interpolator for the '+using+' parametrization.' except AssertionError: print 'ERROR: linear interpolation failed for the '+using+' parametrization with '+str((xf[in_inds], yf[in_inds])) + backup = qp.utils.make_kludge_interpolator((x, y), threshold=default_eps) yf[in_inds] = backup(xf[in_inds]) if vb: print 'Doing linear interpolation by hand for the '+using+' parametrization.' @@ -802,7 +811,7 @@ def approximate(self, points, using=None, scheme=None, vb=True): interpolated = interpolator(points) # except: # print('error in '+using+' interpolation of '+str(points)) - interpolated = qp.utils.normalize_gridded((points, interpolated), vb=vb) + interpolated = qp.utils.normalize_gridded((points, interpolated)) # interpolated[interpolated<0.] = 0. return interpolated#(points, interpolated) @@ -847,12 +856,12 @@ def plot(self, limits=None, loc='plot.pdf', vb=True): styles['samples'] = '-.'#(0,(1,2)) x = np.linspace(self.limits[0], self.limits[-1], 100) - if self.truth is not None: - [min_x, max_x] = [self.truth.ppf(np.array([0.001])), self.truth.ppf(np.array([0.999]))] + if self.mixmod is not None: + [min_x, max_x] = [self.mixmod.ppf(np.array([0.001])), self.mixmod.ppf(np.array([0.999]))] extrema = [min(extrema[0], min_x), max(extrema[1], max_x)] [min_x, max_x] = extrema x = np.linspace(min_x, max_x, 100) - y = self.truth.pdf(x) + y = self.mixmod.pdf(x) plt.plot(x, y, color=colors['truth'], linestyle=styles['truth'], lw=5.0, alpha=0.25, label='True PDF') if vb: print 'Plotted truth.' @@ -868,10 +877,10 @@ def plot(self, limits=None, loc='plot.pdf', vb=True): print 'Plotted mixture model.' if self.quantiles is not None: - (z, p) = self.evaluate(self.quantiles[1], using='quantiles', vb=vb) - print('first: '+str((z,p))) - (x, y) = qp.utils.normalize_quantiles(self.quantiles, (z, p)) - print('second: '+str((x,y))) + # (z, p) = self.evaluate(self.quantiles[1], using='quantiles', vb=vb) + # print('first: '+str((z,p))) + (x, y) = qp.utils.normalize_quantiles(self.quantiles) + print('second: '+str((x, y))) [min_x, max_x] = [min(x), max(x)] extrema = [min(extrema[0], min_x), max(extrema[1], max_x)] [min_x, max_x] = extrema @@ -934,74 +943,76 @@ def plot(self, limits=None, loc='plot.pdf', vb=True): return - def kld(self, using=None, limits=None, dx=0.01): - """ - Calculates Kullback-Leibler divergence of quantile approximation from truth. - - Parameters - ---------- - using: string - parametrization to use - limits: tuple of floats, optional - endpoints of integration interval in which to calculate KLD - dx: float - resolution of integration grid - - Returns - ------- - KL: float - value of Kullback-Leibler divergence from approximation to truth - if truth is available; otherwise nothing. - - Notes - ----- - Example:: - d = p.kld(limits=(-1., 1.), dx=1./100)) - """ - # print('This function is deprecated; use `qp.utils.calculate_kl_divergence`.') - # return - if self.truth is None: - print('Truth not available for comparison.') - return - else: - if using is None: - using = self.last - if limits is None: - limits = self.limits - D = int((limits[-1] - limits[0]) / dx) - grid = np.linspace(limits[0], limits[1], D) - KL = qp.utils.quick_kl_divergence(self.evaluate(grid, using='truth'), self.evaluate(grid, using=using), dx=dx) - self.klds[using] = KL - return(KL) - - def rms(self, limits=(0., 1.), dx=0.01): - """ - Calculates root mean square difference between quantile approximation - and truth. - - Parameters - ---------- - limits: tuple of floats - endpoints of integration interval in which to calculate KLD - dx: float - resolution of integration grid - - Returns - ------- - RMS: float - value of root mean square difference between approximation of truth - if truth is available; otherwise nothing. - - Notes - ----- - Example:: - d = p.rms(limits=(-1., 1.), dx=1./100)) - """ - print('This function is deprecated; use `qp.utils.calculate_rmse`.') - return - # if self.truth is None: - # print('Truth not available for comparison.') - # return - # else: - # RMS = qp.utils.calculate_rms(self, self, limits=limits, dx=dx) - # return(RMS) + # def kld(self, using=None, limits=None, dx=0.01): + # """ + # Calculates Kullback-Leibler divergence of quantile approximation from truth. + # + # Parameters + # ---------- + # using: string + # parametrization to use + # limits: tuple of floats, optional + # endpoints of integration interval in which to calculate KLD + # dx: float + # resolution of integration grid + # + # Returns + # ------- + # KL: float + # value of Kullback-Leibler divergence from approximation to truth + # if truth is available; otherwise nothing. + # + # Notes + # ----- + # Example:: + # d = p.kld(limits=(-1., 1.), dx=1./100)) + # """ + # print('This function is deprecated; use `qp.utils.calculate_kld`.') + # return + # # print('This function is deprecated; use `qp.utils.calculate_kl_divergence`.') + # # return + # # if self.truth is None: + # # print('Truth not available for comparison.') + # # return + # # else: + # # if using is None: + # # using = self.last + # # if limits is None: + # # limits = self.limits + # # D = int((limits[-1] - limits[0]) / dx) + # # grid = np.linspace(limits[0], limits[1], D) + # # KL = qp.utils.quick_kl_divergence(self.evaluate(grid, using='truth'), self.evaluate(grid, using=using), dx=dx) + # # self.klds[using] = KL + # # return(KL) + # + # def rmse(self, limits=(0., 1.), dx=0.01): + # """ + # Calculates root mean square difference between quantile approximation + # and truth. + # + # Parameters + # ---------- + # limits: tuple of floats + # endpoints of integration interval in which to calculate KLD + # dx: float + # resolution of integration grid + # + # Returns + # ------- + # RMS: float + # value of root mean square difference between approximation of truth + # if truth is available; otherwise nothing. + # + # Notes + # ----- + # Example:: + # d = p.rms(limits=(-1., 1.), dx=1./100)) + # """ + # print('This function is deprecated; use `qp.utils.calculate_rmse`.') + # return + # # if self.truth is None: + # # print('Truth not available for comparison.') + # # return + # # else: + # # RMS = qp.utils.calculate_rms(self, self, limits=limits, dx=dx) + # # return(RMS) diff --git a/qp/utils.py b/qp/utils.py index c9af2bf2..527d65a5 100644 --- a/qp/utils.py +++ b/qp/utils.py @@ -1,14 +1,7 @@ -""" -Notes ------ -TO DO: change dx --> dz (or delta) -""" - import numpy as np import scipy as sp from scipy import stats as sps import sys -# import bisect global epsilon epsilon = sys.float_info.epsilon @@ -17,20 +10,50 @@ global lims lims = (epsilon, 1.) +def sandwich(in_arr, ends): + """ + Adds given values to the ends of a 1D array + + Parameters + ---------- + in_arr: numpy.ndarray, float + original array + ends: numpy.ndarray or tuple or list, float or numpy.ndarray, float + values to be added to the beginning and end + + Returns + ------- + out_arr: numpy.ndarray, float + array with front and back concatenations + """ + if type(ends[0]) == np.ndarray: + prepend = len(ends[0]) + else: + prepend = 1 + if type(ends[-1]) == np.ndarray: + append = -1 * len(ends[-1]) + else: + append = -1 + out_arr = np.zeros(prepend + len(in_arr) - append) + out_arr[:prepend] = ends[0] + out_arr[prepend:append] = in_arr + out_arr[append:] = ends[-1] + return out_arr + def safelog(arr, threshold=epsilon): """ - Takes the natural logarithm of an array that might contain zeroes. + Takes the natural logarithm of an array of potentially non-positive numbers Parameters ---------- - arr: ndarray + arr: numpy.ndarray, float values to be logged threshold: float small, positive value to replace zeros and negative numbers Returns ------- - logged: ndarray + logged: numpy.ndarray logarithms, with approximation in place of zeros and negative numbers """ shape = np.shape(arr) @@ -40,433 +63,268 @@ def safelog(arr, threshold=epsilon): def normalize_integral(in_data, vb=False): """ - Normalizes integrals over full range from grid + Normalizes integrals of PDF evaluations on a grid Parameters ---------- - in_data: None or tuple, ndarray, float - tuple of points at which function is evaluated and the PDF at those points - vb: boolean - print progress to stdout? + in_data: None or tuple, numpy.ndarray, float + tuple of points x at which function is evaluated and the PDF y at those + points + vb: boolean, optional + be careful and print progress to stdout? Returns ------- - (x, y): tuple, ndarray, float - tuple of input x and normalized y + out_data: tuple, numpy.ndarray, float + tuple of ordered input x and normalized y """ if in_data is None: return in_data (x, y) = in_data - # a = x.argsort() - # x.sort() - # ys = y[a] + # if vb: + a = x.argsort() + # try: + # assert np.array_equal(x[a], x.sort()) + # except AssertionError: + x.sort() + y = y[a] dx = x[1:] - x[:-1] - dy = (y[1:] + y[:-1]) / 2. - norm = np.dot(dy, dx) + my = (y[1:] + y[:-1]) / 2. + norm = np.dot(my, dx) y = y / norm if vb: - # print('almost normalized integrals') - dy = (y[1:] + y[:-1]) / 2. - if not np.isclose(np.dot(dy, dx), 1.): - print('broken integral = '+str(np.dot(dy, dx))) + try: + my = (y[1:] + y[:-1]) / 2. + assert np.isclose(np.dot(my, dx), 1.) + except AssertionError: + print('`qp.utils.normalize_integral`: broken integral = '+str((my, dx))) assert False - return(x, y) - -def normalize_gridded(in_data, vb=True): - """ - Normalizes gridded parametrizations assuming evenly spaced grid - - Parameters - ---------- - in_data: None or tuple, ndarray, float - tuple of points at which function is evaluated and the PDF at those points - vb: boolean - print progress to stdout? + out_data = (x, y) + return out_data - Returns - ------- - (x, y): tuple, ndarray, float - tuple of input x and normalized y +def evaluate_samples(in_data, bw_method=None, vb=False): """ - if in_data is None: - return in_data - (x, y) = in_data - y[y < epsilon] = epsilon - y[y > infty] = infty - return (x, y) - -def normalize_histogram(in_data, vb=True): - """ - Normalizes histogram parametrizations - - Parameters - ---------- - in_data: None or tuple, ndarray, float - tuple of (n+1) bin endpoints and (n) CDF between endpoints - vb: boolean - print progress to stdout? - - Returns - ------- - (x, y): tuple, ndarray, float - tuple of input x and normalized y - """ - if in_data is None: - return in_data - (x, y) = in_data - delta = x[1:] - x[:-1] - # if vb: print(np.sum(y * delta)) - y[y < epsilon] = epsilon - y /= np.dot(y, delta) - # if vb: print(np.sum(y * delta)) - return (x, y) - -def normalize_quantiles((q, z), (x, y), vb=True): - """ - Adds valid endpoints to quantile parametrization - - Parameters - ---------- - q: numpy.ndarray, float - CDF values corresponding to quantiles - z: numpy.ndarray, float - original quantiles - x: numpy.ndarray, float - averaged quantile values - y: numpy.ndarray, float - probability evaluated at averaged quantiles - vb: boolean - print progress to stdout? - - Returns - ------- - (x, y): tuple, ndarray, float - tuple of input x and normalized y - - Notes - ----- - Finds actual endpoints via linear interpolation from evaluation - """ - # nq = np.insert(q, [0, -1], (0., 1.)) - q = np.insert(q, 0, 0.) - q = np.append(q, 1.) - # nq = (q[1:] + q[:-1]) / 2. - dq = q[1:] - q[:-1] - # if vb: - # if not np.all(nq>0.)... - xmin = z[0] - 2 * (dq[0] + dq[1] / 2. - y[0] * (x[0] - z[0])) / y[0] - xmax = z[-1] + 2 * (dq[-1] + dq[-2]/2. - y[-1] * (z[-1] - x[-1])) / y[-1] - if vb: print('x before: '+str(x)) - x = np.insert(x, 0, xmin) - x = np.append(x, xmax) - if vb: print('x after: '+str(x)) - y = np.insert(y, 0, epsilon) - y = np.append(y, epsilon) - return(x, y) - -def evaluate_quantiles((qs, xs), vb=True): - """ - Produces PDF values given quantile information + Produces PDF values given samples Parameters ---------- - qs: ndarray, float - CDF values - xs: ndarray, float - quantile values - vb: Boolean - print progress to stdout? + in_data: numpy.ndarray, float + samples x from the PDF + bw_method: string or scalar or callable function, optional + `scipy.stats.gaussian_kde` bandwidth methods: 'scott', 'silverman' + vb: boolean, optional + be careful and print progress to stdout? Returns ------- - (x, y): tuple, float - quantile values and corresponding PDF - - Notes - ----- - TO DO: make this use linear interpolation instead of piecewise constant + out_data: tuple, float + sorted samples x and corresponding PDF values y """ - # q = np.append(q, np.array([1.])) - # qs = np.append(np.array([0.]), q) - # norm = max(qs) - min(qs) - dq = qs[1:] - qs[:-1] - # xs = np.append(x, np.array([infty])) - # xs = np.append(np.array([-1. * infty]), x) - dx = xs[1:] - xs[:-1] + x = in_data + x.sort() + kde = sps.gaussian_kde(x, bw_method) if vb: - if not np.all(dx>0.): - print('broken delta quantile values: '+str(xs)) - assert(np.all(dx>0.)) - mx = (xs[1:] + xs[:-1]) / 2. - y = dq / dx - # print(np.dot(y, dx)) - # y *= norm - return ((mx, y)) - -def evaluate_histogram((xp, y)): - """ - Produces PDF values given histogram information - - Parameters - ---------- - xp: ndarray, float - bin endpoints - y: ndarray, float - CDFs over bins - - Returns - ------- - (x, y): tuple, float - bin midpoints and CDFs over bins + print('`qp.utils.evaluate_samples` made a KDE with bandwidth = '+str(kde.factor)) + y = kde(x) + out_data = (x, y) + return out_data - Notes - ----- - This shouldn't be necessary at all, see qp.PDF.interpolate notes - """ - x = (xp[1:] + xp[:-1]) / 2. - return((x, y)) - -def evaluate_samples(x): +def evaluate_histogram(in_data, threshold=epsilon, vb=False): """ Produces PDF values given samples Parameters ---------- - x: ndarray, float - samples from the PDF + in_data: None or tuple, numpy.ndarray, float + tuple of (n+1) bin endpoints x and (n) CDF y between endpoints + threshold: float, optional - Returns - ------- - (sx, y): tuple, float - sorted sample values and corresponding PDF values - """ - sx = np.sort(x) - # bandwidth = np.mean(sx[1:]-sx[:-1]) - kde = sps.gaussian_kde(x)# , bw_method=bandwidth) - y = kde(sx) - return ((sx, y)) - -def calculate_moment(p, N, using=None, limits=None, dx=0.01, vb=False): - """ - Calculates a moment of a qp.PDF object - - Parameters - ---------- - p: qp.PDF object - the PDF whose moment will be calculated - N: int - order of the moment to be calculated - limits: tuple of floats - endpoints of integration interval over which to calculate moments - dx: float - resolution of integration grid - vb: Boolean - print progress to stdout? + vb: boolean, optional + be careful and print progress to stdout? Returns ------- - M: float - value of the moment + out_data: tuple, float + sorted samples x and corresponding PDF values y """ - if limits is None: - limits = p.limits - if using is None: - using = p.first - # Make a grid from the limits and resolution - d = int((limits[-1] - limits[0]) / dx) - grid = np.linspace(limits[0], limits[1], d) - dx = (limits[-1] - limits[0]) / (d - 1) - # Evaluate the functions on the grid - pe = p.evaluate(grid, using=using, vb=vb)[1] - # pe = normalize_gridded(pe)[1] - # calculate the moment - grid_to_N = grid ** N - M = quick_moment(pe, grid_to_N, dx) - return M - -def quick_moment(p_eval, grid_to_N, dx): + (x, y) = in_data + dx = threshold + xs = np.zeros(2 * len(y)) + ys = xs + xs[::2] = x[:-1] + dx + xs[1::2] = x[1:] - dx + ys = np.repeat(y, 2) + xs = sandwich(xs, (x[0] - dx, x[-1] + dx)) + ys = sandwich(ys, (threshold, threshold)) + if vb: + try: + assert np.all(ys >= threshold) + except AssertionError: + print('broken self-evaluations in `qp.utils.evaluate_histogram`: '+str((xs, ys))) + assert False + out_data = (xs, ys) + return out_data + +def normalize_histogram(in_data, threshold=epsilon, vb=False): """ - Calculates a moment of an evaluated PDF + Normalizes histogram parametrizations Parameters ---------- - p_eval: numpy.ndarray, float - the values of a probability distribution - grid: numpy.ndarray, float - the grid upon which p_eval was evaluated - dx: float - the difference between regular grid points - N: int - order of the moment to be calculated + in_data: None or tuple, numpy.ndarray, float + tuple of (n+1) bin endpoints x and (n) CDF y between endpoints + threshold: float, optional + optional minimum threshold + vb: boolean, optional + be careful and print progress to stdout? Returns ------- - M: float - value of the moment + out_data: tuple, numpy.ndarray, float + tuple of input x and normalized y """ - M = np.dot(grid_to_N, p_eval) * dx - return M + if in_data is None: + return in_data + (x, y) = in_data + dx = x[1:] - x[:-1] + y[y < threshold] = threshold + y /= np.dot(y, dx) + if vb: + try: + assert np.isclose(np.dot(y, dx), 1.) + except AssertionError: + print('`qp.utils.normalize_histogram`: broken integral = '+str(np.dot(y, dx))) + assert False + out_data = (x, y) + return out_data -def calculate_kl_divergence(p, q, limits=lims, dx=0.01, vb=False): +def normalize_gridded(in_data, thresholds=(epsilon, infty)): """ - Calculates the Kullback-Leibler Divergence between two qp.PDF objects. + Removes extreme values from gridded parametrizations Parameters ---------- - p: PDF object - probability distribution whose distance _from_ `q` will be calculated. - q: PDF object - probability distribution whose distance _to_ `p` will be calculated. - limits: tuple of floats - endpoints of integration interval in which to calculate KLD - dx: float - resolution of integration grid - vb: boolean - report on progress to stdout? + in_data: None or tuple, numpy.ndarray, float + tuple of points x at which function is evaluated and the PDF y at those + points + thresholds: tuple, float, optional + optional min/max thresholds for normalization Returns ------- - Dpq: float - the value of the Kullback-Leibler Divergence from `q` to `p` - - Notes - ----- - TO DO: change this to calculate_kld - TO DO: have this take number of points not dx! - """ - # Make a grid from the limits and resolution - N = int((limits[-1] - limits[0]) / dx) - grid = np.linspace(limits[0], limits[1], N) - dx = (limits[-1] - limits[0]) / (N - 1) - # Evaluate the functions on the grid and normalize - pe = p.evaluate(grid, vb=vb, norm=True) - pn = pe[1] - qe = q.evaluate(grid, vb=vb, norm=True) - qn = qe[1] - # Normalize the evaluations, so that the integrals can be done - # (very approximately!) by simple summation: - # pn = pe / np.sum(pe) - #denominator = max(np.sum(qe), epsilon) - # qn = qe / np.sum(qe)#denominator - # Compute the log of the normalized PDFs - # logquotient = safelog(pn / qn) - # logp = safelog(pn) - # logq = safelog(qn) - # Calculate the KLD from q to p - Dpq = quick_kl_divergence(pn, qn, dx=dx)# np.dot(pn * logquotient, np.ones(len(grid)) * dx) - if Dpq < 0.: - print('broken KLD: '+str((Dpq, pn, qn, dx))) - Dpq = epsilon - return Dpq - -def quick_kl_divergence(p_eval, q_eval, dx=0.01): + out_data: tuple, numpy.ndarray, float + tuple of input x and normalized y """ - Calculates the Kullback-Leibler Divergence between two evaluations of PDFs. - - Parameters - ---------- - p_eval: numpy.ndarray, float - evaluations of probability distribution whose distance _from_ `q` will be calculated - q_eval: numpy.ndarray, float - evaluations of probability distribution whose distance _to_ `p` will be calculated. - dx: float - resolution of integration grid - - Returns - ------- - Dpq: float - the value of the Kullback-Leibler Divergence from `q` to `p` + if in_data is None: + return in_data + (x, y) = in_data + y[y < thresholds[0]] = thresholds[0] + y[y > thresholds[-1]] = thresholds[-1] + out_data = (x, y) + return out_data - Notes - ----- - TO DO: change this to quick_kld +def evaluate_quantiles(in_data, threshold=epsilon, vb=False): """ - logquotient = safelog(p_eval) - safelog(q_eval) - # logp = safelog(pn) - # logq = safelog(qn) - # Calculate the KLD from q to p - Dpq = dx * np.sum(p_eval * logquotient) - return Dpq - -def calculate_rmse(p, q, limits=lims, dx=0.01, vb=False): - """ - Calculates the Root Mean Square Error between two qp.PDF objects. + Estimates PDF values given quantile information Parameters ---------- - p: PDF object - probability distribution function whose distance between its truth and the approximation of `q` will be calculated. - q: PDF object - probability distribution function whose distance between its approximation and the truth of `p` will be calculated. - limits: tuple of floats - endpoints of integration interval in which to calculate RMS - dx: float - resolution of integration grid - vb: boolean - report on progress to stdout? + in_data: tuple, numpy.ndarray, float + tuple of CDF values iy and values x at which those CDFs are achieved + threshold: float, optional + optional minimum threshold for CDF difference + vb: boolean, optional + be careful and print progress to stdout? Returns ------- - rms: float - the value of the RMS error between `q` and `p` - - Notes - ----- - TO DO: change dx to N + out_data: tuple, numpy.ndarray, float + values xs and corresponding PDF values ys """ - # Make a grid from the limits and resolution - N = int((limits[-1] - limits[0]) / dx) - grid = np.linspace(limits[0], limits[1], N) - dx = (limits[-1] - limits[0]) / (N - 1) - # Evaluate the functions on the grid - pe = p.evaluate(grid, vb=vb)[1] - qe = q.evaluate(grid, vb=vb)[1] - # Calculate the RMS between p and q - rms = quick_rmse(pe, qe, N)# np.sqrt(dx * np.sum((pe - qe) ** 2)) - return rms - -def quick_rmse(p_eval, q_eval, N): + (iy, x) = in_data + dx = x[1:] - x[:-1] + if vb: + try: + assert np.all(dx > threshold) + except AssertionError: + print('broken quantile locations in `qp.utils.evaluate_quantiles`: '+str(x)) + assert False + diy = iy[1:] - iy[:-1] + if vb: + try: + assert np.all(diy > threshold) + except AssertionError: + print('broken CDF values in `qp.utils.evaluate_quantiles`: '+str(iy)) + assert False + y = diy / dx + (xs, ys) = evaluate_histogram((x, y), threshold=threshold, vb=vb) + if vb: print('input shape: '+str((len(x), len(y)))+', output shape: '+str((len(xs), len(ys)))) + # try: + # assert (np.all(xs > threshold) and np.all(ys > threshold)) + # except AssertionError: + # print('broken quantile self-evaluations in `qp.utils.evaluate_quantiles`: '+str((xs, ys))) + # assert False + # ys = ys[1:-1] + # xs = xs[1:-1] + # ys = sandwich(ys, (threshold, threshold)) + # x_min = xs[0] - 2 * iy[0] / y[0] + # x_max = xs[-1] + 2 * iy[-1] / y[-1] + # xs = sandwich(xs, (x_min, x_max)) + out_data = (xs[1:-1], ys[1:-1]) + return out_data + +def normalize_quantiles(in_data, threshold=epsilon, vb=False): """ - Calculates the Root Mean Square Error between two evaluations of PDFs. + Evaluates PDF from quantiles including endpoints from linear extrapolation Parameters ---------- - p_eval: numpy.ndarray, float - evaluation of probability distribution function whose distance between its truth and the approximation of `q` will be calculated. - q_eval: numpy.ndarray, float - evaluation of probability distribution function whose distance between its approximation and the truth of `p` will be calculated. - N: int - number of points at which PDFs were evaluated + in_data: tuple, numpy.ndarray, float + tuple of CDF values iy corresponding to quantiles and the points x at + which those CDF values are achieved + threshold: float, optional + optional minimum threshold for PDF + vb: boolean, optional + be careful and print progress to stdout? Returns ------- - rms: float - the value of the RMS error between `q` and `p` + out_data: tuple, ndarray, float + tuple of values x at which CDF is achieved, including extrema, and + normalized PDF values y at x """ - # Calculate the RMS between p and q - rms = np.sqrt(np.sum((p_eval - q_eval) ** 2) / N) - return rms - -def make_kludge_interpolator((x, y), outside=epsilon): + (iy, x) = in_data + (xs, ys) = evaluate_quantiles((iy, x), vb=vb) + # xs = xs[1:-1] + # ys = ys[1:-1] + x_min = xs[0] - 2 * iy[0] / ys[0] + x_max = xs[-1] + 2 * (1. - iy[-1]) / ys[-1] + xs = sandwich(xs, (x_min, x_max)) + ys = sandwich(ys, (threshold, threshold)) + out_data = (xs, ys) + return out_data + +def make_kludge_interpolator(in_data, threshold=epsilon): """ Linear interpolation by hand for debugging Parameters ---------- - (x, y): tuple, numpy.ndarray, float - where interpolator is fit - outside: float - value to use outside interpolation range + in_data: tuple, numpy.ndarray, float + values x and PDF values y at which interpolator is fit + threshold: float, optional + minimum value to use outside interpolation range Returns ------- kludge_interpolator: function evaluates linear interpolant based on input points """ + (x, y) = in_data dx = x[1:] - x[:-1] dy = y[1:] - y[:-1] def kludge_interpolator(xf): - yf = np.ones(np.shape(xf)) * epsilon + yf = np.ones(np.shape(xf)) * threshold for i in range(len(x)): inside = ((xf >= x[i]) & (xf <= x[i+1])).nonzero()[0] yf[inside] = y[i] + (y[i+1] - y[i]) * (xf[inside] - x[i]) / dx[i]