From 1f733a82b3750e44d06f85f08eb3f7a4751740c4 Mon Sep 17 00:00:00 2001 From: IftachSadeh Date: Tue, 3 Apr 2018 12:01:05 +0200 Subject: [PATCH] v2.3.0 ### For users: - Changed the optimization method for generating regression PDFs. The new default method (denoted in the output as `PDF_0`) is now generated based on a simple random walk alg. The previous versions of the PDF are now denoted as `PDF_1` and `PDF_2`. While currently available, the deprecated PDFs are not guaranteed to be supported in the future. In order to derive the deprecated PDFs, set: ```python glob.annz["nPDFs"] = 3 glob.annz["addOldStylePDFs"] = True ``` - **(1)** Two new job options corresponding to `PDF_0` have been added: `max_optimObj_PDF` and `nOptimLoops` (see `README.md` and `scripts/annz_rndReg_advanced.py` for details). **(2)** The default value of `excludeRangePdfModelFit` has been changed from `0.1` to `0`. **(3)** Added several job options for plotting, to control the extent of underflow and overflow regions in the regression target: `underflowZ`, `overflowZ`, `underflowZwidth`, `overflowZwidth`, `nUnderflowBins`, `nOverflowBins`. (See `src/myANNZ.cpp` for details.) **(4)** Added a variable, `nZclosBins`, to control the number of bins used for optimization-metric calculations in regression. (See `src/myANNZ.cpp` for details.) **(5)** ROOT scripts are no longer stored by default for each plot. Set `savePlotScripts` to choose otherwise. - Added a wrapper class, which allows calling the evaluation phase for regression/classification directly from python. This can be used to integrate ANNZ directly within pipelines. The python interface is defined in `py/ANNZ.py`, with a full example given in `scripts/annz_evalWrapper.py`. (See README.md for details.) - Bug fix in a few python scripts, where the example for the `weightInp_wgtKNN` option had previously been set to numerically insignificant values. - Changed the interface to turn off colour output (see `README.md`). ### For developers: - Major revamp of the `Makefile`, including adding a step of precompilation of the shared `include/commonInclude.hpp` header. - Reorganization of shared namespaces. - Created a new `Manager` class as part of `include/myANNZ.hpp`, `src/myANNZ.cpp`. - The new random walk alg for generating regression PDFs is implemented in `ANNZ::getRndMethodBestPDF()`, which has been completely revamped. The old version of this function has been renamed to `ANNZ::getOldStyleRndMethodBestPDF()`. It is now used in order to derive `PDF_1` and `PDF_2`. - Added a wrapper class for e.g., python integration, implemented in `include/Wrapper.hpp`, `src/Wrapper.cpp` and `py/ANNZ.py`. - Completely rewrote `ANNZ::doEvalReg()` to comply with pipeline integration. Added new interfaces for regression evaluation, as implemented in `src/ANNZ_regEval.cpp`. --- CHANGELOG.md | 15 +- README.md | 35 +- examples/scripts/annz_binCls_advanced.py | 2 +- examples/scripts/annz_binCls_quick.py | 2 +- examples/scripts/annz_evalWrapper.py | 17 +- examples/scripts/annz_rndReg_advanced.py | 34 +- examples/scripts/annz_rndReg_quick.py | 6 +- include/ANNZ.hpp | 43 +- include/commonInclude.hpp | 79 +- src/ANNZ.cpp | 12 +- src/ANNZ_TMVA.cpp | 28 +- src/ANNZ_clsEval.cpp | 12 +- src/ANNZ_err.cpp | 34 +- src/ANNZ_loopCls.cpp | 6 +- src/ANNZ_loopReg.cpp | 941 +++++++++++++++++++++-- src/ANNZ_loopRegCls.cpp | 25 +- src/ANNZ_onlyKnnErr.cpp | 6 +- src/ANNZ_regEval.cpp | 59 +- src/ANNZ_train.cpp | 14 +- src/ANNZ_utils.cpp | 248 ++++-- src/OutMngr_utils.cpp | 25 +- src/myANNZ.cpp | 60 +- 22 files changed, 1360 insertions(+), 343 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14c4159..65d1af8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,20 @@ # Changelog -## Master version (20/02/2018) +## ANNZ v2.3.0 (03/04/2018) ### For users: +- Changed the optimization method for generating regression PDFs. The new default method (denoted in the output as `PDF_0`) is now generated based on a simple random walk alg. The previous versions of the PDF are now denoted as `PDF_1` and `PDF_2`. While currently available, the deprecated PDFs are not guaranteed to be supported in the future. In order to derive the deprecated PDFs, set: + ```python + glob.annz["nPDFs"] = 3 + glob.annz["addOldStylePDFs"] = True + ``` + +- **(1)** Two new job options corresponding to `PDF_0` have been added: `max_optimObj_PDF` and `nOptimLoops` (see `README.md` and `scripts/annz_rndReg_advanced.py` for details). **(2)** The default value of `excludeRangePdfModelFit` has been changed from `0.1` to `0`. **(3)** Added several job options for plotting, to control the extent of underflow and overflow regions in the regression target: `underflowZ`, `overflowZ`, `underflowZwidth`, `overflowZwidth`, `nUnderflowBins`, `nOverflowBins`. (See `src/myANNZ.cpp` for details.) **(4)** Added a variable, `nZclosBins`, to control the number of bins used for optimization-metric calculations in regression. (See `src/myANNZ.cpp` for details.) **(5)** ROOT scripts are no longer stored by default for each plot. Set `savePlotScripts` to choose otherwise. + + - Added a wrapper class, which allows calling the evaluation phase for regression/classification directly from python. This can be used to integrate ANNZ directly within pipelines. The python interface is defined in `py/ANNZ.py`, with a full example given in `scripts/annz_evalWrapper.py`. (See README.md for details.) -- Bug fix in a few `python` scripts, where the example for the `weightInp_wgtKNN` option had previously been set to numerically insignificant values. +- Bug fix in a few python scripts, where the example for the `weightInp_wgtKNN` option had previously been set to numerically insignificant values. - Changed the interface to turn off colour output (see `README.md`). @@ -16,6 +25,8 @@ - Created a new `Manager` class as part of `include/myANNZ.hpp`, `src/myANNZ.cpp`. +- The new random walk alg for generating regression PDFs is implemented in `ANNZ::getRndMethodBestPDF()`, which has been completely revamped. The old version of this function has been renamed to `ANNZ::getOldStyleRndMethodBestPDF()`. It is now used in order to derive `PDF_1` and `PDF_2`. + - Added a wrapper class for e.g., python integration, implemented in `include/Wrapper.hpp`, `src/Wrapper.cpp` and `py/ANNZ.py`. - Completely rewrote `ANNZ::doEvalReg()` to comply with pipeline integration. Added new interfaces for regression evaluation, as implemented in `src/ANNZ_regEval.cpp`. diff --git a/README.md b/README.md index 1a7aa6b..b549278 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ -# ANNZ v2.2.2 +# ANNZ v2.3.0 ## Introduction ANNZ uses both regression and classification techniques for estimation of single-value photo-z (or any regression problem) solutions and PDFs. In addition it is suitable for classification problems, such as star/galaxy classification. -ANZZ uses the [TMVA package](http://tmva.sourceforge.net/) which is based on [ROOT](https://root.cern.ch/). The current version is a completely new implementation of the original ANNZ package. +ANNZ uses the [TMVA package](http://tmva.sourceforge.net/) which is based on [ROOT](https://root.cern.ch/). The current version is a completely new implementation of the original ANNZ package. The different configurations for regression problems (such as photometric redshift estimation) are referred to as *single regression*, *randomized regression* and *binned classification*. In addition, it is possible to run ANNZ in *single classification* and *randomized classification* modes, used for general classification problems. @@ -25,7 +25,7 @@ While any of the MLMs available through TMVA may be used, ANN/BDT generally achi #### Randomized regression An ensemble of regression methods is automatically generated. The randomized MLMs differ from each other in several ways. This includes setting unique random seed initializations, as well as changing the configuration parameters of a given algorithm (e.g., number of hidden layers in an ANN), or the set of input parameters used for the training. -Once training is complete, optimization takes place. In this stage, a distribution of photo-z solutions for each galaxy is derived. A selection procedure is then applied to the ensemble of answers, choosing the subset of methods which achieve optimal performance. The selected MLMs are then folded with their respective uncertainty estimates, which are derived using a KNN-uncertainty estimator (see [Oyaizu et al, 2007](http://arxiv.org/abs/0711.0962)). A set of PDF candidates is generated, where each candidate is constructed by a different set of relative weights associated with the various MLM components. Two selection schemes which optimize the performance of the PDF candidates are currently implemented in ANNZ. In this way, the PDFs which best describe the target of the regression are chosen, resulting in two alternative PDF solutions. +Once training is complete, optimization takes place. In this stage, a distribution of photo-z solutions for each galaxy is derived. A selection procedure is then applied to the ensemble of answers, choosing the subset of methods which achieve optimal performance. The selected MLMs are then folded with their respective uncertainty estimates, which are derived using a KNN-uncertainty estimator (see [Oyaizu et al, 2007](http://arxiv.org/abs/0711.0962)). A set of PDF candidates is generated, where each candidate is constructed by a different set of relative weights associated with the various MLM components. The final products are the *best* solution out of all the randomized MLMs, the full binned PDF(s) and the weighted and un-weighted average of the PDF(s), each also having a corresponding uncertainty estimator. (More details below.) @@ -144,7 +144,7 @@ python scripts/annz_singleReg_quick.py --singleRegression --evaluate python scripts/annz_rndReg_quick.py --randomRegression --train ``` - 3. **optimize**: Using all trained MLMs, rank the different solutions by their performance metrics (bias, scatter and outlier-fractions); generate up to two types of PDF solutions using the distribution of MLM solutions, convoluted with the corresponding error estimates. Performance plots are also created. + 3. **optimize**: Using all trained MLMs, rank the different solutions by their performance metrics (bias, scatter and outlier-fractions); generate PDF solutions using the distribution of MLM solutions, convoluted with the corresponding error estimates. Performance plots are also created. ```bash python scripts/annz_rndReg_quick.py --randomRegression --optimize ``` @@ -429,16 +429,20 @@ It is advisable to run ANNZ on a batch farm, especially during the training phas ### Python pipeline integration -- Nominally, the input data to ANNZ is ingested from a source file (e.g., an ascii or ROOT file). For evaluation, there is also the option to call ANNZ on an object-by-object basis, directly from python. This can be done using a wrapper class defined in `py/ANNZ.py`. -Schematically, the steps to use the wrapper are as follows: - 1. Setup the wrapper class with some user options (the same as would be used during nominal evaluation), with a dedicated list of input parameters, defined in `inVars` (same syntax as for the `inAsciiVars` parameter): +- Nominally, the input data to ANNZ are ingested from a source file (an ascii or ROOT file). For evaluation, there is also the option to call ANNZ on an object-by-object basis, directly from python. This can be done using a wrapper class defined in `py/ANNZ.py`. + +- A full example is given in `scripts/annz_evalWrapper.py`. Schematically, the steps to use the wrapper are as follows: + + 1. Setup the wrapper class with some user options (the same as would be used during nominal evaluation). Add a dedicated list of input parameters, defined in `inVars` (using the same syntax as for the `inAsciiVars` parameter): ```python + from py.ANNZ import ANNZ opts = dict() opts['doRegression'] = True opts["inVars"] = "F:MAG_U;F:MAGERR_U;F:MAG_G;F:MAGERR_G;F:MAG_R;F:MAGERR_R;F:MAG_I;F:MAGERR_I;F:MAG_Z;F:MAGERR_Z" ... annz = ANNZ(opts) ``` + 2. Call the evaluation function of the wrapper, providing values for the predefined set of inputs: ```python input = { @@ -447,21 +451,20 @@ Schematically, the steps to use the wrapper are as follows: output = annz.eval(input) ``` where `output` is a dict containing the evaluation results. + 3. Call the cleanup function of the wrapper once done (for a graceful release of resources): ```python annz.cleanup() ``` - A full example is given in `scripts/annz_evalWrapper.py`. - -- Step 1. may take a bit of time, as MLM estimators and ROOT trees are being loaded on the `C++` side; it should be done once at startup. Step 2. is quick and may be called with little overhead. It can e.g., be integrated as part of a python loop. The wrapper object should remain valid throughout the life cycle of the pipeline, in order to keep the `C++` resources booked. -- `py/ANNZ.py` is implemented with a thread lock, which allows multiple instances to be run concurrently (e.g., for different estimators with different inputs). +- Step 1. (initialize) may take a bit of time, as MLM estimators and ROOT trees are being loaded on the `C++` side; it should be done once at startup. Step 2. (evaluate) is quick and may be called with little overhead. It can e.g., be integrated as part of a python loop. The wrapper object should remain valid throughout the life cycle of the pipeline, in order to keep the `C++` resources booked. +- `py/ANNZ.py` is implemented with a thread lock, which allows multiple instances to be run concurrently (e.g., for different types of estimators or for different inputs). ## The outputs of ANNZ ### Randomized regression -Two PDFs may be generated, derived by choosing a weighting for the trained MLMs, which is *most compatible* with the target value of the regression, `zTrg`. This is determined in two ways (generating two alternative PDFs), using cumulative PDF distributions; for the first PDF (`PDF_0` in the output), the cumulative distribution is based on the *truth* (`zTrg`). For the second PDF (`PDF_1` in the output), the cumulative distribution is based on the *best* MLM. For the former, a set of templates, derived from `zTrg` is used to fit the dataset. For the later, a flat distribution of the cumulator serves as the baseline. (The PDF derivation procedure is described in the paper in greater detail.) +A PDF may be generated, derived by choosing a weighting for the trained MLMs, which is *most compatible* with the target value of the regression, `zTrg`. This is determined using cumulative PDF distributions. Nominally, the cumulative distribution is based on the *truth* (`zTrg`); it is derived using a simple random walk alg, and is denoted as `PDF_0` in the output. Prior to `ANNZ v2.2.2`, two other derivations of PDFs were used, the first also based on `zTrg`, and the second on the *best* MLM. The two deprecated PDFs are not guaranteed to be supported in the future; they may currently still be generated, by setting `glob.annz["addOldStylePDFs"] = True` and `glob.annz["nPDFs"] = 2` or `3`. They are respectively denoted as `PDF_1` and `PDF_2` in the output. (The metric for deriving the various PDFs is described in the paper in greater detail.) We define the following name-tags: @@ -586,7 +589,7 @@ A few notes: - For instance, lets assume that `userWeights_train`, `userWeights_valid`, `userCuts_train` and `userCuts_valid` are not set, while `useWgtKNN = True`. In this case, the weight variables (i.e., `ANNZ_best_wgt`) would correspond exactly to the reference dataset correction factors, which are stored e.g., in `output/test_randReg_advanced/rootIn/ANNZ_KNN_wANNZ_tree_valid_0000.csv`. This naturally only holds for the training/validation dataset. For a general evaluation sample, weight variables such as `ANNZ_best_wgt` would be derived by `userWeights_train`, `userWeights_valid`, `userCuts_train` and `userCuts_valid` alone. - - The estimator weights have nothing to do with the PDF-weights discussed above. The object weights represent per-object numbers which are derived from the overall properties of an object - they can even depend on variables which are not part of the training. For instance, in the examples, one may limit the impact on training, of objects with high uncertainty on the I-band magnitude: + - The estimator weights have nothing to do with the PDF-weights discussed above. The object weights represent per-object numbers which are derived from the overall properties of an object - they can even depend on variables which are not part of the training. For instance, in the examples, one may limit the impact on training of objects with high uncertainty on the I-band magnitude: ```python glob.annz["userWeights_train"] = "(MAGERR_I > 1)/MAGERR_I + (MAGERR_I =< 1)*1" ``` @@ -674,7 +677,11 @@ A few notes: ```python glob.annz["minPdfWeight"] = 0.05 ``` - will insure that each MLM will have at least 5% relative significance in the PDF. That is, in this case, no more than 20 MLMs will be used for the PDF. + will insure that each MLM will have at least 5% relative significance in the PDF. That is, in this case, no more than 20 MLMs will be used for the PDF. (This option is only used for the two deprecated PDFs, and my be removed in the future.) + + - **`max_optimObj_PDF` -** may be used to limit the number of objects from the training sample to use as part of the random walk alg for deriving `PDF_0`. This should only be set if the optimization stage takes too long to run. + + - **`nOptimLoops` -** may be used to change the maximal number of steps taken by the random walk alg deriving `PDF_0`. Note that the random walk alg will likely end before `nOptimLoops` steps in any case; this will happen after a pre-set number of steps, during which the solution does not improve. - **`doMultiCls`:** Using the *MultiClass* option of binned classification, multiple background samples can be trained simultaneously against the signal. This means that each classification bin acts as an independent sample during the training. The MultiClass option is only compatible with four MLM algorithms: `BDT`, `ANN`, `FDA` and `PDEFoam`. For `BDT`, only the gradient boosted decision trees are available. That is, one may set `:BoostType=Grad`, but not `:BoostType=Bagging` or `:BoostType=AdaBoost`, as part of the `userMLMopts` option. diff --git a/examples/scripts/annz_binCls_advanced.py b/examples/scripts/annz_binCls_advanced.py index ffc040b..dc8085d 100644 --- a/examples/scripts/annz_binCls_advanced.py +++ b/examples/scripts/annz_binCls_advanced.py @@ -425,7 +425,7 @@ # nPDFbins is ignored if [userPdfBins != ""]. # - userPdfBins: define a specific set of bins instead of having nPDFbins bins of euqal width. # -------------------------------------------------------------------------------------------------- - glob.annz["nPDFbins"] = 40 + glob.annz["nPDFbins"] = 100 # glob.annz["userPdfBins"] = "0.05;0.3;0.5;0.6;0.8" # -------------------------------------------------------------------------------------------------- diff --git a/examples/scripts/annz_binCls_quick.py b/examples/scripts/annz_binCls_quick.py index a429df9..1f892c4 100644 --- a/examples/scripts/annz_binCls_quick.py +++ b/examples/scripts/annz_binCls_quick.py @@ -115,7 +115,7 @@ # nPDFbins - number of PDF bins, with equal width bins between minValZ and maxValZ. (see advanced example # for setting other bin configurations) this is not directly tied to binCls_nBins -> the results of # the classification bins are cast into whatever final PDF bin scheme is defined by nPDFbins. - glob.annz["nPDFbins"] = 30 + glob.annz["nPDFbins"] = 120 # -------------------------------------------------------------------------------------------------- # optimization diff --git a/examples/scripts/annz_evalWrapper.py b/examples/scripts/annz_evalWrapper.py index 303a730..6cf5909 100644 --- a/examples/scripts/annz_evalWrapper.py +++ b/examples/scripts/annz_evalWrapper.py @@ -1,23 +1,22 @@ -# ================================================================================================== +# -------------------------------------------------------------------------------------------------- # simple example, illustrating how the evaluation python ANNZ should be setup. # instead of providing an input ascii catalogue, it can be used for # pipeline integration of ANNZ. # the generation/training/optimization(verification) stages must be # run first, as for the nominal evaluation examples (eg scripts/annz_singleReg_quick.py). -# ================================================================================================== +# -------------------------------------------------------------------------------------------------- # - the run this script, execute the following: # python annz_evalWrapper.py -# ================================================================================================== -# - an example is given for several modes of operation (uncomment one...): # -------------------------------------------------------------------------------------------------- +# - an example is given for several modes of operation (uncomment one...): +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - testType = 'doSingleReg' # testType = 'doRandomReg' # testType = 'doBinnedCls' # testType = 'doRandomCls' # similarly for singleCls -# ================================================================================================== +# -------------------------------------------------------------------------------------------------- from py.ANNZ import ANNZ -import threading # logging module useLog = True @@ -130,7 +129,7 @@ def init(testType, opts): # - nPDFs must be lower/equal to the number used for optimization (can be zero to speed things up...) # - nPDFbins can be anything (not necessarily the same as for optimization) opts["nPDFs"] = 1 - opts["nPDFbins"] = 50 + opts["nPDFbins"] = 120 # -------------------------------------------------------------------------------------------------- # can work with sbinned-classification, eg: @@ -145,7 +144,7 @@ def init(testType, opts): # for setting other bin configurations) this is not directly tied to binCls_nBins -> the results of # the classification bins are cast into whatever final PDF bin scheme is defined by nPDFbins. # (doesnt have to be the same as for training/validation) - opts["nPDFbins"] = 30 + opts["nPDFbins"] = 160 if testType == 'doRandomCls': opts['doClassification'] = True @@ -285,5 +284,5 @@ def loadObjV(opts): # run the main function on execution of this script # -------------------------------------------------------------------------------------------------- if __name__ == '__main__': - main() + main() # -------------------------------------------------------------------------------------------------- diff --git a/examples/scripts/annz_rndReg_advanced.py b/examples/scripts/annz_rndReg_advanced.py index 77778f6..8aa14d6 100644 --- a/examples/scripts/annz_rndReg_advanced.py +++ b/examples/scripts/annz_rndReg_advanced.py @@ -298,7 +298,7 @@ # it is recomended to always use some kind of normalization, so best to leave this at the default setting. # -------------------------------------------------------------------------------------------------- if nMLMnow == 0: glob.annz["userMLMopts"] = "ANNZ_MLM=BDT:VarTransform=N:NTrees=110:NormMode=NumEvents:BoostType=AdaBoost:" - elif nMLMnow == 1: glob.annz["userMLMopts"] = "ANNZ_MLM=SVM:MaxIter=700" + elif nMLMnow == 1: glob.annz["userMLMopts"] = "ANNZ_MLM=KNN" elif nMLMnow == 2: glob.annz["userMLMopts"] = "ANNZ_MLM=BDT:NTrees=190:VarTransform=N,P" elif nMLMnow == 3: glob.annz["userMLMopts"] = genRndOpts(nMLMnow) else: glob.annz["userMLMopts"] = "" @@ -377,15 +377,35 @@ if glob.annz["doOptim"] or glob.annz["doEval"]: # -------------------------------------------------------------------------------------------------- - # nPDFs - number of PDFs (up to two PDF types are implemented for randomized regression): + # nPDFs - number of PDFs (up to three PDF types are implemented for randomized regression): # - PDFs are selected by choosing the weighting scheme which is "most compatible" with the true value. - # This is determined in two ways (generating two alternative PDFs), using cumulative distributions; for the first - # PDF (PDF_0 in the output), the cumulative distribution is based on the "truth" (the target variable, zTrg). For the second - # PDF (PDF_1 in the output), the cumulative distribution is based on the "best" MLM. + # This is determined in several ways (generating alternative PDFs), using cumulative distributions; for the first/second + # PDFs (PDF_0 and PDF_1 in the output), the cumulative distribution is based on the "truth" (the target variable, zTrg). + # For the third PDF (PDF_2 in the output), the cumulative distribution is based on the "best" MLM. # For the former, a set of templates, derived from zTrg is used to fit the dataset. For the later, # a flat distribution of the cumulator serves as the baseline. + # - nominally, only PDF_0 should be used. PDF_1/PDF_2 are depracated, and are not guaranteed to + # be supported in the future. They may currently still be generated by setting addOldStylePDFs to True # -------------------------------------------------------------------------------------------------- - glob.annz["nPDFs"] = 2 + glob.annz["nPDFs"] = 1 + + speedUpMCMC = False + if speedUpMCMC: + # max_optimObj_PDF - can be set to some number (smaller than the size of the training sample). the + # result will be to limit the number of objects used in each step of the MCMC used to derive the PDF + glob.annz["max_optimObj_PDF"] = 2500 + # nOptimLoops - set the maximal number of steps taken by the MCMC + # note that the MCMC will likely end before `nOptimLoops` steps in any case. this will happen + # after a pre-set number of steps, during which the solution does not improve. + glob.annz["nOptimLoops"] = 5000 + + # add the depricated PDFs to the output + getOldStylePdfs = False + if getOldStylePdfs: + # addOldStylePDFs - set to get the old-style PDFs (which were the default up to v2.2.2) as PDF_1 and PDF_2 + glob.annz["addOldStylePDFs"] = True + # set to 2 or 3, respectively to add [PDF_1] or [PDF_1 and PDF_2] to the output + glob.annz["nPDFs"] = 3 # -------------------------------------------------------------------------------------------------- # definition of the PDF bins - @@ -404,7 +424,7 @@ glob.annz["userPdfBins"] = "0.0;0.1;0.2;0.24;0.3;0.52;0.6;0.7;0.8" elif pdfBinsType == 1: # nPDFbins - number of PDF bins (equal width bins between minValZ and maxValZ- automatically derive pdfBinWidth) - glob.annz["nPDFbins"] = 90 + glob.annz["nPDFbins"] = 120 elif pdfBinsType == 2: # pdfBinWidth - width of each PDF bin (equal width bins between minValZ and maxValZ - automatically derive nPDFbins) glob.annz["pdfBinWidth"] = 0.01 diff --git a/examples/scripts/annz_rndReg_quick.py b/examples/scripts/annz_rndReg_quick.py index 49f3dba..da93d70 100644 --- a/examples/scripts/annz_rndReg_quick.py +++ b/examples/scripts/annz_rndReg_quick.py @@ -87,10 +87,10 @@ # -------------------------------------------------------------------------------------------------- if glob.annz["doOptim"] or glob.annz["doEval"]: - # nPDFs - number of PDFs (up to two PDF types are implemented) (see advanced example for a general description of the two PDFs) - glob.annz["nPDFs"] = 2 + # nPDFs - number of PDFs (see advanced example for a general description of PDFs) + glob.annz["nPDFs"] = 1 # nPDFbins - number of PDF bins, with equal width bins between minValZ and maxValZ. (see advanced example for setting other bin configurations) - glob.annz["nPDFbins"] = 90 + glob.annz["nPDFbins"] = 80 # -------------------------------------------------------------------------------------------------- # optimization diff --git a/include/ANNZ.hpp b/include/ANNZ.hpp index cbc8f03..3fe8a2e 100644 --- a/include/ANNZ.hpp +++ b/include/ANNZ.hpp @@ -29,7 +29,7 @@ class RegEval; */ // =========================================================================================================== class ANNZ : public BaseClass { -// ============================ +// =========================================================================================================== public: ANNZ(TString aName = "ANNZ", Utils * aUtils = NULL, OptMaps * aMaps = NULL, OutMngr * anOutMngr = NULL); ~ANNZ(); @@ -168,14 +168,19 @@ class ANNZ : public BaseClass { void getBestANNZ(map < int,vector > & zRegQnt_nANNZ, map < int,vector > & zRegQnt_bias, map < int,vector > & zRegQnt_sigma68, map < int,vector > & zRegQnt_fracSig68, vector < int > & bestMLMsV, bool onlyInclusiveBin = true); - void getRndMethodBestPDF(TTree * aChain, int bestANNZindex, vector & zRegQnt_nANNZ, vector & zRegQnt_bias, - vector & zRegQnt_sigma68, vector & zRegQnt_fracSig68, - vector < vector > & bestWeightsV, vector & hisPdfBiasCorV); void setBinClsPdfBinWeights(vector < vector < pair > > & pdfBinWgt, vector & nClsBinsIn); void getBinClsBiasCorPDF(TChain * aChain, vector & hisPdfBiasCorV); void doEvalReg(TChain * inChain = NULL, TString outDirName = "", vector * selctVarV = NULL); void doMetricPlots(TChain * inChain = NULL, vector * addPlotVarV = NULL, TString addOutputVarsIn = ""); + void getRndMethodBestPDF(TTree * aChain, int bestANNZindex, vector & zRegQnt_nANNZ, vector & zRegQnt_bias, + vector & zRegQnt_sigma68, vector & zRegQnt_fracSig68, + vector < vector > & bestWeightsV, vector & hisPdfBiasCorV); + + void getOldStyleRndMethodBestPDF(TTree * aChain, int bestANNZindex, vector & zRegQnt_nANNZ, vector & zRegQnt_bias, + vector & zRegQnt_sigma68, vector & zRegQnt_fracSig68, + vector < vector > & bestWeightsV, vector & hisPdfBiasCorV); + // ----------------------------------------------------------------------------------------------------------- // ANNZ_loopRegCls.cpp : // ----------------------------------------------------------------------------------------------------------- @@ -185,27 +190,29 @@ class ANNZ : public BaseClass { // =========================================================================================================== // private variables // =========================================================================================================== - vector < Double_t > zClos_binE, zClos_binC, zPlot_binE, zPlot_binC, zPDF_binE, zPDF_binC, zBinCls_binE, zBinCls_binC; - vector < TString > mlmTagName, mlmTagWeight, mlmTagBias, mlmTagClsVal, mlmTagIndex, mlmTagErrKNN, inputVariableV; - vector < map > mlmTagErr; - vector < vector > pdfBinNames, inErrTag; - vector < map < TString,TString> > pdfAvgNames; - map < TString,bool > mlmSkip; - vector < vector > inNamesVar, inNamesErr; vector < bool > hasBiasCorMLMinp; - vector < vector > inVarsScaleFunc; - map < TString,TCut > userCutsM; - map < TString,TString > userWgtsM, bestMLMname, mlmBaseTag; vector < time_t > trainTimeM; - // vector < pair > readerInptV; - vector < vector > readerInptIndexV; + vector < TH1* > hisClsPrbV; vector < Float_t > readerBiasInptV; + vector < Double_t > zClos_binE, zClos_binC, zPlot_binE, zPlot_binC, zPDF_binE, + zPDF_binC, zBinCls_binE, zBinCls_binC, zTrgPlot_binE, zTrgPlot_binC; + vector < TString > mlmTagName, mlmTagWeight, mlmTagBias, mlmTagClsVal, mlmTagIndex, + mlmTagErrKNN, inputVariableV; + + map < TString,bool > mlmSkip; + map < TString,TCut > userCutsM; + map < TString,TString > userWgtsM, bestMLMname, mlmBaseTag; + + vector < vector > readerInptIndexV; + vector < vector > inVarsScaleFunc; + vector < vector > pdfBinNames, inErrTag, inNamesVar, inNamesErr; + vector < map > mlmTagErr, pdfAvgNames; + vector < TMVA::Reader* > regReaders, biasReaders; vector < TMVA::Types::EAnalysisType > anlysTypes; vector < TMVA::Types::EMVA > typeMLM, allANNZtypes; map < TMVA::Types::EMVA,TString > typeToNameMLM; map < TString,TMVA::Types::EMVA > nameToTypeMLM; - vector < TH1* > hisClsPrbV; }; #endif // #define ANNZ_h @@ -216,7 +223,7 @@ class ANNZ : public BaseClass { */ // =========================================================================================================== class RegEval : public BaseClass { -// =============================== +// =========================================================================================================== public: RegEval(TString aName = "RegEval", Utils * aUtils = NULL, OptMaps * aMaps = NULL, OutMngr * anOutMngr = NULL); ~RegEval(); diff --git a/include/commonInclude.hpp b/include/commonInclude.hpp index ec324fb..ababbff 100644 --- a/include/commonInclude.hpp +++ b/include/commonInclude.hpp @@ -129,27 +129,25 @@ namespace DefOpts { // =========================================================================================================== // namespace for sorting logic functions // =========================================================================================================== -namespace sortFunctors { - - bool double_descend(double a, double b) { return (a < b); } - bool double_ascend (double a, double b) { return (a > b); } - - bool pairIntInt_descendSecond(pair a, pair b) { return (a.second < b.second); } - bool pairIntInt_ascendSecond (pair a, pair b) { return (a.second > b.second); } - bool pairIntInt_descendFirst (pair a, pair b) { return (a.first < b.first ); } - bool pairIntInt_ascendFirst (pair a, pair b) { return (a.first > b.first ); } - - bool pairIntDouble_descendSecond(pair a, pair b) { return (a.second < b.second); } - bool pairIntDouble_ascendSecond (pair a, pair b) { return (a.second > b.second); } - bool pairIntDouble_descendFirst (pair a, pair b) { return (a.first < b.first ); } - bool pairIntDouble_ascendFirst (pair a, pair b) { return (a.first > b.first ); } - - struct pairIntDouble_equalFirst { - int b; - pairIntDouble_equalFirst(int input) : b(input) { } - bool operator () (pair const& a) { return (a.first == b); } - }; - +namespace sortFunc { + bool highToLowI(int a, int b) { return (a > b); } + bool lowToHighI(int a, int b) { return (a < b); } + + bool highToLowD(double a, double b) { return (a > b); } + bool lowToHighD(double a, double b) { return (a < b); } + + namespace pairID { + // bool highToLowBy0 (pair a, pair b) { return (a.first > b.first ); } + // bool lowToHighBy0 (pair a, pair b) { return (a.first < b.first ); } + bool highToLowBy1(pair a, pair b) { return (a.second > b.second); } + bool lowToHighBy1(pair a, pair b) { return (a.second < b.second); } + + struct equalToFirst { + int b; + equalToFirst(int input) : b(input) { } + bool operator () (pair const& a) { return (a.first == b); } + }; + } } @@ -212,27 +210,26 @@ namespace Log { }; class MyLog { - public: - MyLog(); - virtual ~MyLog(); - std::ostringstream & Get(LOGtypes level = INFO); - std::ostringstream & Clean(std::string str = ""); - public: - static LOGtypes & ReportingLevel(); - static std::string ToString(LOGtypes level); - static LOGtypes FromString(const std::string& level); - protected: - std::ostringstream os; - private: - MyLog(const MyLog&); - MyLog & operator = (const MyLog &); + public: + MyLog(); + virtual ~MyLog(); + std::ostringstream & Get(LOGtypes level = INFO); + std::ostringstream & Clean(std::string str = ""); + + static LOGtypes & ReportingLevel(); + static std::string ToString(LOGtypes level); + static LOGtypes FromString(const std::string& level); + protected: + std::ostringstream os; + private: + MyLog(const MyLog&); + MyLog & operator = (const MyLog &); }; inline MyLog::MyLog(){}; inline std::ostringstream& MyLog::Get(LOGtypes level) { os << "[" << NowTime() << " " << ToString(level) << "] "; - //os << std::string(level > DEBUG ? level - DEBUG : 0, '\t'); return os; } @@ -242,7 +239,6 @@ namespace Log { } inline MyLog::~MyLog() { - //os << std::endl; fprintf(stderr, "%s", os.str().c_str()); fflush(stderr); } @@ -252,11 +248,10 @@ namespace Log { return reportingLevel; } - inline std::string MyLog::ToString(LOGtypes level) { - static const char* const buffer[] = {" ERROR", "WARNING", " INFO", " DEBUG", "DEBUG_1", "DEBUG_2", "DEBUG_3", "DEBUG_4"}; - //static const char* const buffer[] = {" ERR", "WARN", "INFO", " DBG", "DBG1", "DBG2", "DBG3", "DBG4"}; - return buffer[level]; - } + static const char* const levelV[] = { + " ERROR", "WARNING", " INFO", " DEBUG", "DEBUG_1", "DEBUG_2", "DEBUG_3", "DEBUG_4" + }; + inline std::string MyLog::ToString(LOGtypes level) { return levelV[level]; } inline LOGtypes MyLog::FromString(const std::string & level) { if (level == "DEBUG_4") return DEBUG_4; diff --git a/src/ANNZ.cpp b/src/ANNZ.cpp index 23eaf01..75f0c5e 100644 --- a/src/ANNZ.cpp +++ b/src/ANNZ.cpp @@ -31,12 +31,14 @@ // =========================================================================================================== ANNZ::ANNZ(TString aName, Utils * aUtils, OptMaps * aMaps, OutMngr * anOutMngr) :BaseClass( aName, aUtils, aMaps, anOutMngr) { -// ========================================================================== +// =========================================================================================================== Init(); return; } + +// =========================================================================================================== ANNZ::~ANNZ() { -// ============ +// =========================================================================================================== aLOG(Log::DEBUG) <(configIn))); #if ROOT_TMVA_V0 @@ -82,7 +82,7 @@ void ANNZ::prepFactory(int nMLMnow, TMVA::Configurable * configIn, bool isBiasML */ // =========================================================================================================== void ANNZ::doFactoryTrain(TMVA::Factory * factory) { -// ================================================= +// =========================================================================================================== aLOG(Log::INFO) <(factory))); @@ -90,12 +90,12 @@ void ANNZ::doFactoryTrain(TMVA::Factory * factory) { // Train MVAs using the set of training events factory->TrainAllMethods(); - // if(glob->GetOptB("testAndEvalTrainMethods")) { - // // Evaluate all MVAs using the set of test events - // cout <TestAllMethods() ... " <TestAllMethods(); - // // Evaluate and compare performance of all configured MVAs - // cout <EvaluateAllMethods() ... "<EvaluateAllMethods(); - // } + if(glob->GetOptB("testAndEvalTrainMethods")) { + // Evaluate all MVAs using the set of test events + cout <TestAllMethods() ... " <TestAllMethods(); + // Evaluate and compare performance of all configured MVAs + cout <EvaluateAllMethods() ... "<EvaluateAllMethods(); + } return; } @@ -107,7 +107,7 @@ void ANNZ::doFactoryTrain(TMVA::Factory * factory) { */ // =========================================================================================================== void ANNZ::clearReaders(Log::LOGtypes logLevel) { -// ============================================== +// =========================================================================================================== aLOG(logLevel) < & mlmSkipNow, bool needMcPRB) { -// ====================================================================== +// =========================================================================================================== aLOG(Log::INFO) <GetOptI("nMLMs"); @@ -318,7 +318,7 @@ void ANNZ::loadReaders(map & mlmSkipNow, bool needMcPRB) { */ // =========================================================================================================== double ANNZ::getReader(VarMaps * var, ANNZ_readType readType, bool forceUpdate, int nMLMnow) { -// =========================================================================================== +// =========================================================================================================== VERIFY(LOCATION,(TString)"Memory leak ?! ",(dynamic_cast(var))); VERIFY(LOCATION,(TString)"Memory leak for regReaders[nMLMnow = "+utils->intToStr(nMLMnow)+"] ?! ",(dynamic_cast(regReaders[nMLMnow]))); VERIFY(LOCATION,(TString)"unknown readType (\""+utils->intToStr((int)readType)+"\") ...",(nMLMnow < glob->GetOptI("nMLMs"))); @@ -367,7 +367,7 @@ double ANNZ::getReader(VarMaps * var, ANNZ_readType readType, bool forceUpdate, */ // =========================================================================================================== void ANNZ::setupTypesTMVA() { -// =========================== +// =========================================================================================================== TString nameNow(""); TMVA::Types::EMVA typeNow(TMVA::Types::kVariable); @@ -411,7 +411,7 @@ void ANNZ::setupTypesTMVA() { */ // =========================================================================================================== TMVA::Types::EMVA ANNZ::getTypeMLMbyName(TString typeName) { -// ========================================================== +// =========================================================================================================== bool isRealType = (nameToTypeMLM.find(typeName) != nameToTypeMLM .end()); if(isRealType) return nameToTypeMLM[typeName]; @@ -432,7 +432,7 @@ TMVA::Types::EMVA ANNZ::getTypeMLMbyName(TString typeName) { */ // =========================================================================================================== bool ANNZ::verifyXML(TString outXmlFileName) { -// =========================================== +// =========================================================================================================== bool isGoodXML = false; std::ifstream * testFile = new std::ifstream(outXmlFileName); diff --git a/src/ANNZ_clsEval.cpp b/src/ANNZ_clsEval.cpp index 01b657b..65b1dc8 100644 --- a/src/ANNZ_clsEval.cpp +++ b/src/ANNZ_clsEval.cpp @@ -30,7 +30,7 @@ */ // =========================================================================================================== void ANNZ::doEvalCls() { -// ====================== +// =========================================================================================================== aLOG(Log::INFO) <GetOptI("nMLMs"); @@ -144,7 +144,7 @@ void ANNZ::evalClsSetup() { */ // =========================================================================================================== void ANNZ::evalClsLoop() { -// ======================== +// =========================================================================================================== aLOG(Log::DEBUG_1) <GetOptI("nMLMs"); @@ -345,7 +345,7 @@ void ANNZ::evalClsLoop() { */ // =========================================================================================================== void ANNZ::evalClsWrapperSetup() { -// =============================== +// =========================================================================================================== aLOG(Log::INFO) < & trgIndexV, int nMLMnow, TCut cutsAll, TString wgtAll) { -// ============================================================================================= +void ANNZ::setupKdTreeKNN( + TChain * aChainKnn, TFile *& knnErrOutFile, TMVA::Factory *& knnErrFactory, + TMVA::Configurable *& knnErrDataLdr, TMVA::kNN::ModulekNN *& knnErrModule, + vector & trgIndexV, int nMLMnow, TCut cutsAll, TString wgtAll +) { +// =========================================================================================================== bool debug = inLOG(Log::DEBUG_2) || glob->OptOrNullB("debugErrANNZ"); if(debug) aCustomLOG("") <GetName(); @@ -456,9 +460,11 @@ void ANNZ::cleanupKdTreeKNN(TFile *& knnErrOutFile, TMVA::Factory *& knnErrFacto * @param zErrV - vector to hold negative/average/positive error estimates for each MLM. */ // =========================================================================================================== -void ANNZ::getRegClsErrKNN(VarMaps * var, TMVA::kNN::ModulekNN * knnErrModule, vector & trgIndexV, - vector & nMLMv, bool isREG, vector < vector > & zErrV) { -// =============================================================================================== +void ANNZ::getRegClsErrKNN( + VarMaps * var, TMVA::kNN::ModulekNN * knnErrModule, vector & trgIndexV, + vector & nMLMv, bool isREG, vector < vector > & zErrV +) { +// =========================================================================================================== aLOG(Log::DEBUG_3) < * zErrV) { -// ============================================================================================================ +double ANNZ::getRegClsErrINP( + VarMaps * var, bool isREG, int nMLMnow, UInt_t * seedP, vector * zErrV +) { +// =========================================================================================================== VERIFY(LOCATION,(TString)"Memory leak ?! ",(dynamic_cast(var))); aLOG(Log::DEBUG_3) < >::iterator Itr; - Itr = find_if(sbSepFracIndexM[typeName].begin(), sbSepFracIndexM[typeName].end(), sortFunctors::pairIntDouble_equalFirst(nMLMnow)); + Itr = find_if(sbSepFracIndexM[typeName].begin(), sbSepFracIndexM[typeName].end(), sortFunc::pairID::equalToFirst(nMLMnow)); int nSbSepIndexNow(-1); double nSbSepValNow(-1); if(Itr != sbSepFracIndexM[typeName].end()) { // just a sanity check, really diff --git a/src/ANNZ_loopReg.cpp b/src/ANNZ_loopReg.cpp index fc8ce07..723ec78 100644 --- a/src/ANNZ_loopReg.cpp +++ b/src/ANNZ_loopReg.cpp @@ -179,7 +179,8 @@ void ANNZ::optimReg() { , ((int)bestWeightsV.size() == nPDFs)); for(int nPDFnow=0; nPDFnowintToStr(nMLMs) +"] ... Something is horribly wrong !!!",((int)bestWeightsV[nPDFnow].size() == nMLMs)); } @@ -307,22 +308,30 @@ void ANNZ::optimReg() { * @param aChain - Input chain, for which the metrics are calculated. */ // =========================================================================================================== -void ANNZ::fillColosureV( map < int,vector > & zRegQnt_nANNZ, map < int,vector > & zRegQnt_bias, - map < int,vector > & zRegQnt_sigma68, map < int,vector > & zRegQnt_fracSig68, TChain * aChain) { -// ========================================================================================================================================== +void ANNZ::fillColosureV( + map < int,vector > & zRegQnt_nANNZ, map < int,vector > & zRegQnt_bias, + map < int,vector > & zRegQnt_sigma68, map < int,vector > & zRegQnt_fracSig68, + TChain * aChain +) { +// =========================================================================================================== VERIFY(LOCATION,(TString)"Memory leak ?! ",(dynamic_cast(aChain))); aLOG(Log::INFO) <GetOptI("nMLMs"); int maxNobj = glob->GetOptI("maxNobj"); + double minValZ = glob->GetOptF("minValZ"); + double maxValZ = glob->GetOptF("maxValZ"); int nObjectsToWrite = glob->GetOptI("nObjectsToWrite"); TString zTrgName = glob->GetOptC("zTrg"); TString aChainName = (TString)aChain->GetName(); + int nZclosBins = glob->GetOptI("nZclosBins"); + if(nZclosBins <= 0) nZclosBins = 10; + bool optimWithMAD = glob->GetOptB("optimWithMAD"); TString quantScatterName = optimWithMAD ? (TString)"quant_MAD" : (TString)"quant_sigma_68"; TString sig68Title = optimWithMAD ? (TString)"MAD" : (TString)"sig68"; @@ -330,9 +339,59 @@ void ANNZ::fillColosureV( map < int,vector > & zRegQnt_nANNZ, map < i bool optimWithSclBias = glob->GetOptB("optimWithScaledBias"); TString biasTitle = optimWithSclBias ? (TString)"bias/(1+"+glob->GetOptC("zTrgTitle")+")" : (TString)"bias"; - int nBinsZ = (int)zClos_binC.size(); - VERIFY(LOCATION,(TString)"zClos_binC not properly initialized ?!? ",(nBinsZ > 0)); //sanity check + // ----------------------------------------------------------------------------------------------------------- + // if not defined, derive the closure bins from the quantile distribution of zTrg + // ----------------------------------------------------------------------------------------------------------- + int nBinsZ = (int)zClos_binC.size(); + if(nBinsZ == 0) { + nBinsZ = nZclosBins; + + vector fracV(nBinsZ-1), quantV(nBinsZ-1,-1); + for(int nBinNow=0; nBinNowGetName()+"_hisQuantTMP"; + drawExprs = (TString)zTrgName+">>"+hisName; + wgtCut = (TString)zTrgName+">="+utils->floatToStr(minValZ)+"&&"+zTrgName+"<="+utils->floatToStr(maxValZ); + + utils->drawTree(aChain,drawExprs,wgtCut); + + TH1 * hisQuantTMP = (TH1F*)gDirectory->Get(hisName); + hisQuantTMP->SetDirectory(0); hisQuantTMP->BufferEmpty(); + + utils->param->clearAll(); + bool gotQuants = utils->getQuantileV(fracV,quantV,hisQuantTMP); + VERIFY(LOCATION,(TString)"Could not derive quantiles from "+aChain->GetName(),gotQuants); + + zClos_binC.resize(nBinsZ,0); zClos_binE.resize(nBinsZ+1,0); + for(int nBinNow=1; nBinNowfloatToStr(zClos_binE[nBinNow]) + TString((nBinNow < nBinsZ) ? "," : ""); + } + aLOG(Log::DEBUG) < sumWeightsBin(nBinsZ,0); vector < vector > closH(nMLMs); @@ -486,10 +545,12 @@ void ANNZ::fillColosureV( map < int,vector > & zRegQnt_nANNZ, map < i * or alternatively, to use the average metrics (over all zTrg bins). */ // =========================================================================================================== -void ANNZ::getBestANNZ( map < int,vector > & zRegQnt_nANNZ, map < int,vector > & zRegQnt_bias, - map < int,vector > & zRegQnt_sigma68, map < int,vector > & zRegQnt_fracSig68, - vector < int > & bestMLMsV, bool onlyInclusiveBin) { -// =================================================================================================================== +void ANNZ::getBestANNZ( + map < int,vector > & zRegQnt_nANNZ, map < int,vector > & zRegQnt_bias, + map < int,vector > & zRegQnt_sigma68, map < int,vector > & zRegQnt_fracSig68, + vector < int > & bestMLMsV, bool onlyInclusiveBin +) { +// =========================================================================================================== aLOG(Log::INFO) <GetOptC("optimCondReg")< > & zRegQnt_nANNZ, map < int } if((int)nMLMmetricPairV.size() < minAcptANNZs) { // sort so that the smallest element is first - sort(nMLMmetricPairV.begin(),nMLMmetricPairV.end(),sortFunctors::pairIntDouble_descendSecond); + sort(nMLMmetricPairV.begin(),nMLMmetricPairV.end(),sortFunc::pairID::lowToHighBy1); for(int nAcptANNZnow=0; nAcptANNZnow<(int)nMLMmetricPairV.size(); nAcptANNZnow++) { bestMLMsV.push_back(nMLMmetricPairV[nAcptANNZnow].first); @@ -696,7 +757,7 @@ void ANNZ::getBestANNZ( map < int,vector > & zRegQnt_nANNZ, map < int if(nAcptANNZs == 0) continue; // sort so that the smallest element is first - sort(bestMLMsPairs[nBinNow][metricName].begin(),bestMLMsPairs[nBinNow][metricName].end(),sortFunctors::pairIntDouble_descendSecond); + sort(bestMLMsPairs[nBinNow][metricName].begin(),bestMLMsPairs[nBinNow][metricName].end(),sortFunc::pairID::lowToHighBy1); for(int nAcptANNZnow=0; nAcptANNZnow > & zRegQnt_nANNZ, map < int // =========================================================================================================== /** - * @brief - Generate PDF weighting schemes for randomized regression. + * @brief - Generate a PDF weighting scheme for randomized regression using a simple random walk alg. * * @details - Candidate PDFs are created by randomely generating weighting schemes, which represent different * supression powers af MLM, based on metric ranking: MLMs with better (smaller values of) metrics, have @@ -755,13 +816,785 @@ void ANNZ::getBestANNZ( map < int,vector > & zRegQnt_nANNZ, map < int * @param hisPdfBiasCorV - Vector for bias correction histograms for pdfs. */ // =========================================================================================================== -void ANNZ::getRndMethodBestPDF(TTree * aChain, int bestANNZindex, vector & zRegQnt_nANNZ, - vector & zRegQnt_bias, vector & zRegQnt_sigma68, vector & zRegQnt_fracSig68, - vector < vector > & bestWeightsV, vector & hisPdfBiasCorV) { -// ========================================================================================================= +void ANNZ::getRndMethodBestPDF( + TTree * aChain, int bestANNZindex, + vector & zRegQnt_nANNZ, vector & zRegQnt_bias, + vector & zRegQnt_sigma68, vector & zRegQnt_fracSig68, + vector < vector > & bestWeightsV, vector & hisPdfBiasCorV +) { +// =========================================================================================================== aLOG(Log::INFO) <(aChain))); + TString zTrgName = glob->GetOptC("zTrg"); + int maxNobj = glob->GetOptI("maxNobj"); + int nSmearsRnd = glob->GetOptI("nSmearsRnd"); + int nMLMs = glob->GetOptI("nMLMs"); + int nPDFbins = glob->GetOptI("nPDFbins"); + TString _typeANNZ = glob->GetOptC("_typeANNZ"); + UInt_t seed = glob->GetOptI("initSeedRnd"); if(seed > 0) seed += 98187; + int minAcptMLMsForPDFs = glob->GetOptI("minAcptMLMsForPDFs"); + int nPDFs = glob->GetOptI("nPDFs"); + double max_sigma68_PDF = glob->GetOptF("max_sigma68_PDF"); + double max_bias_PDF = glob->GetOptF("max_bias_PDF"); + double max_frac68_PDF = glob->GetOptF("max_frac68_PDF"); + TString zTrgTitle = glob->GetOptC("zTrgTitle"); + bool doBiasCorPDF = glob->GetOptB("doBiasCorPDF"); + TString intgrZtreeName = (TString)glob->GetOptC("treeName")+"_intgrZmlm"; + + // parameters for the random walk alg + double max_optimObj_PDF = (double)glob->GetOptI("max_optimObj_PDF"); + int nOptimLoops = max(100, glob->GetOptI("nOptimLoops")); + int nIntgrZsmears = 15; // sould be odd number >= 3 + int maxOptimTries = glob->GetOptI("max_staticOptimTries_PDF"); + int nSameWeightsMax = int(floor(0.5 * maxOptimTries)); + int nNoUpdateMax = 50; + double fracWeightUpdate = 0.15; + bool saveProfileHis = inLOG(Log::DEBUG_1); + + vector < double > excRange(2); + excRange[0] = glob->GetOptF("excludeRangePdfModelFit"); + excRange[1] = 1 - excRange[0]; + + int nBinsZ = (int)zClos_binC.size(); + TString MLMnameBest = getTagName(bestANNZindex); + TString MLMnameBest_e = getTagError(bestANNZindex); + + if(nPDFs == 0) { + aLOG(Log::INFO) < 0)); //sanity check + + // ----------------------------------------------------------------------------------------------------------- + // define bias-correction histograms for the pdf + // ----------------------------------------------------------------------------------------------------------- + TH2F * hisPdfTryBiasCorV(NULL); + if(doBiasCorPDF) { + TString hisName = (TString)"pdfBias"+_typeANNZ+"_nominal"; + hisPdfTryBiasCorV = new TH2F(hisName,hisName,nPDFbins,&(zPDF_binE[0]),nPDFbins,&(zPDF_binE[0])); + } + + TRandom * rnd(NULL); + vector < int > sigmToBiasV(nMLMs,0); + vector < double > weightV(nMLMs,0); + vector < pair > sigm68V, biasV; + + // ----------------------------------------------------------------------------------------------------------- + // sort all accepted MLMs by their sigm68 and bias metrics. + // ----------------------------------------------------------------------------------------------------------- + int nAcptMLMs = (int)zRegQnt_nANNZ.size(); + for(int nAcptMLMnow=0; nAcptMLMnow 0 && zRegQnt_sigma68[nAcptMLMnow] > max_sigma68_PDF) { + aLOG(Log::INFO) < it will be rejected from the PDF ..." < 0 && zRegQnt_bias[nAcptMLMnow] > max_bias_PDF) { + aLOG(Log::INFO) < it will be rejected from the PDF ..." < 0 && zRegQnt_fracSig68[nAcptMLMnow] > max_frac68_PDF) { + aLOG(Log::INFO) < it will be rejected from the PDF ..." <(zRegQnt_nANNZ[nAcptMLMnow],zRegQnt_bias [nAcptMLMnow])); + sigm68V.push_back(pair(zRegQnt_nANNZ[nAcptMLMnow],zRegQnt_sigma68[nAcptMLMnow])); + } + nAcptMLMs = (int)sigm68V.size(); + + VERIFY(LOCATION,(TString)" - found only "+utils->intToStr(nAcptMLMs)+" accepted MLMs, but requested minAcptMLMsForPDFs = " + +utils->intToStr(minAcptMLMsForPDFs)+" ...",(nAcptMLMs >= minAcptMLMsForPDFs)); + + // sort so that the smallest element is first + sort(biasV .begin(),biasV .end(),sortFunc::pairID::lowToHighBy1); + sort(sigm68V.begin(),sigm68V.end(),sortFunc::pairID::lowToHighBy1); + + // get the index conversion between the two vectors after their independent sorts + for(int nAcptMLMnow=0; nAcptMLMnow 0) { seed++; } rnd = new TRandom3(seed); + + int nNonZeroWeights(0); + bool foundGoodSetup(false); + map acceptV_sigm, acceptV_both; + + for(int nRelaxReject=0; nRelaxReject<100; nRelaxReject++) { + int minRejConst_sigm(1), maxRejConst_sigm(6), minRejConst_bias(0), maxRejConst_bias(3); + int maxAcptMLMs(0); + double rndNow(0), rejectionFrac(0); + + if(nRelaxReject > 0) { + maxRejConst_sigm -= 0.5 * nRelaxReject; + maxRejConst_bias -= 0.5 * nRelaxReject; + if(maxRejConst_sigm <= minRejConst_sigm) break; + if (maxRejConst_bias <= minRejConst_bias && maxRejConst_sigm <= minRejConst_sigm) break; + else if(maxRejConst_bias <= minRejConst_bias ) maxRejConst_bias = minRejConst_bias + 1; + + aLOG(Log::DEBUG_2) <Rndm(); + rejectionFrac = 0.1 * (minRejConst_sigm + int(ceil(rndNow * (maxRejConst_sigm-minRejConst_sigm)))); + maxAcptMLMs = static_cast(floor(nAcptMLMs * (1-rejectionFrac))); + + // set the flag for accepted/rejected methods + nNonZeroWeights = 0; + for(int nAcptMLMnow=0; nAcptMLMnow maxAcptMLMs) { + acceptV_sigm[nMLMnow] = false; + aLOG(Log::DEBUG_3) <= minAcptMLMsForPDFs) { + foundGoodSetup = true; break; + } + } + + // check the accepted methods by minimal bias (requiring to already also be accepted by sigma_68) + // ----------------------------------------------------------------------------------------------------------- + if(foundGoodSetup) { + foundGoodSetup = false; + + for(int nRjctFarcNow=0; nRjctFarcNow<100; nRjctFarcNow++) { + rndNow = rnd->Rndm(); + rejectionFrac = 0.1 * (minRejConst_bias + int(ceil(rndNow * (maxRejConst_bias-minRejConst_bias)))); + maxAcptMLMs = static_cast(floor(nAcptMLMs * (1-rejectionFrac))); + + // set the flag for accepted/rejected methods + nNonZeroWeights = 0; + for(int nAcptMLMnow=0; nAcptMLMnow maxAcptMLMs) { + acceptV_both[nMLMnow] = false; + aLOG(Log::DEBUG_3) <= minAcptMLMsForPDFs) { + foundGoodSetup = true; + break; + } + } + } + + if(foundGoodSetup) break; + + // if didn't break, than could not find a combination of MLMs with good metrics under the current restrictions + aLOG(Log::DEBUG_1) <intToStr(minAcptMLMsForPDFs) + +" accepted methods ... Something is horribly wrong !!! Try setting a lower value" + +" of minAcptMLMsForPDFs or increasing the number of trained MLMs ...",foundGoodSetup); + + + map < TString,bool > mlmSkipPdf; + for(int nMLMnow=0; nMLMnow > tmpWeightV; + for(int nAcptMLMnow=0; nAcptMLMnow(nMLMnow,weightNow)); + } + int nAcptWgts = (int)tmpWeightV.size(); + + // sort so that the largest element is first + sort(tmpWeightV.begin(),tmpWeightV.end(),sortFunc::pairID::highToLowBy1); + + // now that we have the right ranking of MLM weiights, set the values to by fixed scale + sumWeights = 0; + weightV.resize(nMLMs,0); + for(int nAcptMLMnow=0; nAcptMLMnow EPS) { + acceptedMLMs += (TString)coutPurple+getTagName(nMLMnow)+coutGreen+", "; + } + else { + rejectedMLMs += (TString)coutYellow+getTagName(nMLMnow)+coutGreen+", "; + } + } + aLOG(Log::DEBUG) < EPS) tmpWeightV.push_back(pair(nMLMnow,weightV[nMLMnow])); + } + + // sort so that the largest element is first + sort(tmpWeightV.begin(),tmpWeightV.end(),sortFunc::pairID::highToLowBy1); + + TString weightMsg(""); + for(int nAcptMLMnow=0; nAcptMLMnow<(int)tmpWeightV.size(); nAcptMLMnow++) { + TString MLMname = getTagName(tmpWeightV[nAcptMLMnow].first); + weightMsg += (TString)coutGreen+MLMname+":"+coutYellow+utils->floatToStr(tmpWeightV[nAcptMLMnow].second,"%1.3f")+" "; + } + aLOG(Log::INFO) < 0) { seed++; } rnd = new TRandom3(seed); + + // define a VarMaps to loop over the chain + TString aChainName = aChain->GetName(); + VarMaps * var_0 = new VarMaps(glob,utils,(TString)"inputTreeVars_"+aChainName); + var_0->connectTreeBranches(aChain); + + double pdfObjFrac(-1); + if(max_optimObj_PDF > 0) { + double nEntriesChain = aChain->GetEntries(); + pdfObjFrac = min((max_optimObj_PDF/nEntriesChain), 1.); + } + + for(int nMLMnow=0; nMLMnowSetBranchStatus(getTagIndex(nMLMnow),0); + } + + VarMaps * var_1 = new VarMaps(glob,utils,(TString)"vars_"+intgrZtreeName); + for(int nMLMnow=0; nMLMnowNewVarF(MLMname); + } + var_1->NewVarF("eventWeight"); + var_1->NewVarF(zTrgName); + + TTree * outTree = new TTree(intgrZtreeName,intgrZtreeName); + outTree->SetDirectory(0); outputs->TreeMap[intgrZtreeName] = outTree; + + var_1->createTreeBranches(outTree); + + // ----------------------------------------------------------------------------------------------------------- + // loop on the tree + // ----------------------------------------------------------------------------------------------------------- + bool breakLoop(false); + int nObjectsToWrite(glob->GetOptI("nObjectsToWrite")); + var_0->clearCntr(); + for(Long64_t loopEntry=0; true; loopEntry++) { + if(!var_0->getTreeEntry(loopEntry)) breakLoop = true; + + if((var_0->GetCntr("nObj")+1 % nObjectsToWrite == 0) || breakLoop) { var_0->printCntr(aChainName,Log::DEBUG_1); } + if(breakLoop) break; + + if(pdfObjFrac > 0) { + if(rnd->Rndm() > pdfObjFrac) continue; + } + var_0->IncCntr("nObj"); if(var_0->GetCntr("nObj") == maxNobj) breakLoop = true; + + double zTrg = var_0->GetVarF(zTrgName); + + // ----------------------------------------------------------------------------------------------------------- + // loop over all MLMs + // ----------------------------------------------------------------------------------------------------------- + double eventWeight(0); + for(int nMLMnow=0; nMLMnowGetVarF(MLMname); + double errN = var_0->GetVarF(MLMname_eN); if(errN < EPS) continue; + double errP = var_0->GetVarF(MLMname_eP); if(errP < EPS) continue; + double weightNow = var_0->GetVarF(MLMname_w); if(weightNow < EPS) continue; + + eventWeight += weightNow; + + double intgrZmlm(0); + for(int nSmearNow=0; nSmearNow 0) { + int signNow = (nSmearNow % 2 == 0) ? 1 : -1; + double errNow = (nSmearNow % 2 == 0) ? errP : errN; + sfNow = signNow * fabs(rnd->Gaus(0,errNow)); + } + double zReg = regNow + sfNow; + if(zReg < zTrg) intgrZmlm += 1; + } + intgrZmlm /= double(nIntgrZsmears); + var_1->SetVarF(MLMname, intgrZmlm); + } + eventWeight /= double(nMLMs); + var_1->SetVarF("eventWeight",eventWeight); + var_1->SetVarF(zTrgName,zTrg); + + var_1->fillTree(); + } + if(!breakLoop) { var_0->printCntr(aChainName,Log::DEBUG_1); } + + DELNULL(rnd); + + outputs->WriteOutObjects(false,true); outputs->ResetObjects(); + + DELNULL(var_1); + DELNULL(outTree); outputs->TreeMap.erase(intgrZtreeName); + + + // ----------------------------------------------------------------------------------------------------------- + // use the integral-metric with various MLM combinations to compare pdf candidates + // ----------------------------------------------------------------------------------------------------------- + TString intgrZfileName = (TString)glob->GetOptC("outDirNameFull")+intgrZtreeName+"*.root"; + TChain * intgrZchain = new TChain(intgrZtreeName,intgrZtreeName); intgrZchain->SetDirectory(0); intgrZchain->Add(intgrZfileName); + aLOG(Log::DEBUG) <GetEntries()<<")" + <<" from "<connectTreeBranches(intgrZchain); + + TFile * rootSaveFile(NULL); + TProfile * hisIntgrZmlm(NULL); + if(saveProfileHis) { + TString hisName("his_intgrZmlmOptim"); + hisIntgrZmlm = new TProfile(hisName,hisName,nPDFbins,&(zPDF_binE[0])); + hisIntgrZmlm->GetXaxis()->SetTitle((TString)"C("+zTrgTitle+")"); + hisIntgrZmlm->GetYaxis()->SetTitle((TString)"1/N dN/dC("+zTrgTitle+")"); + + // make sure the name does not fall under the patter for intgrZfileName (sice it could be deleted...) + TString rootFileName = (TString)glob->GetOptC("outDirNameFull")+"intgrZmlmOptimProfileHis.root"; + rootSaveFile = new TFile(rootFileName,"RECREATE"); + } + + if(seed > 0) { seed++; } rnd = new TRandom3(seed); + + // ----------------------------------------------------------------------------------------------------------- + // perform the random walk alg + // ----------------------------------------------------------------------------------------------------------- + double avgIntgrZ(0.5); + int nNoUpdate(0), nSameWeights(0); + double varIntgrBest(std::numeric_limits::max()), varIntgrPrev(varIntgrBest); + vector < double > weightsNow(weightV), weightsPrev(weightV), weightsBest, intgrZ_valV, intgrZmlm_nEvtV; + + vector < int > updateIndexV; + for(int nMLMnow=0; nMLMnow::iterator indexItr = updateIndexV.begin(); + int nWgtsIn = updateIndexV.size(); + int nRnds0(0); + if (nWgtsIn < 10) nRnds0 = 1; + else if(nWgtsIn < 20) nRnds0 = 5; + else if(nWgtsIn < 40) nRnds0 = 10; + else nRnds0 = floor(nWgtsIn * 0.2); + + for(int nLoops=0; nLoopsReset(); + + // ----------------------------------------------------------------------------------------------------------- + // calculate the optimization metric for this set of PDF weights from the entire dataset + // ----------------------------------------------------------------------------------------------------------- + for(Long64_t loopEntry=0; true; loopEntry++) { + if(!var_1->getTreeEntry(loopEntry)) break; + + double zTrg = var_1->GetVarF(zTrgName); + double eventWeight = var_1->GetVarF("eventWeight"); + int nPdfBinNow = getBinZ(zTrg, zPDF_binE); if(nPdfBinNow < 0) continue; + + double intgrZ(0); + for(int nMLMnow=0; nMLMnowGetVarF(MLMname); + intgrZ += intgrZmlm * weightsNow[nMLMnow]; + } + + if(intgrZ >= excRange[0] && intgrZ <= excRange[1]) { + if(canPrint && saveProfileHis) hisIntgrZmlm->Fill(zTrg,intgrZ,eventWeight); + + intgrZ_valV [nPdfBinNow] += intgrZ * eventWeight; + intgrZmlm_nEvtV[nPdfBinNow] += eventWeight; + } + } + + // ----------------------------------------------------------------------------------------------------------- + // derive the final metric as the RMS around avgIntgrZ + // ----------------------------------------------------------------------------------------------------------- + double varIntgrNow(0), nBinsIn(0); + for(int nPdfBinNow=0; nPdfBinNow EPS) ? sqrt(varIntgrNow/nBinsIn) : 0; + + // check if the current solution is better then the previous one (or the best so far) + bool isBest = varIntgrNow < varIntgrBest; + bool isBetter = varIntgrNow < varIntgrPrev; + + bool canPrintNew(false); + if(isBest) { + if(varIntgrBest < EPS) canPrintNew = true; + else canPrintNew = ((varIntgrBest - varIntgrNow)/varIntgrBest > 0.01); + } + + // ----------------------------------------------------------------------------------------------------------- + // output for the user to validate that things are going well + // ----------------------------------------------------------------------------------------------------------- + if(canPrint || canPrintNew) { + weightMsg = ""; + for(int nMLMnow=0; nMLMnowfloatToStr(weightsNow[nMLMnow],"%1.3f")+" "; + } + + TString msgHead = (TString)(isBest ? " - NEW: " : " - nTry: "); + TString msgCol = (TString)(isBest ? coutGreen : coutYellow); + aLOG(Log::INFO) <floatToStr(varIntgrBest,"%1.5e") + <floatToStr(varIntgrPrev,"%1.5e") + <floatToStr(varIntgrNow,"%1.5e") + <SetTitle((TString)(TString)"RMS[C("+zTrgTitle+")] = "+utils->floatToStr(varIntgrNow,"%1.5e")); + hisIntgrZmlm->Write((TString)hisIntgrZmlm->GetName()+"_"+utils->intToStr(nLoops)); + } + } + + // ----------------------------------------------------------------------------------------------------------- + // update the best weights if needed + // ----------------------------------------------------------------------------------------------------------- + if(isBest) { + varIntgrBest = varIntgrNow; + weightsBest = weightsNow; + nSameWeights = 0; + } + else nSameWeights++; + + // ----------------------------------------------------------------------------------------------------------- + // go back to the best solution if there has been no improvement for a while + // finish if there has been no improvement for a long time + // ----------------------------------------------------------------------------------------------------------- + if(nSameWeights % nSameWeightsMax == 0) weightsPrev = weightsBest; + if(nSameWeights >= maxOptimTries) break; + + // ----------------------------------------------------------------------------------------------------------- + // continue with this set of weights if the result is better, or if forceUpdate + // ----------------------------------------------------------------------------------------------------------- + bool forceUpdate = (nNoUpdate >= nNoUpdateMax) || (rnd->Rndm() < fracWeightUpdate); + if(isBetter || forceUpdate) { + nNoUpdate = 0; + weightsPrev = weightsNow; + varIntgrPrev = varIntgrNow; + } + else { + nNoUpdate++; + weightsNow = weightsPrev; + } + + // ----------------------------------------------------------------------------------------------------------- + // change one or more of the weights for the next iteration + // ----------------------------------------------------------------------------------------------------------- + int nRnds(nRnds0); + if(rnd->Rndm() < 0.5) nRnds = max(nWgtsIn, nRnds + int(ceil(0.2 * rnd->Rndm() * nWgtsIn))); + for(int nRndNow=0; nRndNowRndm(); + if(weightsNow[*indexItr] >= 0 && weightsNow[*indexItr] <= 1) break; + } + + indexItr++; + if(indexItr == updateIndexV.end()) { indexItr = updateIndexV.begin(); } + } + + // normalize the new weights + double sumWeights(0); + for(int nMLMnow=0; nMLMnow EPS) tmpWeightV.push_back(pair(nMLMnow,weightsBest[nMLMnow])); + } + + // sort so that the largest element is first + sort(tmpWeightV.begin(),tmpWeightV.end(),sortFunc::pairID::highToLowBy1); + + weightMsg = ""; + for(int nAcptMLMnow=0; nAcptMLMnow<(int)tmpWeightV.size(); nAcptMLMnow++) { + TString MLMname = getTagName(tmpWeightV[nAcptMLMnow].first); + weightMsg += (TString)coutGreen+MLMname+":"+coutYellow+utils->floatToStr(tmpWeightV[nAcptMLMnow].second,"%1.3f")+" "; + } + + aLOG(Log::INFO) <floatToStr(varIntgrBest,"%1.5e")< 0) { seed++; } rnd = new TRandom3(seed); + + breakLoop = false; var_0->clearCntr(); + for(Long64_t loopEntry=0; true; loopEntry++) { + if(!var_0->getTreeEntry(loopEntry)) breakLoop = true; + + if((var_0->GetCntr("nObj")+1 % nObjectsToWrite == 0) || breakLoop) { var_0->printCntr(aChainName,Log::DEBUG_1); } + if(breakLoop) break; + + var_0->IncCntr("nObj"); if(var_0->GetCntr("nObj") == maxNobj) breakLoop = true; + + // get the target value and smear the reg-value of the best MLM + double zTrg = var_0->GetVarF(zTrgName); + + // loop over all MLMs + for(int nMLMnow=0; nMLMnowGetVarF(MLMname); + double errN = var_0->GetVarF(MLMname_eN); if(errN < EPS) continue; + double errP = var_0->GetVarF(MLMname_eP); if(errP < EPS) continue; + double weightNow = var_0->GetVarF(MLMname_w); if(weightNow < EPS) continue; + + // generate random smearing factors for this MLM + for(int nSmearRndNow=0; nSmearRndNowGaus(0,errNow)); + double zReg = regNow + sfNow; + double weightFull = weightsBest[nMLMnow] * weightNow; + + hisPdfTryBiasCorV->Fill(zReg,zTrg,weightFull); + } + } + } + if(!breakLoop) { var_0->printCntr(aChainName,Log::DEBUG_1); } + + DELNULL(rnd); + } + DELNULL(var_0); + + // ----------------------------------------------------------------------------------------------------------- + // store the results in the vector accosible by the outside world + // ----------------------------------------------------------------------------------------------------------- + bestWeightsV.clear(); hisPdfBiasCorV.clear(); + bestWeightsV.push_back(weightsBest); hisPdfBiasCorV.push_back(hisPdfTryBiasCorV); + + // ----------------------------------------------------------------------------------------------------------- + // cleanup + // ----------------------------------------------------------------------------------------------------------- + if(saveProfileHis) { + aLOG(Log::INFO) <GetName()<Close(); DELNULL(rootSaveFile); + } + + // cleanup the transient trees we created during optimization + if(!glob->GetOptB("keepOptimTrees_randReg")) { + utils->safeRM(intgrZfileName,inLOG(Log::DEBUG)); + } + + sigm68V.clear(); biasV.clear(); sigmToBiasV.clear(); + weightV.clear(); mlmSkipPdf.clear(); updateIndexV.clear(); + weightsNow.clear(); weightsPrev.clear(); weightsBest.clear(); + excRange.clear(); intgrZ_valV.clear(); intgrZmlm_nEvtV.clear(); + tmpWeightV.clear(); + + // ----------------------------------------------------------------------------------------------------------- + // call the old-style pdf function is needed + // ----------------------------------------------------------------------------------------------------------- + if(nPDFs > 1) { + getOldStyleRndMethodBestPDF( + aChain, bestANNZindex, zRegQnt_nANNZ, zRegQnt_bias, + zRegQnt_sigma68, zRegQnt_fracSig68, bestWeightsV, hisPdfBiasCorV + ); + } + + return; +} + + +// =========================================================================================================== +/** + * @brief - Generate PDF weighting schemes for randomized regression. + * + * @details - Candidate PDFs are created by randomely generating weighting schemes, which represent different + * supression powers af MLM, based on metric ranking: MLMs with better (smaller values of) metrics, have + * larger relative weights. + * - PDFs are selected by choosing the weighting scheme which is "most compatible" with the true value. + * This is determined in two ways (generating two alternative PDFs), using cumulative distributions; for one + * PDF, the cumulative distribution is based on the "truth" (the target variable, zTrg). For the other + * PDF, the cumulative distribution is based on the "best" MLM. + * For the former, a set of templates, derived from zTrg is used to fit the dataset. For the later, + * a flat distribution of the cumulator serves as the baseline. + * + * @param aChain - Input chain, for which to do the PDF derivation. + * @param bestANNZindex - Index of the "best" MLM. + * @param zRegQnt_nANNZ - Map of vector, which is filled with the index of MLMs, corresponding to the order of the + * entries of the metrics in the other vectors (zRegQnt_sigma68 and zRegQnt_bias). + * @param zRegQnt_bias - Map of vector, which is filled with the claculated bias metric. + * @param zRegQnt_sigma68 - Map of vector, which is filled with the claculated sigma68 metric. + * @param zRegQnt_fracSig68 - Map of vector, which is filled with the claculated combined outlier-fraction metric. + * @param bestWeightsV - Vector into which the results of the PDF derivation are stored (as weights for + * each MLM). + * @param hisPdfBiasCorV - Vector for bias correction histograms for pdfs. + */ +// =========================================================================================================== +void ANNZ::getOldStyleRndMethodBestPDF( + TTree * aChain, int bestANNZindex, + vector & zRegQnt_nANNZ, vector & zRegQnt_bias, + vector & zRegQnt_sigma68, vector & zRegQnt_fracSig68, + vector < vector > & bestWeightsV, vector & hisPdfBiasCorV +) { +// =========================================================================================================== + aLOG(Log::INFO) <(aChain))); + double minValZ = glob->GetOptF("minValZ"); double maxValZ = glob->GetOptF("maxValZ"); TString zTrgName = glob->GetOptC("zTrg"); @@ -905,8 +1738,9 @@ void ANNZ::getRndMethodBestPDF(TTree * aChain, int VERIFY(LOCATION,(TString)" - found only "+utils->intToStr(nAcptMLMs)+" accepted MLMs, but requested minAcptMLMsForPDFs = " +utils->intToStr(minAcptMLMsForPDFs)+" ...",(nAcptMLMs >= minAcptMLMsForPDFs)); - sort(sigm68V.begin(),sigm68V.end(),sortFunctors::pairIntDouble_descendSecond); // sort so that the smallest element is first - sort(biasV.begin(), biasV.end(), sortFunctors::pairIntDouble_descendSecond); // sort so that the smallest element is first + // sort so that the smallest element is first + sort(sigm68V.begin(),sigm68V.end(),sortFunc::pairID::lowToHighBy1); + sort(biasV.begin(), biasV.end(), sortFunc::pairID::lowToHighBy1); // ----------------------------------------------------------------------------------------------------------- @@ -1086,7 +1920,6 @@ void ANNZ::getRndMethodBestPDF(TTree * aChain, int } if(weightSum > 0) { - // ----------------------------------------------------------------------------------------------------------- // remove weights which are smaller than minPdfWeight after normalization // ----------------------------------------------------------------------------------------------------------- @@ -1119,7 +1952,8 @@ void ANNZ::getRndMethodBestPDF(TTree * aChain, int } // now remove the smallest weight before trying again... if((int)tmpV.size() > 0) { - sort(tmpV.begin(),tmpV.end(),sortFunctors::pairIntDouble_descendSecond); // sort so that the smallest element is first + // sort so that the smallest element is first + sort(tmpV.begin(),tmpV.end(),sortFunc::pairID::lowToHighBy1); int nANNZ_min = tmpV[0].first; double weight_min = tmpV[0].second; @@ -1151,7 +1985,7 @@ void ANNZ::getRndMethodBestPDF(TTree * aChain, int // sort the vector used for plotting, and create a graph with the current weights // ----------------------------------------------------------------------------------------------------------- - sort(weightsV.begin(),weightsV.end(),sortFunctors::double_ascend); // sort so that the largest element is first + sort(weightsV.begin(),weightsV.end(),sortFunc::highToLowD); vector graph_X, graph_Y, graph_Xerr, graph_Yerr; @@ -1459,7 +2293,8 @@ void ANNZ::getRndMethodBestPDF(TTree * aChain, int fitParMap.clear(); } - sort(fitMetric_nPDF.begin(),fitMetric_nPDF.end(),sortFunctors::pairIntDouble_descendSecond); // sort so that the smallest element is first + // sort so that the smallest element is first + sort(fitMetric_nPDF.begin(),fitMetric_nPDF.end(),sortFunc::pairID::lowToHighBy1); vector < TH1* > hisIntgrZregFirstFewV, hisVtmp; hisVtmp = hisIntgrZregV; hisIntgrZregV.clear(); @@ -1534,7 +2369,8 @@ void ANNZ::getRndMethodBestPDF(TTree * aChain, int DELNULL(fitOpts); } - sort(fitMetric_nPDF.begin(),fitMetric_nPDF.end(),sortFunctors::pairIntDouble_descendSecond); // sort so that the smallest element is first + // sort so that the smallest element is first + sort(fitMetric_nPDF.begin(),fitMetric_nPDF.end(),sortFunc::pairID::lowToHighBy1); hisVtmp.clear(); hisVtmp = hisIntgrZtrgV; hisIntgrZtrgV.clear(); @@ -1731,9 +2567,14 @@ void ANNZ::getRndMethodBestPDF(TTree * aChain, int // ----------------------------------------------------------------------------------------------------------- // store the results in the vector accosible by the outside world // ----------------------------------------------------------------------------------------------------------- - bestWeightsV.clear(); hisPdfBiasCorV.clear(); - if(nPDFs > 0) { bestWeightsV.push_back(bestWeightsV_Ztrg); hisPdfBiasCorV.push_back(hisPdfBiasV_Ztrg); } // combination of models -> PDF_0 - if(nPDFs > 1) { bestWeightsV.push_back(bestWeightsV_Zreg); hisPdfBiasCorV.push_back(hisPdfBiasV_Zreg); } // constant fit -> PDF_1 + VERIFY(LOCATION,(TString)" - mismatch in ordering of PDFs ... Something is horribly wrong ?!?! ",(bestWeightsV.size() == 1 && hisPdfBiasCorV.size() == 1)); + + // combination of models -> PDF_0 + bestWeightsV.push_back(bestWeightsV_Ztrg); hisPdfBiasCorV.push_back(hisPdfBiasV_Ztrg); + // constant fit -> PDF_1 + if(nPDFs > 2) { + bestWeightsV.push_back(bestWeightsV_Zreg); hisPdfBiasCorV.push_back(hisPdfBiasV_Zreg); + } // ----------------------------------------------------------------------------------------------------------- // cleanup @@ -1760,7 +2601,6 @@ void ANNZ::getRndMethodBestPDF(TTree * aChain, int } hisPdfTryBiasCorV.clear(); - // aLOG(Log::INFO) < > > & pdfBinWgt, vector & nClsBinsIn) { -// =============================================================================================================== +void ANNZ::setBinClsPdfBinWeights( + vector < vector < pair > > & pdfBinWgt, vector & nClsBinsIn +) { +// =========================================================================================================== int nPDFbins = glob->GetOptI("nPDFbins"); int nClsBins = (int)zBinCls_binC.size(); @@ -1835,7 +2677,7 @@ void ANNZ::setBinClsPdfBinWeights(vector < vector < pair > > & pdfBi */ // =========================================================================================================== void ANNZ::getBinClsBiasCorPDF(TChain * aChain, vector & hisPdfBiasCorV) { -// ================================================================================ +// =========================================================================================================== aLOG(Log::INFO) <GetOptI("maxNobj"); @@ -1927,16 +2769,16 @@ void ANNZ::getBinClsBiasCorPDF(TChain * aChain, vector & hisPdfBiasCorV // =========================================================================================================== /** - * @brief - Create performance plots for regression. + * @brief - Create performance plots for regression. * - * @param aChain - Input chain, the result of doEvalReg(). - * @param addPlotVarV - Possible vector of MLM or variable names, which will be added to the list of solutions - * for which plots are generated. - * @param addOutputVarsIn - additional variables to be plotted. + * @param aChain - Input chain, the result of doEvalReg(). + * @param addPlotVarV - Possible vector of MLM or variable names, which will be added to the list of solutions + * for which plots are generated. + * @param addOutputVarsIn - additional variables that would get plots */ // =========================================================================================================== void ANNZ::doMetricPlots(TChain * aChain, vector * addPlotVarV, TString addOutputVarsIn) { -// ================================================================================================== +// =========================================================================================================== if(!glob->GetOptB("doPlots")) { aLOG(Log::DEBUG) < * addPlotVarV, TStri bool addMaxPDF = glob->GetOptB("addMaxPDF"); bool doStorePdfBins = glob->GetOptB("doStorePdfBins"); bool doKnnErrPlots = glob->GetOptB("doKnnErrPlots"); + int closHisN = glob->GetOptI("closHisN"); + int hisBufSize = glob->GetOptI("hisBufSize"); bool plotWithSclBias = glob->GetOptB("plotWithScaledBias"); TString biasTitle = plotWithSclBias ? (TString)"#delta/(1+"+glob->GetOptC("zTrgTitle")+")" : (TString)"#delta"; + int errTrgHisN = 40; if(!doStorePdfBins) { aLOG(Log::INFO) < Will skip ANNZ::doMetricPlots()"< * addPlotVarV, TStri TString userWgtFrm("userWgtFrm"); // some internal unique name TString hisName(""); - int nBinsZ((int)zPlot_binC.size()), maxSigmaRelErrToPlot(10); - int closHisN(glob->GetOptI("closHisN")), nDrawBins_zTrg(glob->GetOptI("nDrawBins_zTrg")); - int hisBufSize(glob->GetOptI("hisBufSize")), errTrgHisN(40); + int nBinsZ((int)zPlot_binC.size()), nBinsZtrg((int)zTrgPlot_binC.size()), maxSigmaRelErrToPlot(10); // single-estimator // ----------------------------------------------------------------------------------------------------------- @@ -2100,6 +2943,10 @@ void ANNZ::doMetricPlots(TChain * aChain, vector * addPlotVarV, TStri } } } + // sort so that the smallest element is first + sort(nMLMsChain.begin(),nMLMsChain.end(),sortFunc::lowToHighI); + sort(nPDFsChain.begin(),nPDFsChain.end(),sortFunc::lowToHighI); + // ----------------------------------------------------------------------------------------------------------- // store the list of all MLMs, average PDF solutions and PDF bins, together with corresponding errors, @@ -2298,7 +3145,7 @@ void ANNZ::doMetricPlots(TChain * aChain, vector * addPlotVarV, TStri TString drawExprs = (TString)plotVars[nTypeBinNow]+">>"+hisName; bool useQuantileBins = (find(noQuantileBinsV.begin(),noQuantileBinsV.end(), plotVars[nTypeBinNow]) == noQuantileBinsV.end()); - int nEvtPass = utils->drawTree(aChain,drawExprs,treeCuts); + int nEvtPass = utils->drawTree(aChain,drawExprs,treeCuts); double minVal(1), maxVal(-1); if(nEvtPass > 0) { @@ -2483,13 +3330,13 @@ void ANNZ::doMetricPlots(TChain * aChain, vector * addPlotVarV, TStri if(nTypeBinNow < 2) { hisName = (TString)"regTrgZ_"+typeName+typeBinZ; - his_regTrgZ[typeName][nTypeBinNow] = new TH1F(hisName,hisName,nDrawBins_zTrg,minValZ,maxValZ); + his_regTrgZ[typeName][nTypeBinNow] = new TH1F(hisName,hisName,nBinsZtrg,&(zTrgPlot_binE[0])); his_regTrgZ[typeName][nTypeBinNow]->SetTitle(hisTitle); } } hisName = (TString)"corRegTrgZ_"+typeName; - his_corRegTrgZ[typeName] = new TH2F(hisName,hisName,nDrawBins_zTrg,minValZ,maxValZ,nDrawBins_zTrg,minValZ,maxValZ); + his_corRegTrgZ[typeName] = new TH2F(hisName,hisName,nBinsZtrg,&(zTrgPlot_binE[0]),nBinsZtrg,&(zTrgPlot_binE[0])); his_corRegTrgZ[typeName]->SetTitle(hisTitle); his_corRegTrgZ[typeName]->GetXaxis()->SetTitle(zTrgTitle); his_corRegTrgZ[typeName]->GetYaxis()->SetTitle(zRegTitle); @@ -2523,8 +3370,7 @@ void ANNZ::doMetricPlots(TChain * aChain, vector * addPlotVarV, TStri double zRegE = var->GetVarF(nameV_MLM_e[nMLMinNow]); double zRegW = var->GetVarF(nameV_MLM_w[nMLMinNow]) * usrWgt; if(zRegW < EPS) continue; - // if(zRegV < minValZ || zRegV > maxValZ) continue; - zRegV = min(max(zRegV,minValZ),maxValZ); + // zRegV = min(max(zRegV,minValZ),maxValZ); his_regTrgZ [typeName][0]->Fill(zTrg, zRegW); his_regTrgZ [typeName][1]->Fill(zRegV, zRegW); @@ -3086,8 +3932,7 @@ void ANNZ::doMetricPlots(TChain * aChain, vector * addPlotVarV, TStri } TH1 * his1(NULL); - double binW = (maxValZ-minValZ)/double(nDrawBins_zTrg); - TString yAxisTitle = TString::Format("Entries/%1.2g",binW); + TString yAxisTitle("Entries/bin"); if((int)hisRegTrgV.size() == 0) { his1 = (TH1F*)his_regTrgZ[typeName][0]->Clone((TString)his_regTrgZ[typeName][0]->GetName()+"_cln_"+nDrawNowStr); diff --git a/src/ANNZ_loopRegCls.cpp b/src/ANNZ_loopRegCls.cpp index 32b26d4..3834dc7 100644 --- a/src/ANNZ_loopRegCls.cpp +++ b/src/ANNZ_loopRegCls.cpp @@ -22,7 +22,7 @@ */ // =========================================================================================================== void ANNZ::Eval() { -// ================ +// =========================================================================================================== aLOG(Log::INFO) < optNames; @@ -488,7 +488,7 @@ void ANNZ::makeTreeRegClsAllMLM() { */ // =========================================================================================================== void ANNZ::makeTreeRegClsOneMLM(int nMLMnow) { -// ============================================ +// =========================================================================================================== aLOG(Log::INFO) <(hisSig) && dynamic_cast(hisBck))); int width(15); @@ -933,7 +933,7 @@ double ANNZ::getSeparation(TH1 * hisSig, TH1 * hisBck) { */ // =========================================================================================================== void ANNZ::deriveHisClsPrb(int nMLMnow) { -// ====================================== +// =========================================================================================================== aLOG(Log::DEBUG) <prb histogram ... "< * chainFriendFileNameV, - vector * acceptV, vector * rejectV, TCut aCut, - vector< pair > * addFormV) { -// ========================================================================== +TChain * ANNZ::mergeTreeFriends( + TChain * aChain, TChain * aChainFriend, + vector * chainFriendFileNameV, vector * acceptV, + vector * rejectV, TCut aCut, + vector< pair > * addFormV +) { +// =========================================================================================================== aLOG(Log::INFO) <(aChain))); @@ -1270,7 +1273,7 @@ TChain * ANNZ::mergeTreeFriends(TChain * aChain, TChain * aChainFriend, vector(aChain))); VERIFY(LOCATION,(TString)"Memory leak ?! ",(dynamic_cast(aChain->GetFile()))); aLOG(Log::INFO) <GetName()<GetOptI("nMLMnow"); @@ -122,7 +122,7 @@ void ANNZ::onlyKnnErr_createTreeErrKNN() { */ // =========================================================================================================== void ANNZ::onlyKnnErr_eval() { -// =========================== +// =========================================================================================================== aLOG(Log::INFO) <GetOptI("nMLMs"); diff --git a/src/ANNZ_regEval.cpp b/src/ANNZ_regEval.cpp index 3807f9e..789ccad 100644 --- a/src/ANNZ_regEval.cpp +++ b/src/ANNZ_regEval.cpp @@ -31,7 +31,7 @@ */ // =========================================================================================================== void ANNZ::doEvalReg(TChain * inChain, TString outDirName, vector * selctVarV) { -// ======================================================================================= +// =========================================================================================================== aLOG(Log::INFO) < * se */ // =========================================================================================================== void ANNZ::evalRegSetup() { -// ======================== +// =========================================================================================================== aLOG(Log::DEBUG_1) <GetOptC("MLMsToStore"); @@ -359,7 +359,7 @@ void ANNZ::evalRegSetup() { */ // =========================================================================================================== void ANNZ::evalRegLoop() { -// ======================= +// =========================================================================================================== aLOG(Log::DEBUG_1) <GetOptC("outDirNameFull"); @@ -374,8 +374,6 @@ void ANNZ::evalRegLoop() { int nPDFs = glob->GetOptI("nPDFs"); int nPDFbins = glob->GetOptI("nPDFbins"); bool doStoreToAscii = glob->GetOptB("doStoreToAscii"); - double minValZ = glob->GetOptF("minValZ"); - double maxValZ = glob->GetOptF("maxValZ"); int nSmearsRnd = glob->GetOptI("nSmearsRnd"); double nSmearUnf = glob->GetOptI("nSmearUnf"); // and cast to double, since we divide by this later TString _typeANNZ = glob->GetOptC("_typeANNZ"); @@ -699,10 +697,10 @@ void ANNZ::evalRegLoop() { if(nLoopTypeNow == 1) { for(int nPDFnow=0; nPDFnowhisPDF_w [nPDFnow]->Reset(); aRegEval->mlmAvg_val[nPDFnow].resize(nMLMs,0); aRegEval->mlmAvg_err[nPDFnow].resize(nMLMs,0); aRegEval->mlmAvg_wgt[nPDFnow].resize(nMLMs,0); - aRegEval->hisPDF_w[nPDFnow]->Reset(); aRegEval->pdfWgtValV[nPDFnow][0] = aRegEval->pdfWgtValV[nPDFnow][1] = 0; aRegEval->pdfWgtNumV[nPDFnow][0] = aRegEval->pdfWgtNumV[nPDFnow][1] = 0; @@ -864,9 +862,12 @@ void ANNZ::evalRegLoop() { aRegEval->pdfWgtValV[nPDFnow][1] += pdfWgt; aRegEval->pdfWgtNumV[nPDFnow][1] += aRegEval->pdfWeightV[nPDFnow][nMLMnow]; - aRegEval->mlmAvg_val[nPDFnow][nMLMnow] = regVal; aRegEval->mlmAvg_err[nPDFnow][nMLMnow] = regErr; aRegEval->mlmAvg_wgt[nPDFnow][nMLMnow] = regWgt; + // input original value into the pdf before smearing + aRegEval->hisPDF_w[nPDFnow]->Fill(regVal,pdfWgt); - aRegEval->hisPDF_w[nPDFnow]->Fill(regVal,pdfWgt); // input original value into the pdf before smearing + aRegEval->mlmAvg_val[nPDFnow][nMLMnow] = regVal; + aRegEval->mlmAvg_err[nPDFnow][nMLMnow] = regErr; + aRegEval->mlmAvg_wgt[nPDFnow][nMLMnow] = regWgt; // generate random smearing factors for this MLM for(int nSmearRndNow=0; nSmearRndNowGaus(0,errNow)); double regSmr = regVal + sfNow; - if(regSmr < minValZ || regSmr > maxValZ) continue; - aRegEval->hisPDF_w[nPDFnow]->Fill(regSmr,pdfWgt); } } @@ -898,20 +897,19 @@ void ANNZ::evalRegLoop() { // apply the bias-correction to the pdf // ----------------------------------------------------------------------------------------------------------- if(doBiasCorPDF) { - TH1 * hisPDF_w_TMP = (TH1*)aRegEval->hisPDF_w[nPDFnow]->Clone((TString)aRegEval->hisPDF_w[nPDFnow]->GetName()+"_TMP"); + TString clnName = (TString)aRegEval->hisPDF_w[nPDFnow]->GetName()+"_TMP"; + TH1 * hisPDF_w_TMP = (TH1*)aRegEval->hisPDF_w[nPDFnow]->Clone(clnName); for(int nBinXnow=1; nBinXnowGetBinContent(nBinXnow); - if(val < aRegEval->minWeight) continue; - if(!aRegEval->hisBiasCorV[nPDFnow][nBinXnow-1]) continue; - if(aRegEval->hisBiasCorV[nPDFnow][nBinXnow-1]->Integral() < EPS) continue; + if(val < aRegEval->minWeight) continue; + if(!aRegEval->hisBiasCorV[nPDFnow][nBinXnow-1]) continue; + if( aRegEval->hisBiasCorV[nPDFnow][nBinXnow-1]->Integral() < EPS) continue; val /= nSmearUnf; for(int nSmearUnfNow=0; nSmearUnfNowhisBiasCorV[nPDFnow][nBinXnow-1]->GetRandom(); - // rndVal = min(max(rndVal,minValZ+EPS),maxValZ-EPS); - aRegEval->hisPDF_w[nPDFnow]->Fill(rndVal,val); } } @@ -1129,7 +1127,7 @@ void ANNZ::evalRegLoop() { */ // =========================================================================================================== void ANNZ::evalRegErrSetup() { -// =========================== +// =========================================================================================================== VERIFY(LOCATION,(TString)"Calling evalRegErrSetup() with no aRegEval defined ..." +" something is horribly wrong ?!?",(dynamic_cast(aRegEval))); @@ -1247,7 +1245,7 @@ void ANNZ::evalRegErrSetup() { */ // =========================================================================================================== void ANNZ::evalRegErrCleanup() { -// ============================= +// =========================================================================================================== if(!aRegEval) return; aLOG(Log::DEBUG_1) <varWrapper->printVars(); @@ -1332,8 +1330,6 @@ TString ANNZ::evalRegWrapperLoop() { int nMLMs = glob->GetOptI("nMLMs"); int nPDFs = glob->GetOptI("nPDFs"); int nPDFbins = glob->GetOptI("nPDFbins"); - double minValZ = glob->GetOptF("minValZ"); - double maxValZ = glob->GetOptF("maxValZ"); TString baseTag_v = glob->GetOptC("baseTag_v"); TString baseTag_e = glob->GetOptC("baseTag_e"); TString baseTag_w = glob->GetOptC("baseTag_w"); @@ -1494,12 +1490,13 @@ TString ANNZ::evalRegWrapperLoop() { aRegEval->pdfWgtNumV[nPDFnow][0] += 1; aRegEval->pdfWgtNumV[nPDFnow][1] += aRegEval->pdfWeightV[nPDFnow][nMLMnow]; + // input original value into the pdf before smearing + aRegEval->hisPDF_w[nPDFnow]->Fill(regVal,pdfWgt); + aRegEval->mlmAvg_val[nPDFnow][nMLMnow] = regVal; aRegEval->mlmAvg_err[nPDFnow][nMLMnow] = regErr; aRegEval->mlmAvg_wgt[nPDFnow][nMLMnow] = regWgt; - aRegEval->hisPDF_w[nPDFnow]->Fill(regVal,pdfWgt); // input original value into the pdf before smearing - // generate random smearing factors for this MLM for(int nSmearRndNow=0; nSmearRndNowGaus(0,errNow)); double regSmr = regVal + sfNow; - if(regSmr < minValZ || regSmr > maxValZ) continue; - aRegEval->hisPDF_w[nPDFnow]->Fill(regSmr,pdfWgt); } } @@ -1529,19 +1524,19 @@ TString ANNZ::evalRegWrapperLoop() { // apply the bias-correction to the pdf // ----------------------------------------------------------------------------------------------------------- if(doBiasCorPDF) { - TH1 * hisPDF_w_TMP = (TH1*)aRegEval->hisPDF_w[nPDFnow]->Clone((TString)aRegEval->hisPDF_w[nPDFnow]->GetName()+"_TMP"); + TString clnName = (TString)aRegEval->hisPDF_w[nPDFnow]->GetName()+"_TMP"; + TH1 * hisPDF_w_TMP = (TH1*)aRegEval->hisPDF_w[nPDFnow]->Clone(clnName); for(int nBinXnow=1; nBinXnowGetBinContent(nBinXnow); - if(val < aRegEval->minWeight) continue; - if(!aRegEval->hisBiasCorV[nPDFnow][nBinXnow-1]) continue; - if(aRegEval->hisBiasCorV[nPDFnow][nBinXnow-1]->Integral() < EPS) continue; + if(val < aRegEval->minWeight) continue; + if(!aRegEval->hisBiasCorV[nPDFnow][nBinXnow-1]) continue; + if( aRegEval->hisBiasCorV[nPDFnow][nBinXnow-1]->Integral() < EPS) continue; val /= nSmearUnf; for(int nSmearUnfNow=0; nSmearUnfNowhisBiasCorV[nPDFnow][nBinXnow-1]->GetRandom(); - // rndVal = min(max(rndVal,minValZ+EPS),maxValZ-EPS); aRegEval->hisPDF_w[nPDFnow]->Fill(rndVal,val); } } @@ -1652,7 +1647,7 @@ TString ANNZ::evalRegWrapperLoop() { */ // =========================================================================================================== void ANNZ::evalRegWrapperCleanup() { -// ================================= +// =========================================================================================================== aLOG(Log::DEBUG_1) <GetOptB("trainingNeeded")) return; // choose the training type @@ -43,7 +43,7 @@ void ANNZ::Train() { */ // =========================================================================================================== void ANNZ::Train_singleCls() { -// =========================== +// =========================================================================================================== aLOG(Log::INFO) <GetOptI("minObjTrainTest"); @@ -248,7 +248,7 @@ void ANNZ::Train_singleCls() { */ // =========================================================================================================== void ANNZ::Train_singleReg() { -// =========================== +// =========================================================================================================== aLOG(Log::INFO) <GetOptB("doBiasCorMLM"); @@ -550,7 +550,7 @@ void ANNZ::Train_singleReg() { */ // =========================================================================================================== void ANNZ::Train_singleRegBiasCor() { -// ================================== +// =========================================================================================================== aLOG(Log::INFO) <GetOptI("nMLMnow"); @@ -912,7 +912,7 @@ void ANNZ::Train_singleRegBiasCor() { */ // =========================================================================================================== void ANNZ::Train_binnedCls() { -// =========================== +// =========================================================================================================== aLOG(Log::INFO) <GetOptI("binCls_nTries"); @@ -1472,7 +1472,7 @@ void ANNZ::Train_binnedCls() { */ // =========================================================================================================== void ANNZ::generateOptsMLM(OptMaps * optMap, TString userMLMopts) { -// ================================================================ +// =========================================================================================================== TString type(""), opt(""); int seed = glob->GetOptI("initSeedRnd"); @@ -1642,7 +1642,7 @@ void ANNZ::generateOptsMLM(OptMaps * optMap, TString userMLMopts) { */ // =========================================================================================================== void ANNZ::verifTarget(TTree * aTree) { -// ==================================== +// =========================================================================================================== VERIFY(LOCATION,(TString)"Memory leak ?! ",(dynamic_cast(aTree))); TString zTrgName = glob->GetOptC("zTrg"); diff --git a/src/ANNZ_utils.cpp b/src/ANNZ_utils.cpp index 9b836ed..ce18b35 100644 --- a/src/ANNZ_utils.cpp +++ b/src/ANNZ_utils.cpp @@ -27,7 +27,7 @@ */ // =========================================================================================================== void ANNZ::Init() { -// ================ +// =========================================================================================================== aLOG(Log::INFO)<0)" ,(glob->GetOptI("nMLMs") > 0)); VERIFY(LOCATION,(TString)"Must set (nMLMnowGetOptI("nMLMs") > glob->GetOptI("nMLMnow"))); + double minValZ(glob->GetOptF("minValZ")), maxValZ(glob->GetOptF("maxValZ")), diffValZ(maxValZ - minValZ); + if(glob->GetOptB("doRegression") && !glob->GetOptB("doOnlyKnnErr")) { - VERIFY(LOCATION,(TString)"Must set (minValZ < maxValZ)",(glob->GetOptF("minValZ") < glob->GetOptF("maxValZ"))); + VERIFY(LOCATION,(TString)"Must set (minValZ < maxValZ)",(minValZ < maxValZ)); } + // underflow/overflow options for plotting + if(glob->GetOptI("nUnderflowBins") < 0) glob->SetOptI("nUnderflowBins", 0); + if(glob->GetOptI("nOverflowBins") < 0) glob->SetOptI("nOverflowBins", 0); + if(glob->GetOptF("underflowZwidth") < EPS) glob->SetOptF("underflowZwidth", (glob->GetOptI("nUnderflowBins") > 0) ? diffValZ * 0.1 : 0); + if(glob->GetOptF("overflowZwidth") < EPS) glob->SetOptF("overflowZwidth", (glob->GetOptI("nOverflowBins") > 0) ? diffValZ * 0.1 : 0); + if(glob->GetOptF("underflowZ") < EPS) glob->SetOptF("underflowZ", glob->GetOptF("minValZ") - glob->GetOptF("underflowZwidth")); + if(glob->GetOptF("overflowZ") < EPS) glob->SetOptF("overflowZ", glob->GetOptF("maxValZ") + glob->GetOptF("overflowZwidth")); + // ----------------------------------------------------------------------------------------------------------- // check the definition of PDF bins, and compute nPDFbins from pdfBinWidth if needed // ----------------------------------------------------------------------------------------------------------- if(!glob->GetOptB("doTrain")) { // for single-regression there is no pdf, so just set a sensible default value just in case - if (glob->GetOptB("doSingleReg")) { + if(glob->GetOptB("doSingleReg")) { glob->SetOptI("nPDFbins",1); } // for randomized regression there is the option for the user to define a bin-width instead of the number of bins else if(glob->GetOptB("doRandomReg")) { + if(glob->GetOptI("nPDFs") <= 0) glob->SetOptI("nPDFbins",1); // check if a specialized binning definition has been given by the user if(glob->GetOptC("userPdfBins") != "") { @@ -265,9 +276,8 @@ void ANNZ::Init() { else { VERIFY(LOCATION,(TString)"Configuration problem... must set either \"nPDFbins\" or \"binWidth\" ...",(binWidth > 0)); - double zDiff = glob->GetOptF("maxValZ") - glob->GetOptF("minValZ"); - int nPDFbins = static_cast(floor(0.00001+zDiff/binWidth)); - double finalBinW = zDiff/double(nPDFbins); + int nPDFbins = static_cast(floor(0.00001+diffValZ/binWidth)); + double finalBinW = diffValZ/double(nPDFbins); VERIFY(LOCATION,(TString)"Configuration problem... \"binWidth\" inconsistent with max/minValZ range ...",(nPDFbins >= 1)); @@ -316,12 +326,38 @@ void ANNZ::Init() { // number of PDF types - either generate no PDF, or choose up to two types // ----------------------------------------------------------------------------------------------------------- // currently implemented in getRndMethodBestPDF() - if (glob->GetOptB("doSingleReg")) { glob->SetOptI("nPDFs", 0 ); } - else if(glob->GetOptB("doRandomReg")) { glob->SetOptI("nPDFs", min(max(glob->GetOptI("nPDFs"),0),2) ); } + if (glob->GetOptB("doSingleReg")) { + if(glob->GetOptI("nPDFs") > 0) { + aLOG(Log::WARNING) <SetOptI("nPDFs", 0); + } + else if(glob->GetOptB("doRandomReg")) { + if(glob->GetOptI("nPDFs") == 1 && glob->GetOptB("addOldStylePDFs")) { + aLOG(Log::WARNING) < 1"<GetOptI("nPDFs") > 1 && !glob->GetOptB("addOldStylePDFs")) { + aLOG(Log::WARNING) <SetOptI("nPDFs", 1); + } + else if(glob->GetOptI("nPDFs") > 3) { + aLOG(Log::WARNING) <SetOptI("nPDFs", min(max(glob->GetOptI("nPDFs"),0),3)); + } // either the baseline PDF, or that as well as another, which includes uncertainty-smearing, see doEvalReg() - else if(glob->GetOptB("doBinnedCls")) { glob->SetOptI("nPDFs", min(max(glob->GetOptI("nPDFs"),1),2) ); } + else if(glob->GetOptB("doBinnedCls")) { + if(glob->GetOptI("nPDFs") > 2) { + aLOG(Log::WARNING) <SetOptI("nPDFs", min(max(glob->GetOptI("nPDFs"),1),2)); + } // no PDFs currently defined for classification - else { glob->SetOptI("nPDFs", 0 ); } + else { + glob->SetOptI("nPDFs", 0); + } if(glob->GetOptB("doRegression") && !glob->GetOptB("doTrain") && !glob->GetOptB("doOnlyKnnErr")) { aLOG(Log::INFO) <GetOptI("nPDFs")<GetOptC("zTrgTitle") == "") glob->SetOptC("zTrgTitle",glob->GetOptC("zTrg")); if(glob->GetOptC("zRegTitle") == "") glob->SetOptC("zRegTitle","regression value"); - if(glob->GetOptF("zClosBinWidth") < EPS) { - glob->SetOptF("zClosBinWidth", (glob->GetOptF("maxValZ")-glob->GetOptF("minValZ"))/10.); - } + // if(glob->GetOptF("zClosBinWidth") < EPS) { + // double norm = 50; + // if(glob->GetOptI("nPDFs") > 0 && glob->GetOptI("nPDFbins") > norm) norm = glob->GetOptI("nPDFbins"); + + // glob->SetOptF("zClosBinWidth", (maxValZ - minValZ)/norm); + // } if(glob->GetOptF("zPlotBinWidth") < EPS) { - glob->SetOptF("zPlotBinWidth", glob->GetOptF("zClosBinWidth")); + glob->SetOptF("zPlotBinWidth", (maxValZ - minValZ)/10.); } if(glob->GetOptI("nDrawBins_zTrg") < 2) { - glob->SetOptI("nDrawBins_zTrg", static_cast(floor(0.00001+(glob->GetOptF("maxValZ")-glob->GetOptF("minValZ"))/0.05))); + glob->SetOptI("nDrawBins_zTrg", static_cast(floor(0.00001+(maxValZ - minValZ)/0.05))); } - double zDiff = glob->GetOptF("maxValZ")-glob->GetOptF("minValZ"); - bool hasGoodBinWidthZ = ( (glob->GetOptF("zClosBinWidth") > 0 && glob->GetOptF("zClosBinWidth") <= zDiff/5.) - || (glob->GetOptF("zClosBinWidth") < 0 ) ); + bool hasGoodBinWidthZ = ( (glob->GetOptF("zClosBinWidth") > 0 && glob->GetOptF("zClosBinWidth") <= diffValZ/5.) + || (glob->GetOptF("zClosBinWidth") < 0 ) ); VERIFY(LOCATION,(TString)"Must set ((zClosBinWidth > 0) and (zClosBinWidth <= (maxValZ-minValZ)/5)) or (zClosBinWidth < 0) ...",hasGoodBinWidthZ); setInfoBinsZ(); + double excludeRange = glob->GetOptF("excludeRangePdfModelFit"); + if(excludeRange < 0 || excludeRange > 0.4) { + aLOG(Log::WARNING) <SetOptF("excludeRangePdfModelFit", 0); + } + // validate correct setting for the primary condition for optimization, and set the corresponding title if(glob->GetOptC("optimCondReg") == "fracSig68") { aLOG(Log::WARNING) <GetOptF("closHisL") - glob->GetOptF("closHisH")) < 1e-10) { - double minValZ(glob->GetOptF("minValZ")), maxValZ(glob->GetOptF("maxValZ")); - glob->SetOptF("closHisL",( (maxValZ-minValZ)/2. - 4.5 * (maxValZ-minValZ) )); - glob->SetOptF("closHisH",( (maxValZ-minValZ)/2. + 4.5 * (maxValZ-minValZ) )); + glob->SetOptF("closHisL",( (maxValZ - minValZ)/2. - 4.5 * (maxValZ - minValZ) )); + glob->SetOptF("closHisH",( (maxValZ - minValZ)/2. + 4.5 * (maxValZ - minValZ) )); } // round up closHisN, so may be divided by some factor to finally equal nDrawBins_zTrg @@ -580,7 +625,7 @@ void ANNZ::Init() { */ // =========================================================================================================== void ANNZ::setTags() { -// =================== +// =========================================================================================================== int nMLMs = glob->GetOptI("nMLMs"); int nPDFs = glob->GetOptI("nPDFs"); int nPDFbins = glob->GetOptI("nPDFbins"); @@ -650,50 +695,50 @@ void ANNZ::setTags() { // =========================================================================================================== TString ANNZ::getBaseTagName(TString MLMname) { -// ============================================ +// =========================================================================================================== VERIFY(LOCATION,(TString)"(MLMname = \""+MLMname+"\") has unsupported format",(mlmBaseTag.find(MLMname) != mlmBaseTag.end())); return mlmBaseTag[MLMname]; } // =========================================================================================================== TString ANNZ::getTagName(int nMLMnow) { -// ==================================== +// =========================================================================================================== VERIFY(LOCATION,(TString)"(nMLMnow = "+utils->intToStr(nMLMnow)+") out of range ?!?",(nMLMnow >= 0 && nMLMnow < glob->GetOptI("nMLMs"))); return mlmTagName[nMLMnow]; } // =========================================================================================================== TString ANNZ::getTagError(int nMLMnow, TString errType) { -// ====================================================== +// =========================================================================================================== VERIFY(LOCATION,(TString)"(nMLMnow = "+utils->intToStr(nMLMnow)+") out of range ?!?",(nMLMnow >= 0 && nMLMnow < glob->GetOptI("nMLMs"))); VERIFY(LOCATION,(TString)"(Unknown error-type requested (\"errType\" = "+errType+")",(mlmTagErr[nMLMnow].find(errType) != mlmTagErr[nMLMnow].end())); return mlmTagErr[nMLMnow][errType]; } // =========================================================================================================== TString ANNZ::getTagWeight(int nMLMnow) { -// ====================================== +// =========================================================================================================== VERIFY(LOCATION,(TString)"(nMLMnow = "+utils->intToStr(nMLMnow)+") out of range ?!?",(nMLMnow >= 0 && nMLMnow < glob->GetOptI("nMLMs"))); return mlmTagWeight[nMLMnow]; } // =========================================================================================================== TString ANNZ::getTagBias(int nMLMnow) { -// ==================================== +// =========================================================================================================== VERIFY(LOCATION,(TString)"(nMLMnow = "+utils->intToStr(nMLMnow)+") out of range ?!?",(nMLMnow >= 0 && nMLMnow < glob->GetOptI("nMLMs"))); return mlmTagBias[nMLMnow]; } // =========================================================================================================== TString ANNZ::getTagClsVal(int nMLMnow) { -// ====================================== +// =========================================================================================================== VERIFY(LOCATION,(TString)"(nMLMnow = "+utils->intToStr(nMLMnow)+") out of range ?!?",(nMLMnow >= 0 && nMLMnow < glob->GetOptI("nMLMs"))); return mlmTagClsVal[nMLMnow]; } // =========================================================================================================== TString ANNZ::getTagIndex(int nMLMnow) { -// ===================================== +// =========================================================================================================== VERIFY(LOCATION,(TString)"(nMLMnow = "+utils->intToStr(nMLMnow)+") out of range ?!?",(nMLMnow >= 0 && nMLMnow < glob->GetOptI("nMLMs"))); return mlmTagIndex[nMLMnow]; } // =========================================================================================================== TString ANNZ::getTagInVarErr(int nMLMnow, int nInErrNow) { -// ======================================================= +// =========================================================================================================== VERIFY(LOCATION,(TString)"(nMLMnow = " +utils->intToStr(nMLMnow) +") out of range ?!?",(nMLMnow >= 0 && nMLMnow < glob->GetOptI("nMLMs"))); VERIFY(LOCATION,(TString)"(nInErrNow = "+utils->intToStr(nInErrNow)+") out of range ?!?",(nInErrNow >= 0 && nInErrNow < (int)inErrTag[nMLMnow].size())); @@ -701,7 +746,7 @@ TString ANNZ::getTagInVarErr(int nMLMnow, int nInErrNow) { } // =========================================================================================================== TString ANNZ::getTagPdfBinName(int nPdfNow, int nBinNow) { -// ======================================================= +// =========================================================================================================== VERIFY(LOCATION,(TString)"(nPdfNow = "+utils->intToStr(nPdfNow)+") out of range ?!?",(nPdfNow >= 0 && nPdfNow < glob->GetOptI("nPDFs"))); VERIFY(LOCATION,(TString)"(nBinNow = "+utils->intToStr(nBinNow)+") out of range ?!?",(nBinNow >= 0 && nBinNow < glob->GetOptI("nPDFbins"))); @@ -709,7 +754,7 @@ TString ANNZ::getTagPdfBinName(int nPdfNow, int nBinNow) { } // =========================================================================================================== TString ANNZ::getTagPdfAvgName(int nPdfNow, TString type) { -// ======================================================== +// =========================================================================================================== VERIFY(LOCATION,(TString)"(nPdfNow = "+utils->intToStr(nPdfNow)+") out of range ?!?",(nPdfNow >= 0 && nPdfNow < glob->GetOptI("nPDFs"))); VERIFY(LOCATION,(TString)"(type = \""+type+"\") has unsupported format",(pdfAvgNames[nPdfNow].find(type) != pdfAvgNames[nPdfNow].end())); @@ -717,14 +762,14 @@ TString ANNZ::getTagPdfAvgName(int nPdfNow, TString type) { } // =========================================================================================================== TString ANNZ::getTagBestMLMname(TString type) { -// ============================================ +// =========================================================================================================== VERIFY(LOCATION,(TString)"(type = \""+type+"\") has unsupported format",(bestMLMname.find(type) != bestMLMname.end())); return bestMLMname[type]; } // =========================================================================================================== int ANNZ::getTagNow(TString MLMname) { -// =================================== +// =========================================================================================================== TString MLMnamePost(MLMname); MLMnamePost.ReplaceAll(glob->GetOptC("basePrefix"),""); VERIFY(LOCATION,(TString)"(MLMname = \""+MLMname+"\") has unsupported format",(MLMnamePost.IsDigit())); @@ -735,13 +780,13 @@ int ANNZ::getTagNow(TString MLMname) { } // =========================================================================================================== TString ANNZ::getErrKNNname(int nMLMnow) { -// ======================================= +// =========================================================================================================== VERIFY(LOCATION,(TString)"(nMLMnow = "+utils->intToStr(nMLMnow)+") out of range ?!?",(nMLMnow >= 0 && nMLMnow < glob->GetOptI("nMLMs"))); return mlmTagErrKNN[nMLMnow]; } // =========================================================================================================== int ANNZ::getErrKNNtagNow(TString errKNNname) { -// ============================================ +// =========================================================================================================== TString errKNNnamePost(errKNNname); errKNNnamePost.ReplaceAll(glob->GetOptC("baseTag_errKNN"),""); return getTagNow(errKNNnamePost); @@ -759,7 +804,7 @@ int ANNZ::getErrKNNtagNow(TString errKNNname) { */ // =========================================================================================================== TString ANNZ::getKeyWord(TString MLMname, TString sequence, TString key) { -// ======================================================================= +// =========================================================================================================== // ----------------------------------------------------------------------------------------------------------- if(sequence == "trainXML") { @@ -862,6 +907,7 @@ TString ANNZ::getKeyWord(TString MLMname, TString sequence, TString key) { */ // =========================================================================================================== TString ANNZ::getRegularStrForm(TString strIn, VarMaps * var, TChain * aChain) { +// =========================================================================================================== if(strIn == "") return strIn; bool hasVar = (dynamic_cast(var) != NULL); @@ -894,7 +940,7 @@ TString ANNZ::getRegularStrForm(TString strIn, VarMaps * var, TChain * aChain) { */ // =========================================================================================================== void ANNZ::loadOptsMLM() { -// ======================= +// =========================================================================================================== aLOG(Log::DEBUG_1) <GetOptI("nMLMs"); @@ -1234,7 +1280,7 @@ void ANNZ::loadOptsMLM() { */ // =========================================================================================================== void ANNZ::setNominalParams(int nMLMnow, TString inputVariables, TString inputVarErrors) { -// ======================================================================================= +// =========================================================================================================== TString MLMname = getTagName(nMLMnow); TString basePrefix = glob->GetOptC("basePrefix"); TString baseName_inVarErr = glob->GetOptC("baseName_inVarErr"); @@ -1284,7 +1330,7 @@ void ANNZ::setNominalParams(int nMLMnow, TString inputVariables, TString inputVa */ // =========================================================================================================== void ANNZ::setMethodCuts(VarMaps * var, int nMLMnow, bool verbose) { -// ================================================================= +// =========================================================================================================== TString MLMname = getTagName(nMLMnow); VERIFY(LOCATION,(TString)"Memory leak ?! ",(dynamic_cast(var))); @@ -1319,8 +1365,10 @@ void ANNZ::setMethodCuts(VarMaps * var, int nMLMnow, bool verbose) { * @param aChain - The associated chain */ // =========================================================================================================== -TCut ANNZ::getTrainTestCuts(TString cutType, int nMLMnow, int split0, int split1, VarMaps * var, TChain * aChain) { -// ================================================================================================================ +TCut ANNZ::getTrainTestCuts( + TString cutType, int nMLMnow, int split0, int split1, VarMaps * var, TChain * aChain +) { +// =========================================================================================================== vector cutTypeV = utils->splitStringByChar(cutType,';'); int nCuts = (int)cutTypeV.size(); TString MLMname = getTagName(nMLMnow); @@ -1385,7 +1433,7 @@ TCut ANNZ::getTrainTestCuts(TString cutType, int nMLMnow, int split0, int split1 */ // =========================================================================================================== void ANNZ::selectUserMLMlist(vector & optimMLMv, map & mlmSkipNow) { -// =========================================================================================== +// =========================================================================================================== aLOG(Log::INFO) < & optimMLMv, map & */ // =========================================================================================================== void ANNZ::setInfoBinsZ() { -// ======================== - TString userPdfBins = glob->GetOptC("userPdfBins"); - double minValZ = glob->GetOptF("minValZ"); - double maxValZ = glob->GetOptF("maxValZ"); - double zClosBinWidth = glob->GetOptF("zClosBinWidth"); - double zPlotBinWidth = glob->GetOptF("zPlotBinWidth"); - bool doTrain = glob->GetOptB("doTrain"); - bool doBinnedCls = glob->GetOptB("doBinnedCls"); - int nPDFs = glob->GetOptI("nPDFs"); - int nPDFbins = glob->GetOptI("nPDFbins"); - +// =========================================================================================================== + TString userPdfBins = glob->GetOptC("userPdfBins"); + double minValZ = glob->GetOptF("minValZ"); + double maxValZ = glob->GetOptF("maxValZ"); + double underflowZ = glob->GetOptF("underflowZ"); + double overflowZ = glob->GetOptF("overflowZ"); + int nUnderflowBins = glob->GetOptI("nUnderflowBins"); + int nOverflowBins = glob->GetOptI("nOverflowBins"); + int nDrawBins_zTrg = glob->GetOptI("nDrawBins_zTrg"); + double zClosBinWidth = glob->GetOptF("zClosBinWidth"); + double zPlotBinWidth = glob->GetOptF("zPlotBinWidth"); + bool doTrain = glob->GetOptB("doTrain"); + bool doBinnedCls = glob->GetOptB("doBinnedCls"); + int nPDFs = glob->GetOptI("nPDFs"); + int nPDFbins = glob->GetOptI("nPDFbins"); + TString zTrgName = glob->GetOptC("zTrg"); + double underflowW = (nUnderflowBins > 0 ) ? glob->GetOptF("underflowZwidth") / double(nUnderflowBins) : 0; + double overflowW = (nOverflowBins > 0 ) ? glob->GetOptF("overflowZwidth") / double(nOverflowBins) : 0; + TString name(""); int nBinsZ(0); - double binW(0); + double minZ(0), maxZ(0), binW(0); bool hasUserPdfBins(userPdfBins != ""); for(int nBinTypwNow=0; nBinTypwNow<100; nBinTypwNow++) { if(nBinTypwNow == 0) { + if(zClosBinWidth < EPS) continue; + + minZ = minValZ; + maxZ = maxValZ; binW = zClosBinWidth; nBinsZ = static_cast(floor(0.1+(maxValZ - minValZ)/binW)); name = "closure"; } else if(nBinTypwNow == 1) { + minZ = underflowZ; + maxZ = overflowZ; binW = zPlotBinWidth; - nBinsZ = static_cast(floor(0.1+(maxValZ - minValZ)/binW)); + nBinsZ = static_cast(floor(0.1+(maxValZ - minValZ)/binW)) + nUnderflowBins+nOverflowBins; name = "plotting"; } else if(nBinTypwNow == 2) { if(hasUserPdfBins) continue; + minZ = minValZ; + maxZ = maxValZ; nBinsZ = max(nPDFbins,1); binW = (maxValZ - minValZ) / double(nBinsZ); name = "PDF"; } + else if(nBinTypwNow == 3) { + minZ = underflowZ; + maxZ = overflowZ; + binW = (maxValZ - minValZ)/double(nDrawBins_zTrg); + nBinsZ = nDrawBins_zTrg + nUnderflowBins+nOverflowBins; + name = (TString)"plotting "+zTrgName; + } else break; aLOG(Log::DEBUG) < bins_E(nBinsZ+1,0), bins_C(nBinsZ,0); for(int nBinZnow=0; nBinZnow= nBinsZ - nOverflowBins) { + binEdgeL = maxValZ + overflowW * (nBinZnow - nBinsZ + nOverflowBins); + binCenter = binEdgeL + overflowW * 0.5; + } + else { + binEdgeL = minValZ + binW * (nBinZnow - nUnderflowBins); + binCenter = binEdgeL + binW * 0.5; + } + } + else { + binEdgeL = minValZ + binW * nBinZnow; + binCenter = binEdgeL + binW * 0.5; + } bins_E[nBinZnow] = binEdgeL; bins_C[nBinZnow] = binCenter; } - bins_E[nBinsZ] = maxValZ; + bins_E[nBinsZ] = maxZ; - if (nBinTypwNow == 0) { zClos_binE = bins_E; zClos_binC = bins_C; } - else if(nBinTypwNow == 1) { zPlot_binE = bins_E; zPlot_binC = bins_C; } - else { zPDF_binE = bins_E; zPDF_binC = bins_C; } + if (nBinTypwNow == 0) { zClos_binE = bins_E; zClos_binC = bins_C; } + else if(nBinTypwNow == 1) { zPlot_binE = bins_E; zPlot_binC = bins_C; } + else if(nBinTypwNow == 2) { zPDF_binE = bins_E; zPDF_binC = bins_C; } + else if(nBinTypwNow == 3) { zTrgPlot_binE = bins_E; zTrgPlot_binC = bins_C; } } + + // for(int nBinZnow=0; nBinZnow pdfBinV = utils->splitStringByChar(userPdfBins,';'); @@ -1538,12 +1629,14 @@ void ANNZ::setInfoBinsZ() { glob->SetOptI("nPDFbins",nBinsZ); aLOG(Log::DEBUG) <strToDouble(pdfBinV[nPdfBinNow]); + zPDF_binE [nPdfBinNow] = utils->strToDouble(pdfBinV[nPdfBinNow]); + // zClos_binE[nPdfBinNow] = zPDF_binE[nPdfBinNow]; if(nPdfBinNow > 0) { VERIFY(LOCATION,(TString)"PDF bins must be given in ascending order (\"userPdfBins\" = "+userPdfBins+")" @@ -1551,9 +1644,12 @@ void ANNZ::setInfoBinsZ() { } } for(int nPdfBinNow=0; nPdfBinNow= minValZ)); @@ -1579,7 +1675,7 @@ void ANNZ::setInfoBinsZ() { */ // =========================================================================================================== int ANNZ::getBinZ(double valZ, vector & binEdgesV, bool forceCheck) { -// =========================================================================== +// =========================================================================================================== int nBinsZ = (int)binEdgesV.size() - 1; VERIFY(LOCATION,(TString)"Trying to getBinZ() a vector of size "+utils->intToStr(nBinsZ),(nBinsZ > 0)); @@ -1610,7 +1706,7 @@ int ANNZ::getBinZ(double valZ, vector & binEdgesV, bool forceCheck) { */ // =========================================================================================================== void ANNZ::binClsStrToV(TString clsBins) { -// ======================================= +// =========================================================================================================== if(clsBins == "") return; double minValZ = glob->GetOptF("minValZ"); @@ -1676,7 +1772,7 @@ void ANNZ::binClsStrToV(TString clsBins) { */ // =========================================================================================================== TString ANNZ::deriveBinClsBins(map < TString,TChain* > & chainM, map < TString,TCut > & cutM) { -// ============================================================================================ +// =========================================================================================================== aLOG(Log::DEBUG) <GetOptI("closHisN"); @@ -1948,8 +2044,10 @@ TString ANNZ::deriveBinClsBins(map < TString,TChain* > & chainM, map < TString,T * @param optMap - Used to store the final number of objects in the split trees. */ // =========================================================================================================== -void ANNZ::createCutTrainTrees(map < TString,TChain* > & chainM, map < TString,TCut > & cutM, OptMaps * optMap) { -// ============================================================================================================== +void ANNZ::createCutTrainTrees( + map < TString,TChain* > & chainM, map < TString,TCut > & cutM, OptMaps * optMap +) { +// =========================================================================================================== aLOG(Log::DEBUG) <GetOptI("maxNobj"); @@ -2039,8 +2137,10 @@ void ANNZ::createCutTrainTrees(map < TString,TChain* > & chainM, map < TString,T * @param optMap - Used to store the final number of objects in the split trees. */ // =========================================================================================================== -void ANNZ::splitToSigBckTrees(map < TString,TChain* > & chainM, map < TString,TCut > & cutM, OptMaps * optMap) { -// ============================================================================================================= +void ANNZ::splitToSigBckTrees( + map < TString,TChain* > & chainM, map < TString,TCut > & cutM, OptMaps * optMap +) { +// =========================================================================================================== aLOG(Log::DEBUG) <GetOptI("maxNobj"); diff --git a/src/OutMngr_utils.cpp b/src/OutMngr_utils.cpp index a1bb71e..9bb48bf 100644 --- a/src/OutMngr_utils.cpp +++ b/src/OutMngr_utils.cpp @@ -99,6 +99,7 @@ void OutMngr::WriteOutObjects(bool writePdfScripts, bool dontWriteHis) { // outputRootFileName = (TString)outPlotDirName+outFileName+"_plots.root"; // OutputRootFile = new TFile(outputRootFileName,"RECREATE"); + bool saveScript(glob->GetOptB("savePlotScripts")); TString printPlotExtension(glob->GetOptC("printPlotExtension")); vector eraseKeys; @@ -110,17 +111,21 @@ void OutMngr::WriteOutObjects(bool writePdfScripts, bool dontWriteHis) { if(dynamic_cast(CanvasMap[hisName])) { // CanvasMap[hisName]->Write(); - TString printName = outPlotDirName+hisName; - TString printFile = (TString)printName+".C"; - CanvasMap[hisName]->Print(printFile); - - // fix a bug in root, where the given function name in the macro includes the full path - TString tmpFile = (TString)printFile+"_tmp"; - TString fixCmnd = printName; fixCmnd.ReplaceAll("/","\\/"); - fixCmnd = (TString)"sed -e 's/"+fixCmnd+"/"+hisName+"/g' < "+printFile+" > "+tmpFile+" ; mv "+tmpFile+" "+printFile; - utils->exeShellCmndOutput(fixCmnd); + if(saveScript) { + TString printName = outPlotDirName+hisName; + TString printFile = (TString)printName+".C"; + CanvasMap[hisName]->Print(printFile); + + // fix a bug in root, where the given function name in the macro includes the full path + TString tmpFile = (TString)printFile+"_tmp"; + TString fixCmnd = printName; fixCmnd.ReplaceAll("/","\\/"); + fixCmnd = (TString)"sed -e 's/"+fixCmnd+"/"+hisName+"/g' < "+printFile+" > "+tmpFile+" ; mv "+tmpFile+" "+printFile; + utils->exeShellCmndOutput(fixCmnd); + } - if(printPlotExtension != "") CanvasMap[hisName]->SaveAs((TString)outPlotDirName+hisName+"."+printPlotExtension); + if(printPlotExtension != "") { + CanvasMap[hisName]->SaveAs((TString)outPlotDirName+hisName+"."+printPlotExtension); + } eraseKeys.push_back(hisName); } diff --git a/src/myANNZ.cpp b/src/myANNZ.cpp index d133744..4af86fc 100644 --- a/src/myANNZ.cpp +++ b/src/myANNZ.cpp @@ -38,7 +38,7 @@ */ // =========================================================================================================== int main(int argc, char ** argv){ -// =============================== +// =========================================================================================================== // create the main object. It comes with built-in glob->() options which may be modified by user inputs in the following // ----------------------------------------------------------------------------------------------------------- @@ -75,7 +75,7 @@ int main(int argc, char ** argv){ */ // =========================================================================================================== Manager::Manager() { -// ================= +// =========================================================================================================== // ----------------------------------------------------------------------------------------------------------- // initial setup - all of these glob->() options may may be overwritten by user inputs @@ -202,7 +202,7 @@ Manager::Manager() { glob->NewOptF("sampleFracInp_inTrain" ,1); // fraction of the input sample to use for the kd-tree (positive number, smaller or equal to 1) glob->NewOptF("sampleFracRef_inTrain" ,1); // fraction of the input sample to use for the kd-tree (positive number, smaller or equal to 1) glob->NewOptB("doWidthRescale_inTrain" ,true); // transform the input parameters used for the kd-tree to the range [-1,1] - + glob->NewOptB("testAndEvalTrainMethods" ,false); // perform a TMVA test of the results of the training // ----------------------------------------------------------------------------------------------------------- // general options (regression, binned-classification and classification) @@ -212,9 +212,15 @@ Manager::Manager() { // ----------------------------------------------------------------------------------------------------------- // general options (regression, binned-classification) // ----------------------------------------------------------------------------------------------------------- - glob->NewOptF("minValZ",1); // minimal value of regression target (the "truth" value) - glob->NewOptF("maxValZ",-1); // maximal value of regression target (the "truth" value) - glob->NewOptC("zTrg" ,""); // the name of regression target - it must correspond to one of the input variables + glob->NewOptC("zTrg" ,""); // the name of regression target - it must correspond to one of the input variables + glob->NewOptF("minValZ" , 1); // minimal value of regression target (the "truth" value) + glob->NewOptF("maxValZ" ,-1); // maximal value of regression target + glob->NewOptF("underflowZ" ,-1); // underflow value of regression target + glob->NewOptF("overflowZ" ,-1); // overflow value of regression target + glob->NewOptI("nUnderflowBins" , 3); // number of underflow bins + glob->NewOptI("nOverflowBins" , 3); // number of overflow bins + glob->NewOptF("underflowZwidth",-1); // total width of all underflow bins + glob->NewOptF("overflowZwidth" ,-1); // total width of all overflow bins // ----------------------------------------------------------------------------------------------------------- // general options (binned-classification) @@ -339,7 +345,8 @@ Manager::Manager() { // ----------------------------------------------------------------------------------------------------------- // optimization (randomized regression) // ----------------------------------------------------------------------------------------------------------- - glob->NewOptF("excludeRangePdfModelFit",0.1); // exclude margin for fitting cumulative dist as part of PDF optimization + // exclude margin for fitting cumulative dist as part of PDF optimization (eg within [0,0.1]) + glob->NewOptF("excludeRangePdfModelFit",0); // optimCondReg - // ["sig68" or "bias"] - used for deciding how to rank MLM performance. the named criteria represents @@ -408,9 +415,14 @@ Manager::Manager() { // if max_sigma68_PDF,max_bias_PDF are positive, they put thresholds on the maximal value of the // scatter/bias/outlier-fraction of an MLM which may be included in the PDF created in randomized regression - glob->NewOptF("max_sigma68_PDF" ,-1); // maximal value of the scatter of an MLM included in the PDF - glob->NewOptF("max_bias_PDF" ,-1); // maximal value of the bias of an MLM included in the PDF - glob->NewOptF("max_frac68_PDF" ,-1); // maximal value of the outlier-fraction of an MLM included in the PDF + glob->NewOptF("max_sigma68_PDF" ,-1); // maximal value of the scatter of an MLM included in the PDF + glob->NewOptF("max_bias_PDF" ,-1); // maximal value of the bias of an MLM included in the PDF + glob->NewOptF("max_frac68_PDF" ,-1); // maximal value of the outlier-fraction of an MLM included in the PDF + + glob->NewOptI("max_optimObj_PDF" ,1e4); // maximal number of objects to use in order to optimize PDFs + glob->NewOptI("nOptimLoops" ,1e4); // maximal number of tries to improve the PDF weights + glob->NewOptB("addOldStylePDFs" ,false); // whether to use the old-style PDFs (defined since v2.2.3) + glob->NewOptI("max_staticOptimTries_PDF",250); // maximal number of steps for the optimization random walk // bias-correction procedure on MLMs and/or PDFs // doBiasCorPDF - whether or not to perform the correction for PDFs (during optimization) @@ -453,8 +465,6 @@ Manager::Manager() { glob->NewOptC("zRegTitle" ,"Z_{reg}"); // title of regression variable (for plots) glob->NewOptF("zPlotBinWidth" ,-1); // width of bins to perform the plotting glob->NewOptI("nDrawBins_zTrg" ,-1); // number of bins in zTrg for plotting - // typical width in the regression variable for plotting, e.g., for 10 plotting bins, set zClosBinWidth = (maxValZ-minValZ)/10. - glob->NewOptF("zClosBinWidth" ,-1); // possible list of variables which will always be plotted (that is, no safety checks on variable type will be performed) glob->NewOptC("alwaysPlotVars" ,""); // possible list of variables for which we do not use quantile bins for plotting - e.g., set as "inTrainFlag;MAG_U" @@ -462,6 +472,12 @@ Manager::Manager() { // add plots with the distribution of the knn error estimator glob->NewOptB("doKnnErrPlots" ,false); + // zClosBinWidth - typical width in the regression variable for plotting, e.g., for 10 plotting + // bins, set zClosBinWidth = (maxValZ-minValZ)/10. - if not set, a quantile division will be used of the zTrg range + // nZclosBins - alternatively, dynamically derive nZclosBins bins from the quantile distribution of zTrg + glob->NewOptF("zClosBinWidth" ,-1); + glob->NewOptI("nZclosBins" ,-1); + // format for plotting (in addition to generated root scripts, which may be run with [root -l script.C] // availabe formats (leave empty [glob->NewOptC("printPlotExtension","")] to prevent plotting): // "ps" "eps" "pdf" "svg" "tex" "gif" "xpm" "png" "jpg" "tiff" "xml" @@ -469,8 +485,11 @@ Manager::Manager() { // ----------------------------------------------------------------------------------------------------------- glob->NewOptC("printPlotExtension","pdf"); + // flag to store root scripts corresponding to generated plots + glob->NewOptB("savePlotScripts",false); + // use scaled bias (delta/(1+zTrg)) instead of delta for plotting in ANNZ::doMetricPlots() - glob->NewOptB("plotWithScaledBias" ,false); + glob->NewOptB("plotWithScaledBias",false); // ----------------------------------------------------------------------------------------------------------- // uncertainty estimators - either KNN (K-near-neighbours) entimation (used by default), or input-error propagation. @@ -555,11 +574,11 @@ Manager::Manager() { */ // =========================================================================================================== void Manager::Init(int argc, char ** argv) { -// ========================================= +// =========================================================================================================== // ----------------------------------------------------------------------------------------------------------- // current version-tag for the code // ----------------------------------------------------------------------------------------------------------- - TString basePrefix("ANNZ_"), versionTag("2.2.2"); + TString basePrefix("ANNZ_"), versionTag("2.3.0"); // sanity check on allowed root versions with corresponding supported TMVA versions VERIFY(LOCATION,(TString)" - Using unsupported ROOT version ... ?!?!?",(ROOT_TMVA_V0 || ROOT_TMVA_V1)); // ----------------------------------------------------------------------------------------------------------- @@ -910,7 +929,7 @@ void Manager::Init(int argc, char ** argv) { */ // =========================================================================================================== void Manager::GenerateInputTrees() { -// ================================= +// =========================================================================================================== // initialize the outputs with the rootIn directory and reset it outputs->InitializeDir(glob->GetOptC("outDirNameFull"),glob->GetOptC("baseName")); @@ -944,7 +963,7 @@ void Manager::GenerateInputTrees() { */ // =========================================================================================================== void Manager::doOnlyKnnErr() { -// =========================== +// =========================================================================================================== VERIFY(LOCATION,(TString)"Can not define both \"doGenInputTrees\" and \"doEval\" at the same time ... run \"doGenInputTrees\" " +"separately first, then \"doEval\"", !(glob->GetOptB("doGenInputTrees") && glob->GetOptB("doEval"))); @@ -984,7 +1003,7 @@ void Manager::doOnlyKnnErr() { */ // =========================================================================================================== void Manager::doInTrainFlag() { -// ============================ +// =========================================================================================================== // initialize the inTrainFlag directory outputs->InitializeDir(glob->GetOptC("outDirNameFull"),glob->GetOptC("baseName")); @@ -1004,8 +1023,7 @@ void Manager::doInTrainFlag() { */ // =========================================================================================================== void Manager::DoANNZ() { -// ===================== - +// =========================================================================================================== ANNZ * aANNZ = new ANNZ("aANNZ",utils,glob,outputs); if (glob->GetOptB("doTrain")) aANNZ->Train(); // training @@ -1044,7 +1062,7 @@ void Manager::DoANNZ() { */ // =========================================================================================================== Manager::~Manager() { -// ================== +// =========================================================================================================== aLOG(Log::DEBUG)<