From a02fe911e4a867953c4536d2d1c27a49eabf6688 Mon Sep 17 00:00:00 2001 From: IftachSadeh Date: Sun, 20 Nov 2016 17:40:19 +0100 Subject: [PATCH] support for input ROOT files with different Trees --- CHANGELOG.md | 4 ++- README.md | 14 ++++++++ examples/scripts/annz_rndReg_advanced.py | 46 +++++++++++++++++++----- src/ANNZ_loopReg.cpp | 5 ++- src/CatFormat_asciiToTree.cpp | 27 +++++++++----- src/myANNZ.cpp | 9 +++-- 6 files changed, 85 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 869d599..c5b1a8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## Master version (19/11/2016) +## Master version (20/11/2016) - Added the option to to *not* store the full value of pdfs in the output of optimization/evaluation, by setting ```python @@ -15,6 +15,8 @@ In this case, only the average metrics of a pdf are included in the output. glob.annz["doKnnErrPlots"] = True ``` +- Added support for input ROOT files with different Tree names. + - Updated `README.md`. - Other minor modifications and bug fixes. diff --git a/README.md b/README.md index da9e73a..be16abc 100644 --- a/README.md +++ b/README.md @@ -323,6 +323,20 @@ See the advanced scripts for additional details. ``` Set a list of input files for training in `splitTypeTrain`, and a list of input files for testing in `splitTypeTest`. +- Most of the examples show the use of ascii input files. However, one may also use ROOT files as input (see `scripts/annz_rndReg_advanced.py`). For instance, one may define separate input files with corresponding ROOT Tree names, as +```python +glob.annz["inTreeNameTrain"] = "tree0" +glob.annz["splitTypeTrain"] = "file_0.root;file_1.root" +glob.annz["inTreeNameTest"] = "tree1" +glob.annz["splitTypeTest"] = "file_2.root" +``` +Notice that in this case, all files defined in `splitTypeTrain` must contain a ROOT Tree as defined in `inTreeNameTrain`, and the same goes for `splitTypeTest` and `inTreeNameTest`. Since the Tree names for training and testing can be different, one may e.g., use two separate Trees from a single input file for training and testing: +```python +glob.annz["splitTypeTrain"] = "file_0.root" +glob.annz["splitTypeTest"] = "file_0.root" +glob.annz["inTreeNameTrain"] = "tree0" +glob.annz["inTreeNameTest"] = "tree1" +``` #### Definition of signal and background objects in single/randomized classification diff --git a/examples/scripts/annz_rndReg_advanced.py b/examples/scripts/annz_rndReg_advanced.py index 14876cd..a5a7666 100644 --- a/examples/scripts/annz_rndReg_advanced.py +++ b/examples/scripts/annz_rndReg_advanced.py @@ -83,18 +83,48 @@ inFileOpt = 1 # splitTypeTrain - list of files for training. splitTypeTest - list of files for testing if inFileOpt == 0: - glob.annz["splitTypeTrain"] = "boss_dr10_0.csv" - glob.annz["splitTypeTest"] = "boss_dr10_1.csv;boss_dr10_2.csv" + glob.annz["splitTypeTrain"] = "boss_dr10_0.csv" + glob.annz["splitTypeTest"] = "boss_dr10_1.csv;boss_dr10_2.csv" # inAsciiFiles - one list of input files for training and testing, where the the objects are assigned to a given # category based on the selection criteria defined by splitType elif inFileOpt == 1: - glob.annz["splitType"] = "serial" # "serial", "blocks" or "random" - glob.annz["inAsciiFiles"] = "boss_dr10_0.csv;boss_dr10_1.csv;boss_dr10_2.csv;boss_dr10_3.csv" - # example ofr using a root tree input file, instead of an ascii input + glob.annz["splitType"] = "serial" # "serial", "blocks" or "random" + glob.annz["inAsciiFiles"] = "boss_dr10_0.csv;boss_dr10_1.csv;boss_dr10_2.csv;boss_dr10_3.csv" + # example for using a root tree input file, instead of an ascii input (objects split automatically for + # training and teting) elif inFileOpt == 2: - glob.annz["splitType"] = "serial" # "serial", "blocks" or "random" - glob.annz["inTreeName"] = "ANNZ_tree_full" - glob.annz["inAsciiFiles"] = "ANNZ_tree_full_00000.root" + glob.annz["splitType"] = "serial" # "serial", "blocks" or "random" + glob.annz["inTreeName"] = "ANNZ_tree_full" + glob.annz["inAsciiFiles"] = "ANNZ_tree_full_00000.root" + # -------------------------------------------------------------------------------------------------- + # examples for using a root tree input file, instead of an ascii input. in this case, we specify the + # names of the files for training and testing separately, where each has the corresponding name of + # the input tree. (in the following, the names of input files/trees are just for illustration - + # they are not included in the example directory...) + # -------------------------------------------------------------------------------------------------- + # two input files which each contain a root tree of the same name. + elif False: + # name of input root file and corresponding tree-name for training + glob.annz["inTreeNameTrain"] = "tree0" + glob.annz["splitTypeTrain"] = "file0.root" + # name of input root file and corresponding tree-name for testing + glob.annz["inTreeNameTest"] = "tree0" + glob.annz["splitTypeTest"] = "file1.root" + # two input files with two distinct corresponding tree names + elif False: + # name of input root file and corresponding tree-name for training + glob.annz["inTreeNameTrain"] = "tree0" + glob.annz["splitTypeTrain"] = "file0.root" + # name of input root file and corresponding tree-name for testing + glob.annz["inTreeNameTest"] = "tree1" + glob.annz["splitTypeTest"] = "file1.root;file2.root" + # a single input file list, where each file contains two distinct trees, one for training and one for testing + elif False: + glob.annz["inAsciiFiles"] = "file0.root;file1.root" + glob.annz["inTreeNameTrain"] = "tree0" + glob.annz["inTreeNameTest"] = "tree1" + + else: inFileOpt("Unsupported...",False) diff --git a/src/ANNZ_loopReg.cpp b/src/ANNZ_loopReg.cpp index 9126c72..307a0a8 100644 --- a/src/ANNZ_loopReg.cpp +++ b/src/ANNZ_loopReg.cpp @@ -3446,7 +3446,7 @@ void ANNZ::doMetricPlots(TChain * aChain, vector * addPlotVarV, TStri // ----------------------------------------------------------------------------------------------------------- - // derive the binning scheme for the his_err* histograms + // derive the binning for the his_err* histograms // ----------------------------------------------------------------------------------------------------------- double minErrTrgReg(1), maxErrTrgReg(-1); if(doKnnErrPlots) { @@ -3489,6 +3489,9 @@ void ANNZ::doMetricPlots(TChain * aChain, vector * addPlotVarV, TStri } } + // ----------------------------------------------------------------------------------------------------------- + // initialize some histograms + // ----------------------------------------------------------------------------------------------------------- vector < TString > typeTitleV; map < TString,vector > his_regTrgZ; map < TString,TH2* > his_corRegTrgZ; diff --git a/src/CatFormat_asciiToTree.cpp b/src/CatFormat_asciiToTree.cpp index f95a49f..1dfb47a 100644 --- a/src/CatFormat_asciiToTree.cpp +++ b/src/CatFormat_asciiToTree.cpp @@ -281,7 +281,6 @@ void CatFormat::inputToFullTree(TString inAsciiFiles, TString inAsciiVars, TStri /** * @brief - Convert ascii file into a root tree (optional splitting for train/test/valid subsamples). * - * * @details - For training and testing/validation the input is divided into two (test,train) or into three (test,train,valid) * sub-samples. * - The user needs to define the number of sub-samples (e.g., nSplit = 1 or 2) and the way to divide the @@ -318,6 +317,8 @@ void CatFormat::inputToSplitTree(TString inAsciiFiles, TString inAsciiVars) { TString plotExt = glob->GetOptC("printPlotExtension"); TString outDirNameFull = glob->GetOptC("outDirNameFull"); TString inTreeName = glob->GetOptC("inTreeName"); + TString inTreeNameTrain = glob->GetOptC("inTreeNameTrain"); + TString inTreeNameTest = glob->GetOptC("inTreeNameTest"); TString sigBckInpName = glob->GetOptC("sigBckInpName"); TString inpFiles_sig = glob->GetOptC("inpFiles_sig"); @@ -388,7 +389,9 @@ void CatFormat::inputToSplitTree(TString inAsciiFiles, TString inAsciiVars) { isRootInput = inFileNameV[0].EndsWith(".root"); // add the path to the file names - for(int nInFileNow=0; nInFileNowGetOptC("inDirName")+inFileNameV[nInFileNow]; + for(int nInFileNow=0; nInFileNowGetOptC("inDirName")+inFileNameV[nInFileNow]; + } } // make sure the input files are consistently of the same type @@ -396,7 +399,8 @@ void CatFormat::inputToSplitTree(TString inAsciiFiles, TString inAsciiVars) { if(isRootInput) { for(int nInFileNow=0; nInFileNowNewCntr("nLineFile",0); if(isRootInput) { - TChain * inChain = new TChain(inTreeName,inTreeName); inChain->SetDirectory(0); inChain->Add(inFileNameNow); - aLOG(Log::DEBUG) <GetEntries()<<")"<<" from "<SetDirectory(0); inChain->Add(inFileNameNow); + aLOG(Log::DEBUG) <GetEntries()<<")"<<" from "<connectTreeBranches(inChain); // get the full list of variables common to both var and var_0 @@ -686,7 +698,6 @@ void CatFormat::inputToSplitTree(TString inAsciiFiles, TString inAsciiVars) { } } - // cleanup DELNULL(var); DELNULL(rnd); diff --git a/src/myANNZ.cpp b/src/myANNZ.cpp index f37bd64..612076f 100644 --- a/src/myANNZ.cpp +++ b/src/myANNZ.cpp @@ -211,8 +211,13 @@ myANNZ::myANNZ() { glob->NewOptB("trainTestTogether_wgtKNN",true); // input files (given by splitTypeTrain, splitTypeTest and inAsciiFiles) may also be root files, containing - // root trees, instead of ascii files. In this case, the name of the tree in the input files is defined in inTreeName - glob->NewOptC("inTreeName" ,""); + // root trees, instead of ascii files. In this case, the name of the tree in the input files is + // defined in inTreeName. one may alternatively define separate tree names for training and + // testing, using inTreeNameTrain and inTreeNameTest. + glob->NewOptC("inTreeName" ,""); + glob->NewOptC("inTreeNameTrain" ,""); + glob->NewOptC("inTreeNameTest" ,""); + // if root input is given in inAsciiFiles_wgtKNN, the corresponding tree name is defined in treeName_wgtKNN glob->NewOptC("inTreeName_wgtKNN" ,"");