support for input ROOT files with different Trees

IftachSadeh · Nov 20, 2016 · a02fe91 · a02fe91
1 parent 26d4336
commit a02fe91
Show file tree

Hide file tree

Showing 6 changed files with 85 additions and 20 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # Changelog
 
-## Master version (19/11/2016)
+## Master version (20/11/2016)
 
 - Added the option to to *not* store the full value of pdfs in the output of optimization/evaluation, by setting
 ```python
@@ -15,6 +15,8 @@ In this case, only the average metrics of a pdf are included in the output.
 glob.annz["doKnnErrPlots"] = True
 ```
 
+- Added support for input ROOT files with different Tree names.
+
 - Updated `README.md`.
 
 - Other minor modifications and bug fixes.

diff --git a/README.md b/README.md
@@ -323,6 +323,20 @@ See the advanced scripts for additional details.
     ```
   Set a list of input files for training in `splitTypeTrain`, and a list of input files for testing in `splitTypeTest`. 
 
+- Most of the examples show the use of ascii input files. However, one may also use ROOT files as input (see `scripts/annz_rndReg_advanced.py`). For instance, one may define separate input files with corresponding ROOT Tree names, as
+```python
+glob.annz["inTreeNameTrain"]  = "tree0"  
+glob.annz["splitTypeTrain"]   = "file_0.root;file_1.root"
+glob.annz["inTreeNameTest"]   = "tree1"  
+glob.annz["splitTypeTest"]    = "file_2.root"
+```
+Notice that in this case, all files defined in `splitTypeTrain` must contain a ROOT Tree as defined in `inTreeNameTrain`, and the same goes for `splitTypeTest` and `inTreeNameTest`. Since the Tree names for training and testing can be different, one may e.g., use two separate Trees from a single input file for training and testing:
+```python
+glob.annz["splitTypeTrain"]   = "file_0.root"
+glob.annz["splitTypeTest"]    = "file_0.root"
+glob.annz["inTreeNameTrain"]  = "tree0"  
+glob.annz["inTreeNameTest"]   = "tree1"  
+```
 
 #### Definition of signal and background objects in single/randomized classification
 

diff --git a/examples/scripts/annz_rndReg_advanced.py b/examples/scripts/annz_rndReg_advanced.py
@@ -83,18 +83,48 @@
   inFileOpt = 1
   # splitTypeTrain - list of files for training. splitTypeTest - list of files for testing
   if   inFileOpt == 0:
-    glob.annz["splitTypeTrain"] = "boss_dr10_0.csv"
-    glob.annz["splitTypeTest"]  = "boss_dr10_1.csv;boss_dr10_2.csv"
+    glob.annz["splitTypeTrain"]  = "boss_dr10_0.csv"
+    glob.annz["splitTypeTest"]   = "boss_dr10_1.csv;boss_dr10_2.csv"
   # inAsciiFiles - one list of input files for training and testing, where the the objects are assigned to a given
   # category based on the selection criteria defined by splitType
   elif inFileOpt == 1:
-    glob.annz["splitType"]      = "serial" # "serial", "blocks" or "random"
-    glob.annz["inAsciiFiles"]   = "boss_dr10_0.csv;boss_dr10_1.csv;boss_dr10_2.csv;boss_dr10_3.csv"
-  # example ofr using a root tree input file, instead of an ascii input
+    glob.annz["splitType"]       = "serial" # "serial", "blocks" or "random"
+    glob.annz["inAsciiFiles"]    = "boss_dr10_0.csv;boss_dr10_1.csv;boss_dr10_2.csv;boss_dr10_3.csv"
+  # example for using a root tree input file, instead of an ascii input (objects split automatically for
+  # training and teting)
   elif inFileOpt == 2:
-    glob.annz["splitType"]      = "serial" # "serial", "blocks" or "random"
-    glob.annz["inTreeName"]     = "ANNZ_tree_full"  
-    glob.annz["inAsciiFiles"]   = "ANNZ_tree_full_00000.root"
+    glob.annz["splitType"]       = "serial" # "serial", "blocks" or "random"
+    glob.annz["inTreeName"]      = "ANNZ_tree_full"  
+    glob.annz["inAsciiFiles"]    = "ANNZ_tree_full_00000.root"
+  # --------------------------------------------------------------------------------------------------
+  # examples for using a root tree input file, instead of an ascii input. in this case, we specify the
+  # names of the files for training and testing separately, where each has the corresponding name of
+  # the input tree. (in the following, the names of input files/trees are just for illustration -
+  # they are not included in the example directory...)
+  # --------------------------------------------------------------------------------------------------
+  # two input files which each contain a root tree of the same name.
+  elif False:
+    # name of input root file and corresponding tree-name for training
+    glob.annz["inTreeNameTrain"] = "tree0"
+    glob.annz["splitTypeTrain"]  = "file0.root"
+    # name of input root file and corresponding tree-name for testing
+    glob.annz["inTreeNameTest"]  = "tree0"
+    glob.annz["splitTypeTest"]   = "file1.root"
+  # two input files with two distinct corresponding tree names
+  elif False:
+    # name of input root file and corresponding tree-name for training
+    glob.annz["inTreeNameTrain"] = "tree0"
+    glob.annz["splitTypeTrain"]  = "file0.root"
+    # name of input root file and corresponding tree-name for testing
+    glob.annz["inTreeNameTest"]  = "tree1"
+    glob.annz["splitTypeTest"]   = "file1.root;file2.root"
+  # a single input file list, where each file contains two distinct trees, one for training and one for testing
+  elif False:
+    glob.annz["inAsciiFiles"]    = "file0.root;file1.root"
+    glob.annz["inTreeNameTrain"] = "tree0"
+    glob.annz["inTreeNameTest"]  = "tree1"
+
+
   else:
     inFileOpt("Unsupported...",False)
 

diff --git a/src/ANNZ_loopReg.cpp b/src/ANNZ_loopReg.cpp
@@ -3446,7 +3446,7 @@ void  ANNZ::doMetricPlots(TChain * aChain, vector <TString> * addPlotVarV, TStri
 
 
   // -----------------------------------------------------------------------------------------------------------
-  // derive the binning scheme for the his_err* histograms
+  // derive the binning for the his_err* histograms
   // -----------------------------------------------------------------------------------------------------------
   double minErrTrgReg(1), maxErrTrgReg(-1);
   if(doKnnErrPlots) {
@@ -3489,6 +3489,9 @@ void  ANNZ::doMetricPlots(TChain * aChain, vector <TString> * addPlotVarV, TStri
     }
   }
 
+  // -----------------------------------------------------------------------------------------------------------
+  // initialize some histograms
+  // -----------------------------------------------------------------------------------------------------------
   vector < TString >                      typeTitleV;
   map < TString,vector <TH1*> >           his_regTrgZ;
   map < TString,TH2* >                    his_corRegTrgZ;

diff --git a/src/CatFormat_asciiToTree.cpp b/src/CatFormat_asciiToTree.cpp
@@ -281,7 +281,6 @@ void CatFormat::inputToFullTree(TString inAsciiFiles, TString inAsciiVars, TStri
 /**
  * @brief                - Convert ascii file into a root tree (optional splitting for train/test/valid subsamples).
  * 
- * 
  * @details              - For training and testing/validation the input is divided into two (test,train) or into three (test,train,valid)
  *                         sub-samples.
  *                       - The user needs to define the number of sub-samples (e.g., nSplit = 1 or 2) and the way to divide the
@@ -318,6 +317,8 @@ void CatFormat::inputToSplitTree(TString inAsciiFiles, TString inAsciiVars) {
   TString plotExt           = glob->GetOptC("printPlotExtension");
   TString outDirNameFull    = glob->GetOptC("outDirNameFull");
   TString inTreeName        = glob->GetOptC("inTreeName");
+  TString inTreeNameTrain   = glob->GetOptC("inTreeNameTrain");
+  TString inTreeNameTest    = glob->GetOptC("inTreeNameTest");
 
   TString sigBckInpName     = glob->GetOptC("sigBckInpName");
   TString inpFiles_sig      = glob->GetOptC("inpFiles_sig");
@@ -388,15 +389,18 @@ void CatFormat::inputToSplitTree(TString inAsciiFiles, TString inAsciiVars) {
     isRootInput = inFileNameV[0].EndsWith(".root");
 
     // add the path to the file names
-    for(int nInFileNow=0; nInFileNow<nInFiles; nInFileNow++) inFileNameV[nInFileNow] = glob->GetOptC("inDirName")+inFileNameV[nInFileNow];
+    for(int nInFileNow=0; nInFileNow<nInFiles; nInFileNow++) {
+      inFileNameV[nInFileNow] = glob->GetOptC("inDirName")+inFileNameV[nInFileNow];
+    }
   }
 
   // make sure the input files are consistently of the same type
   // -----------------------------------------------------------------------------------------------------------
   if(isRootInput) {
     for(int nInFileNow=0; nInFileNow<nInFiles; nInFileNow++) {
       TString inFileNameNow    = inFileNameV[nInFileNow];
-      bool    isExpectedFormat = (isRootInput && inFileNameV[nInFileNow].EndsWith(".root")) || (!isRootInput && !inFileNameV[nInFileNow].EndsWith(".root"));
+      bool    isExpectedFormat = ( ( isRootInput &&  inFileNameV[nInFileNow].EndsWith(".root")) ||
+                                   (!isRootInput && !inFileNameV[nInFileNow].EndsWith(".root"))    );
 
       VERIFY(LOCATION,(TString)"Found some files ending with \".root\" and some without... must give one type of input!",isExpectedFormat);
     }
@@ -509,8 +513,16 @@ void CatFormat::inputToSplitTree(TString inAsciiFiles, TString inAsciiVars) {
     }
 
     // write out trees and initialize counters if moving from one type to another (e.g., from train to test)
+    // switch to the correct inTreeNameNow if seperate tree names are specified for train/test
+    TString inTreeNameNow(inTreeName);
     if(nSplitType == 3) {
-      intMap["inFileSplitIndex"] = inFileTypeV[nInFileNow];
+      int nSplitNow = inFileTypeV[nInFileNow];
+
+      if     (nSplitNow == 0 && inTreeNameTrain != "") inTreeNameNow = inTreeNameTrain;
+      else if(nSplitNow == 1 && inTreeNameTest  != "") inTreeNameNow = inTreeNameTest;
+      else                                             inTreeNameNow = inTreeName;
+
+      intMap["inFileSplitIndex"] = nSplitNow;
 
       if(inFileTypeChange != inFileTypeV[nInFileNow]) {
         inFileTypeChange = inFileTypeV[nInFileNow];
@@ -530,10 +542,10 @@ void CatFormat::inputToSplitTree(TString inAsciiFiles, TString inAsciiVars) {
     var->NewCntr("nLineFile",0);
 
     if(isRootInput) {
-      TChain  * inChain = new TChain(inTreeName,inTreeName); inChain->SetDirectory(0); inChain->Add(inFileNameNow);
-      aLOG(Log::DEBUG) <<coutRed<<" - added chain "<<coutGreen<<inTreeName<<"("<<inChain->GetEntries()<<")"<<" from "<<coutBlue<<inFileNameNow<<coutDef<<endl;
+      TChain  * inChain = new TChain(inTreeNameNow,inTreeNameNow); inChain->SetDirectory(0); inChain->Add(inFileNameNow);
+      aLOG(Log::DEBUG) <<coutRed<<" - added chain "<<coutGreen<<inTreeNameNow<<"("<<inChain->GetEntries()<<")"<<" from "<<coutBlue<<inFileNameNow<<coutDef<<endl;
 
-      VarMaps * var_0   = new VarMaps(glob,utils,(TString)"inputTree_"+inTreeName);
+      VarMaps * var_0   = new VarMaps(glob,utils,(TString)"inputTree_"+inTreeNameNow);
       var_0->connectTreeBranches(inChain);
 
       // get the full list of variables common to both var and var_0
@@ -686,7 +698,6 @@ void CatFormat::inputToSplitTree(TString inAsciiFiles, TString inAsciiVars) {
     }
   }
 
-
   // cleanup
   DELNULL(var); DELNULL(rnd);
 

diff --git a/src/myANNZ.cpp b/src/myANNZ.cpp
@@ -211,8 +211,13 @@ myANNZ::myANNZ() {
   glob->NewOptB("trainTestTogether_wgtKNN",true);
 
   // input files (given by splitTypeTrain, splitTypeTest and inAsciiFiles) may also be root files, containing
-  // root trees, instead of ascii files. In this case, the name of the tree in the input files is defined in inTreeName
-  glob->NewOptC("inTreeName"     ,"");
+  // root trees, instead of ascii files. In this case, the name of the tree in the input files is
+  // defined in inTreeName. one may alternatively define separate tree names for training and
+  // testing, using inTreeNameTrain and inTreeNameTest.
+  glob->NewOptC("inTreeName"      ,"");
+  glob->NewOptC("inTreeNameTrain" ,"");
+  glob->NewOptC("inTreeNameTest"  ,"");
+
   // if root input is given in inAsciiFiles_wgtKNN, the corresponding tree name is defined in treeName_wgtKNN
   glob->NewOptC("inTreeName_wgtKNN"     ,"");