Skip to content

Commit

Permalink
v2.3.1
Browse files Browse the repository at this point in the history
- Updated `py/ANNZ.py` and `scripts/annz_evalWrapper.py` for `python-3.6` compatibility.

- Fixed bug in the `Makefile`; now ROOT shared libraries are linked *after* the local objects.

- Added `isReadOnlySys` option, usable for evaluation only. One may set `isReadOnlySys = Ture` while using the python wrapper, in order to avoid writing anything to disk during evaluation.

- Fixed issue of unnecessary excess memory consumption following validation of XML files.

- Added `minPdfWeight` functionality to the new version of PDF generation using the random walk alg.
  • Loading branch information
IftachSadeh committed Dec 12, 2018
1 parent bfd6614 commit 88e2759
Show file tree
Hide file tree
Showing 7 changed files with 187 additions and 29 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
# Changelog

## Master version (16/11/2018)
## ANNZ v2.3.1 (12/12/2018)

- Updated `py/ANNZ.py` and `scripts/annz_evalWrapper.py` for `python-3.6` compatibility.

- Fixed bug in the `Makefile`; now ROOT shared libraries are linked *after* the local objects.

- Added `isReadOnlySys` option, usable for evaluation only. One may set `isReadOnlySys = Ture` while using the python wrapper, in order to avoid writing anything to disk during evaluation.

- Fixed issue of unnecessary excess memory consumption following validation of XML files.

- Added `minPdfWeight` functionality to the new version of PDF generation using the random walk alg.

## ANNZ v2.3.0 (03/04/2018)

### For users:
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# ANNZ v2.3.0
# ANNZ v2.3.1

## Introduction
ANNZ uses both regression and classification techniques for estimation of single-value photo-z (or any regression problem) solutions and PDFs. In addition it is suitable for classification problems, such as star/galaxy classification.
Expand Down
2 changes: 2 additions & 0 deletions include/ANNZ.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ class ANNZ : public BaseClass {
vector<double> & zRegQnt_sigma68, vector<double> & zRegQnt_fracSig68,
vector < vector<double> > & bestWeightsV, vector <TH2*> & hisPdfBiasCorV);

vector < double > clipWeightsPDF(vector < double > & weightsIn, Log::LOGtypes logLevel = Log::INFO);

// -----------------------------------------------------------------------------------------------------------
// ANNZ_loopRegCls.cpp :
// -----------------------------------------------------------------------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions include/commonInclude.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
#include <TGraphAsymmErrors.h>
#include <TLine.h>
#include <TKey.h>
#include <TXMLEngine.h>

#include "TMVA/Tools.h"
#include "TMVA/Config.h"
Expand Down
10 changes: 7 additions & 3 deletions src/ANNZ_TMVA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -442,11 +442,15 @@ bool ANNZ::verifyXML(TString outXmlFileName) {
if(isGoodXML) {
// minimal verification that the XML is good and has a definition of a "Method"
// see: http://root.cern.ch/root/html/TMVA__Reader.html#TMVA__Reader:GetMethodTypeFromFile
void * doc = TMVA::gTools().xmlengine().ParseFile(outXmlFileName,TMVA::gTools().xmlenginebuffersize());
void * rootnode = TMVA::gTools().xmlengine().DocGetRootElement(doc);
TXMLEngine * xmlengine = new TXMLEngine();
void * doc = xmlengine->ParseFile(outXmlFileName,TMVA::gTools().xmlenginebuffersize());
void * rootnode = xmlengine->DocGetRootElement(doc);

isGoodXML = TMVA::gTools().HasAttr(rootnode, "Method");

if(!isGoodXML) aLOG(Log::DEBUG_1)<<coutRed<<" ... Found bad XML file - "<<coutCyan<<outXmlFileName<<coutDef<<endl;

xmlengine->FreeDoc(doc);
DELNULL(xmlengine);
}
else aLOG(Log::DEBUG_1)<<coutRed<<" ... Did not find the XML file - "<<coutCyan<<outXmlFileName<<coutDef<<endl;

Expand Down
176 changes: 153 additions & 23 deletions src/ANNZ_loopReg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1143,7 +1143,8 @@ void ANNZ::getRndMethodBestPDF(

tmpWeightV.clear();
for(int nMLMnow=0; nMLMnow<nMLMs; nMLMnow++) {
if(weightV[nMLMnow] > EPS) tmpWeightV.push_back(pair<int,double>(nMLMnow,weightV[nMLMnow]));
if(weightV[nMLMnow] < EPS) continue;
tmpWeightV.push_back(pair<int,double>(nMLMnow,weightV[nMLMnow]));
}

// sort so that the largest element is first
Expand Down Expand Up @@ -1292,7 +1293,8 @@ void ANNZ::getRndMethodBestPDF(
double avgIntgrZ(0.5);
int nNoUpdate(0), nSameWeights(0);
double varIntgrBest(std::numeric_limits<double>::max()), varIntgrPrev(varIntgrBest);
vector < double > weightsNow(weightV), weightsPrev(weightV), weightsBest, intgrZ_valV, intgrZmlm_nEvtV;
vector < double > weightsNow(weightV), weightsPrev(weightV),
weightsClipped, weightsBest, intgrZ_valV, intgrZmlm_nEvtV;

vector < int > updateIndexV;
for(int nMLMnow=0; nMLMnow<nMLMs; nMLMnow++) {
Expand All @@ -1307,18 +1309,23 @@ void ANNZ::getRndMethodBestPDF(
else if(nWgtsIn < 40) nRnds0 = 10;
else nRnds0 = floor(nWgtsIn * 0.2);

for(int nLoops=0; nLoops<nOptimLoops; nLoops++) {
for(int nLoopNow=0; nLoopNow<nOptimLoops; nLoopNow++) {
bool canPrint(false);
if (nLoops < 10) canPrint = true;
else if(nLoops < 100 && nLoops%10 == 0) canPrint = true;
else if(nLoops < 500 && nLoops%20 == 0) canPrint = true;
else if(nLoops < 1000 && nLoops%50 == 0) canPrint = true;
else if( nLoops%100 == 0) canPrint = true;
if (nLoopNow < 10) canPrint = true;
else if(nLoopNow < 100 && nLoopNow%10 == 0) canPrint = true;
else if(nLoopNow < 500 && nLoopNow%20 == 0) canPrint = true;
else if(nLoopNow < 1000 && nLoopNow%50 == 0) canPrint = true;
else if( nLoopNow%100 == 0) canPrint = true;

intgrZ_valV.resize(nPDFbins,0); intgrZmlm_nEvtV.resize(nPDFbins,0);

if(canPrint && saveProfileHis) hisIntgrZmlm->Reset();

// -----------------------------------------------------------------------------------------------------------
// clip any weights smaller than minPdfWeight, so long as at least minAcptMLMsForPDFs MLMs remain
// -----------------------------------------------------------------------------------------------------------
weightsClipped = clipWeightsPDF(weightsNow, Log::DEBUG_2);

// -----------------------------------------------------------------------------------------------------------
// calculate the optimization metric for this set of PDF weights from the entire dataset
// -----------------------------------------------------------------------------------------------------------
Expand All @@ -1334,7 +1341,7 @@ void ANNZ::getRndMethodBestPDF(
TString MLMname = getTagName(nMLMnow); if(mlmSkipPdf[MLMname]) continue;

double intgrZmlm = var_1->GetVarF(MLMname);
intgrZ += intgrZmlm * weightsNow[nMLMnow];
intgrZ += intgrZmlm * weightsClipped[nMLMnow];
}

if(intgrZ >= excRange[0] && intgrZ <= excRange[1]) {
Expand Down Expand Up @@ -1376,12 +1383,12 @@ void ANNZ::getRndMethodBestPDF(
weightMsg = "";
for(int nMLMnow=0; nMLMnow<nMLMs; nMLMnow++) {
TString MLMname = getTagName(nMLMnow); if(mlmSkipPdf[MLMname]) continue;
weightMsg += coutBlue+MLMname+":"+coutPurple+utils->floatToStr(weightsNow[nMLMnow],"%1.3f")+" ";
weightMsg += coutBlue+MLMname+":"+coutPurple+utils->floatToStr(weightsClipped[nMLMnow],"%1.3f")+" ";
}

TString msgHead = (TString)(isBest ? " - NEW: " : " - nTry: ");
TString msgCol = (TString)(isBest ? coutGreen : coutYellow);
aLOG(Log::INFO) <<msgCol<<msgHead<<coutPurple<<nLoops
aLOG(Log::INFO) <<msgCol<<msgHead<<coutPurple<<nLoopNow
<<msgCol<<" - min-param best/prev/now: "<<coutBlue<< utils->floatToStr(varIntgrBest,"%1.5e")
<<msgCol<<" / "<<coutRed<< utils->floatToStr(varIntgrPrev,"%1.5e")
<<msgCol<<" / "<<coutBlue<< utils->floatToStr(varIntgrNow,"%1.5e")
Expand All @@ -1390,7 +1397,7 @@ void ANNZ::getRndMethodBestPDF(

if(canPrint && saveProfileHis) {
hisIntgrZmlm->SetTitle((TString)(TString)"RMS[C("+zTrgTitle+")] = "+utils->floatToStr(varIntgrNow,"%1.5e"));
hisIntgrZmlm->Write((TString)hisIntgrZmlm->GetName()+"_"+utils->intToStr(nLoops));
hisIntgrZmlm->Write((TString)hisIntgrZmlm->GetName()+"_"+utils->intToStr(nLoopNow));
}
}

Expand Down Expand Up @@ -1429,11 +1436,13 @@ void ANNZ::getRndMethodBestPDF(
// change one or more of the weights for the next iteration
// -----------------------------------------------------------------------------------------------------------
int nRnds(nRnds0);
if(rnd->Rndm() < 0.5) nRnds = max(nWgtsIn, nRnds + int(ceil(0.2 * rnd->Rndm() * nWgtsIn)));
if(rnd->Rndm() < 0.5) {
nRnds = max(nWgtsIn, nRnds + int(ceil(0.2 * rnd->Rndm() * nWgtsIn)));
}
for(int nRndNow=0; nRndNow<nRnds; nRndNow++) {
while(true) {
weightsNow[*indexItr] += 0.1 * rnd->Rndm();
if(weightsNow[*indexItr] >= 0 && weightsNow[*indexItr] <= 1) break;
weightsNow[*indexItr] += 0.05 * (2*rnd->Rndm() - 1);
if(weightsNow[*indexItr] >= 0) break;
}

indexItr++;
Expand All @@ -1446,14 +1455,21 @@ void ANNZ::getRndMethodBestPDF(
for(int nMLMnow=0; nMLMnow<nMLMs; nMLMnow++) { weightsNow[nMLMnow] /= sumWeights; }
}


// -----------------------------------------------------------------------------------------------------------
// clip any weights smaller than minPdfWeight, so long as at least minAcptMLMsForPDFs MLMs remain
// -----------------------------------------------------------------------------------------------------------
weightsBest = clipWeightsPDF(weightsBest, Log::DEBUG_1);

// -----------------------------------------------------------------------------------------------------------
// final message with the results
// -----------------------------------------------------------------------------------------------------------
tmpWeightV.clear();
for(int nMLMnow=0; nMLMnow<nMLMs; nMLMnow++) {
if(weightsBest[nMLMnow] > EPS) tmpWeightV.push_back(pair<int,double>(nMLMnow,weightsBest[nMLMnow]));
if(weightsBest[nMLMnow] < EPS) continue;
tmpWeightV.push_back(pair<int,double>(nMLMnow,weightsBest[nMLMnow]));
}

// sort so that the largest element is first
sort(tmpWeightV.begin(),tmpWeightV.end(),sortFunc::pairID::highToLowBy1);

Expand All @@ -1462,10 +1478,12 @@ void ANNZ::getRndMethodBestPDF(
TString MLMname = getTagName(tmpWeightV[nAcptMLMnow].first);
weightMsg += (TString)coutGreen+MLMname+":"+coutYellow+utils->floatToStr(tmpWeightV[nAcptMLMnow].second,"%1.3f")+" ";
}

aLOG(Log::INFO) <<coutPurple<<" - finished PDF optimization! - final minimization parameter: " <<coutRed
<<utils->floatToStr(varIntgrBest,"%1.5e")<<coutPurple<<" , PDF weights: "<<weightMsg<<coutDef<<endl;
aLOG(Log::DEBUG) <<coutGreen<<" ----------------------------------------------------------------------------- "<<coutDef<<endl;

aLOG(Log::INFO) <<coutPurple<<" - finished PDF optimization! --> final minimization parameter: "
<<coutRed<<utils->floatToStr(varIntgrBest,"%1.5e")<<coutDef<<endl;
aLOG(Log::INFO) <<coutPurple<<" final PDF weights: "<<weightMsg<<coutDef<<endl;
aLOG(Log::DEBUG) <<coutGreen<<" ----------------------------------------------------------------------------- "
<<coutDef<<endl;

// cleanup
DELNULL(var_1); DELNULL(intgrZchain);
Expand Down Expand Up @@ -1543,10 +1561,10 @@ void ANNZ::getRndMethodBestPDF(
weightV.clear(); mlmSkipPdf.clear(); updateIndexV.clear();
weightsNow.clear(); weightsPrev.clear(); weightsBest.clear();
excRange.clear(); intgrZ_valV.clear(); intgrZmlm_nEvtV.clear();
tmpWeightV.clear();
tmpWeightV.clear(); weightsClipped.clear();

// -----------------------------------------------------------------------------------------------------------
// call the old-style pdf function is needed
// call the old-style pdf function if needed
// -----------------------------------------------------------------------------------------------------------
if(nPDFs > 1) {
getOldStyleRndMethodBestPDF(
Expand All @@ -1559,6 +1577,118 @@ void ANNZ::getRndMethodBestPDF(
}


// ===========================================================================================================
/**
* @brief - clip a vector of weights.
*
* @details - clip a vector of weights, keeping weights above minPdfWeight, with the constraint that
* the minimal number of elements in the vector remains no smaller than minAcptMLMsForPDFs
*
* @param weightsIn - vector of input weights (is not modified within the function)
* @param logLevel - new vector of output weights, after clipping
*/
// ===========================================================================================================
vector < double > ANNZ::clipWeightsPDF(vector < double > & weightsIn, Log::LOGtypes logLevel) {
// ===========================================================================================================
int nMLMs = glob->GetOptI("nMLMs");
double minPdfWeight = glob->GetOptF("minPdfWeight");
int minAcptMLMsForPDFs = glob->GetOptI("minAcptMLMsForPDFs");
bool printFullWeightV = false; // debugging flag

vector < pair<int,double> > tmpWeightV;
vector < double > weightsClipped(weightsIn);

// -----------------------------------------------------------------------------------------------------------
// only proceed if the minPdfWeight threshold is defined
// -----------------------------------------------------------------------------------------------------------
if(minPdfWeight < EPS || minPdfWeight >= 1) return weightsClipped;

// -----------------------------------------------------------------------------------------------------------
// just in case, normalize the original weights + do a sanity chekc, before getting started
// -----------------------------------------------------------------------------------------------------------
double sumWeights(0);
for(int nMLMnow=0; nMLMnow<nMLMs; nMLMnow++) { sumWeights += weightsClipped[nMLMnow]; }
if(sumWeights < EPS) {
aLOG(Log::WARNING) <<coutRed<<"Clipping attempted with sumWeights=0 !!! Something is horribly "
<<"wrong !!! - ANNZ::clipWeightsPDF() aborted ..."<<coutDef<<endl;

return weightsClipped;
}
for(int nMLMnow=0; nMLMnow<nMLMs; nMLMnow++) { weightsClipped[nMLMnow] /= sumWeights; }

// sort the weights
for(int nMLMnow=0; nMLMnow<nMLMs; nMLMnow++) {
if(weightsClipped[nMLMnow] < EPS) continue;
tmpWeightV.push_back(pair<int,double>(nMLMnow,weightsClipped[nMLMnow]));
}

// sort so that the largest element is first
sort(tmpWeightV.begin(),tmpWeightV.end(),sortFunc::pairID::highToLowBy1);

// -----------------------------------------------------------------------------------------------------------
// consecutively clip weights smaller than minPdfWeight, so long as at least minAcptMLMsForPDFs MLMs remain
// -----------------------------------------------------------------------------------------------------------
while(true) {
bool canClip(false);
for(int nAcptMLMnow=0; nAcptMLMnow<(int)tmpWeightV.size(); nAcptMLMnow++) {
if(tmpWeightV[nAcptMLMnow].second < minPdfWeight) {
canClip = true;
break;
}
}
if(!canClip) break;

if((int)tmpWeightV.size() <= minAcptMLMsForPDFs) {
aLOG(logLevel) <<coutRed<<" - can not clip the next least-significant MLMs from "
<<"PDF ... need at least "<<coutYellow<<minAcptMLMsForPDFs<<coutRed
<<" to survive ..."<<coutDef<<endl;
break;
}

int nMLMminWgt = tmpWeightV.back().first;
aLOG(logLevel) <<coutYellow<<" - clipping least-significant MLMs from PDF: " <<coutGreen
<<getTagName(nMLMminWgt)<<coutYellow<<" with weight "<<coutRed
<<utils->floatToStr(tmpWeightV.back().second,"%1.5e")<<coutYellow
<<" has been removed ... "<<coutDef<<endl;

weightsClipped[nMLMminWgt] = 0;
tmpWeightV.pop_back();

sumWeights = 0;
for(int nAcptMLMnow=0; nAcptMLMnow<(int)tmpWeightV.size(); nAcptMLMnow++) {
sumWeights += tmpWeightV[nAcptMLMnow].second;
}
if(sumWeights < EPS) {
weightsClipped = weightsIn;

aLOG(Log::WARNING) <<coutRed<<"Clipping resulted in sumWeights=0 !!! try adjusting the values "
<<"of \"minPdfWeight\" and/or \"minAcptMLMsForPDFs\". "
<<"ANNZ::clipWeightsPDF() aborted ..."<<coutDef<<endl;
break;
}

for(int nAcptMLMnow=0; nAcptMLMnow<(int)tmpWeightV.size(); nAcptMLMnow++) {
tmpWeightV[nAcptMLMnow].second /= sumWeights;
weightsClipped[tmpWeightV[nAcptMLMnow].first] = tmpWeightV[nAcptMLMnow].second;
}

if(inLOG(logLevel) && printFullWeightV) {
for(int nMLMnow=0; nMLMnow<nMLMs; nMLMnow++) {
aLOG(logLevel) <<coutGreen<<" - clipWeightsPDF: "<<coutBlue
<<nMLMnow<<CT<<coutGreen<<TString::Format("%1.4f",weightsIn[nMLMnow])<<" -> "
<<coutYellow<<TString::Format("%1.4f",weightsClipped[nMLMnow])
<<coutRed<<(TString)(nMLMnow == nMLMminWgt ? " CLIPPED" : "")<<coutDef<<endl;
}
}
}
if(printFullWeightV) aLOG(logLevel) << endl;

tmpWeightV.clear();

return weightsClipped;
}


// ===========================================================================================================
/**
* @brief - Generate PDF weighting schemes for randomized regression.
Expand Down
19 changes: 18 additions & 1 deletion src/ANNZ_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,17 @@ void ANNZ::Init() {
if(glob->GetOptC("MLMsToStore") == "") glob->SetOptC("MLMsToStore","BEST");
}

// check for errors in addOutputVars
vector<TString> addOutputVarsV = utils->splitStringByChar(glob->GetOptC("addOutputVars"),';');
for(int nOutputVarNow=0; nOutputVarNow<(int)addOutputVarsV.size(); nOutputVarNow++) {
TString outputVar(addOutputVarsV[nOutputVarNow]);

VERIFY(LOCATION,(TString)"Can not include "+outputVar+" in \"addOutputVars\", as "
+"it begins with \""+glob->GetOptC("basePrefix")+"\" ... "
+"did you mean to add it to \"MLMsToStore\" ???",
!(outputVar.BeginsWith(glob->GetOptC("basePrefix"))));
}

if(!glob->GetOptB("doRegression")) glob->SetOptB("doBiasCorPDF",false);

// number of PDF types - either generate no PDF, or choose up to two types
Expand Down Expand Up @@ -408,11 +419,17 @@ void ANNZ::Init() {

glob->SetOptC("optimCondReg","sig68");
}

if (glob->GetOptC("optimCondReg") == "sig68") glob->NewOptC("optimCondRegtitle", "#sigma_{68}");
else if(glob->GetOptC("optimCondReg") == "bias") glob->NewOptC("optimCondRegtitle", "Bias");
else if(glob->GetOptC("optimCondReg") == "fracSig68") glob->NewOptC("optimCondRegtitle", "f(2,3#sigma_{68})");
else VERIFY(LOCATION,(TString)"Configuration problem... \"optimCondReg\" should have one of the "+
"following values: \"sig68\", \"bias\" or \"fracSig68\" options ...",false);

if(glob->GetOptF("minPdfWeight") > 1 || glob->GetOptF("minPdfWeight") < EPS) {
aLOG(Log::WARNING) <<coutRed<<" - found minPdfWeight > 1 ... minPdfWeight will be ignored"<<coutDef<<endl;
glob->SetOptF("minPdfWeight", -1);
}
}

// set flag for generating uncertainty estimators for classification MLMs (needed for the second PDF in binned classification)
Expand All @@ -429,7 +446,7 @@ void ANNZ::Init() {

// a lower acceptance bound to check if too few MLMs are trained or if something went wrong with the optimization procedure
// (e.g., not enough trained MLMs have 'good' combinations of scatter, bias and outlier-fraction metrics).
int minAcptMLMsForPDFs(5);
int minAcptMLMsForPDFs(3);
if(glob->GetOptI("minAcptMLMsForPDFs") < minAcptMLMsForPDFs) {
aLOG(Log::WARNING) <<coutRed <<" - Found minAcptMLMsForPDFs = "
<<coutBlue<<glob->GetOptI("minAcptMLMsForPDFs")
Expand Down

0 comments on commit 88e2759

Please sign in to comment.