From e88bd2ca46c420d321c59992a467ece1c2b117dd Mon Sep 17 00:00:00 2001 From: pl0xz0rz Date: Wed, 5 Apr 2023 14:02:39 +0200 Subject: [PATCH 1/8] Added matrix transpose to predictRFStat --- RootInteractive/MLpipeline/MIForestErrPDF.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/RootInteractive/MLpipeline/MIForestErrPDF.py b/RootInteractive/MLpipeline/MIForestErrPDF.py index 57edcb8b..a1a37a44 100644 --- a/RootInteractive/MLpipeline/MIForestErrPDF.py +++ b/RootInteractive/MLpipeline/MIForestErrPDF.py @@ -50,22 +50,23 @@ def predictRFStat(rf, X, statDictionary,n_jobs): allRF = np.zeros((len(rf.estimators_), X.shape[0])) lock = threading.Lock() statOut={} - Parallel(n_jobs=n_jobs, verbose=rf.verbose,**_joblib_parallel_args(require="sharedmem"),)( + Parallel(n_jobs=n_jobs, verbose=rf.verbose,require="sharedmem")( delayed(_accumulate_prediction)(e.predict, X, allRF, col,lock) for col,e in enumerate(rf.estimators_) ) # - if "median" in statDictionary: statOut["median"]=np.median(allRF, 0) - if "mean" in statDictionary: statOut["mean"]=np.mean(allRF, 0) - if "std" in statDictionary: statOut["std"]=np.std(allRF, 0) + allRFTranspose = allRF.T + if "median" in statDictionary: statOut["median"]=np.median(allRFTranspose, 1) + if "mean" in statDictionary: statOut["mean"]=np.mean(allRFTranspose, 1) + if "std" in statDictionary: statOut["std"]=np.std(allRFTranspose, 1) if "quantile" in statDictionary: - statOut["quantiles"]={} - for quant in statDictionary["quantile"]: - statOut["quantiles"][quant]=np.quantile(allRF,quant,axis=0) + statOut["quantile"]={} + for quant in statDictionary["quantil"]: + statOut["quantile"][quant]=np.quantile(allRF,quant,axis=1) if "trim_mean" in statDictionary: statOut["trim_mean"]={} for quant in statDictionary["trim_mean"]: - statOut["trim_mean"][quant]=stats.trim_mean(allRF,quant,axis=0) + statOut["trim_mean"][quant]=stats.trim_mean(allRF,quant,axis=1) return statOut def predictRFStatNew(rf, X, statDictionary,n_jobs): """ @@ -371,4 +372,4 @@ def getImportance(self): impTree = np.zeros((len(self.trees[0]), len(self.trees[0][0]))) for row,tree in enumerate(self.trees[0]): impTree[row]=tree.feature_importances_ - return impTree.mean(axis=0) \ No newline at end of file + return impTree.mean(axis=0) From e61446b7150087034a1c2c647c7cea110a6b504b Mon Sep 17 00:00:00 2001 From: pl0xz0rz Date: Wed, 5 Apr 2023 14:47:58 +0200 Subject: [PATCH 2/8] Added ascontiguous --- RootInteractive/MLpipeline/MIForestErrPDF.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RootInteractive/MLpipeline/MIForestErrPDF.py b/RootInteractive/MLpipeline/MIForestErrPDF.py index a1a37a44..9a1e72eb 100644 --- a/RootInteractive/MLpipeline/MIForestErrPDF.py +++ b/RootInteractive/MLpipeline/MIForestErrPDF.py @@ -55,7 +55,7 @@ def predictRFStat(rf, X, statDictionary,n_jobs): for col,e in enumerate(rf.estimators_) ) # - allRFTranspose = allRF.T + allRFTranspose = allRF.T.ascontiguousarray() if "median" in statDictionary: statOut["median"]=np.median(allRFTranspose, 1) if "mean" in statDictionary: statOut["mean"]=np.mean(allRFTranspose, 1) if "std" in statDictionary: statOut["std"]=np.std(allRFTranspose, 1) From 2d45a2d779935a131a135561cecee1ddcb4eb0bd Mon Sep 17 00:00:00 2001 From: pl0xz0rz Date: Wed, 5 Apr 2023 14:54:12 +0200 Subject: [PATCH 3/8] bugfix --- RootInteractive/MLpipeline/MIForestErrPDF.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/RootInteractive/MLpipeline/MIForestErrPDF.py b/RootInteractive/MLpipeline/MIForestErrPDF.py index 9a1e72eb..de76c236 100644 --- a/RootInteractive/MLpipeline/MIForestErrPDF.py +++ b/RootInteractive/MLpipeline/MIForestErrPDF.py @@ -55,13 +55,13 @@ def predictRFStat(rf, X, statDictionary,n_jobs): for col,e in enumerate(rf.estimators_) ) # - allRFTranspose = allRF.T.ascontiguousarray() + allRFTranspose = allRF.T.copy(order='C') if "median" in statDictionary: statOut["median"]=np.median(allRFTranspose, 1) if "mean" in statDictionary: statOut["mean"]=np.mean(allRFTranspose, 1) if "std" in statDictionary: statOut["std"]=np.std(allRFTranspose, 1) if "quantile" in statDictionary: statOut["quantile"]={} - for quant in statDictionary["quantil"]: + for quant in statDictionary["quantile"]: statOut["quantile"][quant]=np.quantile(allRF,quant,axis=1) if "trim_mean" in statDictionary: statOut["trim_mean"]={} From e7f31a3fea9b41243241b6f40ebc202e70c467c8 Mon Sep 17 00:00:00 2001 From: pl0xz0rz Date: Thu, 6 Apr 2023 09:59:29 +0200 Subject: [PATCH 4/8] Replaced median algorithm with simpler one, should only help when other quantiles are used by reusing sorting --- RootInteractive/MLpipeline/MIForestErrPDF.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/RootInteractive/MLpipeline/MIForestErrPDF.py b/RootInteractive/MLpipeline/MIForestErrPDF.py index de76c236..1021320c 100644 --- a/RootInteractive/MLpipeline/MIForestErrPDF.py +++ b/RootInteractive/MLpipeline/MIForestErrPDF.py @@ -47,7 +47,8 @@ def predictRFStat(rf, X, statDictionary,n_jobs): :param n_jobs: number of parallel jobs for prediction :return: dictionary with requested output statistics """ - allRF = np.zeros((len(rf.estimators_), X.shape[0])) + nEstimators = len(rf.estimators_) + allRF = np.zeros((nEstimators, X.shape[0])) lock = threading.Lock() statOut={} Parallel(n_jobs=n_jobs, verbose=rf.verbose,require="sharedmem")( @@ -56,9 +57,11 @@ def predictRFStat(rf, X, statDictionary,n_jobs): ) # allRFTranspose = allRF.T.copy(order='C') - if "median" in statDictionary: statOut["median"]=np.median(allRFTranspose, 1) - if "mean" in statDictionary: statOut["mean"]=np.mean(allRFTranspose, 1) - if "std" in statDictionary: statOut["std"]=np.std(allRFTranspose, 1) + if "median" in statDictionary: + allRFTranspose = allRFTranspose.partition(nEstimators//2, -1) + statOut["median"]= allRFTranspose[:,nEstimators//2] + if "mean" in statDictionary: statOut["mean"]=np.mean(allRFTranspose, -1) + if "std" in statDictionary: statOut["std"]=np.std(allRFTranspose, -1) if "quantile" in statDictionary: statOut["quantile"]={} for quant in statDictionary["quantile"]: From f86c1f19a5d69aa05159a4bc0a33a1a4e9bcaacf Mon Sep 17 00:00:00 2001 From: pl0xz0rz Date: Thu, 6 Apr 2023 10:10:36 +0200 Subject: [PATCH 5/8] bugfix --- RootInteractive/MLpipeline/MIForestErrPDF.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RootInteractive/MLpipeline/MIForestErrPDF.py b/RootInteractive/MLpipeline/MIForestErrPDF.py index 1021320c..b2bfc608 100644 --- a/RootInteractive/MLpipeline/MIForestErrPDF.py +++ b/RootInteractive/MLpipeline/MIForestErrPDF.py @@ -58,7 +58,7 @@ def predictRFStat(rf, X, statDictionary,n_jobs): # allRFTranspose = allRF.T.copy(order='C') if "median" in statDictionary: - allRFTranspose = allRFTranspose.partition(nEstimators//2, -1) + allRFTranspose.partition(nEstimators//2, -1) statOut["median"]= allRFTranspose[:,nEstimators//2] if "mean" in statDictionary: statOut["mean"]=np.mean(allRFTranspose, -1) if "std" in statDictionary: statOut["std"]=np.std(allRFTranspose, -1) From 6a7a219f51b31696f2aedb2915e4f81b74847236 Mon Sep 17 00:00:00 2001 From: pl0xz0rz Date: Thu, 6 Apr 2023 10:51:29 +0200 Subject: [PATCH 6/8] Using joblib.Parallel on median --- RootInteractive/MLpipeline/MIForestErrPDF.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/RootInteractive/MLpipeline/MIForestErrPDF.py b/RootInteractive/MLpipeline/MIForestErrPDF.py index b2bfc608..1d92c5a8 100644 --- a/RootInteractive/MLpipeline/MIForestErrPDF.py +++ b/RootInteractive/MLpipeline/MIForestErrPDF.py @@ -37,6 +37,9 @@ def _accumulate_predictionNL(predict, X, out,col): prediction = predict(X, check_input=False) out[col] += prediction +def partitionBlock(allRF, k, begin, end): + allRF[begin:end].partition(k) + def predictRFStat(rf, X, statDictionary,n_jobs): """ inspired by https://github.com/scikit-learn/scikit-learn/blob/37ac6788c/sklearn/ensemble/_forest.py#L1410 @@ -58,7 +61,14 @@ def predictRFStat(rf, X, statDictionary,n_jobs): # allRFTranspose = allRF.T.copy(order='C') if "median" in statDictionary: - allRFTranspose.partition(nEstimators//2, -1) + blockSize = X.shape[0] // n_jobs + 1 + block_begin = arange(0, X.shape[0], blockSize) + block_end = block_begin[1:] + block_end.append(X.shape[0]) + Parallel(n_jobs=n_jobs, verbose=rf.verbose, require="sharedmem")( + delayed(partitionBlock)(allRFTranspose, nEstimators // 2, first, last) + for first, last in zip(block_begin, block_end) + statOut["median"]= allRFTranspose[:,nEstimators//2] if "mean" in statDictionary: statOut["mean"]=np.mean(allRFTranspose, -1) if "std" in statDictionary: statOut["std"]=np.std(allRFTranspose, -1) From 2a9fe041833a46975be162fcebcc32349cc39ca9 Mon Sep 17 00:00:00 2001 From: pl0xz0rz Date: Thu, 6 Apr 2023 10:53:09 +0200 Subject: [PATCH 7/8] Fixed mising parenthesis --- RootInteractive/MLpipeline/MIForestErrPDF.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RootInteractive/MLpipeline/MIForestErrPDF.py b/RootInteractive/MLpipeline/MIForestErrPDF.py index 1d92c5a8..71047373 100644 --- a/RootInteractive/MLpipeline/MIForestErrPDF.py +++ b/RootInteractive/MLpipeline/MIForestErrPDF.py @@ -68,7 +68,7 @@ def predictRFStat(rf, X, statDictionary,n_jobs): Parallel(n_jobs=n_jobs, verbose=rf.verbose, require="sharedmem")( delayed(partitionBlock)(allRFTranspose, nEstimators // 2, first, last) for first, last in zip(block_begin, block_end) - + ) statOut["median"]= allRFTranspose[:,nEstimators//2] if "mean" in statDictionary: statOut["mean"]=np.mean(allRFTranspose, -1) if "std" in statDictionary: statOut["std"]=np.std(allRFTranspose, -1) From 51230967d64139d36462c191786cf3e762eef798 Mon Sep 17 00:00:00 2001 From: pl0xz0rz Date: Thu, 6 Apr 2023 11:02:10 +0200 Subject: [PATCH 8/8] fixed typo in range --- RootInteractive/MLpipeline/MIForestErrPDF.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RootInteractive/MLpipeline/MIForestErrPDF.py b/RootInteractive/MLpipeline/MIForestErrPDF.py index 71047373..615a586e 100644 --- a/RootInteractive/MLpipeline/MIForestErrPDF.py +++ b/RootInteractive/MLpipeline/MIForestErrPDF.py @@ -62,7 +62,7 @@ def predictRFStat(rf, X, statDictionary,n_jobs): allRFTranspose = allRF.T.copy(order='C') if "median" in statDictionary: blockSize = X.shape[0] // n_jobs + 1 - block_begin = arange(0, X.shape[0], blockSize) + block_begin = list(range(0, X.shape[0], blockSize)) block_end = block_begin[1:] block_end.append(X.shape[0]) Parallel(n_jobs=n_jobs, verbose=rf.verbose, require="sharedmem")(