From b779b3992b2d0d1283a97de947bfc2131ed468bd Mon Sep 17 00:00:00 2001 From: pl0xz0rz Date: Mon, 6 Nov 2023 14:30:04 +0100 Subject: [PATCH 1/5] Added first implementation fo rolling quantile using insertion sort --- .../Tools/RDataFrame/RDataFrame_Array.py | 5 +- .../Tools/RDataFrame/RollingQuantile.cpp | 64 +++++++++++++++++++ .../Tools/RDataFrame/test_RDataFrame_Array.py | 2 + 3 files changed, 70 insertions(+), 1 deletion(-) diff --git a/RootInteractive/Tools/RDataFrame/RDataFrame_Array.py b/RootInteractive/Tools/RDataFrame/RDataFrame_Array.py index db1d7ee7..8847e21a 100644 --- a/RootInteractive/Tools/RDataFrame/RDataFrame_Array.py +++ b/RootInteractive/Tools/RDataFrame/RDataFrame_Array.py @@ -257,7 +257,8 @@ def visit_Rolling(self, node: ast.Call): "rollingSum": "rolling_sum", "rollingMean": "rolling_mean", "rollingStd": "rolling_std", - "rollingMedian": "rolling_median" + "rollingMedian": "rolling_median", + "rollingQuantile": "rolling_quantile" } time_arr_name="" rolling_statistic_name = rollingStatistics[node.func.id] @@ -271,6 +272,8 @@ def visit_Rolling(self, node: ast.Call): if scalar_type(unpackScalarType(scalar_type_str(time_arr["type"]), 1))[0] == 'o': raise TypeError("Weights array for rolling sum must be of numeric data type") time_arr_name = f", {time_arr['implementation']}.begin()" + if rolling_statistic_name == "rolling_quantile": + init = f'self.visit(keywords["quantile"])["value"]' new_helper_id = self.helpervar_idx self.helpervar_idx += 1 self.helpervar_stmt.append((0, f""" diff --git a/RootInteractive/Tools/RDataFrame/RollingQuantile.cpp b/RootInteractive/Tools/RDataFrame/RollingQuantile.cpp index 18b20c61..d2dc7f12 100644 --- a/RootInteractive/Tools/RDataFrame/RollingQuantile.cpp +++ b/RootInteractive/Tools/RDataFrame/RollingQuantile.cpp @@ -102,6 +102,70 @@ OutputIt rolling_median_weighted(InputIt first, InputIt last, TimeIt t_first, Ou return d_first; } +// For small windows, insertion sort has less overhead than the heap or Las Vegas algorithms +template +OutputIt rolling_quantile(InputIt first, InputIt last, OutputIt d_first, size_t window, double quantile, bool center){ + std::vector sorted; + size_t rolling_pos = 0; + unsigned long count = 0; + unsigned long idx_insert; + size_t n_skip = window >> 1; + InputIt window_end = first; + for(;window_end < last && window_end < first+window; ++window_end){ + ++count; + sorted.push_back(window_end); + for(auto insert_pos = sorted.end()-1; insert_pos > sorted.begin(); --insert_pos){ + if(*(insert_pos-1) <= *insert_pos){ + break; + } + InputIt tmp = *insert_pos; + *insert_pos = insert_pos[-1]; + insert_pos[-1] = tmp; + } + if(count > n_skip || !center){ + *d_first = *sorted[(size_t)(count*quantile)]; + ++d_first; + } + ++rolling_pos; + } + while(window_end < last){ + size_t found; + for(found=0; found0 && *sorted[found-1] > *sorted[found]){ + InputIt tmp = sorted[found]; + sorted[found] = sorted[found-1]; + sorted[found-1] = tmp; + --found; + } + ++first; + ++window_end; + *d_first = *sorted[(size_t)(count*quantile)]; + ++d_first; + } + if(center){ + for(--count;count>n_skip;count--){ + sorted.erase(std::remove(sorted.begin(), sorted.end(), first)); + ++first; + *d_first = *sorted[(size_t)(count*quantile)]; + ++d_first; + } + } + return d_first; + +} + + } #endif diff --git a/RootInteractive/Tools/RDataFrame/test_RDataFrame_Array.py b/RootInteractive/Tools/RDataFrame/test_RDataFrame_Array.py index 51f9b265..442eb2db 100644 --- a/RootInteractive/Tools/RDataFrame/test_RDataFrame_Array.py +++ b/RootInteractive/Tools/RDataFrame/test_RDataFrame_Array.py @@ -237,6 +237,8 @@ def test_define2(rdf): print(rdf.Sum("arrayRollingMedian1D0").GetValue()) rdf = makeDefine("arrayRollingMedian1D1", "abs(rollingMedian(array1D0, 7, center=True)[3:-3] - array1D0[3:-3])", rdf, None, 3) print(rdf.Sum("arrayRollingMedian1D1").GetValue()) + rdf = makeDefine("arrayRollingMedian1D2", "abs(rollingQuantile(array1D0, 7, center=True, quantile=.5)[3:-3] - array1D0[3:-3])", rdf, None, 3) + print(rdf.Sum("arrayRollingMedian1D2").GetValue()) return rdf def test_exception(rdf): From 0e1efa3472c905cf9e3f5f8e9e80837606f021de Mon Sep 17 00:00:00 2001 From: pl0xz0rz Date: Mon, 6 Nov 2023 14:59:57 +0100 Subject: [PATCH 2/5] Fixed a buffer overflow --- RootInteractive/Tools/RDataFrame/RDataFrame_Array.py | 2 +- RootInteractive/Tools/RDataFrame/RollingQuantile.cpp | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/RootInteractive/Tools/RDataFrame/RDataFrame_Array.py b/RootInteractive/Tools/RDataFrame/RDataFrame_Array.py index 8847e21a..8025554a 100644 --- a/RootInteractive/Tools/RDataFrame/RDataFrame_Array.py +++ b/RootInteractive/Tools/RDataFrame/RDataFrame_Array.py @@ -305,7 +305,7 @@ def visit_Call(self, node: ast.Call): return {"type":('u',64), "implementation":f"({bsearch_names[node.func.id]}({searched_arr['implementation']}.begin(), {searched_arr['implementation']}.end(), {query['implementation']}, {cmp['implementation']})-{searched_arr['implementation']}.begin())"} else: raise TypeError(f"Expected 2 or 3 arguments, got {len(node.args)}") - elif isinstance(node.func, ast.Name) and node.func.id in ["rollingSum", "rollingMean", "rollingStd", "rollingMedian"]: + elif isinstance(node.func, ast.Name) and node.func.id in ["rollingSum", "rollingMean", "rollingStd", "rollingMedian", "rollingQuantile"]: return self.visit_Rolling(node) args = [self.visit(iArg) for iArg in node.args] left = self.visit_func(node.func, args) diff --git a/RootInteractive/Tools/RDataFrame/RollingQuantile.cpp b/RootInteractive/Tools/RDataFrame/RollingQuantile.cpp index d2dc7f12..1c707396 100644 --- a/RootInteractive/Tools/RDataFrame/RollingQuantile.cpp +++ b/RootInteractive/Tools/RDataFrame/RollingQuantile.cpp @@ -110,6 +110,13 @@ OutputIt rolling_quantile(InputIt first, InputIt last, OutputIt d_first, size_t unsigned long count = 0; unsigned long idx_insert; size_t n_skip = window >> 1; + // Sanity checks on quantile to prevent buffer overflow + if(quantile >= 1.){ + quantile = 1.; + } + if(quantile <= 0. || quantile != quantile){ + quantile = 0.; + } InputIt window_end = first; for(;window_end < last && window_end < first+window; ++window_end){ ++count; From 0573e9ae366e2ae133825694a8149b4d964bdfe8 Mon Sep 17 00:00:00 2001 From: pl0xz0rz Date: Mon, 6 Nov 2023 15:03:49 +0100 Subject: [PATCH 3/5] Wrapped a python template in curly braces --- RootInteractive/Tools/RDataFrame/RDataFrame_Array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RootInteractive/Tools/RDataFrame/RDataFrame_Array.py b/RootInteractive/Tools/RDataFrame/RDataFrame_Array.py index 8025554a..61524ce3 100644 --- a/RootInteractive/Tools/RDataFrame/RDataFrame_Array.py +++ b/RootInteractive/Tools/RDataFrame/RDataFrame_Array.py @@ -273,7 +273,7 @@ def visit_Rolling(self, node: ast.Call): raise TypeError("Weights array for rolling sum must be of numeric data type") time_arr_name = f", {time_arr['implementation']}.begin()" if rolling_statistic_name == "rolling_quantile": - init = f'self.visit(keywords["quantile"])["value"]' + init = f', {self.visit(keywords["quantile"])["value"]}' new_helper_id = self.helpervar_idx self.helpervar_idx += 1 self.helpervar_stmt.append((0, f""" From 1d37097fafbab25f875463e82df8fd6472eeffb0 Mon Sep 17 00:00:00 2001 From: pl0xz0rz Date: Mon, 6 Nov 2023 15:13:42 +0100 Subject: [PATCH 4/5] Added rolling quantile for time index --- .../Tools/RDataFrame/RollingQuantile.cpp | 37 +++++++++++++++++++ .../Tools/RDataFrame/test_RDataFrame_Array.py | 2 + 2 files changed, 39 insertions(+) diff --git a/RootInteractive/Tools/RDataFrame/RollingQuantile.cpp b/RootInteractive/Tools/RDataFrame/RollingQuantile.cpp index 1c707396..c702069f 100644 --- a/RootInteractive/Tools/RDataFrame/RollingQuantile.cpp +++ b/RootInteractive/Tools/RDataFrame/RollingQuantile.cpp @@ -172,6 +172,43 @@ OutputIt rolling_quantile(InputIt first, InputIt last, OutputIt d_first, size_t } +// TODO: If performance is bad, add a better algorithm than brute force +template +OutputIt rolling_quantile_weighted(InputIt first, InputIt last, TimeIt t_first, OutputIt d_first, DistT window, double quantile, bool center){ + std::vector window_contents; + InputIt low = first; + InputIt high = first; + TimeIt t_low = t_first; + TimeIt t_high = t_first; + DistT off_low = center ? -window/2 : 0; + DistT off_high = center ? window/2 : window; + // Sanity checks on quantile to prevent buffer overflow + if(quantile >= 1.){ + quantile = 1.; + } + if(quantile <= 0. || quantile != quantile){ + quantile = 0.; + } + for(;first *t_high){ + ++high; + ++t_high; + } + while(*t_first + off_low > *t_low){ + ++low; + ++t_low; + } + window_contents.clear(); + for(InputIt j = low; j Date: Tue, 7 Nov 2023 16:26:11 +0100 Subject: [PATCH 5/5] Simplified unbinned histogram mean/std algorithm, changed to more numerically stable Welford variance --- .../bokeh/HistoNdProfile.ts | 44 +++++++------------ 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/RootInteractive/InteractiveDrawing/bokeh/HistoNdProfile.ts b/RootInteractive/InteractiveDrawing/bokeh/HistoNdProfile.ts index dbcae1a3..dbd54225 100644 --- a/RootInteractive/InteractiveDrawing/bokeh/HistoNdProfile.ts +++ b/RootInteractive/InteractiveDrawing/bokeh/HistoNdProfile.ts @@ -167,19 +167,30 @@ export class HistoNdProfile extends ColumnarDataSource { entries_total += entries if(entries > 0){ let mean = 0 + let std = 0 + let entries_ = 0 if(unbinned){ + let old_mean = 0 for (let y = 0; y < stride_high; y+= stride_low) { const index_low = x+y+z ? global_cumulative_histogram![x+y+z-1] : 0 const index_high = global_cumulative_histogram![x+y+z] if(sorted_weights == null){ for(let i=index_low; i < index_high; ++i){ - if (isFinite(sorted_entries![i])) - mean += sorted_entries![i] + if (isFinite(sorted_entries![i])){ + entries_ ++ + old_mean = mean + mean += (sorted_entries![i] - mean) / entries_ + std += (sorted_entries![i] - mean) * (sorted_entries![i] - old_mean) + } } } else { for(let i=index_low; i < index_high; ++i){ - if (isFinite(sorted_entries![i])) - mean += sorted_entries![i] * sorted_weights[i] + if (isFinite(sorted_entries![i])){ + entries_ += sorted_weights[i] + old_mean = mean + mean += (sorted_entries![i] - mean) * sorted_weights[i] / entries_ + std += (sorted_entries![i] - mean) * (sorted_entries![i] - old_mean) * sorted_weights[i] + } } } } @@ -188,29 +199,8 @@ export class HistoNdProfile extends ColumnarDataSource { for (let y = 0; y < stride_high; y+= stride_low) { mean += bin_count[x+y+z] * bin_centers[x+y+z] } - } - let std = 0 - mean /= entries - if(unbinned){ - for (let y = 0; y < stride_high; y+= stride_low) { - const index_low = x+y+z ? global_cumulative_histogram![x+y+z-1] : 0 - const index_high = global_cumulative_histogram![x+y+z] - if(sorted_weights == null){ - for(let i=index_low; i < index_high; ++i){ - if (isFinite(sorted_entries![i])) - std += sorted_entries![i] * sorted_entries![i] - } - } else { - for(let i=index_low; i < index_high; ++i){ - if (isFinite(sorted_entries![i])) - std += sorted_entries![i] * sorted_entries![i] * sorted_weights[i] - } - } - } - std -= mean*mean*entries - } else - { - for (let y = 0; y < stride_high; y+= stride_low) { + mean /= entries + for (let y = 0; y < stride_high; y+= stride_low) { std += (bin_centers[x+y+z] - mean) * (bin_centers[x+y+z] - mean) * bin_count[x+y+z] } }