Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework iteration to avoid overflow #68

Merged
merged 8 commits into from
Oct 24, 2017
8 changes: 4 additions & 4 deletions examples/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,24 +121,24 @@ fn quantiles<R: BufRead, W: Write>(mut reader: R, mut writer: W, quantile_precis
let mut sum = 0;
for v in hist.iter_quantiles(ticks_per_half) {
sum += v.count_since_last_iteration();
if v.quantile() < 1.0 {
if v.quantile_iterated_to() < 1.0 {
writer.write_all(
format!(
"{:12} {:1.*} {:1.*} {:10} {:14.2}\n",
v.value(),
v.value_iterated_to(),
quantile_precision,
v.quantile(),
quantile_precision,
v.quantile_iterated_to(),
sum,
1_f64 / (1_f64 - v.quantile())
1_f64 / (1_f64 - v.quantile_iterated_to())
).as_ref(),
)?;
} else {
writer.write_all(
format!(
"{:12} {:1.*} {:1.*} {:10} {:>14}\n",
v.value(),
v.value_iterated_to(),
quantile_precision,
v.quantile(),
quantile_precision,
Expand Down
12 changes: 4 additions & 8 deletions src/iterators/all.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use Counter;
use Histogram;
use iterators::{HistogramIterator, PickyIterator};
use iterators::{HistogramIterator, PickyIterator, PickMetadata};

/// An iterator that will yield every bin.
pub struct Iter { visited: Option<usize> }
Expand All @@ -13,21 +13,17 @@ impl Iter {
}

impl<T: Counter> PickyIterator<T> for Iter {
fn pick(&mut self, index: usize, _: u64) -> bool {
fn pick(&mut self, index: usize, _: u64, _: T) -> Option<PickMetadata> {
if self.visited.map(|i| i != index).unwrap_or(true) {
// haven't visited this index yet
self.visited = Some(index);
true
Some(PickMetadata::new(None, None))
} else {
false
None
}
}

fn more(&mut self, _: usize) -> bool {
true
}

fn quantile_iterated_to(&self) -> Option<f64> {
None
}
}
26 changes: 13 additions & 13 deletions src/iterators/linear.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use Counter;
use Histogram;
use iterators::{HistogramIterator, PickyIterator};
use iterators::{HistogramIterator, PickyIterator, PickMetadata};

/// An iterator that will yield at fixed-size steps through the histogram's value range.
pub struct Iter<'a, T: 'a + Counter> {
Expand Down Expand Up @@ -31,28 +31,28 @@ impl<'a, T: 'a + Counter> Iter<'a, T> {
}

impl<'a, T: 'a + Counter> PickyIterator<T> for Iter<'a, T> {
fn pick(&mut self, index: usize, _: u64) -> bool {
fn pick(&mut self, index: usize, _: u64, _: T) -> Option<PickMetadata> {
let val = self.hist.value_for(index);
if val >= self.current_step_lowest_value_reporting_level || index == self.hist.last_index() {
let metadata = PickMetadata::new(None, Some(self.current_step_highest_value_reporting_level));
self.current_step_highest_value_reporting_level += self.value_units_per_bucket;
self.current_step_lowest_value_reporting_level = self.hist
.lowest_equivalent(self.current_step_highest_value_reporting_level);
true
.lowest_equivalent(self.current_step_highest_value_reporting_level);
Some(metadata)
} else {
false
None
}
}

fn more(&mut self, index: usize) -> bool {
fn more(&mut self, index_to_pick: usize) -> bool {
// If the next iterate will not move to the next sub bucket index (which is empty if
// if we reached this point), then we are not yet done iterating (we want to iterate
// until we are no longer on a value that has a count, rather than util we first reach
// until we are no longer on a value that has a count, rather than until we first reach
// the last value that has a count. The difference is subtle but important)...
// TODO index + 1 could overflow 16-bit usize
self.current_step_highest_value_reporting_level + 1 < self.hist.value_for(index + 1)
}

fn quantile_iterated_to(&self) -> Option<f64> {
None
// When this is called, we're about to begin the "next" iteration, so
// current_step_highest_value_reporting_level has already been incremented,
// and we use it without incrementing its value.
let next_index = index_to_pick.checked_add(1).expect("usize overflow");
self.current_step_highest_value_reporting_level < self.hist.value_for(next_index)
}
}
19 changes: 8 additions & 11 deletions src/iterators/log.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use Counter;
use Histogram;
use iterators::{HistogramIterator, PickyIterator};
use iterators::{HistogramIterator, PickyIterator, PickMetadata};

/// An iterator that will yield at log-size steps through the histogram's value range.
pub struct Iter<'a, T: 'a + Counter> {
Expand Down Expand Up @@ -37,31 +37,28 @@ impl<'a, T: 'a + Counter> Iter<'a, T> {
}

impl<'a, T: 'a + Counter> PickyIterator<T> for Iter<'a, T> {
fn pick(&mut self, index: usize, _: u64) -> bool {
fn pick(&mut self, index: usize, _: u64, _: T) -> Option<PickMetadata> {
let val = self.hist.value_for(index);
if val >= self.current_step_lowest_value_reporting_level || index == self.hist.last_index() {
let metadata = PickMetadata::new(None, Some(self.current_step_highest_value_reporting_level));
// implies log_base must be > 1.0
self.next_value_reporting_level *= self.log_base;
// won't underflow since next_value_reporting_level starts > 0 and only grows
self.current_step_highest_value_reporting_level = self.next_value_reporting_level as u64 - 1;
self.current_step_lowest_value_reporting_level = self.hist
.lowest_equivalent(self.current_step_highest_value_reporting_level);
true
.lowest_equivalent(self.current_step_highest_value_reporting_level);
Some(metadata)
} else {
false
None
}
}

fn more(&mut self, next_index: usize) -> bool {
fn more(&mut self, index_to_pick: usize) -> bool {
// If the next iterate will not move to the next sub bucket index (which is empty if if we
// reached this point), then we are not yet done iterating (we want to iterate until we are
// no longer on a value that has a count, rather than util we first reach the last value
// that has a count. The difference is subtle but important)...
self.hist.lowest_equivalent(self.next_value_reporting_level as u64) <
self.hist.value_for(next_index)
}

fn quantile_iterated_to(&self) -> Option<f64> {
None
self.hist.value_for(index_to_pick)
}
}
150 changes: 84 additions & 66 deletions src/iterators/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,44 @@ pub mod recorded;
/// An iterator that iterates over histogram values.
pub mod all;

/// Extra information about the picked point in the histogram provided by the picker.
pub struct PickMetadata {
/// Supply the quantile iterated to in the last `pick()`, if available. If `None` is provided,
/// the quantile of the current value will be used instead. Probably only useful for the
/// quantile iterator.
quantile_iterated_to: Option<f64>,

/// Supply the value iterated to in the last `pick()`, if the picker can supply a more useful
/// value than the largest value represented by the bucket.
value_iterated_to: Option<u64>
}

impl PickMetadata {
fn new(quantile_iterated_to: Option<f64>, value_iterated_to: Option<u64>) -> PickMetadata {
PickMetadata {
quantile_iterated_to,
value_iterated_to
}
}
}

/// A trait for designing an subset iterator over values in a `Histogram`.
pub trait PickyIterator<T: Counter> {
/// should an item be yielded for the given index?
/// Return `Some` if an `IterationValue` should be emitted at this point.
///
/// `index` is a valid index in the relevant histogram.
fn pick(&mut self, index: usize, total_count_to_index: u64) -> bool;
/// should we keep iterating even though all future indices are zeros?
fn more(&mut self, index: usize) -> bool;

/// Supply the quantile iterated to in the last `pick()`, if available. If `None` is returned,
/// the quantile of the current value will be used instead. Probably only useful for the
/// quantile iterator.
fn quantile_iterated_to(&self) -> Option<f64>;
///
/// This will be called with the same index until it returns `None`. This enables modes of
/// iteration that pick different values represented by the same bucket, for instance.
fn pick(&mut self, index: usize, total_count_to_index: u64, count_at_index: T) -> Option<PickMetadata>;

/// Should we keep iterating even though the last index with non-zero count has already been
/// picked at least once?
///
/// This will be called on every iteration once the last index with non-zero count has been
/// picked, even if the index was not advanced in the last iteration (because `pick()` returned
/// `Some`).
fn more(&mut self, index_to_pick: usize) -> bool;
}

/// `HistogramIterator` provides a base iterator for a `Histogram`.
Expand All @@ -37,17 +63,14 @@ pub trait PickyIterator<T: Counter> {
/// sophisticated iterators, a *picker* is also provided, which is allowed to only select some bins
/// that should be yielded. The picker may also extend the iteration to include a suffix of empty
/// bins.
///
/// One peculiarity of this iterator is that, if the picker does choose to yield a particular bin,
/// that bin *is re-visited* before moving on to later bins. It is not clear why this is, but it is
/// how the iterators were implemented in the original HdrHistogram, so we preserve the behavior
/// here. This is the reason why iterators such as all and recorded need to keep track of which
/// indices they have already visited.
pub struct HistogramIterator<'a, T: 'a + Counter, P: PickyIterator<T>> {
hist: &'a Histogram<T>,
total_count_to_index: u64,
prev_total_count: u64,
count_since_last_iteration: u64,
count_at_index: T,
current_index: usize,
last_picked_index: usize,
max_value_index: usize,
fresh: bool,
ended: bool,
picker: P,
Expand All @@ -56,7 +79,7 @@ pub struct HistogramIterator<'a, T: 'a + Counter, P: PickyIterator<T>> {
/// The value emitted at each step when iterating over a `Histogram`.
#[derive(Debug, PartialEq)]
pub struct IterationValue<T: Counter> {
value: u64,
value_iterated_to: u64,
quantile: f64,
quantile_iterated_to: f64,
count_at_value: T,
Expand All @@ -65,34 +88,35 @@ pub struct IterationValue<T: Counter> {

impl<T: Counter> IterationValue<T> {
/// Create a new IterationValue.
pub fn new(value: u64, quantile: f64, quantile_iterated_to: f64, count_at_value: T,
pub fn new(value_iterated_to: u64, quantile: f64, quantile_iterated_to: f64, count_at_value: T,
count_since_last_iteration: u64) -> IterationValue<T> {
IterationValue {
value,
value_iterated_to,
quantile,
quantile_iterated_to,
count_at_value,
count_since_last_iteration
}
}

/// The lowest value stored in the current histogram bin
pub fn value(&self) -> u64 {
self.value
/// The value iterated to. Some iterators provide a specific value inside the bucket, while
/// others just use the highest value in the bucket.
pub fn value_iterated_to(&self) -> u64 {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe this rename-for-clarity is not worth the compatibility concern?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's fine.

self.value_iterated_to
}

/// Percent of recorded values that are equivalent to or below `value`.
/// Percent of recorded values that are at or below the current bucket.
/// This is simply the quantile multiplied by 100.0, so if you care about maintaining the best
/// floating-point precision, use `quantile()` instead.
pub fn percentile(&self) -> f64 {
self.quantile * 100.0
}

/// Quantile of recorded values that are equivalent to or below `value`
/// Quantile of recorded values that are at or below the current bucket.
pub fn quantile(&self) -> f64 { self.quantile }

/// Quantile iterated to, which in the case of quantile iteration may be different from
/// `quantile` because slightly different quantiles can still map to the same bucket.
/// Quantile iterated to, which may be different than `quantile()` when an iterator provides
/// information about the specific quantile it's iterating to.
pub fn quantile_iterated_to(&self) -> f64 { self.quantile_iterated_to }

/// Recorded count for values equivalent to `value`
Expand All @@ -111,25 +135,16 @@ impl<'a, T: Counter, P: PickyIterator<T>> HistogramIterator<'a, T, P> {
HistogramIterator {
hist: h,
total_count_to_index: 0,
prev_total_count: 0,
count_since_last_iteration: 0,
count_at_index: T::zero(),
current_index: 0,
last_picked_index: 0,
max_value_index: h.index_for(h.max()).expect("Either 0 or an existing index"),
picker,
fresh: true,
ended: false,
}
}

fn current(&self) -> IterationValue<T> {
let quantile = self.total_count_to_index as f64 / self.hist.count() as f64;
IterationValue {
value: self.hist.highest_equivalent(self.hist.value_for(self.current_index)),
quantile,
quantile_iterated_to: self.picker.quantile_iterated_to().unwrap_or(quantile),
count_at_value: self.hist.count_at_index(self.current_index)
.expect("current index cannot exceed counts length"),
count_since_last_iteration: self.total_count_to_index - self.prev_total_count
}
}
}

impl<'a, T: 'a, P> Iterator for HistogramIterator<'a, T, P>
Expand All @@ -155,50 +170,53 @@ impl<'a, T: 'a, P> Iterator for HistogramIterator<'a, T, P>
return None;
}

// TODO should check if we've reached max, not count, to avoid early termination
// on histograms with very large counts whose total would exceed u64::max_value()
// Have we yielded all non-zeros in the histogram?
let total = self.hist.count();
if self.prev_total_count == total {
// Have we already picked the index with the last non-zero count in the histogram?
if self.last_picked_index >= self.max_value_index {
// is the picker done?
if !self.picker.more(self.current_index) {
self.ended = true;
return None;
}

// nope -- alright, let's keep iterating
} else {
// nope -- alright, let's keep iterating
assert!(self.current_index < self.hist.len());
assert!(self.prev_total_count < total);

if self.fresh {
let count = self.hist.count_at_index(self.current_index)
.expect("Already checked that current_index is < counts len");

// if we've seen all counts, no other counts should be non-zero
if self.total_count_to_index == total {
// TODO this can fail when total count overflows
assert_eq!(count, T::zero());
}
// at a new index, and not past the max, so there's nonzero counts to add
self.count_at_index = self.hist.count_at_index(self.current_index)
.expect("Already checked that current_index is < counts len");

// TODO overflow
self.total_count_to_index = self.total_count_to_index + count.as_u64();
self.total_count_to_index =
self.total_count_to_index.saturating_add(self.count_at_index.as_u64());
self.count_since_last_iteration =
self.count_since_last_iteration.saturating_add(self.count_at_index.as_u64());

// make sure we don't add this index again
self.fresh = false;
}
}

// figure out if picker thinks we should yield this value
if self.picker.pick(self.current_index, self.total_count_to_index) {
let val = self.current();
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I felt it was easier to reason at-a-glance about when the iterator's fields were used with this function inlined, since it matters that you read things like count_since_last_iteration before resetting a few lines below.


// note that we *don't* increment self.current_index here. the picker will be
// exposed to the same value again after yielding. not sure why this is the
// behavior we want, but it's what the original Java implementation dictates.

// TODO count starting at 0 each time we emit a value to be less prone to overflow
self.prev_total_count = self.total_count_to_index;
if let Some(metadata) = self.picker.pick(self.current_index, self.total_count_to_index, self.count_at_index) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think rustfmt might complain about this line.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I haven't been rustfmt-ing but I'm happy to adopt it. I don't even really care if its formatting is suboptimal for some aesthetic; consistency is good. I'll format the iterator files.

let quantile = self.total_count_to_index as f64 / self.hist.count() as f64;
let val = IterationValue {
value_iterated_to: metadata.value_iterated_to
.unwrap_or(self.hist.highest_equivalent(self.hist.value_for(self.current_index))),
quantile,
quantile_iterated_to: metadata.quantile_iterated_to.unwrap_or(quantile),
count_at_value: self.hist.count_at_index(self.current_index)
.expect("current index cannot exceed counts length"),
count_since_last_iteration: self.count_since_last_iteration
};

// Note that we *don't* increment self.current_index here. The picker will be
// exposed to the same value again after yielding. This is to allow a picker to
// pick multiple times at the same index. An example of this is how the linear
// picker may be using a step size smaller than the bucket size, so it should
// step multiple times without advancing the index.

self.count_since_last_iteration = 0;
self.last_picked_index = self.current_index;
return Some(val);
}

Expand Down
Loading