Skip to content

Commit

Permalink
Track HTTP request latencies in nanoseconds (#6470)
Browse files Browse the repository at this point in the history
- Change the `http_service:request_latency_histogram` to express
latencies in nanoseconds, as a histogram of u64s.
- Update the latency tracking types in `oximeter_instruments::http` to
use new bin type, and clean up some of the documentation.
- Expunged old timeseries schema
- Closes #6443
  • Loading branch information
bnaecker authored Aug 29, 2024
1 parent b7ab537 commit 29358f1
Show file tree
Hide file tree
Showing 8 changed files with 106 additions and 41 deletions.
12 changes: 7 additions & 5 deletions gateway/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,18 @@ impl ServerContext {
OnceLock::new()
};

const START_LATENCY_DECADE: i16 = -6;
const END_LATENCY_DECADE: i16 = 3;
// Track from 1 microsecond == 1e3 nanoseconds
const LATENCY_START_POWER: u16 = 3;
// To 1000s == 1e9 * 1e3 == 1e12 nanoseconds
const LATENCY_END_POWER: u16 = 12;
let latencies =
oximeter_instruments::http::LatencyTracker::with_latency_decades(
oximeter_instruments::http::LatencyTracker::with_log_linear_bins(
oximeter_instruments::http::HttpService {
name: "management-gateway-service".into(),
id,
},
START_LATENCY_DECADE,
END_LATENCY_DECADE,
LATENCY_START_POWER,
LATENCY_END_POWER,
)
.expect("start and end decades are hardcoded and should be valid");

Expand Down
12 changes: 7 additions & 5 deletions nexus/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,14 @@ impl ServerContext {
name: name.to_string().into(),
id: config.deployment.id,
};
const START_LATENCY_DECADE: i16 = -6;
const END_LATENCY_DECADE: i16 = 3;
LatencyTracker::with_latency_decades(
// Start at 1 microsecond == 1e3 nanoseconds.
const LATENCY_START_POWER: u16 = 3;
// End at 1000s == (1e9 * 1e3) == 1e12 nanoseconds.
const LATENCY_END_POWER: u16 = 12;
LatencyTracker::with_log_linear_bins(
target,
START_LATENCY_DECADE,
END_LATENCY_DECADE,
LATENCY_START_POWER,
LATENCY_END_POWER,
)
.unwrap()
};
Expand Down
1 change: 1 addition & 0 deletions oximeter/db/schema/replicated/12/timeseries-to-delete.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http_service:request_latency_histogram
1 change: 1 addition & 0 deletions oximeter/db/schema/single-node/12/timeseries-to-delete.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http_service:request_latency_histogram
2 changes: 1 addition & 1 deletion oximeter/db/src/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ use uuid::Uuid;
/// - [`crate::Client::initialize_db_with_version`]
/// - [`crate::Client::ensure_schema`]
/// - The `clickhouse-schema-updater` binary in this crate
pub const OXIMETER_VERSION: u64 = 11;
pub const OXIMETER_VERSION: u64 = 12;

// Wrapper type to represent a boolean in the database.
//
Expand Down
59 changes: 31 additions & 28 deletions oximeter/instruments/src/http.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ pub use http_service::RequestLatencyHistogram;
impl RequestLatencyHistogram {
/// Build a new `RequestLatencyHistogram` with a specified histogram.
///
/// Latencies are expressed in seconds.
/// Latencies are expressed in nanoseconds.
pub fn new(
operation_id: &str,
status_code: StatusCode,
histogram: Histogram<f64>,
histogram: Histogram<u64>,
) -> Self {
Self {
operation_id: operation_id.to_string().into(),
Expand All @@ -37,24 +37,26 @@ impl RequestLatencyHistogram {
}
}

/// Build a `RequestLatencyHistogram` with a histogram whose bins span the given decades.
/// Build a histogram whose bins span the given powers of ten.
///
/// `start_decade` and `end_decade` specify the lower and upper limits of the histogram's
/// range, as a power of 10. For example, passing `-3` and `2` results in a histogram with bins
/// spanning `[10 ** -3, 10 ** 2)`. There are 10 bins in each decade. See the
/// [`Histogram::span_decades`] method for more details.
/// `start_power` and `end_power` specify the lower and upper limits of
/// the histogram's range, as powers of 10. For example, passing 2 and 4
/// results in a histogram with bins from `[10 ** 2, 10 ** 4)`. There are 10
/// bins in each power of 10.
///
/// Latencies are expressed as seconds.
pub fn with_latency_decades(
/// See the [`Histogram::span_decades`] method for more details.
///
/// Latencies are expressed in nanoseconds.
pub fn with_log_linear_bins(
operation_id: &str,
status_code: StatusCode,
start_decade: i16,
end_decade: i16,
start_power: u16,
end_power: u16,
) -> Result<Self, MetricsError> {
Ok(Self::new(
operation_id,
status_code,
Histogram::span_decades(start_decade, end_decade)?,
Histogram::span_decades(start_power, end_power)?,
))
}

Expand All @@ -71,7 +73,7 @@ impl RequestLatencyHistogram {
}

/// The `LatencyTracker` is an [`oximeter::Producer`] that tracks the latencies of requests for an
/// HTTP service, in seconds.
/// HTTP service, in nanoseconds.
///
/// Consumers should construct one `LatencyTracker` for each HTTP service they wish to instrument.
/// As requests are received, the [`LatencyTracker::update`] method can be called with the
Expand All @@ -94,33 +96,34 @@ pub struct LatencyTracker {
/// The histogram used to track each request.
///
/// We store it here to clone as we see new requests.
histogram: Histogram<f64>,
histogram: Histogram<u64>,
}

impl LatencyTracker {
/// Build a new tracker for the given `service`, using `histogram` to track latencies.
///
/// Note that the same histogram is used for each tracked timeseries.
pub fn new(service: HttpService, histogram: Histogram<f64>) -> Self {
pub fn new(service: HttpService, histogram: Histogram<u64>) -> Self {
Self {
service,
latencies: Arc::new(Mutex::new(HashMap::new())),
histogram,
}
}

/// Build a new tracker for the given `service`, with a histogram that spans the given decades
/// (powers of 10). See [`RequestLatencyHistogram::with_latency_decades`] for details on the
/// Build a new tracker with log-linear bins.
///
/// This creates a tracker for the `service`, using 10 bins per power of 10,
/// from `[10 ** start_power, 10 ** end_power)`.
///
/// [`RequestLatencyHistogram::with_log_linear_bins`] for details on the
/// arguments.
pub fn with_latency_decades(
pub fn with_log_linear_bins(
service: HttpService,
start_decade: i16,
end_decade: i16,
start_power: u16,
end_power: u16,
) -> Result<Self, MetricsError> {
Ok(Self::new(
service,
Histogram::span_decades(start_decade, end_decade)?,
))
Ok(Self::new(service, Histogram::span_decades(start_power, end_power)?))
}

/// Update (or create) a timeseries in response to a new request.
Expand All @@ -142,7 +145,7 @@ impl LatencyTracker {
self.histogram.clone(),
)
});
entry.datum.sample(latency.as_secs_f64()).map_err(MetricsError::from)
entry.datum.sample(latency.as_nanos() as _).map_err(MetricsError::from)
}

/// Instrument the given Dropshot endpoint handler function.
Expand Down Expand Up @@ -218,16 +221,16 @@ mod tests {
fn test_latency_tracker() {
let service =
HttpService { name: "my-service".into(), id: ID.parse().unwrap() };
let hist = Histogram::new(&[0.0, 1.0]).unwrap();
let hist = Histogram::new(&[100, 1000]).unwrap();
let tracker = LatencyTracker::new(service, hist);
let status_code0 = StatusCode::OK;
let status_code1 = StatusCode::NOT_FOUND;
let operation_id = "some_operation_id";
tracker
.update(operation_id, status_code0, Duration::from_secs_f64(0.5))
.update(operation_id, status_code0, Duration::from_nanos(200))
.unwrap();
tracker
.update(operation_id, status_code1, Duration::from_secs_f64(0.5))
.update(operation_id, status_code1, Duration::from_nanos(200))
.unwrap();
let key0 = RequestLatencyHistogram::key_for(operation_id, status_code0);
let key1 = RequestLatencyHistogram::key_for(operation_id, status_code1);
Expand Down
4 changes: 2 additions & 2 deletions oximeter/oximeter/schema/http-service.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ versions = [
[[metrics]]
name = "request_latency_histogram"
description = "Duration for the server to handle a request"
units = "seconds"
datum_type = "histogram_f64"
units = "nanoseconds"
datum_type = "histogram_u64"
versions = [
{ added_in = 1, fields = [ "operation_id", "status_code" ] }
]
Expand Down
56 changes: 56 additions & 0 deletions oximeter/types/src/histogram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,47 @@ where
}
}

pub trait Bits: Integer {
const BITS: u32;
fn next_power(self) -> Option<Self>;
}

macro_rules! impl_bits {
($type_:ty) => {
impl Bits for $type_ {
const BITS: u32 = Self::BITS;

fn next_power(self) -> Option<Self> {
self.checked_mul(2)
}
}
};
}

impl_bits!(u8);
impl_bits!(u16);
impl_bits!(u32);
impl_bits!(u64);

impl<T> Histogram<T>
where
T: Bits + HistogramSupport,
{
/// Create a histogram with logarithmically spaced bins at each power of 2.
///
/// This is only available for unsigned integer support types.
pub fn power_of_two() -> Self {
let mut bins = Vec::with_capacity(T::BITS as _);
let mut x = T::one();
bins.push(x);
while let Some(next) = x.next_power() {
bins.push(next);
x = next;
}
Self::new(&bins).expect("Bits is statically known")
}
}

// Helper to ensure all values are comparable, i.e., not NaN.
fn ensure_finite<T>(value: T) -> Result<(), HistogramError>
where
Expand Down Expand Up @@ -1797,4 +1838,19 @@ mod tests {
"expected to overflow a u8, since support type is not wide enough",
);
}

#[test]
fn test_log_bins_u8() {
let (bins, _) = Histogram::<u8>::power_of_two().bins_and_counts();
assert_eq!(bins, [0, 1, 2, 4, 8, 16, 32, 64, 128],);
}

#[test]
fn test_log_bins_u64() {
let (bins, _) = Histogram::<u64>::power_of_two().bins_and_counts();
assert_eq!(bins[0], 0);
for (i, bin) in bins.iter().skip(1).enumerate() {
assert_eq!(*bin, 1u64 << i);
}
}
}

0 comments on commit 29358f1

Please sign in to comment.