Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add histogram metric type #386

Merged
merged 10 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 75 additions & 3 deletions include/triton/core/tritonserver.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ struct TRITONSERVER_Server;
struct TRITONSERVER_ServerOptions;
struct TRITONSERVER_Metric;
struct TRITONSERVER_MetricFamily;
struct TRITONSERVER_MetricArgs;

///
/// TRITONSERVER API Version
Expand Down Expand Up @@ -91,7 +92,7 @@ struct TRITONSERVER_MetricFamily;
/// }
///
#define TRITONSERVER_API_VERSION_MAJOR 1
#define TRITONSERVER_API_VERSION_MINOR 33
#define TRITONSERVER_API_VERSION_MINOR 34

/// Get the TRITONBACKEND API version supported by the Triton shared
/// library. This value can be compared against the
Expand Down Expand Up @@ -2615,7 +2616,8 @@ TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_ServerInferAsync(
///
typedef enum TRITONSERVER_metrickind_enum {
TRITONSERVER_METRIC_KIND_COUNTER,
TRITONSERVER_METRIC_KIND_GAUGE
TRITONSERVER_METRIC_KIND_GAUGE,
TRITONSERVER_METRIC_KIND_HISTOGRAM
} TRITONSERVER_MetricKind;

/// Create a new metric family object. The caller takes ownership of the
Expand Down Expand Up @@ -2644,6 +2646,44 @@ TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_MetricFamilyNew(
TRITONSERVER_DECLSPEC struct TRITONSERVER_Error*
TRITONSERVER_MetricFamilyDelete(struct TRITONSERVER_MetricFamily* family);

/// Get the TRITONSERVER_MetricKind of the metric family.
///
/// \param family The metric family object to query.
/// \param kind Returns the TRITONSERVER_MetricKind of metric.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC struct TRITONSERVER_Error*
TRITONSERVER_GetMetricFamilyKind(
Tabrizian marked this conversation as resolved.
Show resolved Hide resolved
struct TRITONSERVER_MetricFamily* family, TRITONSERVER_MetricKind* kind);

/// Create a new metric args object. The caller takes ownership of the
/// TRITONSERVER_MetricArgs object and must call TRITONSERVER_MetricArgsDelete
/// to release the object.
///
/// \param args Returns the new metric args object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_MetricArgsNew(
struct TRITONSERVER_MetricArgs** args);

/// Set metric args with histogram metric parameter.
///
/// \param args The metric args object to set.
/// \param buckets The array of bucket boundaries for the expected range of
/// observed values.
///
/// \param buckets_count The number of bucket boundaries.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC struct TRITONSERVER_Error*
TRITONSERVER_MetricArgsSetHistogram(
struct TRITONSERVER_MetricArgs* args, const double* buckets,
const uint64_t buckets_count);

/// Delete a metric args object.
///
/// \param args The metric args object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_MetricArgsDelete(
struct TRITONSERVER_MetricArgs* args);

/// Create a new metric object. The caller takes ownership of the
/// TRITONSERVER_Metric object and must call
/// TRITONSERVER_MetricDelete to release the object. The caller is also
Expand All @@ -2661,6 +2701,28 @@ TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_MetricNew(
struct TRITONSERVER_MetricFamily* family,
const struct TRITONSERVER_Parameter** labels, const uint64_t label_count);

/// Create a new metric object. The caller takes ownership of the
/// TRITONSERVER_Metric object and must call
/// TRITONSERVER_MetricDelete to release the object. The caller is also
/// responsible for ownership of the labels passed in.
/// Each label can be deleted immediately after creating the metric with
/// TRITONSERVER_ParameterDelete if not re-using the labels.
/// Metric args can be deleted immediately after creating the metric with
/// TRITONSERVER_MetricArgsDelete if not re-using the metric args.
///
/// \param metric Returns the new metric object.
/// \param family The metric family to add this new metric to.
/// \param labels The array of labels to associate with this new metric.
/// \param label_count The number of labels.
/// \param args Metric args that store additional arguments to construct
/// particular metric types, e.g. histogram.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_MetricNewWithArgs(
struct TRITONSERVER_Metric** metric,
struct TRITONSERVER_MetricFamily* family,
const struct TRITONSERVER_Parameter** labels, const uint64_t label_count,
const struct TRITONSERVER_MetricArgs* args);

/// Delete a metric object.
/// All TRITONSERVER_Metric* objects should be deleted BEFORE their
/// corresponding TRITONSERVER_MetricFamily* objects have been deleted.
Expand Down Expand Up @@ -2705,7 +2767,17 @@ TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_MetricIncrement(
TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_MetricSet(
struct TRITONSERVER_Metric* metric, double value);

/// Get the TRITONSERVER_MetricKind of metric and its corresponding family.
/// Sample an observation and count it to the appropriate bucket of a metric.
/// Supports metrics of kind TRITONSERVER_METRIC_KIND_HISTOGRAM and returns
/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind.
///
/// \param metric The metric object to update.
/// \param value The amount for metric to sample observation.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_MetricObserve(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reuse TRITONSERVER_MetricSet?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rmccorm4 Any thought? Basically we need to merge two funcions below into one Metric::Set(double value). It works but may add confusion.

core/src/metric_family.cc

Lines 338 to 404 in fd5c44b

TRITONSERVER_Error*
Metric::Set(double value)
{
if (metric_ == nullptr) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"Could not set metric value. Metric has been invalidated.");
}
switch (kind_) {
case TRITONSERVER_METRIC_KIND_COUNTER: {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"TRITONSERVER_METRIC_KIND_COUNTER does not support Set");
}
case TRITONSERVER_METRIC_KIND_GAUGE: {
auto gauge_ptr = reinterpret_cast<prometheus::Gauge*>(metric_);
gauge_ptr->Set(value);
break;
}
case TRITONSERVER_METRIC_KIND_HISTOGRAM: {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"TRITONSERVER_METRIC_KIND_HISTOGRAM does not support Set");
}
default:
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"Unsupported TRITONSERVER_MetricKind");
}
return nullptr; // Success
}
TRITONSERVER_Error*
Metric::Observe(double value)
{
if (metric_ == nullptr) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"Could not set metric value. Metric has been invalidated.");
}
switch (kind_) {
case TRITONSERVER_METRIC_KIND_COUNTER: {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"TRITONSERVER_METRIC_KIND_COUNTER does not support Observe");
}
case TRITONSERVER_METRIC_KIND_GAUGE: {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"TRITONSERVER_METRIC_KIND_GAUGE does not support Observe");
}
case TRITONSERVER_METRIC_KIND_HISTOGRAM: {
auto histogram_ptr = reinterpret_cast<prometheus::Histogram*>(metric_);
histogram_ptr->Observe(value);
break;
}
default:
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"Unsupported TRITONSERVER_MetricKind");
}
return nullptr; // Success
}

Copy link
Contributor

@rmccorm4 rmccorm4 Aug 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would need to take a closer look, but my gut reaction is that Guan is probably right and we can probably just reuse MetricValue and MetricSet which will call Collect and Observe internally when kind == KIND_HISTOGRAM if functionally equivalent

Copy link
Contributor

@rmccorm4 rmccorm4 Aug 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MetricValue may not work if Collect returns multiple values (one per bucket?), but again will need to take a closer look. Let me know if you already know more details on this from your research.

But similar to the new C API for MetricV2, keep in mind how this would work if we added support for Summary metric and wanted to get the values for each quantile, which is basically same as values for each bucket. Ideally the same API would work for both or all types.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MetricValue cannot be reused for histogram.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated

Copy link
Contributor Author

@yinggeh yinggeh Aug 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like to revisit this change. The consensus is to keep C API and python_backend API 1:1 matched. I am inclined to add a new C API TRITONSERVER_MetricObserve for histogram instead of reusing TRITONSERVER_MetricSet for three reasons.

  1. Both Histogram and Summary types call Observe to record new value. We can reuse observe for Summary type if we add it in the future.
  2. Histogram also has ObserveMultiple API which may be added in the future. I don't like the idea that Histogram.Set and Histogram.ObserveMultiple coexist.
  3. Setting histogram to a value aka Histogram.set(val) is semantically wrong. It is confusing to users familiar with Prometheus APIs. The description of TRITONSERVER_MetricSet can be verbose as well in order to describe different behaviors for counter/gauge and histogram/summary.

cc @Tabrizian @rmccorm4 @GuanLuo

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well.. Triton metrics API is not supposed to be mirroring "Prometheus API", Prometheus is one of the "forms" we can exhibit the metrics as. So we should design the API to be using generic terms for statistics, the meaning of gauge/counter/histogram (and summary?) is not affected by the fact that Prometheus or other metrics libraries are used.

Thinking from this mindset, my question is if observe is the generic verb for recording the statistics of histogram. If that is the case, then I am fine to add XXXObserve, otherwise, we should use the proper verb

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think either sample or observe. Voting for Observe for simplicity.

A histogram samples observations (usually things like request durations or response sizes) and counts them in configurable buckets. It also provides a sum of all observed values.

Similar to a histogram, a summary samples observations (usually things like request durations and response sizes). While it also provides a total count of observations and a sum of all observed values, it calculates configurable quantiles over a sliding time window.

struct TRITONSERVER_Metric* metric, double value);

/// Get the TRITONSERVER_MetricKind of metric of its corresponding family.
///
/// \param metric The metric object to query.
/// \param kind Returns the TRITONSERVER_MetricKind of metric.
Expand Down
98 changes: 94 additions & 4 deletions src/metric_family.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights
// reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -54,6 +55,12 @@ MetricFamily::MetricFamily(
.Help(description)
.Register(*registry));
break;
case TRITONSERVER_METRIC_KIND_HISTOGRAM:
family_ = reinterpret_cast<void*>(&prometheus::BuildHistogram()
.Name(name)
.Help(description)
.Register(*registry));
break;
default:
throw std::invalid_argument(
"Unsupported kind passed to MetricFamily constructor.");
Expand All @@ -63,24 +70,49 @@ MetricFamily::MetricFamily(
}

void*
MetricFamily::Add(std::map<std::string, std::string> label_map, Metric* metric)
MetricFamily::Add(
std::map<std::string, std::string> label_map, Metric* metric,
const TritonServerMetricArgs* args)
{
void* prom_metric = nullptr;
switch (kind_) {
case TRITONSERVER_METRIC_KIND_COUNTER: {
if (args != nullptr) {
throw std::invalid_argument(
"Unexpected args found in counter Metric constructor.");
}
auto counter_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Counter>*>(family_);
auto counter_ptr = &counter_family_ptr->Add(label_map);
prom_metric = reinterpret_cast<void*>(counter_ptr);
break;
}
case TRITONSERVER_METRIC_KIND_GAUGE: {
if (args != nullptr) {
throw std::invalid_argument(
"Unexpected args found in gauge Metric constructor.");
}
auto gauge_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Gauge>*>(family_);
auto gauge_ptr = &gauge_family_ptr->Add(label_map);
prom_metric = reinterpret_cast<void*>(gauge_ptr);
break;
}
case TRITONSERVER_METRIC_KIND_HISTOGRAM: {
if (args == nullptr) {
throw std::invalid_argument(
"Bucket boundaries not found in Metric args.");
}
if (args->kind() != TRITONSERVER_METRIC_KIND_HISTOGRAM) {
throw std::invalid_argument("Metric args not set to histogram kind.");
}
auto histogram_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Histogram>*>(family_);
auto histogram_ptr =
&histogram_family_ptr->Add(label_map, args->buckets());
prom_metric = reinterpret_cast<void*>(histogram_ptr);
break;
}
default:
throw std::invalid_argument(
"Unsupported family kind passed to Metric constructor.");
Expand Down Expand Up @@ -134,6 +166,14 @@ MetricFamily::Remove(void* prom_metric, Metric* metric)
gauge_family_ptr->Remove(gauge_ptr);
break;
}
case TRITONSERVER_METRIC_KIND_HISTOGRAM: {
auto histogram_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Histogram>*>(family_);
auto histogram_ptr =
reinterpret_cast<prometheus::Histogram*>(prom_metric);
histogram_family_ptr->Remove(histogram_ptr);
break;
}
default:
// Invalid kind should be caught in constructor
LOG_ERROR << "Unsupported kind in Metric destructor.";
Expand Down Expand Up @@ -169,7 +209,8 @@ MetricFamily::~MetricFamily()
//
Metric::Metric(
TRITONSERVER_MetricFamily* family,
std::vector<const InferenceParameter*> labels)
std::vector<const InferenceParameter*> labels,
const TritonServerMetricArgs* args)
{
family_ = reinterpret_cast<MetricFamily*>(family);
kind_ = family_->Kind();
Expand All @@ -188,7 +229,7 @@ Metric::Metric(
std::string(reinterpret_cast<const char*>(param->ValuePointer()));
}

metric_ = family_->Add(label_map, this);
metric_ = family_->Add(label_map, this, args);
}

Metric::~Metric()
Expand Down Expand Up @@ -235,6 +276,11 @@ Metric::Value(double* value)
*value = gauge_ptr->Value();
break;
}
case TRITONSERVER_METRIC_KIND_HISTOGRAM: {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"TRITONSERVER_METRIC_KIND_HISTOGRAM does not support Value");
}
default:
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
Expand Down Expand Up @@ -279,6 +325,11 @@ Metric::Increment(double value)
}
break;
}
case TRITONSERVER_METRIC_KIND_HISTOGRAM: {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"TRITONSERVER_METRIC_KIND_HISTOGRAM does not support Increment");
}
default:
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
Expand Down Expand Up @@ -308,6 +359,45 @@ Metric::Set(double value)
gauge_ptr->Set(value);
break;
}
case TRITONSERVER_METRIC_KIND_HISTOGRAM: {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"TRITONSERVER_METRIC_KIND_HISTOGRAM does not support Set");
}
default:
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"Unsupported TRITONSERVER_MetricKind");
}

return nullptr; // Success
}

TRITONSERVER_Error*
Metric::Observe(double value)
{
if (metric_ == nullptr) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"Could not set metric value. Metric has been invalidated.");
}

switch (kind_) {
case TRITONSERVER_METRIC_KIND_COUNTER: {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"TRITONSERVER_METRIC_KIND_COUNTER does not support Observe");
}
case TRITONSERVER_METRIC_KIND_GAUGE: {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"TRITONSERVER_METRIC_KIND_GAUGE does not support Observe");
}
case TRITONSERVER_METRIC_KIND_HISTOGRAM: {
auto histogram_ptr = reinterpret_cast<prometheus::Histogram*>(metric_);
histogram_ptr->Observe(value);
break;
}
default:
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
Expand Down
35 changes: 32 additions & 3 deletions src/metric_family.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights
// reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -27,6 +28,7 @@

#ifdef TRITON_ENABLE_METRICS

#include <cstring>
#include <mutex>
#include <set>
#include <unordered_map>
Expand All @@ -37,6 +39,29 @@

namespace triton { namespace core {

//
// TritonServerMetricArgs
//
// Implementation for TRITONSERVER_MetricArgs.
//
class TritonServerMetricArgs {
public:
TritonServerMetricArgs() = default;

void* SetHistogramArgs(const double* buckets, uint64_t bucket_count)
{
kind_ = TRITONSERVER_METRIC_KIND_HISTOGRAM;
buckets_ = std::vector<double>(buckets, buckets + bucket_count);
return nullptr;
}
TRITONSERVER_MetricKind kind() const { return kind_; }
const std::vector<double>& buckets() const { return buckets_; }

private:
TRITONSERVER_MetricKind kind_;
std::vector<double> buckets_;
};

//
// Implementation for TRITONSERVER_MetricFamily.
//
Expand All @@ -50,7 +75,9 @@ class MetricFamily {
void* Family() const { return family_; }
TRITONSERVER_MetricKind Kind() const { return kind_; }

void* Add(std::map<std::string, std::string> label_map, Metric* metric);
void* Add(
std::map<std::string, std::string> label_map, Metric* metric,
const TritonServerMetricArgs* args);
void Remove(void* prom_metric, Metric* metric);

int NumMetrics()
Expand Down Expand Up @@ -86,7 +113,8 @@ class Metric {
public:
Metric(
TRITONSERVER_MetricFamily* family,
std::vector<const InferenceParameter*> labels);
std::vector<const InferenceParameter*> labels,
const TritonServerMetricArgs* args);
~Metric();

MetricFamily* Family() const { return family_; }
Expand All @@ -95,6 +123,7 @@ class Metric {
TRITONSERVER_Error* Value(double* value);
TRITONSERVER_Error* Increment(double value);
TRITONSERVER_Error* Set(double value);
TRITONSERVER_Error* Observe(double value);

// If a MetricFamily is deleted before its dependent Metric, we want to
// invalidate the references so we don't access invalid memory.
Expand Down
3 changes: 2 additions & 1 deletion src/metrics.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -35,6 +35,7 @@

#include "prometheus/counter.h"
#include "prometheus/gauge.h"
#include "prometheus/histogram.h"
#include "prometheus/registry.h"
#include "prometheus/serializer.h"
#include "prometheus/summary.h"
Expand Down
Loading
Loading