Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Metrics doc #685

Merged
merged 6 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ For more information about the Xline client SDK, or the Xline client command lin

## Quick Start

To get started, check out the document [QUICK_START.md](doc/quick-start/README.md) for in-depth information and step-by-step instructions.
To get started, check out the document [QUICK_START.md](doc/QUICK_START.md) for in-depth information and step-by-step instructions.

## Contribute Guide

Expand Down
2 changes: 1 addition & 1 deletion USAGE.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ retry_timeout = '50ms' # the rpc retry interval, of which the default i
2. Use the following command to start cluster:

```bash
# Run in 3 terminals. If you want more logs, add `RUST_LOG=debug` before the command.
# Run in 3 terminals. If you want more logs, add `RUST_LOG=curp=debug,xline=debug` before the command.

./xline --name node1 --members node1=127.0.0.1:2379,node2=127.0.0.1:2380,node3=127.0.0.1:2381 --is-leader

Expand Down
4 changes: 2 additions & 2 deletions crates/curp/src/rpc/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ use utils::define_metrics;
define_metrics! {
"curp_p2p",
peer_sent_bytes_total: Counter<u64> = meter()
.u64_counter("peer_sent_bytes_total")
.u64_counter("peer_sent_bytes")
.with_description("The total number of bytes send to peers.")
.init(),
peer_sent_failures_total: Counter<u64> = meter()
.u64_counter("peer_sent_failures_total")
.u64_counter("peer_sent_failures")
.with_description("The total number of send failures to peers.")
.init(),
peer_round_trip_time_seconds: Histogram<u64> = meter()
Expand Down
4 changes: 2 additions & 2 deletions crates/curp/src/server/curp_node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ impl<C: Command, RC: RoleChange> CurpNode<C, RC> {
&self,
req_stream: impl Stream<Item = Result<InstallSnapshotRequest, E>>,
) -> Result<InstallSnapshotResponse, CurpError> {
metrics::get().apply_snapshot_in_progress.observe(1, &[]);
metrics::get().apply_snapshot_in_progress.add(1, &[]);
let start = Instant::now();
pin_mut!(req_stream);
let mut snapshot = self
Expand Down Expand Up @@ -315,7 +315,7 @@ impl<C: Command, RC: RoleChange> CurpNode<C, RC> {
"failed to reset the command executor by snapshot, {err}"
))
})?;
metrics::get().apply_snapshot_in_progress.observe(0, &[]);
metrics::get().apply_snapshot_in_progress.add(-1, &[]);
metrics::get()
.snapshot_install_total_duration_seconds
.record(start.elapsed().as_secs(), &[]);
Expand Down
54 changes: 34 additions & 20 deletions crates/curp/src/server/metrics.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use std::sync::Arc;

use clippy_utilities::NumericCast;
use clippy_utilities::{NumericCast, OverflowArithmetic};
use curp_external_api::{cmd::Command, role_change::RoleChange};
use opentelemetry::metrics::{Counter, Histogram, MetricsError, ObservableGauge};
use opentelemetry::metrics::{Counter, Histogram, MetricsError, UpDownCounter};
use utils::define_metrics;

use super::raw_curp::RawCurp;
Expand All @@ -25,26 +25,14 @@ define_metrics! {
.u64_counter("heartbeat_send_failures")
.with_description("The total number of leader heartbeat send failures (likely overloaded from slow disk).")
.init(),
apply_snapshot_in_progress: ObservableGauge<u64> = meter()
.u64_observable_gauge("apply_snapshot_in_progress")
apply_snapshot_in_progress: UpDownCounter<i64> = meter()
.i64_up_down_counter("apply_snapshot_in_progress")
.with_description("1 if the server is applying the incoming snapshot. 0 if none.")
.init(),
proposals_committed: ObservableGauge<u64> = meter()
.u64_observable_gauge("proposals_committed")
.with_description("The total number of consensus proposals committed.")
.init(),
proposals_failed: Counter<u64> = meter()
.u64_counter("proposals_failed")
.with_description("The total number of failed proposals seen.")
.init(),
proposals_applied: ObservableGauge<u64> = meter()
.u64_observable_gauge("proposals_applied")
.with_description("The total number of consensus proposals applied.")
.init(),
proposals_pending: ObservableGauge<u64> = meter()
.u64_observable_gauge("proposals_pending")
.with_description("The current number of pending proposals to commit.")
.init(),
snapshot_install_total_duration_seconds: Histogram<u64> = meter()
.u64_histogram("snapshot_install_total_duration_seconds")
.with_description("The total latency distributions of save called by install_snapshot.")
Expand All @@ -66,8 +54,11 @@ impl Metrics {
is_leader,
is_learner,
server_id,
sp_total,
sp_cnt,
online_clients,
proposals_committed,
proposals_applied,
proposals_pending,
) = (
meter
.u64_observable_gauge("has_leader")
Expand All @@ -86,13 +77,25 @@ impl Metrics {
.with_description("Server or member ID in hexadecimal format. 1 for 'server_id' label with current ID.")
.init(),
meter
.u64_observable_gauge("sp_total")
.u64_observable_gauge("sp_cnt")
.with_description("The speculative pool size of this server")
.init(),
meter
.u64_observable_gauge("online_clients")
.with_description("The online client ids count of this server if it is the leader")
.init(),
meter
.u64_observable_gauge("proposals_committed")
.with_description("The total number of consensus proposals committed.")
.init(),
meter
.u64_observable_gauge("proposals_applied")
.with_description("The total number of consensus proposals applied.")
.init(),
meter
.u64_observable_gauge("proposals_pending")
.with_description("The current number of pending proposals to commit.")
.init(),
);

_ = meter.register_callback(
Expand All @@ -101,7 +104,7 @@ impl Metrics {
is_leader.as_any(),
is_learner.as_any(),
server_id.as_any(),
sp_total.as_any(),
sp_cnt.as_any(),
online_clients.as_any(),
],
move |observer| {
Expand All @@ -115,10 +118,21 @@ impl Metrics {
observer.observe_u64(&server_id, id, &[]);

let sp_size = curp.spec_pool().lock().len();
observer.observe_u64(&sp_total, sp_size.numeric_cast(), &[]);
observer.observe_u64(&sp_cnt, sp_size.numeric_cast(), &[]);

let client_ids = curp.lease_manager().read().expiry_queue.len();
observer.observe_u64(&online_clients, client_ids.numeric_cast(), &[]);

let commit_index = curp.commit_index();
let last_log_index = curp.last_log_index();

observer.observe_u64(&proposals_committed, commit_index, &[]);
observer.observe_u64(&proposals_applied, curp.last_applied(), &[]);
observer.observe_u64(
&proposals_pending,
last_log_index.overflow_sub(commit_index),
&[],
);
},
)?;

Expand Down
16 changes: 5 additions & 11 deletions crates/curp/src/server/raw_curp/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -768,12 +768,6 @@ impl<C: Command, RC: RoleChange> RawCurp<C, RC> {
if last_sent_index > log_w.commit_index {
log_w.commit_to(last_sent_index);
debug!("{} updates commit index to {last_sent_index}", self.id());
metrics::get()
.proposals_committed
.observe(last_sent_index, &[]);
metrics::get()
.proposals_pending
.observe(log_w.last_log_index().overflow_sub(last_sent_index), &[]);
self.apply(&mut *log_w);
}
}
Expand Down Expand Up @@ -1503,6 +1497,11 @@ impl<C: Command, RC: RoleChange> RawCurp<C, RC> {
self.log.read().last_log_index()
}

/// Get last applied index
pub(super) fn last_applied(&self) -> u64 {
self.log.read().last_as
}

/// Pick a node that has the same log as the current node
pub(super) fn pick_new_leader(&self) -> Option<ServerId> {
let last_idx = self.log.read().last_log_index();
Expand Down Expand Up @@ -1766,7 +1765,6 @@ impl<C: Command, RC: RoleChange> RawCurp<C, RC> {
/// Apply new logs
fn apply(&self, log: &mut Log<C>) {
for i in (log.last_as + 1)..=log.commit_index {
metrics::get().proposals_applied.observe(i, &[]);
let entry = log.get(i).unwrap_or_else(|| {
unreachable!(
"system corrupted, apply log[{i}] when we only have {} log entries",
Expand Down Expand Up @@ -1898,10 +1896,6 @@ impl<C: Command, RC: RoleChange> RawCurp<C, RC> {
// check if commit_index needs to be updated
if self.can_update_commit_index_to(log_w, index, term) && index > log_w.commit_index {
log_w.commit_to(index);
metrics::get().proposals_committed.observe(index, &[]);
metrics::get()
.proposals_pending
.observe(log_w.last_log_index().overflow_sub(index), &[]);
debug!("{} updates commit index to {index}", self.id());
self.apply(&mut *log_w);
}
Expand Down
140 changes: 140 additions & 0 deletions doc/QUICK_START.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# Quick Start

## Single node cluster

### Using docker

```bash
# Assume that docker engine environment is installed.

$ docker run -it --rm --name=xline -e RUST_LOG=xline=debug -p 2379:2379 ghcr.io/xline-kv/xline \
xline \
--name xline \
--storage-engine rocksdb \
--members xline=127.0.0.1:2379 \
--data-dir /usr/local/xline/data-dir \
--client-listen-urls http://0.0.0.0:2379 \
--peer-listen-urls http://0.0.0.0:2380 \
--client-advertise-urls http://127.0.0.1:2379 \
--peer-advertise-urls http://127.0.0.1:2380
```

```bash
# Try with etcdctl

$ ETCDCTL_API=3 etcdctl put A 1
OK
$ ETCDCTL_API=3 etcdctl get A
A
1
```

### Build from source

1. Install dependencies

```bash
# Ubuntu/Debian

$ sudo apt-get install -y autoconf autogen libtool

# Requires protobuf-compiler >= 3.15
$ git clone --branch v3.21.12 --recurse-submodules https://github.com/protocolbuffers/protobuf
$ cd protobuf
$ ./autogen.sh
$ ./configure
$ make -j$(nproc)
$ sudo make install
```

```bash
# macOS

# Assume that brew is installed, or you could install brew by:
# /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"

$ brew install protobuf
```

2. Build xline

```bash
# Assume that rust compile environment installed, such as cargo, etc.

# clone source code
$ git clone --recurse-submodules https://github.com/xline-kv/Xline

# compile Xline
$ cd Xline
$ cargo build --release
```

3. Run xline

```bash
$ ./target/release/xline --name xline \
--storage-engine rocksdb \
--members xline=127.0.0.1:2379 \
--data-dir <path-to-data-dir> \
--client-listen-urls http://0.0.0.0:2379 \
--peer-listen-urls http://0.0.0.0:2380 \
--client-advertise-urls http://127.0.0.1:2379 \
--peer-advertise-urls http://127.0.0.1:2380
```

## Standard xline cluster

1. Start the cluster

```bash
# Pull the latest image from ghcr.io
$ docker pull ghcr.io/xline-kv/xline:latest
# Copy some fixtures which are required by quick_start.sh
$ cp fixtures/{private,public}.pem scripts/
# Using the quick start scripts
$ ./scripts/quick_start.sh
```

2. Basic requests

```bash
# Set Key A's value to 1
$ docker exec client /bin/sh -c "/usr/local/bin/etcdctl --endpoints=\"http://node1:2379\" put A 1"
OK

# Get Key A's value
$ docker exec client /bin/sh -c "/usr/local/bin/etcdctl --endpoints=\"http://node1:2379\" get A"
A
1
```

3. Inspect metrics

After finished `Start the cluster`, you can goto http://127.0.0.1:9090/graph.
You should be able to see a web ui of Prometheus.

For example:

This means the `node1` is the leader.

![](./img/prom_demo.png)

For more metrics, please goto [metrics.md](./metrics.md)

4. Benchmark

```bash
$ ./scripts/quick_start.sh stop
$ ./scripts/benchmark.sh xline
```

## Directory Structure

| directory name | description |
|----------------|---------------------------------------------------------|
| benchmark | a customized benchmark using CURP protocol based client |
| curp | the CURP protocol |
| xline | xline services |
| engine | persistent storage |
| utils | some utilities, like lock, config, etc. |
| scripts | the shell scripts for env deployment or benchmarking |
Binary file added doc/img/prom_demo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Loading