diff --git a/README.md b/README.md index 06d7b11f3..45a76e5ca 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ For more information about the Xline client SDK, or the Xline client command lin ## Quick Start -To get started, check out the document [QUICK_START.md](doc/quick-start/README.md) for in-depth information and step-by-step instructions. +To get started, check out the document [QUICK_START.md](doc/QUICK_START.md) for in-depth information and step-by-step instructions. ## Contribute Guide diff --git a/USAGE.md b/USAGE.md index c7369bf02..e2f76fcc0 100644 --- a/USAGE.md +++ b/USAGE.md @@ -65,7 +65,7 @@ retry_timeout = '50ms' # the rpc retry interval, of which the default i 2. Use the following command to start cluster: ```bash - # Run in 3 terminals. If you want more logs, add `RUST_LOG=debug` before the command. + # Run in 3 terminals. If you want more logs, add `RUST_LOG=curp=debug,xline=debug` before the command. ./xline --name node1 --members node1=127.0.0.1:2379,node2=127.0.0.1:2380,node3=127.0.0.1:2381 --is-leader diff --git a/crates/curp/src/rpc/metrics.rs b/crates/curp/src/rpc/metrics.rs index 7e7281070..66ff6c29b 100644 --- a/crates/curp/src/rpc/metrics.rs +++ b/crates/curp/src/rpc/metrics.rs @@ -4,11 +4,11 @@ use utils::define_metrics; define_metrics! { "curp_p2p", peer_sent_bytes_total: Counter = meter() - .u64_counter("peer_sent_bytes_total") + .u64_counter("peer_sent_bytes") .with_description("The total number of bytes send to peers.") .init(), peer_sent_failures_total: Counter = meter() - .u64_counter("peer_sent_failures_total") + .u64_counter("peer_sent_failures") .with_description("The total number of send failures to peers.") .init(), peer_round_trip_time_seconds: Histogram = meter() diff --git a/crates/curp/src/server/curp_node.rs b/crates/curp/src/server/curp_node.rs index ea0f0806b..51866774f 100644 --- a/crates/curp/src/server/curp_node.rs +++ b/crates/curp/src/server/curp_node.rs @@ -265,7 +265,7 @@ impl CurpNode { &self, req_stream: impl Stream>, ) -> Result { - metrics::get().apply_snapshot_in_progress.observe(1, &[]); + metrics::get().apply_snapshot_in_progress.add(1, &[]); let start = Instant::now(); pin_mut!(req_stream); let mut snapshot = self @@ -315,7 +315,7 @@ impl CurpNode { "failed to reset the command executor by snapshot, {err}" )) })?; - metrics::get().apply_snapshot_in_progress.observe(0, &[]); + metrics::get().apply_snapshot_in_progress.add(-1, &[]); metrics::get() .snapshot_install_total_duration_seconds .record(start.elapsed().as_secs(), &[]); diff --git a/crates/curp/src/server/metrics.rs b/crates/curp/src/server/metrics.rs index 208c4e29a..bcc9ba658 100644 --- a/crates/curp/src/server/metrics.rs +++ b/crates/curp/src/server/metrics.rs @@ -1,8 +1,8 @@ use std::sync::Arc; -use clippy_utilities::NumericCast; +use clippy_utilities::{NumericCast, OverflowArithmetic}; use curp_external_api::{cmd::Command, role_change::RoleChange}; -use opentelemetry::metrics::{Counter, Histogram, MetricsError, ObservableGauge}; +use opentelemetry::metrics::{Counter, Histogram, MetricsError, UpDownCounter}; use utils::define_metrics; use super::raw_curp::RawCurp; @@ -25,26 +25,14 @@ define_metrics! { .u64_counter("heartbeat_send_failures") .with_description("The total number of leader heartbeat send failures (likely overloaded from slow disk).") .init(), - apply_snapshot_in_progress: ObservableGauge = meter() - .u64_observable_gauge("apply_snapshot_in_progress") + apply_snapshot_in_progress: UpDownCounter = meter() + .i64_up_down_counter("apply_snapshot_in_progress") .with_description("1 if the server is applying the incoming snapshot. 0 if none.") .init(), - proposals_committed: ObservableGauge = meter() - .u64_observable_gauge("proposals_committed") - .with_description("The total number of consensus proposals committed.") - .init(), proposals_failed: Counter = meter() .u64_counter("proposals_failed") .with_description("The total number of failed proposals seen.") .init(), - proposals_applied: ObservableGauge = meter() - .u64_observable_gauge("proposals_applied") - .with_description("The total number of consensus proposals applied.") - .init(), - proposals_pending: ObservableGauge = meter() - .u64_observable_gauge("proposals_pending") - .with_description("The current number of pending proposals to commit.") - .init(), snapshot_install_total_duration_seconds: Histogram = meter() .u64_histogram("snapshot_install_total_duration_seconds") .with_description("The total latency distributions of save called by install_snapshot.") @@ -66,8 +54,11 @@ impl Metrics { is_leader, is_learner, server_id, - sp_total, + sp_cnt, online_clients, + proposals_committed, + proposals_applied, + proposals_pending, ) = ( meter .u64_observable_gauge("has_leader") @@ -86,13 +77,25 @@ impl Metrics { .with_description("Server or member ID in hexadecimal format. 1 for 'server_id' label with current ID.") .init(), meter - .u64_observable_gauge("sp_total") + .u64_observable_gauge("sp_cnt") .with_description("The speculative pool size of this server") .init(), meter .u64_observable_gauge("online_clients") .with_description("The online client ids count of this server if it is the leader") .init(), + meter + .u64_observable_gauge("proposals_committed") + .with_description("The total number of consensus proposals committed.") + .init(), + meter + .u64_observable_gauge("proposals_applied") + .with_description("The total number of consensus proposals applied.") + .init(), + meter + .u64_observable_gauge("proposals_pending") + .with_description("The current number of pending proposals to commit.") + .init(), ); _ = meter.register_callback( @@ -101,7 +104,7 @@ impl Metrics { is_leader.as_any(), is_learner.as_any(), server_id.as_any(), - sp_total.as_any(), + sp_cnt.as_any(), online_clients.as_any(), ], move |observer| { @@ -115,10 +118,21 @@ impl Metrics { observer.observe_u64(&server_id, id, &[]); let sp_size = curp.spec_pool().lock().len(); - observer.observe_u64(&sp_total, sp_size.numeric_cast(), &[]); + observer.observe_u64(&sp_cnt, sp_size.numeric_cast(), &[]); let client_ids = curp.lease_manager().read().expiry_queue.len(); observer.observe_u64(&online_clients, client_ids.numeric_cast(), &[]); + + let commit_index = curp.commit_index(); + let last_log_index = curp.last_log_index(); + + observer.observe_u64(&proposals_committed, commit_index, &[]); + observer.observe_u64(&proposals_applied, curp.last_applied(), &[]); + observer.observe_u64( + &proposals_pending, + last_log_index.overflow_sub(commit_index), + &[], + ); }, )?; diff --git a/crates/curp/src/server/raw_curp/mod.rs b/crates/curp/src/server/raw_curp/mod.rs index 86c62eace..a129a9c71 100644 --- a/crates/curp/src/server/raw_curp/mod.rs +++ b/crates/curp/src/server/raw_curp/mod.rs @@ -768,12 +768,6 @@ impl RawCurp { if last_sent_index > log_w.commit_index { log_w.commit_to(last_sent_index); debug!("{} updates commit index to {last_sent_index}", self.id()); - metrics::get() - .proposals_committed - .observe(last_sent_index, &[]); - metrics::get() - .proposals_pending - .observe(log_w.last_log_index().overflow_sub(last_sent_index), &[]); self.apply(&mut *log_w); } } @@ -1503,6 +1497,11 @@ impl RawCurp { self.log.read().last_log_index() } + /// Get last applied index + pub(super) fn last_applied(&self) -> u64 { + self.log.read().last_as + } + /// Pick a node that has the same log as the current node pub(super) fn pick_new_leader(&self) -> Option { let last_idx = self.log.read().last_log_index(); @@ -1766,7 +1765,6 @@ impl RawCurp { /// Apply new logs fn apply(&self, log: &mut Log) { for i in (log.last_as + 1)..=log.commit_index { - metrics::get().proposals_applied.observe(i, &[]); let entry = log.get(i).unwrap_or_else(|| { unreachable!( "system corrupted, apply log[{i}] when we only have {} log entries", @@ -1898,10 +1896,6 @@ impl RawCurp { // check if commit_index needs to be updated if self.can_update_commit_index_to(log_w, index, term) && index > log_w.commit_index { log_w.commit_to(index); - metrics::get().proposals_committed.observe(index, &[]); - metrics::get() - .proposals_pending - .observe(log_w.last_log_index().overflow_sub(index), &[]); debug!("{} updates commit index to {index}", self.id()); self.apply(&mut *log_w); } diff --git a/doc/QUICK_START.md b/doc/QUICK_START.md new file mode 100644 index 000000000..bf0a8cdfb --- /dev/null +++ b/doc/QUICK_START.md @@ -0,0 +1,140 @@ +# Quick Start + +## Single node cluster + +### Using docker + +```bash +# Assume that docker engine environment is installed. + +$ docker run -it --rm --name=xline -e RUST_LOG=xline=debug -p 2379:2379 ghcr.io/xline-kv/xline \ + xline \ + --name xline \ + --storage-engine rocksdb \ + --members xline=127.0.0.1:2379 \ + --data-dir /usr/local/xline/data-dir \ + --client-listen-urls http://0.0.0.0:2379 \ + --peer-listen-urls http://0.0.0.0:2380 \ + --client-advertise-urls http://127.0.0.1:2379 \ + --peer-advertise-urls http://127.0.0.1:2380 +``` + +```bash +# Try with etcdctl + +$ ETCDCTL_API=3 etcdctl put A 1 +OK +$ ETCDCTL_API=3 etcdctl get A +A +1 +``` + +### Build from source + +1. Install dependencies + +```bash +# Ubuntu/Debian + +$ sudo apt-get install -y autoconf autogen libtool + +# Requires protobuf-compiler >= 3.15 +$ git clone --branch v3.21.12 --recurse-submodules https://github.com/protocolbuffers/protobuf +$ cd protobuf +$ ./autogen.sh +$ ./configure +$ make -j$(nproc) +$ sudo make install +``` + +```bash +# macOS + +# Assume that brew is installed, or you could install brew by: +# /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" + +$ brew install protobuf +``` + +2. Build xline + +```bash +# Assume that rust compile environment installed, such as cargo, etc. + +# clone source code +$ git clone --recurse-submodules https://github.com/xline-kv/Xline + +# compile Xline +$ cd Xline +$ cargo build --release +``` + +3. Run xline + +```bash +$ ./target/release/xline --name xline \ + --storage-engine rocksdb \ + --members xline=127.0.0.1:2379 \ + --data-dir \ + --client-listen-urls http://0.0.0.0:2379 \ + --peer-listen-urls http://0.0.0.0:2380 \ + --client-advertise-urls http://127.0.0.1:2379 \ + --peer-advertise-urls http://127.0.0.1:2380 +``` + +## Standard xline cluster + +1. Start the cluster + +```bash +# Pull the latest image from ghcr.io +$ docker pull ghcr.io/xline-kv/xline:latest +# Copy some fixtures which are required by quick_start.sh +$ cp fixtures/{private,public}.pem scripts/ +# Using the quick start scripts +$ ./scripts/quick_start.sh +``` + +2. Basic requests + +```bash +# Set Key A's value to 1 +$ docker exec client /bin/sh -c "/usr/local/bin/etcdctl --endpoints=\"http://node1:2379\" put A 1" +OK + +# Get Key A's value +$ docker exec client /bin/sh -c "/usr/local/bin/etcdctl --endpoints=\"http://node1:2379\" get A" +A +1 +``` + +3. Inspect metrics + +After finished `Start the cluster`, you can goto http://127.0.0.1:9090/graph. +You should be able to see a web ui of Prometheus. + +For example: + +This means the `node1` is the leader. + +![](./img/prom_demo.png) + +For more metrics, please goto [metrics.md](./metrics.md) + +4. Benchmark + +```bash +$ ./scripts/quick_start.sh stop +$ ./scripts/benchmark.sh xline +``` + +## Directory Structure + +| directory name | description | +|----------------|---------------------------------------------------------| +| benchmark | a customized benchmark using CURP protocol based client | +| curp | the CURP protocol | +| xline | xline services | +| engine | persistent storage | +| utils | some utilities, like lock, config, etc. | +| scripts | the shell scripts for env deployment or benchmarking | diff --git a/doc/img/prom_demo.png b/doc/img/prom_demo.png new file mode 100644 index 000000000..4d253f02e Binary files /dev/null and b/doc/img/prom_demo.png differ diff --git a/doc/metrics.md b/doc/metrics.md new file mode 100644 index 000000000..0e8134295 --- /dev/null +++ b/doc/metrics.md @@ -0,0 +1,113 @@ +## Xline metrics + +Many metrics are similar to those in [etcd](https://etcd.io/docs/v3.5/metrics/). + +### CURP Server + +1. `leader_changes`: Counter +The number of leader changes seen. + +2. `learner_promote_failed`: Counter +The total number of failed learner promotions (likely learner not ready) while this member is leader. + +3. `learner_promote_succeed`: Counter +The total number of successful learner promotions while this member is leader. + +4. `heartbeat_send_failures`: Counter +The total number of leader heartbeat send failures (likely overloaded from slow disk). + +5. `apply_snapshot_in_progress`: UpDownCounter +not equals to 0 if the server is applying the incoming snapshot. 0 if none. + +6. `proposals_committed`: ObservableGauge +The total number of consensus proposals committed. + +7. `proposals_failed`: Counter +The total number of failed proposals seen. + +8. `proposals_applied`: ObservableGauge +The total number of consensus proposals applied. + +9. `proposals_pending`: ObservableGauge +The current number of pending proposals to commit. + +10. `snapshot_install_total_duration_seconds`: Histogram +The total latency distributions of save called by install_snapshot. + +11. `client_id_revokes`: Counter +The total number of client id revokes times. + +12. `has_leader`: ObservableGauge +Whether or not a leader exists. 1 is existence, 0 is not. + +13. `is_leader`: ObservableGauge +Whether or not this member is a leader. 1 if is, 0 otherwise. + +14. `is_learner`: ObservableGauge +Whether or not this member is a learner. 1 if is, 0 otherwise. + +15. `server_id`: ObservableGauge +Server or member ID in hexadecimal format. 1 for 'server_id' label with the current ID. + +16. `sp_cnt`: ObservableGauge +The speculative pool size of this server. + +17. `online_clients`: ObservableGauge +The online client IDs count of this server if it is the leader. + +### CURP Client + +1. `client_retry_count`: Counter +The total number of retries when the client propose to the cluster. + +2. `client_fast_path_count`: Counter +The total number of fast path when the client propose to the cluster. + +3. `client_slow_path_count`: Counter +The total number of slow path when the client propose to the cluster. + +4. `client_fast_path_fallback_slow_path_count`: Counter +The total number of fast path fallbacks into slow path when the client propose to the cluster. + +### Xline + +1. `slow_read_indexes`: Counter +The total number of pending read indexes not in sync with leader's or timed out read index requests. + +2. `read_indexes_failed`: Counter +The total number of failed read indexes seen. + +3. `lease_expired`: Counter +The total number of expired leases. + +4. `fd_used`: ObservableGauge +The number of used file descriptors. + +5. `fd_limit`: ObservableGauge +The file descriptor limit. + +6. `current_version`: ObservableGauge +Which version is running. 1 for 'server_version' label with the current version. + +7. `current_rust_version`: ObservableGauge +Which Rust version the server is running with. 1 for 'server_rust_version' label with the current version. + + +### Engine + +1. `engine_apply_snapshot_duration_seconds`: Histogram +The backend engine apply snapshot duration in seconds. + +2. `engine_write_batch_duration_seconds`: Histogram +The backend engine write batch engine, `batch_size` refer to the size and `sync` if sync option is on. + +### Network + +1. `peer_sent_bytes`: Counter +The total number of bytes sent to peers. + +2. `peer_sent_failures`: Counter +The total number of send failures to peers. + +3. `peer_round_trip_time_seconds`: Histogram +The round-trip-time histogram between peers. diff --git a/doc/quick-start/Dockerfile b/doc/quick-start/Dockerfile deleted file mode 100644 index e82b1c669..000000000 --- a/doc/quick-start/Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -FROM rust:1.67.1-slim-bullseye as builder - -WORKDIR /build - -COPY . . - -RUN set -eux && \ - sed -i s@/deb.debian.org/@/mirrors.aliyun.com/@g /etc/apt/sources.list && \ - apt-get update && apt-get install -y git build-essential autoconf autogen libtool clang && \ - git clone --branch v3.21.12 --recurse-submodules https://github.com/protocolbuffers/protobuf && \ - cd protobuf && ./autogen.sh && ./configure && make -j4 && make install && cd .. && ldconfig && \ - cargo build --release - -FROM debian:bullseye-slim - -RUN set -eux && \ - apt-get update && \ - apt-get install -y iproute2 iputils-ping procps && \ - rm -rf /var/lib/apt/lists/* - -COPY --from=builder /build/target/release/xline /usr/local/bin -COPY --from=builder /build/target/release/benchmark /usr/local/bin -COPY --from=builder /build/target/release/validation_lock_client /usr/local/bin - -CMD ["/usr/local/bin/xline"] diff --git a/doc/quick-start/README.md b/doc/quick-start/README.md deleted file mode 100644 index 6ddf13de9..000000000 --- a/doc/quick-start/README.md +++ /dev/null @@ -1,181 +0,0 @@ -# Quick Start - -## Run Xline from a pre-built image - -```bash -# Assume that docker engine environment is installed. - -docker run -it --name=xline ghcr.io/xline-kv/xline \ - xline \ - --name xline \ - --storage-engine rocksdb \ - --members xline=127.0.0.1:2379 \ - --data-dir /usr/local/xline/data-dir -``` - -## Run Xline from source code - -### Install dependencies - -#### Ubuntu/Debian - -```bash -sudo apt-get install -y autoconf autogen libtool - -# requires protobuf-compiler >= 3.15 -git clone --branch v3.21.12 --recurse-submodules https://github.com/protocolbuffers/protobuf -cd protobuf -./autogen.sh -./configure -make -j -sudo make install -``` - -#### macOS - -```bash -# Assume that brew is installed, or you could install brew by: -# /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" - -brew install protobuf -``` - -### Build Xline from source - -```bash -# Assume that rust compile environment installed, such as cargo, etc. - -# clone source code -git clone --recurse-submodules https://github.com/datenlord/Xline - -# compile Xline -cd Xline -cargo build --release -``` - -### Run Xline - -```bash -./target/release/xline --name xline \ - --storage-engine rocksdb \ - --members xline=127.0.0.1:2379 \ - --data-dir -``` - -## Test Xline cluster - -### Pull or Build image for validation - -#### Pull the latest image from ghcr.io -```bash -# Assume that docker engine environment is installed. - docker pull ghcr.io/xline-kv/xline:latest - ``` - -#### Build image -```bash -# Assume that docker engine environment is installed. - -# clone source code -git clone --recurse-submodules https://github.com/datenlord/Xline -cd Xline - -# build docker image -# you may need to add sudo before the command to make it work -docker build . -t ghcr.io/xline-kv/xline -f doc/quick-start/Dockerfile -``` - -### Start Xline servers - -```bash -cp ./fixtures/{private,public}.pem ./scripts - -./scripts/quick_start.sh -``` - -### Test basic etcd requests - -```bash -# Set Key A's value to 1 -docker exec client /bin/sh -c "/usr/local/bin/etcdctl --endpoints=\"http://172.20.0.3:2379\" put A 1" - -# Get Key A's value -docker exec client /bin/sh -c "/usr/local/bin/etcdctl --endpoints=\"http://172.20.0.3:2379\" get A" -``` - -### Membership Change -```bash -# Before member add -$ docker exec client /bin/sh -c "/usr/local/bin/etcdctl --endpoints=\"http://172.20.0.3:2379\" member list -w table" -+------------------+---------+-------+---------------------------------+---------------------------------+------------+ -| ID | STATUS | NAME | PEER ADDRS | CLIENT ADDRS | IS LEARNER | -+------------------+---------+-------+---------------------------------+---------------------------------+------------+ -| c446f1764cf82129 | started | node3 | 172.20.0.5:2379,172.20.0.5:2380 | 172.20.0.5:2379,172.20.0.5:2380 | false | -| 536070dcd739623d | started | node1 | 172.20.0.3:2379,172.20.0.3:2380 | 172.20.0.3:2379,172.20.0.3:2380 | false | -| c58a7f879100c944 | started | node2 | 172.20.0.4:2379,172.20.0.4:2380 | 172.20.0.4:2379,172.20.0.4:2380 | false | -+------------------+---------+-------+---------------------------------+---------------------------------+------------+ - -# do the member add -$ docker exec client /bin/sh -c "/usr/local/bin/etcdctl --endpoints=\"http://172.20.0.3:2379\" member add node4 --peer-urls=http://172.20.0.6:2379,http://172.20.0.6:2380" -Member 7bcbb7db4adc6890 added to cluster 73917cf4cbc75001 - -ETCD_NAME="node4" -ETCD_INITIAL_CLUSTER="node4=http://172.20.0.6:2379,node4=http://172.20.0.6:2380,node2=172.20.0.4:2379,node2=172.20.0.4:2380,node3=172.20.0.5:2379,node3=172.20.0.5:2380,node1=172.20.0.3:2379,node1=172.20.0.3:2380" -ETCD_INITIAL_ADVERTISE_PEER_URLS="http://172.20.0.6:2379,http://172.20.0.6:2380" -ETCD_INITIAL_CLUSTER_STATE="existing" - -# boot up a new node -$ docker run -d -it --rm --name=node4 --net=xline_net --ip=172.20.0.6 --cap-add=NET_ADMIN --cpu-shares=1024 -m=512M -v /home/jiawei/Xline/scripts:/mnt ghcr.io/xline-kv/xline:latest bash - -$ docker exec -e RUST_LOG=debug -d node4 "/usr/local/bin/xline --name node4 --members node1=172.20.0.3:2379,172.20.0.3:2380,node2=172.20.0.4:2379,172.20.0.4:2380,node3=172.20.0.5:2379,172.20.0.5:2380,node4=172.20.0.6:2379,172.20.0.6:2380 --storage-engine rocksdb --data-dir /usr/local/xline/data-dir --auth-public-key /mnt/public.pem --auth-private-key /mnt/private.pem --initial-cluster-state=existing" - -# check whether the new member adding success or not -$ docker exec client /bin/sh -c "/usr/local/bin/etcdctl --endpoints=\"http://172.20.0.3:2379\" member list -w table" -+------------------+---------+-------+-----------------------------------------------+-----------------------------------------------+------------+ -| ID | STATUS | NAME | PEER ADDRS | CLIENT ADDRS | IS LEARNER | -+------------------+---------+-------+-----------------------------------------------+-----------------------------------------------+------------+ -| 7bcbb7db4adc6890 | started | node4 | http://172.20.0.6:2379,http://172.20.0.6:2380 | http://172.20.0.6:2379,http://172.20.0.6:2380 | false | -| c58a7f879100c944 | started | node2 | 172.20.0.4:2379,172.20.0.4:2380 | 172.20.0.4:2379,172.20.0.4:2380 | false | -| c446f1764cf82129 | started | node3 | 172.20.0.5:2379,172.20.0.5:2380 | 172.20.0.5:2379,172.20.0.5:2380 | false | -| 536070dcd739623d | started | node1 | 172.20.0.3:2379,172.20.0.3:2380 | 172.20.0.3:2379,172.20.0.3:2380 | false | -+------------------+---------+-------+-----------------------------------------------+-----------------------------------------------+------------+ - -# do the member remove -$ docker exec client /bin/sh -c "/usr/local/bin/etcdctl --endpoints=\"http://172.20.0.3:2379\" member remove 7bcbb7db4adc6890" -Member 7bcbb7db4adc6890 removed from cluster 73917cf4cbc75001 - -# check whether the target member removed success or not -$ docker exec client /bin/sh -c "/usr/local/bin/etcdctl --endpoints=\"http://172.20.0.3:2379\" member list -w table" -+------------------+---------+-------+---------------------------------+---------------------------------+------------+ -| ID | STATUS | NAME | PEER ADDRS | CLIENT ADDRS | IS LEARNER | -+------------------+---------+-------+---------------------------------+---------------------------------+------------+ -| c58a7f879100c944 | started | node2 | 172.20.0.4:2379,172.20.0.4:2380 | 172.20.0.4:2379,172.20.0.4:2380 | false | -| c446f1764cf82129 | started | node3 | 172.20.0.5:2379,172.20.0.5:2380 | 172.20.0.5:2379,172.20.0.5:2380 | false | -| 536070dcd739623d | started | node1 | 172.20.0.3:2379,172.20.0.3:2380 | 172.20.0.3:2379,172.20.0.3:2380 | false | -+------------------+---------+-------+---------------------------------+---------------------------------+------------+ -``` - -### Validation test - -```bash -docker cp node1:/usr/local/bin/lock_client ./scripts - -./scripts/validation_test.sh -``` - -### Benchmark - -```bash -./scripts/benchmark.sh -``` - -# Directory Structure - -| directory name | description | -|----------------|---------------------------------------------------------| -| benchmark | a customized benchmark using CURP protocol based client | -| curp | the CURP protocol | -| xline | xline services | -| engine | persistent storage | -| utils | some utilities, like lock, config, etc. | -| scripts | the shell scripts for env deployment or benchmarking | diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh index 8bd96a300..fc1c16866 100755 --- a/scripts/benchmark.sh +++ b/scripts/benchmark.sh @@ -2,7 +2,7 @@ WORKDIR=$(pwd) OUTPUT_DIR="${WORKDIR}/out" SERVERS=("172.20.0.2" "172.20.0.3" "172.20.0.4" "172.20.0.5") -MEMBERS="node1=${SERVERS[1]}:2379,node2=${SERVERS[2]}:2379,node3=${SERVERS[3]}:2379" +MEMBERS="node1=${SERVERS[1]}:2380,node2=${SERVERS[2]}:2380,node3=${SERVERS[3]}:2380" # container use_curp endpoints # XLINE_TESTCASE[0] VS ETCD_TESTCASE[0]: In the best performance case contrast, xline uses the curp-client while the @@ -55,14 +55,18 @@ run_xline() { --client-wait-synced-timeout 10s \ --client-propose-timeout 5s \ --batch-timeout 1ms \ - --cmd-workers 16" + --cmd-workers 16 \ + --client-listen-urls=http://${SERVERS[$1]}:2379 \ + --peer-listen-urls=http://${SERVERS[$1]}:2380 \ + --client-advertise-urls=http://${SERVERS[$1]}:2379 \ + --peer-advertise-urls=http://${SERVERS[$1]}:2380" if [ ${1} -eq 1 ]; then cmd="${cmd} --is-leader" fi - docker exec -e RUST_LOG=curp,xline. -d node${1} ${cmd} - echo "docker exec -e RUST_LOG=curp,xline -d node${1} ${cmd}" + docker exec -e RUST_LOG=curp=debug,xline=debug -d node${1} ${cmd} + echo "docker exec -e RUST_LOG=curp=debug,xline=debug -d node${1} ${cmd}" } # run etcd node by index @@ -127,7 +131,7 @@ stop_all() { for name in "node1" "node2" "node3" "client"; do docker_id=$(docker ps -qf "name=${name}") if [ -n "$docker_id" ]; then - docker stop $docker_id + docker stop $docker_id -t 1 fi done sleep 1 @@ -216,7 +220,7 @@ rm -r ${OUTPUT_DIR} >/dev/null 2>&1 mkdir ${OUTPUT_DIR} mkdir ${OUTPUT_DIR}/logs -for server in "xline" "etcd"; do +for server in $@; do count=0 logs_dir=${OUTPUT_DIR}/logs/${server}_logs mkdir -p ${logs_dir} @@ -229,11 +233,14 @@ for server in "xline" "etcd"; do etcd) TESTCASE=("${ETCD_TESTCASE[@]}") ;; + *) + echo "unknown server, only support xline/etcd" + exit 1 + ;; esac run_container 3 ${server} for testcase in "${TESTCASE[@]}"; do - tmp=(${testcase}) container_name=${tmp[0]} case ${tmp[1]} in diff --git a/scripts/log.sh b/scripts/log.sh index 9afc534cb..b9f9a728d 100644 --- a/scripts/log.sh +++ b/scripts/log.sh @@ -1,4 +1,4 @@ -${__E2E_COMMON_LOG__:=false} && return 0 || __E2E_COMMON_LOG__=true +${__LOG__:=false} && return 0 || __LOG__=true function log::debug() { echo -e "\033[00;34m" "[DEBUG]" "$@" "\033[0m" diff --git a/scripts/prometheus.yml b/scripts/prometheus.yml index c3c5bd9c8..05b96c8a2 100644 --- a/scripts/prometheus.yml +++ b/scripts/prometheus.yml @@ -6,10 +6,9 @@ scrape_configs: static_configs: - targets: [ - "172.20.0.2:2379", - "172.20.0.3:2379", - "172.20.0.4:2379", - "172.20.0.5:2379", + "node1:9100", + "node2:9100", + "node3:9100", ] metrics_path: /metrics scheme: http diff --git a/scripts/quick_start.sh b/scripts/quick_start.sh index 1185bdbf2..3a0652ad0 100755 --- a/scripts/quick_start.sh +++ b/scripts/quick_start.sh @@ -1,10 +1,12 @@ #!/bin/bash + DIR=$( cd "$(dirname "$0")" pwd ) SERVERS=("172.20.0.2" "172.20.0.3" "172.20.0.4" "172.20.0.5") MEMBERS="node1=${SERVERS[1]}:2380,${SERVERS[1]}:2381,node2=${SERVERS[2]}:2380,${SERVERS[2]}:2381,node3=${SERVERS[3]}:2380,${SERVERS[3]}:2381" +LOG_PATH=${LOG_PATH:-"/mnt"} # default log to /mnt (i.e. the scripts directory) source $DIR/log.sh @@ -14,11 +16,12 @@ stop_all() { for name in "node1" "node2" "node3" "client"; do docker_id=$(docker ps -qf "name=${name}") if [ -n "$docker_id" ]; then - docker stop $docker_id + docker exec $docker_id rm -rf $LOG_PATH/$name + docker stop $docker_id -t 1 fi done docker network rm xline_net >/dev/null 2>&1 - docker stop "prometheus" + docker stop "prometheus" > /dev/null 2>&1 sleep 1 log::info stopped } @@ -37,18 +40,16 @@ run_xline() { --client-listen-urls=http://${SERVERS[$1]}:2379 \ --peer-listen-urls=http://${SERVERS[$1]}:2380,http://${SERVERS[$1]}:2381 \ --client-advertise-urls=http://${SERVERS[$1]}:2379 \ - --peer-advertise-urls=http://${SERVERS[$1]}:2380,http://${SERVERS[$1]}:2381" - - if [ -n "$LOG_LEVEL" ]; then - cmd="${cmd} --log-level ${LOG_LEVEL}" - fi + --peer-advertise-urls=http://${SERVERS[$1]}:2380,http://${SERVERS[$1]}:2381 \ + --log-file ${LOG_PATH}/node${1} --log-level debug" if [ ${1} -eq 1 ]; then cmd="${cmd} --is-leader" fi - docker exec -e RUST_LOG=debug -d node${1} ${cmd} - log::info "command is: docker exec -e RUST_LOG=debug -d node${1} ${cmd}" + exec="docker exec -e RUST_LOG=curp=debug,xline=debug -d node${1} ${cmd}" + eval $exec + log::info "start node${1} with command: ${exec}" } # run cluster of xline/etcd in container @@ -80,7 +81,7 @@ run_container() { done docker run -d -it --rm --name=client \ --net=xline_net --ip=${SERVERS[0]} --cap-add=NET_ADMIN \ - --cpu-shares=1024 -m=512M -v ${DIR}:/mnt ghcr.io/xline-kv/etcdctl:v3.5.9 bash & + --cpu-shares=1024 -m=512M -v ${DIR}:/mnt ghcr.io/xline-kv/etcdctl:v3.5.9 tail -F /dev/null wait log::info container started } @@ -99,7 +100,7 @@ if [ -z "$1" ]; then run_container 3 run_cluster run_prometheus "172.20.0.6" - echo "Prometheus starts on http://172.20.0.6:9090/graph and http://127.0.0.1:9090/graph" + echo "Prometheus starts on http://172.20.0.6:9090/graph and http://127.0.0.1:9090/graph (if you are using Docker Desktop)." exit 0 elif [ "$1" == "stop" ]; then stop_all