diff --git a/.github/workflows/slow_test.yml b/.github/workflows/slow_test.yml index de45fa258d..6c6ba46ac0 100644 --- a/.github/workflows/slow_test.yml +++ b/.github/workflows/slow_test.yml @@ -163,9 +163,9 @@ jobs: env: old_value: ${{ steps.start_tests_debug.outputs.old_mmmap_rnd_bits }} run: | - pids=$(sudo docker exec ${BUILDER_CONTAINER} pgrep -f cmake-build-debug/src/infinity | xargs echo) sudo chmod +x scripts/timeout_kill.sh - sudo docker exec ${BUILDER_CONTAINER} bash -c "/infinity/scripts/timeout_kill.sh 10 ${pids}" + pids=$(sudo docker exec ${BUILDER_CONTAINER} pgrep -f cmake-build-debug/src/infinity | xargs echo) + sudo docker exec ${BUILDER_CONTAINER} bash -c "/infinity/scripts/timeout_kill.sh 15 ${pids}" if [ $? -ne 0 ]; then echo "Failed to kill infinity debug version" exit 1 @@ -216,9 +216,9 @@ jobs: # && !contains(github.event.pull_request.labels.*.name, 'invalid') id: stop_py_tests run: | - pids=$(sudo docker exec ${BUILDER_CONTAINER} pgrep -f cmake-build-release/src/infinity | xargs echo) sudo chmod +x scripts/timeout_kill.sh - sudo docker exec ${BUILDER_CONTAINER} bash -c "/infinity/scripts/timeout_kill.sh 10 ${pids}" + pids=$(sudo docker exec ${BUILDER_CONTAINER} pgrep -f cmake-build-release/src/infinity | xargs echo) + sudo docker exec ${BUILDER_CONTAINER} bash -c "/infinity/scripts/timeout_kill.sh 15 ${pids}" if [ $? -ne 0 ]; then echo "Failed to kill infinity release version" exit 1 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ebdade27b6..4452fa224e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -140,9 +140,9 @@ jobs: if: ${{ !cancelled() }} id: stop_tests_debug_minio run: | - pids=$(sudo docker exec ${BUILDER_CONTAINER} pgrep -f cmake-build-debug/src/infinity | xargs echo) sudo chmod +x scripts/timeout_kill.sh - sudo docker exec ${BUILDER_CONTAINER} bash -c "/infinity/scripts/timeout_kill.sh 10 ${pids}" + pids=$(sudo docker exec ${BUILDER_CONTAINER} pgrep -f cmake-build-debug/src/infinity | xargs echo) + sudo docker exec ${BUILDER_CONTAINER} bash -c "/infinity/scripts/timeout_kill.sh 15 ${pids}" if [ $? -ne 0 ]; then echo "Failed to kill infinity debug version" exit 1 @@ -170,9 +170,9 @@ jobs: if: ${{ !cancelled() }} id: stop_tests_debug run: | - pids=$(sudo docker exec ${BUILDER_CONTAINER} pgrep -f cmake-build-debug/src/infinity | xargs echo) sudo chmod +x scripts/timeout_kill.sh - sudo docker exec ${BUILDER_CONTAINER} bash -c "/infinity/scripts/timeout_kill.sh 10 ${pids}" + pids=$(sudo docker exec ${BUILDER_CONTAINER} pgrep -f cmake-build-debug/src/infinity | xargs echo) + sudo docker exec ${BUILDER_CONTAINER} bash -c "/infinity/scripts/timeout_kill.sh 15 ${pids}" if [ $? -ne 0 ]; then echo "Failed to kill infinity debug version" exit 1 @@ -307,9 +307,9 @@ jobs: if: ${{ !cancelled() }} id: stop_tests_release_minio run: | - pids=$(sudo docker exec ${BUILDER_CONTAINER} pgrep -f cmake-build-release/src/infinity | xargs echo) sudo chmod +x scripts/timeout_kill.sh - sudo docker exec ${BUILDER_CONTAINER} bash -c "/infinity/scripts/timeout_kill.sh 10 ${pids}" + pids=$(sudo docker exec ${BUILDER_CONTAINER} pgrep -f cmake-build-release/src/infinity | xargs echo) + sudo docker exec ${BUILDER_CONTAINER} bash -c "/infinity/scripts/timeout_kill.sh 15 ${pids}" if [ $? -ne 0 ]; then echo "Failed to kill infinity debug version" exit 1 @@ -337,9 +337,9 @@ jobs: if: ${{ !cancelled() }} id: stop_tests_release run: | - pids=$(sudo docker exec ${BUILDER_CONTAINER} pgrep -f cmake-build-release/src/infinity | xargs echo) sudo chmod +x scripts/timeout_kill.sh - sudo docker exec ${BUILDER_CONTAINER} bash -c "/infinity/scripts/timeout_kill.sh 10 ${pids}" + pids=$(sudo docker exec ${BUILDER_CONTAINER} pgrep -f cmake-build-release/src/infinity | xargs echo) + sudo docker exec ${BUILDER_CONTAINER} bash -c "/infinity/scripts/timeout_kill.sh 15 ${pids}" if [ $? -ne 0 ]; then echo "Failed to kill infinity debug version" exit 1 diff --git a/scripts/timeout_kill.sh b/scripts/timeout_kill.sh index 81c1baee53..c191f21f52 100644 --- a/scripts/timeout_kill.sh +++ b/scripts/timeout_kill.sh @@ -15,6 +15,7 @@ fi # kill all infinity process for pid in "${@:2}"; do # Send SIGTERM + echo "Terminate pid: $pid" kill -15 $pid done @@ -27,6 +28,7 @@ while true; do # Check if all processes are still running all_dead=true for pid in "${@:2}"; do + echo "Check pid: $pid status" if ps -p $pid > /dev/null; then all_dead=false break @@ -43,7 +45,10 @@ while true; do if [ $current_time -ge $end_time ]; then echo "Some processes did not terminate in time. Sending SIGKILL." for pid in "${@:2}"; do - kill -9 $pid + if ps -p $pid > /dev/null; then + echo "Pid: $pid didn't terminate" + kill -9 $pid + fi done exit 2 # Return a different value fi diff --git a/src/main/cluster_manager_leader.cpp b/src/main/cluster_manager_leader.cpp index 748e29962e..abe48729c3 100644 --- a/src/main/cluster_manager_leader.cpp +++ b/src/main/cluster_manager_leader.cpp @@ -114,6 +114,11 @@ Status ClusterManager::AddNodeInfo(const SharedPtr &other_node) { return Status::DuplicateNode(other_node_name); } + if (other_node->node_ip() == this_node_->node_ip() and other_node->node_port() == this_node_->node_port()) { + return Status::InvalidServerAddress( + fmt::format("Follower or learner peer server address {}: {} are same as leader", this_node_->node_ip(), this_node_->node_port())); + } + // Add by register auto iter = other_node_map_.find(other_node_name); if (iter != other_node_map_.end()) { diff --git a/src/main/cluster_manager_reader.cpp b/src/main/cluster_manager_reader.cpp index 5e3555c9c2..fd405b5f60 100644 --- a/src/main/cluster_manager_reader.cpp +++ b/src/main/cluster_manager_reader.cpp @@ -125,7 +125,7 @@ Status ClusterManager::RegisterToLeaderNoLock() { Status status = Status::OK(); if (register_peer_task->error_code_ != 0) { status.code_ = static_cast(register_peer_task->error_code_); - status.msg_ = MakeUnique(register_peer_task->error_message_); + status.msg_ = MakeUnique(fmt::format("From leader: {}", register_peer_task->error_message_)); return status; } auto now = std::chrono::system_clock::now(); diff --git a/src/network/peer_server_thrift_service.cpp b/src/network/peer_server_thrift_service.cpp index 7efa35180b..3469d41887 100644 --- a/src/network/peer_server_thrift_service.cpp +++ b/src/network/peer_server_thrift_service.cpp @@ -69,6 +69,7 @@ void PeerServerThriftService::Register(infinity_peer_server::RegisterResponse &r response.leader_term = leader_node->leader_term(); response.heart_beat_interval = leader_node->heartbeat_interval(); } else { + LOG_ERROR(fmt::format("Node: {} fail to register with leader, error: {}", request.node_name, status.message())); response.error_code = static_cast(status.code()); response.error_message = status.message(); } diff --git a/src/network/peer_thrift_client.cpp b/src/network/peer_thrift_client.cpp index 8af4eacdc9..dfb631a3dd 100644 --- a/src/network/peer_thrift_client.cpp +++ b/src/network/peer_thrift_client.cpp @@ -458,17 +458,17 @@ void PeerClient::SyncLogs(SyncLogTask *peer_task) { LOG_ERROR(fmt::format("Sync log to node: {}, error: {}", peer_task->node_name_, peer_task->error_message_)); } } catch (apache::thrift::transport::TTransportException &thrift_exception) { - peer_task->error_message_ = thrift_exception.what(); + peer_task->error_message_ = fmt::format("Sync log to node, transport error: {}, error: {}", peer_task->node_name_, thrift_exception.what()); peer_task->error_code_ = static_cast(ErrorCode::kCantConnectServer); - LOG_ERROR(fmt::format("Sync log to node, transport error: {}, error: {}", peer_task->node_name_, peer_task->error_message_)); + LOG_ERROR(peer_task->error_message_); Status status = InfinityContext::instance().cluster_manager()->UpdateNodeByLeader(peer_task->node_name_, UpdateNodeOp::kLostConnection); if (!status.ok()) { LOG_ERROR(status.message()); } } catch (apache::thrift::TApplicationException &application_exception) { - peer_task->error_message_ = application_exception.what(); + peer_task->error_message_ = fmt::format("Sync log to node, application: {}, error: {}", peer_task->node_name_, application_exception.what()); peer_task->error_code_ = static_cast(ErrorCode::kCantConnectServer); - LOG_ERROR(fmt::format("Sync log to node, application: {}, error: {}", peer_task->node_name_, peer_task->error_message_)); + LOG_ERROR(peer_task->error_message_); Status status = InfinityContext::instance().cluster_manager()->UpdateNodeByLeader(peer_task->node_name_, UpdateNodeOp::kLostConnection); if (!status.ok()) { LOG_ERROR(status.message()); @@ -500,18 +500,20 @@ void PeerClient::ChangeRole(ChangeRoleTask *change_role_task) { LOG_ERROR(fmt::format("Sync log to node: {}, error: {}", change_role_task->node_name_, change_role_task->error_message_)); } } catch (apache::thrift::transport::TTransportException &thrift_exception) { - change_role_task->error_message_ = thrift_exception.what(); + change_role_task->error_message_ = + fmt::format("Sync log to node, transport error: {}, error: {}", change_role_task->node_name_, thrift_exception.what()); change_role_task->error_code_ = static_cast(ErrorCode::kCantConnectServer); - LOG_ERROR(fmt::format("Sync log to node, transport error: {}, error: {}", change_role_task->node_name_, change_role_task->error_message_)); + LOG_ERROR(change_role_task->error_message_); Status status = InfinityContext::instance().cluster_manager()->UpdateNodeByLeader(change_role_task->node_name_, UpdateNodeOp::kLostConnection); if (!status.ok()) { LOG_ERROR(status.message()); } } catch (apache::thrift::TApplicationException &application_exception) { - change_role_task->error_message_ = application_exception.what(); + change_role_task->error_message_ = + fmt::format("Sync log to node, application: {}, error: {}", change_role_task->node_name_, application_exception.what()); change_role_task->error_code_ = static_cast(ErrorCode::kCantConnectServer); - LOG_ERROR(fmt::format("Sync log to node, application: {}, error: {}", change_role_task->node_name_, change_role_task->error_message_)); + LOG_ERROR(change_role_task->error_message_); Status status = InfinityContext::instance().cluster_manager()->UpdateNodeByLeader(change_role_task->node_name_, UpdateNodeOp::kLostConnection); if (!status.ok()) {