Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement] Do checkpoint on follower node to relieve memory pressure on leader #52103

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions fe/fe-core/src/main/java/com/starrocks/common/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,9 @@ public class Config extends ConfigBase {
@ConfField
public static long bdbje_reserved_disk_size = 512L * 1024 * 1024;

@ConfField
public static long checkpoint_timeout_s = 24 * 3600;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
public static long checkpoint_timeout_s = 24 * 3600;
public static long checkpoint_timeout_seconds = 24 * 3600;


/**
* the max txn number which bdbje can roll back when trying to rejoin the group
*/
Expand Down
11 changes: 10 additions & 1 deletion fe/fe-core/src/main/java/com/starrocks/ha/LeaderInfo.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,21 @@ public class LeaderInfo implements Writable {
private int httpPort;
@SerializedName("rp")
private int rpcPort;
@SerializedName("ep")
private long epoch;

public LeaderInfo() {
this.ip = "";
this.httpPort = 0;
this.rpcPort = 0;
this.epoch = 0;
}

public LeaderInfo(String ip, int httpPort, int rpcPort) {
public LeaderInfo(String ip, int httpPort, int rpcPort, long epoch) {
this.ip = ip;
this.httpPort = httpPort;
this.rpcPort = rpcPort;
this.epoch = epoch;
}

public String getIp() {
Expand All @@ -70,8 +74,13 @@ public void setRpcPort(int rpcPort) {
this.rpcPort = rpcPort;
}

public long getEpoch() {
return epoch;
}

@Override
public void write(DataOutput out) throws IOException {
Text.writeString(out, GsonUtils.GSON.toJson(this));
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
public class MetaService {
private static final Logger LOG = LogManager.getLogger(MetaService.class);

private static final int TIMEOUT_SECOND = 10;
public static final int DOWNLOAD_TIMEOUT_SECOND = 10;

public static class ImageAction extends MetaBaseAction {
private static final String VERSION = "version";
Expand Down Expand Up @@ -283,7 +283,7 @@ public void executeGet(BaseRequest request, BaseResponse response) {
if (Files.exists(Path.of(realDir + "/" + filename))) {
LOG.info("image file : {} version: {} already exists, ignore", filename, imageFormatVersion);
} else {
MetaHelper.downloadImageFile(url, TIMEOUT_SECOND * 1000, versionStr, dir);
MetaHelper.downloadImageFile(url, DOWNLOAD_TIMEOUT_SECOND * 1000, versionStr, dir);
}
writeResponse(request, response);
} catch (FileNotFoundException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,15 @@

import com.google.common.base.Strings;
import com.google.gson.Gson;
import com.google.gson.annotations.SerializedName;
import com.starrocks.common.Config;
import com.starrocks.common.DdlException;
import com.starrocks.common.Version;
import com.starrocks.http.ActionController;
import com.starrocks.http.BaseRequest;
import com.starrocks.http.BaseResponse;
import com.starrocks.http.IllegalArgException;
import com.starrocks.monitor.jvm.JvmStats;
import com.starrocks.server.GlobalStateMgr;
import io.netty.handler.codec.http.HttpMethod;
import org.apache.logging.log4j.LogManager;
Expand All @@ -57,15 +59,8 @@
public class BootstrapFinishAction extends RestBaseAction {
private static final Logger LOG = LogManager.getLogger(BootstrapFinishAction.class);

private static final String CLUSTER_ID = "cluster_id";
private static final String TOKEN = "token";

public static final String REPLAYED_JOURNAL_ID = "replayedJournalId";
public static final String QUERY_PORT = "queryPort";
public static final String RPC_PORT = "rpcPort";
public static final String FE_START_TIME = "feStartTime";
public static final String FE_VERSION = "feVersion";

public BootstrapFinishAction(ActionController controller) {
super(controller);
}
Expand Down Expand Up @@ -96,11 +91,12 @@ public void execute(BaseRequest request, BaseResponse response) throws DdlExcept
// cluster id and token are valid, return replayed journal id
long replayedJournalId = GlobalStateMgr.getCurrentState().getReplayedJournalId();
long feStartTime = GlobalStateMgr.getCurrentState().getFeStartTime();
result.setMaxReplayedJournal(replayedJournalId);
result.setReplayedJournal(replayedJournalId);
result.setQueryPort(Config.query_port);
result.setRpcPort(Config.rpc_port);
result.setFeStartTime(feStartTime);
result.setFeVersion(Version.STARROCKS_VERSION + "-" + Version.STARROCKS_COMMIT_HASH);
result.setHeapUsedPercent(JvmStats.getJvmHeapUsedPercent());
}
}
} else {
Expand All @@ -114,11 +110,18 @@ public void execute(BaseRequest request, BaseResponse response) throws DdlExcept
}

public static class BootstrapResult extends RestBaseResult {
@SerializedName("replayedJournalId ")
private long replayedJournalId = 0;
@SerializedName("queryPort")
private int queryPort = 0;
@SerializedName("rpcPort")
private int rpcPort = 0;
@SerializedName("feStartTime")
private long feStartTime = 0;
@SerializedName("feVersion")
private String feVersion;
@SerializedName("heapUsedPercent")
private float heapUsedPercent;

public BootstrapResult() {
super();
Expand All @@ -128,11 +131,11 @@ public BootstrapResult(String msg) {
super(msg);
}

public void setMaxReplayedJournal(long replayedJournalId) {
public void setReplayedJournal(long replayedJournalId) {
this.replayedJournalId = replayedJournalId;
}

public long getMaxReplayedJournal() {
public long getReplayedJournal() {
return replayedJournalId;
}

Expand Down Expand Up @@ -168,10 +171,23 @@ public void setFeVersion(String feVersion) {
this.feVersion = feVersion;
}

public float getHeapUsedPercent() {
return heapUsedPercent;
}

public void setHeapUsedPercent(float heapUsedPercent) {
this.heapUsedPercent = heapUsedPercent;
}

@Override
public String toJson() {
Gson gson = new Gson();
return gson.toJson(this);
}

public static BootstrapResult fromJson(String jsonStr) {
Gson gson = new Gson();
return gson.fromJson(jsonStr, BootstrapResult.class);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -100,4 +100,8 @@ public String getCode() {
public String getMessage() {
return message;
}

public ActionStatus getStatus() {
return status;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ public Map<String, String> getHaInfo() {
try {
master = haProtocol.getLeader();
} catch (Exception e) {
// this may happen when majority of FOLLOWERS are down and no MASTER right now.
// this may happen when the majority of FOLLOWERS are down and no MASTER right now.
LOG.warn("failed to get leader: {}", e.getMessage());
}
if (master != null) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.starrocks.journal;

public class CheckpointException extends Exception {
public CheckpointException(String msg) {
super(msg);
}
}
159 changes: 159 additions & 0 deletions fe/fe-core/src/main/java/com/starrocks/journal/CheckpointWorker.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.starrocks.journal;

import com.starrocks.common.Config;
import com.starrocks.common.FeConstants;
import com.starrocks.common.util.FrontendDaemon;
import com.starrocks.leader.CheckpointController;
import com.starrocks.persist.ImageLoader;
import com.starrocks.rpc.ThriftConnectionPool;
import com.starrocks.rpc.ThriftRPCRequestExecutor;
import com.starrocks.server.GlobalStateMgr;
import com.starrocks.thrift.TFinishCheckpointRequest;
import com.starrocks.thrift.TFinishCheckpointResponse;
import com.starrocks.thrift.TStatus;
import com.starrocks.thrift.TStatusCode;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.thrift.TException;

import java.io.IOException;

public abstract class CheckpointWorker extends FrontendDaemon {
public static final Logger LOG = LogManager.getLogger(CheckpointWorker.class);

protected final String imageDir;
protected final Journal journal;
private NextPoint nextPoint;
protected final GlobalStateMgr servingGlobalState;

public CheckpointWorker(String name, Journal journal, String subDir) {
super(name, FeConstants.checkpoint_interval_second * 1000L);
this.servingGlobalState = GlobalStateMgr.getServingState();
this.imageDir = servingGlobalState.getImageDir() + subDir;
this.journal = journal;
}

abstract void doCheckpoint(long epoch, long journalId) throws Exception;
abstract CheckpointController getCheckpointController();
abstract boolean isBelongToGlobalStateMgr();

public void setNextCheckpoint(long epoch, long journalId) throws CheckpointException {
if (epoch != servingGlobalState.getEpoch()) {
throw new CheckpointException(String.format("epoch: %d is not equal to current epoch: %d",
epoch, servingGlobalState.getEpoch()));
}
if (journalId > journal.getMaxJournalId()) {
throw new CheckpointException(String.format("can not find journal id: %d , current max journal id is: %d",
journalId, journal.getMaxJournalId()));
}

nextPoint = new NextPoint(epoch, journalId);
}

@Override
protected void runAfterCatalogReady() {
if (nextPoint == null) {
return;
}
if (nextPoint.journalId <= getImageJournalId()) {
return;
}
if (nextPoint.epoch != servingGlobalState.getEpoch()) {
return;
}

createImage(nextPoint.epoch, nextPoint.journalId);
}

private void createImage(long epoch, long journalId) {
try {
doCheckpoint(epoch, journalId);
} catch (Exception e) {
LOG.warn("create image failed", e);
finishCheckpoint(epoch, journalId, false, e.getMessage());
return;
}

finishCheckpoint(epoch, journalId, true, "success");
}

private void finishCheckpoint(long epoch, long journalId, boolean isSuccess, String message) {
if (epoch != servingGlobalState.getEpoch()) {
LOG.warn("epoch outdated, do not finish checkpoint");
return;
}

String nodeName = servingGlobalState.getNodeMgr().getNodeName();
if (servingGlobalState.isLeader()) {
CheckpointController controller = getCheckpointController();
if (isSuccess) {
try {
controller.finishCheckpoint(journalId, nodeName);
} catch (CheckpointException e) {
LOG.warn("finish checkpoint failed", e);
}
} else {
controller.cancelCheckpoint(nodeName, message);
}
} else {
TFinishCheckpointRequest request = new TFinishCheckpointRequest();
request.setJournal_id(journalId);
request.setNode_name(nodeName);
request.setIs_success(isSuccess);
request.setMessage(message);
request.setIs_global_state_mgr(isBelongToGlobalStateMgr());

try {
TFinishCheckpointResponse response = ThriftRPCRequestExecutor.call(
ThriftConnectionPool.frontendPool,
servingGlobalState.getNodeMgr().getLeaderRpcEndpoint(),
Config.thrift_rpc_timeout_ms,
client -> client.finishCheckpoint(request));
TStatus status = response.getStatus();
if (status.getStatus_code() != TStatusCode.OK) {
String errMessage = "";
if (status.getError_msgs() != null && !status.getError_msgs().isEmpty()) {
errMessage = String.join(",", status.getError_msgs());
}
LOG.warn("call finishCheckpoint failed, error message: {}", errMessage);
}
} catch (TException e) {
LOG.warn("call finishCheckpoint failed", e);
}
}
}

private long getImageJournalId() {
try {
ImageLoader imageLoader = new ImageLoader(imageDir);
return imageLoader.getImageJournalId();
} catch (IOException e) {
LOG.warn("get image journal id failed", e);
return 0;
}
}

static class NextPoint {
private final long epoch;
private final long journalId;

public NextPoint(long epoch, long journalId) {
this.epoch = epoch;
this.journalId = journalId;
}
}
}
Loading
Loading