Skip to content

Commit

Permalink
raft: add fsync option to disable fsync for performance
Browse files Browse the repository at this point in the history
  • Loading branch information
erikgrinaker committed Jul 23, 2024
1 parent e47aef6 commit c73d0f1
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 5 deletions.
6 changes: 6 additions & 0 deletions config/toydb.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ data_dir: data
storage_raft: bitcask
storage_sql: bitcask

# Whether to fsync writes to disk. Disabling this yields much better write
# performance, but may lose data on host crashes and violate Raft guarantees. It
# only affects Raft log writes (the SQL state machine is never fsynced since it
# can be reconstructed from the Raft log).
fsync: true

# The minimum garbage fraction and bytes to trigger Bitcask log compaction on
# node startup.
compact_threshold: 0.2
Expand Down
8 changes: 7 additions & 1 deletion src/bin/toydb.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ struct Config {
storage_raft: String,
/// The SQL storage engine: bitcask or memory.
storage_sql: String,
/// If false, don't fsync Raft log writes to disk. Disabling this
/// will yield much better write performance, but may lose data on
/// host crashes which compromises Raft safety guarantees.
fsync: bool,
/// The garbage fraction threshold at which to trigger compaction.
compact_threshold: f64,
/// The minimum bytes of garbage before triggering compaction.
Expand All @@ -63,6 +67,7 @@ impl Config {
.set_default("data_dir", "data")?
.set_default("storage_raft", "bitcask")?
.set_default("storage_sql", "bitcask")?
.set_default("fsync", true)?
.set_default("compact_threshold", 0.2)?
.set_default("compact_min_bytes", 1_000_000)?
.add_source(config::File::with_name(file))
Expand Down Expand Up @@ -97,7 +102,7 @@ impl Command {

// Initialize the Raft log storage engine.
let datadir = std::path::Path::new(&cfg.data_dir);
let raft_log = match cfg.storage_raft.as_str() {
let mut raft_log = match cfg.storage_raft.as_str() {
"bitcask" | "" => {
let engine = storage::BitCask::new_compact(
datadir.join("raft"),
Expand All @@ -109,6 +114,7 @@ impl Command {
"memory" => raft::Log::new(Box::new(storage::Memory::new()))?,
name => return errinput!("invalid Raft storage engine {name}"),
};
raft_log.enable_fsync(cfg.fsync);

// Initialize the SQL storage engine.
let raft_state: Box<dyn raft::State> = match cfg.storage_sql.as_str() {
Expand Down
27 changes: 24 additions & 3 deletions src/raft/log.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,13 @@ pub struct Log {
commit_index: Index,
/// The term of the last committed entry.
commit_term: Term,
/// If true, fsync entries to disk when appended. This is mandated by Raft,
/// but comes with a hefty performance penalty (especially since we don't
/// optimize for it by batching entries before fsyncing). Disabling it will
/// yield much better write performance, but may lose data on host crashes,
/// which in some scenarios can cause log entries to become "uncommitted"
/// and state machines diverging.
fsync: bool,
}

impl Log {
Expand All @@ -125,7 +132,14 @@ impl Log {
.map(|v| bincode::deserialize(&v))
.transpose()?
.unwrap_or((0, 0));
Ok(Self { engine, term, vote, last_index, last_term, commit_index, commit_term })
let fsync = true; // fsync by default (NB: BitCask::flush() is a noop in tests)
Ok(Self { engine, term, vote, last_index, last_term, commit_index, commit_term, fsync })
}

/// Controls whether to fsync writes. Disabling this may violate Raft
/// guarantees, see comment on fsync attribute.
pub fn enable_fsync(&mut self, fsync: bool) {
self.fsync = fsync
}

/// Returns the commit index and term.
Expand Down Expand Up @@ -154,6 +168,9 @@ impl Log {
return Ok(());
}
self.engine.set(&Key::TermVote.encode(), bincode::serialize(&(term, vote)))?;
// Always fsync, even with Log.fsync = false. Term changes are rare, so
// this doesn't materially affect performance, and double voting could
// lead to multiple leaders and split brain which is really bad.
self.engine.flush()?;
self.term = term;
self.vote = vote;
Expand All @@ -169,7 +186,9 @@ impl Log {
// in the key, but we keep it simple.
let entry = Entry { index: self.last_index + 1, term: self.term, command };
self.engine.set(&Key::Entry(entry.index).encode(), entry.encode())?;
self.engine.flush()?;
if self.fsync {
self.engine.flush()?;
}
self.last_index = entry.index;
self.last_term = entry.term;
Ok(entry.index)
Expand Down Expand Up @@ -306,7 +325,9 @@ impl Log {
for index in last.index + 1..=self.last_index {
self.engine.delete(&Key::Entry(index).encode())?;
}
self.engine.flush()?;
if self.fsync {
self.engine.flush()?;
}

self.last_index = last.index;
self.last_term = last.term;
Expand Down
4 changes: 3 additions & 1 deletion src/storage/bitcask.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,9 @@ impl Engine for BitCask {
}

fn flush(&mut self) -> Result<()> {
// Don't fsync in tests, to speed them up.
// Don't fsync in tests, to speed them up. We disable this here, instead
// of setting raft::Log::fsync = false in tests, because we want to
// assert that the Raft log flushes to disk even if the flush is a noop.
#[cfg(not(test))]
self.log.file.sync_all()?;
Ok(())
Expand Down

0 comments on commit c73d0f1

Please sign in to comment.