From 8cc7b4bb3c03290a3c7a2f36ad7a1d976051483f Mon Sep 17 00:00:00 2001 From: Drew Willcoxon Date: Mon, 18 Nov 2024 00:04:48 -0800 Subject: [PATCH] Bug 1931812 - Don't reingest the DB on each query benchmark This addresses the bug and also makes one other change: I added a new method to `BenchmarkWithInput` called `global_input()` that returns some data that's then passed by reference to the benchmark's iterations. That's a little better than creating global data in benchmark constructors because it means only the global data for the current benchmark is in memory instead of the global data for all benchmarks in the group. In the case of query benchmarks, the global data is the store, so we avoid having N stores in memory at once, N = the number of benchmarks in the group. (I'm adding quite a few weather benchmarks.) It also made sense to rename `generate_input()` to `iteration_input()` to emphasize that that method is called on each iteration. I'm open to reverting this other change though if you disagree. The speedup between this and the main branch when running `time cargo suggest-bench -- query/query` isn't dramatic but it's a little better: ``` main branch: ~83s this branch: ~68s (~18% improvement) ``` --- components/suggest/benches/benchmark_all.rs | 5 +- components/suggest/src/benchmarks/ingest.rs | 9 ++-- components/suggest/src/benchmarks/mod.rs | 56 ++++++++++++++++++--- components/suggest/src/benchmarks/query.rs | 39 ++++++-------- components/suggest/src/store.rs | 16 ++++++ 5 files changed, 87 insertions(+), 38 deletions(-) diff --git a/components/suggest/benches/benchmark_all.rs b/components/suggest/benches/benchmark_all.rs index 26e401013f..03b09ff619 100644 --- a/components/suggest/benches/benchmark_all.rs +++ b/components/suggest/benches/benchmark_all.rs @@ -24,10 +24,11 @@ fn run_benchmarks( benchmarks: Vec<(&'static str, B)>, ) { for (name, benchmark) in benchmarks { + let g_input = benchmark.global_input(); group.bench_function(name.to_string(), |b| { b.iter_batched( - || benchmark.generate_input(), - |input| benchmark.benchmarked_code(input), + || benchmark.iteration_input(), + |i_input| benchmark.benchmarked_code(&g_input, i_input), // See https://docs.rs/criterion/latest/criterion/enum.BatchSize.html#variants for // a discussion of this. PerIteration is chosen for these benchmarks because the // input holds a database file handle diff --git a/components/suggest/src/benchmarks/ingest.rs b/components/suggest/src/benchmarks/ingest.rs index 9a04c2dc2f..99ad6cac69 100644 --- a/components/suggest/src/benchmarks/ingest.rs +++ b/components/suggest/src/benchmarks/ingest.rs @@ -48,9 +48,12 @@ impl IngestBenchmark { pub struct InputType(SuggestStoreInner); impl BenchmarkWithInput for IngestBenchmark { - type Input = InputType; + type GlobalInput = (); + type IterationInput = InputType; - fn generate_input(&self) -> Self::Input { + fn global_input(&self) -> Self::GlobalInput {} + + fn iteration_input(&self) -> Self::IterationInput { let data_path = self.temp_dir.path().join(unique_db_filename()); let store = SuggestStoreInner::new(data_path, vec![], self.client.clone()); store.ensure_db_initialized(); @@ -61,7 +64,7 @@ impl BenchmarkWithInput for IngestBenchmark { InputType(store) } - fn benchmarked_code(&self, input: Self::Input) { + fn benchmarked_code(&self, _: &Self::GlobalInput, input: Self::IterationInput) { let InputType(store) = input; store.ingest_records_by_type(self.record_type); } diff --git a/components/suggest/src/benchmarks/mod.rs b/components/suggest/src/benchmarks/mod.rs index 52efa276cd..1a7245e28c 100644 --- a/components/suggest/src/benchmarks/mod.rs +++ b/components/suggest/src/benchmarks/mod.rs @@ -10,7 +10,16 @@ //! //! All benchmarks are defined as structs that implement either the [Benchmark] or [BenchmarkWithInput] -use std::sync::atomic::{AtomicU32, Ordering}; +use std::{ + path::PathBuf, + sync::{ + atomic::{AtomicU32, Ordering}, + OnceLock, + }, +}; +use tempfile::TempDir; + +use crate::{SuggestIngestionConstraints, SuggestStore}; pub mod client; pub mod ingest; @@ -28,21 +37,52 @@ pub trait Benchmark { /// Trait for benchmarks that require input /// /// This will run using Criterion's `iter_batched` function. Criterion will create a batch of -/// inputs, then pass each one to benchmark. +/// inputs, then pass each one to the benchmark's iterations. /// -/// This supports simple benchmarks that don't require any input. Note: global setup can be done -/// in the `new()` method for the struct. +/// This supports simple benchmarks that don't require any input. pub trait BenchmarkWithInput { - type Input; + /// Input that will be created once and then passed by reference to each + /// of the benchmark's iterations. + type GlobalInput; + + /// Input that will be created for each of the benchmark's iterations. + type IterationInput; - /// Generate the input (this is not included in the benchmark time) - fn generate_input(&self) -> Self::Input; + /// Generate the global input (not included in the benchmark time) + fn global_input(&self) -> Self::GlobalInput; + + /// Generate the per-iteration input (not included in the benchmark time) + fn iteration_input(&self) -> Self::IterationInput; /// Perform the operations that we're benchmarking. - fn benchmarked_code(&self, input: Self::Input); + fn benchmarked_code(&self, g_input: &Self::GlobalInput, i_input: Self::IterationInput); } fn unique_db_filename() -> String { static COUNTER: AtomicU32 = AtomicU32::new(0); format!("db{}.sqlite", COUNTER.fetch_add(1, Ordering::Relaxed)) } + +/// Creates a new store that will contain all provider data currently in remote +/// settings. +fn new_store() -> SuggestStore { + // Create a "starter" store that will do an initial ingest, and then + // initialize every returned store with a copy of its DB so that each one + // doesn't need to reingest. + static STARTER: OnceLock<(TempDir, PathBuf)> = OnceLock::new(); + let (starter_dir, starter_db_path) = STARTER.get_or_init(|| { + let temp_dir = tempfile::tempdir().unwrap(); + let db_path = temp_dir.path().join(unique_db_filename()); + let store = + SuggestStore::new(&db_path.to_string_lossy(), None).expect("Error building store"); + store + .ingest(SuggestIngestionConstraints::all_providers()) + .expect("Error during ingestion"); + store.checkpoint(); + (temp_dir, db_path) + }); + + let db_path = starter_dir.path().join(unique_db_filename()); + std::fs::copy(starter_db_path, &db_path).expect("Error copying starter DB file"); + SuggestStore::new(&db_path.to_string_lossy(), None).expect("Error building store") +} diff --git a/components/suggest/src/benchmarks/query.rs b/components/suggest/src/benchmarks/query.rs index 30e41dc677..fc6b32d285 100644 --- a/components/suggest/src/benchmarks/query.rs +++ b/components/suggest/src/benchmarks/query.rs @@ -3,50 +3,39 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ use crate::{ - benchmarks::{unique_db_filename, BenchmarkWithInput}, - SuggestIngestionConstraints, SuggestStore, SuggestionProvider, SuggestionQuery, + benchmarks::{new_store, BenchmarkWithInput}, + SuggestStore, SuggestionProvider, SuggestionQuery, }; pub struct QueryBenchmark { - store: SuggestStore, provider: SuggestionProvider, query: &'static str, } impl QueryBenchmark { pub fn new(provider: SuggestionProvider, query: &'static str) -> Self { - let temp_dir = tempfile::tempdir().unwrap(); - let data_path = temp_dir.path().join(unique_db_filename()); - let store = - SuggestStore::new(&data_path.to_string_lossy(), None).expect("Error building store"); - store - .ingest(SuggestIngestionConstraints::all_providers()) - .expect("Error during ingestion"); - Self { - store, - provider, - query, - } + Self { provider, query } } } -// The input for each benchmark a query to pass to the store -pub struct InputType(SuggestionQuery); - impl BenchmarkWithInput for QueryBenchmark { - type Input = InputType; + type GlobalInput = SuggestStore; + type IterationInput = SuggestionQuery; - fn generate_input(&self) -> Self::Input { - InputType(SuggestionQuery { + fn global_input(&self) -> Self::GlobalInput { + new_store() + } + + fn iteration_input(&self) -> Self::IterationInput { + SuggestionQuery { providers: vec![self.provider], keyword: self.query.to_string(), ..SuggestionQuery::default() - }) + } } - fn benchmarked_code(&self, input: Self::Input) { - let InputType(query) = input; - self.store + fn benchmarked_code(&self, store: &Self::GlobalInput, query: Self::IterationInput) { + store .query(query) .unwrap_or_else(|e| panic!("Error querying store: {e}")); } diff --git a/components/suggest/src/store.rs b/components/suggest/src/store.rs index 6a2620cf7e..262a3407e8 100644 --- a/components/suggest/src/store.rs +++ b/components/suggest/src/store.rs @@ -266,6 +266,16 @@ impl SuggestStore { } } +#[cfg(feature = "benchmark_api")] +impl SuggestStore { + /// Creates a WAL checkpoint. This will cause changes in the write-ahead log + /// to be written to the DB. See: + /// https://sqlite.org/pragma.html#pragma_wal_checkpoint + pub fn checkpoint(&self) { + self.inner.checkpoint(); + } +} + /// Constraints limit which suggestions to ingest from Remote Settings. #[derive(Clone, Default, Debug, uniffi::Record)] pub struct SuggestIngestionConstraints { @@ -751,6 +761,12 @@ where self.dbs().unwrap(); } + fn checkpoint(&self) { + let conn = self.dbs().unwrap().writer.conn.lock(); + conn.pragma_update(None, "wal_checkpoint", "TRUNCATE") + .expect("Error performing checkpoint"); + } + pub fn ingest_records_by_type(&self, ingest_record_type: SuggestRecordType) { let writer = &self.dbs().unwrap().writer; let mut context = MetricsContext::default();