Skip to content

Commit

Permalink
accounts db/sort remove dup optimization (anza-xyz#1866)
Browse files Browse the repository at this point in the history
* optimize sort and remove dups for shrinking

* add bench

* pr changes

* more pr changes

* add bench for no dups

* report num dups in shrinking

* update tests

* update fn comments

---------

Co-authored-by: HaoranYi <[email protected]>
Co-authored-by: HaoranYi <[email protected]>
  • Loading branch information
3 people authored Jun 28, 2024
1 parent a4eb7fe commit ad9bed8
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 15 deletions.
51 changes: 49 additions & 2 deletions accounts-db/benches/accounts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@ use {
rand::Rng,
rayon::iter::{IntoParallelRefIterator, ParallelIterator},
solana_accounts_db::{
account_info::{AccountInfo, StorageLocation},
accounts::{AccountAddressFilter, Accounts},
accounts_db::{
test_utils::create_test_accounts, AccountShrinkThreshold, AccountsDb,
VerifyAccountsHashAndLamportsConfig, ACCOUNTS_DB_CONFIG_FOR_BENCHMARKS,
test_utils::create_test_accounts, AccountFromStorage, AccountShrinkThreshold,
AccountsDb, VerifyAccountsHashAndLamportsConfig, ACCOUNTS_DB_CONFIG_FOR_BENCHMARKS,
},
accounts_index::{AccountSecondaryIndexes, ScanConfig},
ancestors::Ancestors,
Expand Down Expand Up @@ -344,3 +345,49 @@ fn bench_load_largest_accounts(b: &mut Bencher) {
)
});
}

#[bench]
fn bench_sort_and_remove_dups(b: &mut Bencher) {
fn generate_sample_account_from_storage(i: u8) -> AccountFromStorage {
// offset has to be 8 byte aligned
let offset = (i as usize) * std::mem::size_of::<u64>();
AccountFromStorage {
index_info: AccountInfo::new(StorageLocation::AppendVec(i as u32, offset), i as u64),
data_len: i as u64,
pubkey: Pubkey::new_from_array([i; 32]),
}
}

use rand::prelude::*;
let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(1234);
let accounts: Vec<_> =
std::iter::repeat_with(|| generate_sample_account_from_storage(rng.gen::<u8>()))
.take(1000)
.collect();

b.iter(|| AccountsDb::sort_and_remove_dups(&mut accounts.clone()));
}

#[bench]
fn bench_sort_and_remove_dups_no_dups(b: &mut Bencher) {
fn generate_sample_account_from_storage(i: u8) -> AccountFromStorage {
// offset has to be 8 byte aligned
let offset = (i as usize) * std::mem::size_of::<u64>();
AccountFromStorage {
index_info: AccountInfo::new(StorageLocation::AppendVec(i as u32, offset), i as u64),
data_len: i as u64,
pubkey: Pubkey::new_unique(),
}
}

use rand::prelude::*;
let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(1234);
let mut accounts: Vec<_> =
std::iter::repeat_with(|| generate_sample_account_from_storage(rng.gen::<u8>()))
.take(1000)
.collect();

accounts.shuffle(&mut rng);

b.iter(|| AccountsDb::sort_and_remove_dups(&mut accounts.clone()));
}
54 changes: 41 additions & 13 deletions accounts-db/src/accounts_db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3929,26 +3929,29 @@ impl AccountsDb {
}
}

/// sort `accounts` by pubkey.
/// Remove earlier entries with the same pubkey as later entries.
/// Sort `accounts` by pubkey and removes all but the *last* of consecutive
/// accounts in the vector with the same pubkey.
///
/// Return the number of duplicated elements in the vector.
#[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))]
fn sort_and_remove_dups(accounts: &mut Vec<AccountFromStorage>) -> usize {
// stable sort because we want the most recent only
accounts.sort_by(|a, b| a.pubkey().cmp(b.pubkey()));
let len0 = accounts.len();
if accounts.len() > 1 {
let mut i = 0;
// iterate 0..1 less than end
while i < accounts.len() - 1 {
let current = accounts[i];
let next = accounts[i + 1];
if current.pubkey() == next.pubkey() {
// remove the first duplicate
accounts.remove(i);
// do not advance i, we just removed an element at i
continue;
let mut last = 0;
let mut curr = 1;

while curr < accounts.len() {
if accounts[curr].pubkey() == accounts[last].pubkey() {
accounts[last] = accounts[curr];
} else {
last += 1;
accounts[last] = accounts[curr];
}
i += 1;
curr += 1;
}
accounts.truncate(last + 1);
}
len0 - accounts.len()
}
Expand Down Expand Up @@ -10049,6 +10052,31 @@ pub mod tests {
assert_eq!(test1, expected);
}

#[test]
fn test_sort_and_remove_dups_random() {
use rand::prelude::*;
let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(1234);
let accounts: Vec<_> =
std::iter::repeat_with(|| generate_sample_account_from_storage(rng.gen::<u8>()))
.take(1000)
.collect();

let mut accounts1 = accounts.clone();
let num_dups1 = AccountsDb::sort_and_remove_dups(&mut accounts1);

// Use BTreeMap to calculate sort and remove dups alternatively.
let mut map = std::collections::BTreeMap::default();
let mut num_dups2 = 0;
for account in accounts.iter() {
if map.insert(*account.pubkey(), *account).is_some() {
num_dups2 += 1;
}
}
let accounts2: Vec<_> = map.into_values().collect();
assert_eq!(accounts1, accounts2);
assert_eq!(num_dups1, num_dups2);
}

/// Reserve ancient storage size is not supported for TiredStorage
#[test]
fn test_create_ancient_accounts_file() {
Expand Down

0 comments on commit ad9bed8

Please sign in to comment.