diff --git a/Cargo.lock b/Cargo.lock index 9fb5ea40687..dbc3d933482 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -957,6 +957,19 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "flume" +version = "0.10.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1657b4441c3403d9f7b3409e47575237dac27b1b5726df654a6ecbf92f0f7577" +dependencies = [ + "futures-core", + "futures-sink", + "nanorand", + "pin-project", + "spin", +] + [[package]] name = "form_urlencoded" version = "1.1.0" @@ -1096,8 +1109,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi", + "wasm-bindgen", ] [[package]] @@ -1849,6 +1864,7 @@ dependencies = [ "blocking", "bytesize", "document-features", + "flume", "fs-err", "futures-io", "futures-lite", @@ -1859,6 +1875,7 @@ dependencies = [ "git-url", "itertools", "jwalk", + "num_cpus", "serde", "serde_json", "tempfile", @@ -2224,6 +2241,15 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "nanorand" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3" +dependencies = [ + "getrandom", +] + [[package]] name = "nix" version = "0.21.0" @@ -2427,6 +2453,26 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" +[[package]] +name = "pin-project" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad29a609b6bcd67fee905812e544992d216af9d755757c05ed2d0e15a74c6ecc" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "069bdb1e05adc7a8990dce9cc75370895fbe4e3d58b9b73bf1aee56359344a55" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "pin-project-lite" version = "0.2.9" @@ -2534,9 +2580,9 @@ dependencies = [ [[package]] name = "prodash" -version = "20.1.0" +version = "20.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "477ce81c3e71b6005714157c54797ff4d84b5aa21d21e160fb9f1eeed936c931" +checksum = "762467059887e40727b7cea07161956c9bd0fb6df0ca1225538effdb9f77c80a" dependencies = [ "async-io", "atty", @@ -2944,6 +2990,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "spin" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09" +dependencies = [ + "lock_api", +] + [[package]] name = "static_assertions" version = "1.1.0" diff --git a/Cargo.toml b/Cargo.toml index df224c94507..24d70220467 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -86,7 +86,7 @@ git-repository = { version = "^0.24.0", path = "git-repository", default-feature git-transport-for-configuration-only = { package = "git-transport", optional = true, version = "^0.20.0", path = "git-transport" } clap = { version = "3.2.5", features = ["derive", "cargo"] } -prodash = { version = "20.1.0", optional = true, default-features = false } +prodash = { version = "20.1.1", optional = true, default-features = false } atty = { version = "0.2.14", optional = true, default-features = false } env_logger = { version = "0.9.0", default-features = false } crosstermion = { version = "0.10.1", optional = true, default-features = false } diff --git a/git-features/Cargo.toml b/git-features/Cargo.toml index d221cd3d815..f0374f453b1 100644 --- a/git-features/Cargo.toml +++ b/git-features/Cargo.toml @@ -115,7 +115,7 @@ crc32fast = { version = "1.2.1", optional = true } sha1 = { version = "0.10.0", optional = true } # progress -prodash = { version = "20.1.0", optional = true, default-features = false, features = ["unit-bytes", "unit-human"] } +prodash = { version = "20.1.1", optional = true, default-features = false, features = ["unit-bytes", "unit-human"] } # pipe bytes = { version = "1.0.0", optional = true } diff --git a/gitoxide-core/Cargo.toml b/gitoxide-core/Cargo.toml index 61819fc8aef..60dc7cb8c01 100644 --- a/gitoxide-core/Cargo.toml +++ b/gitoxide-core/Cargo.toml @@ -18,7 +18,7 @@ default = [] ## Discover all git repositories within a directory. Particularly useful with [skim](https://github.com/lotabout/skim). organize = ["git-url", "jwalk"] ## Derive the amount of time invested into a git repository akin to [git-hours](https://github.com/kimmobrunfeldt/git-hours). -estimate-hours = ["itertools", "fs-err"] +estimate-hours = ["itertools", "fs-err", "num_cpus", "flume"] #! ### Mutually Exclusive Networking #! If both are set, _blocking-client_ will take precedence, allowing `--all-features` to be used. @@ -59,8 +59,11 @@ blocking = { version = "1.0.2", optional = true } git-url = { version = "^0.8.0", path = "../git-url", optional = true } jwalk = { version = "0.6.0", optional = true } +# for 'hours' itertools = { version = "0.10.1", optional = true } fs-err = { version = "2.6.0", optional = true } +num_cpus = { version = "1.13.1", optional = true } +flume = { version = "0.10.14", optional = true } document-features = { version = "0.2.0", optional = true } diff --git a/gitoxide-core/src/hours.rs b/gitoxide-core/src/hours.rs index 35035ffbfd7..d466f7f97bb 100644 --- a/gitoxide-core/src/hours.rs +++ b/gitoxide-core/src/hours.rs @@ -1,4 +1,6 @@ use std::collections::BTreeSet; +use std::convert::Infallible; +use std::sync::atomic::Ordering; use std::{ collections::{hash_map::Entry, HashMap}, io, @@ -9,7 +11,7 @@ use std::{ use anyhow::{anyhow, bail}; use git_repository as git; use git_repository::bstr::BStr; -use git_repository::{actor, bstr::ByteSlice, interrupt, objs, prelude::*, progress, Progress}; +use git_repository::{actor, bstr::ByteSlice, interrupt, prelude::*, progress, Progress}; use itertools::Itertools; /// Additional configuration for the hours estimation functionality. @@ -40,7 +42,7 @@ pub fn estimate( Context { show_pii, ignore_bots, - stats: _, + stats, omit_unify_identities, mut out, }: Context, @@ -53,18 +55,25 @@ where let commit_id = repo.rev_parse_single(rev_spec)?.detach(); let mut string_heap = BTreeSet::<&'static [u8]>::new(); - let (all_commits, is_shallow) = { - let mut progress = progress.add_child("Traverse commit graph"); + let (commit_authors, is_shallow) = { + let stat_progress = stats.then(|| progress.add_child("extract stats")).map(|mut p| { + p.init(None, progress::count("commits")); + p + }); + let stat_counter = stat_progress.as_ref().and_then(|p| p.counter()); + + let mut progress = progress.add_child("traverse commit graph"); + progress.init(None, progress::count("commits")); + std::thread::scope(|scope| -> anyhow::Result<(Vec>, bool)> { let start = Instant::now(); - progress.init(None, progress::count("commits")); let (tx, rx) = std::sync::mpsc::channel::>(); let mailmap = repo.open_mailmap(); - let handle = scope.spawn(move || -> anyhow::Result>> { + let commit_thread = scope.spawn(move || -> anyhow::Result>> { let mut out = Vec::new(); for commit_data in rx { - if let Some(author) = objs::CommitRefIter::from_bytes(&commit_data) + if let Some(author) = git::objs::CommitRefIter::from_bytes(&commit_data) .author() .map(|author| mailmap.resolve_cow(author.trim())) .ok() @@ -101,12 +110,89 @@ where Ok(out) }); + let (tx_tree_id, stat_threads) = stats + .then(|| { + let num_threads = num_cpus::get().saturating_sub(1 /*main thread*/).max(1); + let (tx, rx) = flume::unbounded::<(u32, Option, git::hash::ObjectId)>(); + let stat_workers = (0..num_threads) + .map(|_| { + scope.spawn({ + let counter = stat_counter.clone(); + let mut repo = repo.clone(); + repo.object_cache_size_if_unset(4 * 1024 * 1024); + let rx = rx.clone(); + move || -> Result<_, git::object::tree::diff::Error> { + let mut out = Vec::new(); + for (commit_idx, parent_commit, commit) in rx { + if let Some(c) = counter.as_ref() { + c.fetch_add(1, Ordering::SeqCst); + } + let mut stat = Stats::default(); + let from = match parent_commit { + Some(id) => { + match repo.find_object(id).ok().and_then(|c| c.peel_to_tree().ok()) { + Some(tree) => tree, + None => continue, + } + } + None => repo + .find_object(git::hash::ObjectId::empty_tree(repo.object_hash())) + .expect("always present") + .into_tree(), + }; + let to = match repo.find_object(commit).ok().and_then(|c| c.peel_to_tree().ok()) + { + Some(c) => c, + None => continue, + }; + from.changes().for_each_to_obtain_tree(&to, |change| { + use git::object::tree::diff::change::Event::*; + match change.event { + Addition { entry_mode, .. } => { + if entry_mode.is_no_tree() { + stat.added += 1 + } + } + Deletion { entry_mode, .. } => { + if entry_mode.is_no_tree() { + stat.removed += 1 + } + } + Modification { entry_mode, .. } => { + if entry_mode.is_no_tree() { + stat.modified += 1; + } + } + } + Ok::<_, Infallible>(Default::default()) + })?; + out.push((commit_idx, stat)); + } + Ok(out) + } + }) + }) + .collect::>(); + (Some(tx), stat_workers) + }) + .unwrap_or_else(Default::default); + + let mut commit_idx = 0_u32; let commit_iter = interrupt::Iter::new( commit_id.ancestors(|oid, buf| { progress.inc(); repo.objects.find(oid, buf).map(|o| { tx.send(o.data.to_owned()).ok(); - objs::CommitRefIter::from_bytes(o.data) + if let Some((tx_tree, first_parent, commit)) = tx_tree_id.as_ref().and_then(|tx| { + git::objs::CommitRefIter::from_bytes(o.data) + .parent_ids() + .next() + .map(|first_parent| (tx, Some(first_parent), oid.to_owned())) + }) { + tx_tree.send((commit_idx, first_parent, commit)).ok(); + } + commit_idx += 1; + git::objs::CommitRefIter::from_bytes(o.data) }) }), || anyhow!("Cancelled by user"), @@ -123,23 +209,38 @@ where }; } drop(tx); + drop(tx_tree_id); progress.show_throughput(start); - Ok((handle.join().expect("no panic")?, is_shallow)) + + let _stats_by_commit_idx = match stat_progress { + Some(mut progress) => { + progress.init(Some(commit_idx as usize), progress::count("commits")); + let mut stats = Vec::new(); + for handle in stat_threads { + stats.extend(handle.join().expect("no panic")?); + } + progress.show_throughput(start); + stats + } + None => Vec::new(), + }; + + Ok((commit_thread.join().expect("no panic")?, is_shallow)) })? }; - if all_commits.is_empty() { + if commit_authors.is_empty() { bail!("No commits to process"); } let start = Instant::now(); - let mut current_email = &all_commits[0].email; + let mut current_email = &commit_authors[0].email; let mut slice_start = 0; let mut results_by_hours = Vec::new(); let mut ignored_bot_commits = 0_u32; - for (idx, elm) in all_commits.iter().enumerate() { + for (idx, elm) in commit_authors.iter().enumerate() { if elm.email != *current_email { - let estimate = estimate_hours(&all_commits[slice_start..idx]); + let estimate = estimate_hours(&commit_authors[slice_start..idx]); slice_start = idx; current_email = &elm.email; if ignore_bots && estimate.name.contains_str(b"[bot]") { @@ -149,7 +250,7 @@ where results_by_hours.push(estimate); } } - if let Some(commits) = all_commits.get(slice_start..) { + if let Some(commits) = commit_authors.get(slice_start..) { results_by_hours.push(estimate_hours(commits)); } @@ -167,9 +268,9 @@ where let elapsed = start.elapsed(); progress.done(format!( "Extracted and organized data from {} commits in {:?} ({:0.0} commits/s)", - all_commits.len(), + commit_authors.len(), elapsed, - all_commits.len() as f32 / elapsed.as_secs_f32() + commit_authors.len() as f32 / elapsed.as_secs_f32() )); let num_unique_authors = results_by_hours.len(); @@ -207,7 +308,7 @@ where } assert_eq!( total_commits, - all_commits.len() as u32 - ignored_bot_commits, + commit_authors.len() as u32 - ignored_bot_commits, "need to get all commits" ); Ok(()) @@ -328,3 +429,14 @@ struct WorkByEmail { hours: f32, num_commits: u32, } + +/// Statistics for a particular commit. +#[derive(Debug, Default)] +struct Stats { + /// amount of added files + added: usize, + /// amount of removed files + removed: usize, + /// amount of modified files + modified: usize, +}