diff --git a/halo2_proofs/src/arithmetic.rs b/halo2_proofs/src/arithmetic.rs index a53b541b..d8c7fee6 100644 --- a/halo2_proofs/src/arithmetic.rs +++ b/halo2_proofs/src/arithmetic.rs @@ -381,23 +381,54 @@ where q } -/// This simple utility function will parallelize an operation that is to be +/// This utility function will parallelize an operation that is to be /// performed over a mutable slice. pub fn parallelize(v: &mut [T], f: F) { - let n = v.len(); + // Algorithm rationale: + // + // Using the stdlib `chunks_mut` will lead to severe load imbalance. + // From https://github.com/rust-lang/rust/blob/e94bda3/library/core/src/slice/iter.rs#L1607-L1637 + // if the division is not exact, the last chunk will be the remainder. + // + // Dividing 40 items on 12 threads will lead to a chunk size of 40/12 = 3, + // There will be a 13 chunks of size 3 and 1 of size 1 distributed on 12 threads. + // This leads to 1 thread working on 6 iterations, 1 on 4 iterations and 10 on 3 iterations, + // a load imbalance of 2x. + // + // Instead we can divide work into chunks of size + // 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3 = 4*4 + 3*8 = 40 + // + // This would lead to a 6/4 = 1.5x speedup compared to naive chunks_mut + // + // See also OpenMP spec (page 60) + // http://www.openmp.org/mp-documents/openmp-4.5.pdf + // "When no chunk_size is specified, the iteration space is divided into chunks + // that are approximately equal in size, and at most one chunk is distributed to + // each thread. The size of the chunks is unspecified in this case." + // This implies chunks are the same size ±1 + + let f = &f; + let total_iters = v.len(); let num_threads = multicore::current_num_threads(); - let mut chunk = (n as usize) / num_threads; - if chunk < num_threads { - chunk = 1; - } + let base_chunk_size = total_iters / num_threads; + let cutoff_chunk_id = total_iters % num_threads; + let split_pos = cutoff_chunk_id * (base_chunk_size + 1); + let (v_hi, v_lo) = v.split_at_mut(split_pos); multicore::scope(|scope| { - for (chunk_num, v) in v.chunks_mut(chunk).enumerate() { - let f = f.clone(); - scope.spawn(move |_| { - let start = chunk_num * chunk; - f(v, start); - }); + // Skip special-case: number of iterations is cleanly divided by number of threads. + if cutoff_chunk_id != 0 { + for (chunk_id, chunk) in v_hi.chunks_exact_mut(base_chunk_size + 1).enumerate() { + let offset = chunk_id * (base_chunk_size + 1); + scope.spawn(move |_| f(chunk, offset)); + } + } + // Skip special-case: less iterations than number of threads. + if base_chunk_size != 0 { + for (chunk_id, chunk) in v_lo.chunks_exact_mut(base_chunk_size).enumerate() { + let offset = split_pos + (chunk_id * base_chunk_size); + scope.spawn(move |_| f(chunk, offset)); + } } }); }