apacheGH-44084: [C++] Improve merge step in chunked sorting

pitrou · Sep 25, 2024 · 132b89a · 132b89a
1 parent c557fe5
commit 132b89a
Show file tree

Hide file tree

Showing 8 changed files with 491 additions and 195 deletions.
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
@@ -731,6 +731,7 @@ set(ARROW_COMPUTE_SRCS
     compute/light_array_internal.cc
     compute/ordering.cc
     compute/registry.cc
+    compute/kernels/chunked_internal.cc
     compute/kernels/codegen_internal.cc
     compute/kernels/ree_util_internal.cc
     compute/kernels/scalar_cast_boolean.cc

diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc
@@ -28,6 +28,8 @@
 
 namespace arrow::internal {
 
+using ::arrow::util::span;
+
 namespace {
 template <typename T>
 int64_t GetLength(const T& array) {
@@ -42,7 +44,7 @@ int64_t GetLength<std::shared_ptr<RecordBatch>>(
 }
 
 template <typename T>
-inline std::vector<int64_t> MakeChunksOffsets(const std::vector<T>& chunks) {
+inline std::vector<int64_t> MakeChunksOffsets(span<T> chunks) {
   std::vector<int64_t> offsets(chunks.size() + 1);
   int64_t offset = 0;
   std::transform(chunks.begin(), chunks.end(), offsets.begin(),
@@ -112,13 +114,13 @@ void ResolveManyInline(uint32_t num_offsets, const int64_t* signed_offsets,
 }  // namespace
 
 ChunkResolver::ChunkResolver(const ArrayVector& chunks) noexcept
-    : offsets_(MakeChunksOffsets(chunks)), cached_chunk_(0) {}
+    : offsets_(MakeChunksOffsets(span(chunks))), cached_chunk_(0) {}
 
-ChunkResolver::ChunkResolver(const std::vector<const Array*>& chunks) noexcept
+ChunkResolver::ChunkResolver(span<const Array* const> chunks) noexcept
     : offsets_(MakeChunksOffsets(chunks)), cached_chunk_(0) {}
 
 ChunkResolver::ChunkResolver(const RecordBatchVector& batches) noexcept
-    : offsets_(MakeChunksOffsets(batches)), cached_chunk_(0) {}
+    : offsets_(MakeChunksOffsets(span(batches))), cached_chunk_(0) {}
 
 ChunkResolver::ChunkResolver(ChunkResolver&& other) noexcept
     : offsets_(std::move(other.offsets_)),

diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h
@@ -26,6 +26,7 @@
 
 #include "arrow/type_fwd.h"
 #include "arrow/util/macros.h"
+#include "arrow/util/span.h"
 
 namespace arrow::internal {
 
@@ -76,7 +77,7 @@ struct ARROW_EXPORT ChunkResolver {
 
  public:
   explicit ChunkResolver(const ArrayVector& chunks) noexcept;
-  explicit ChunkResolver(const std::vector<const Array*>& chunks) noexcept;
+  explicit ChunkResolver(::arrow::util::span<const Array* const> chunks) noexcept;
   explicit ChunkResolver(const RecordBatchVector& batches) noexcept;
 
   /// \brief Construct a ChunkResolver from a vector of chunks.size() + 1 offsets.

diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.cc b/cpp/src/arrow/compute/kernels/chunked_internal.cc
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/kernels/chunked_internal.h"
+
+#include <algorithm>
+
+#include "arrow/record_batch.h"
+#include "arrow/util/logging.h"
+
+namespace arrow::compute::internal {
+
+using ::arrow::internal::TypedChunkLocation;
+
+std::vector<const Array*> GetArrayPointers(const ArrayVector& arrays) {
+  std::vector<const Array*> pointers(arrays.size());
+  std::transform(arrays.begin(), arrays.end(), pointers.begin(),
+                 [&](const std::shared_ptr<Array>& array) { return array.get(); });
+  return pointers;
+}
+
+std::vector<int64_t> ChunkedIndexMapper::GetChunkLengths(
+    util::span<const Array* const> chunks) {
+  std::vector<int64_t> chunk_lengths(chunks.size());
+  for (int64_t i = 0; i < static_cast<int64_t>(chunks.size()); ++i) {
+    chunk_lengths[i] = chunks[i]->length();
+  }
+  return chunk_lengths;
+}
+
+std::vector<int64_t> ChunkedIndexMapper::GetChunkLengths(
+    const RecordBatchVector& chunks) {
+  std::vector<int64_t> chunk_lengths(chunks.size());
+  for (int64_t i = 0; i < static_cast<int64_t>(chunks.size()); ++i) {
+    chunk_lengths[i] = chunks[i]->num_rows();
+  }
+  return chunk_lengths;
+}
+
+Result<std::pair<ResolvedChunkIndex*, ResolvedChunkIndex*>>
+ChunkedIndexMapper::LogicalToPhysical() {
+  // Check that indices would fall in bounds for ResolvedChunkIndex
+  if (ARROW_PREDICT_FALSE(chunk_lengths_.size() >
+                          ResolvedChunkIndex::kMaxChunkIndex + 1)) {
+    return Status::NotImplemented("Chunked array has more than ",
+                                  ResolvedChunkIndex::kMaxChunkIndex + 1, " chunks");
+  }
+  for (int64_t chunk_length : chunk_lengths_) {
+    if (ARROW_PREDICT_FALSE(static_cast<uint64_t>(chunk_length) >
+                            ResolvedChunkIndex::kMaxIndexInChunk + 1)) {
+      return Status::NotImplemented("Individual chunk in chunked array has more than ",
+                                    ResolvedChunkIndex::kMaxIndexInChunk + 1,
+                                    " elements");
+    }
+  }
+
+  constexpr int64_t kMaxBatchSize = 512;
+  std::array<TypedChunkLocation<uint64_t>, kMaxBatchSize> batch;
+
+  const int64_t num_indices = static_cast<int64_t>(indices_end_ - indices_begin_);
+  ResolvedChunkIndex* physical_begin =
+      reinterpret_cast<ResolvedChunkIndex*>(indices_begin_);
+  DCHECK_EQ(physical_begin + num_indices,
+            reinterpret_cast<ResolvedChunkIndex*>(indices_end_));
+
+  for (int64_t i = 0; i < num_indices; i += kMaxBatchSize) {
+    const int64_t batch_size = std::min(kMaxBatchSize, num_indices - i);
+    [[maybe_unused]] bool ok =
+        resolver_.ResolveMany(batch_size, indices_begin_ + i, batch.data());
+    DCHECK(ok) << "ResolveMany unexpectedly failed (invalid logical index?)";
+    for (int64_t j = 0; j < batch_size; ++j) {
+      const auto loc = batch[j];
+      physical_begin[i + j] = ResolvedChunkIndex{loc.chunk_index, loc.index_in_chunk};
+    }
+  }
+
+  return std::pair{physical_begin, physical_begin + num_indices};
+}
+
+Status ChunkedIndexMapper::PhysicalToLogical() {
+  std::vector<int64_t> chunk_offsets(chunk_lengths_.size());
+  {
+    int64_t offset = 0;
+    for (int64_t i = 0; i < static_cast<int64_t>(chunk_lengths_.size()); ++i) {
+      chunk_offsets[i] = offset;
+      offset += chunk_lengths_[i];
+    }
+  }
+
+  const int64_t num_indices = static_cast<int64_t>(indices_end_ - indices_begin_);
+  ResolvedChunkIndex* physical_begin =
+      reinterpret_cast<ResolvedChunkIndex*>(indices_begin_);
+  for (int64_t i = 0; i < num_indices; ++i) {
+    const auto loc = physical_begin[i];
+    DCHECK_LT(loc.chunk_index(), chunk_offsets.size());
+    DCHECK_LT(loc.index_in_chunk(),
+              static_cast<uint64_t>(chunk_lengths_[loc.chunk_index()]));
+    indices_begin_[i] =
+        chunk_offsets[loc.chunk_index()] + static_cast<int64_t>(loc.index_in_chunk());
+  }
+
+  return Status::OK();
+}
+
+}  // namespace arrow::compute::internal
diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.h b/cpp/src/arrow/compute/kernels/chunked_internal.h
@@ -20,26 +20,35 @@
 #include <algorithm>
 #include <cstdint>
 #include <memory>
+#include <utility>
 #include <vector>
 
 #include "arrow/array.h"
 #include "arrow/chunk_resolver.h"
 #include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/util/span.h"
 
-namespace arrow {
-namespace compute {
-namespace internal {
+namespace arrow::compute::internal {
+
+using ::arrow::internal::ChunkResolver;
+using ::arrow::internal::TypedChunkLocation;
 
 // The target chunk in a chunked array.
 struct ResolvedChunk {
   // The target array in chunked array.
   const Array* array;
   // The index in the target array.
-  const int64_t index;
+  int64_t index;
 
   ResolvedChunk(const Array* array, int64_t index) : array(array), index(index) {}
 
- public:
+  friend bool operator==(const ResolvedChunk& left, const ResolvedChunk& right) {
+    return left.array == right.array && left.index == right.index;
+  }
+  friend bool operator!=(const ResolvedChunk& left, const ResolvedChunk& right) {
+    return left.array != right.array || left.index != right.index;
+  }
+
   bool IsNull() const { return array->IsNull(index); }
 
   template <typename ArrowType, typename ViewType = GetViewType<ArrowType>>
@@ -50,34 +59,108 @@ struct ResolvedChunk {
   }
 };
 
+// A compressed (chunk_index, index_in_chunk) pair.
+// The goal of compression is to make it fit in 64 bits, allowing in place
+// replacement of logical uint64_t indices with physical indices.
+// (see ChunkedIndexMapper)
+struct ResolvedChunkIndex {
+  static constexpr int kChunkIndexBits = 24;
+  static constexpr int KIndexInChunkBits = 64 - kChunkIndexBits;
+
+  static constexpr uint64_t kMaxChunkIndex = (1ULL << kChunkIndexBits) - 1;
+  static constexpr uint64_t kMaxIndexInChunk = (1ULL << KIndexInChunkBits) - 1;
+
+  ResolvedChunkIndex() = default;
+
+  constexpr uint64_t chunk_index() const { return data_ & kMaxChunkIndex; }
+  constexpr uint64_t index_in_chunk() const { return data_ >> kChunkIndexBits; }
+
+  explicit constexpr ResolvedChunkIndex(uint64_t chunk_index, uint64_t index_in_chunk)
+      : data_((index_in_chunk << kChunkIndexBits) | chunk_index) {}
+
+  template <typename IndexType>
+  explicit operator TypedChunkLocation<IndexType>() {
+    return {static_cast<IndexType>(chunk_index()),
+            static_cast<IndexType>(index_in_chunk())};
+  }
+
+ private:
+  uint64_t data_;
+};
+
+static_assert(sizeof(uint64_t) == sizeof(ResolvedChunkIndex));
+
 class ChunkedArrayResolver {
  private:
-  ::arrow::internal::ChunkResolver resolver_;
-  std::vector<const Array*> chunks_;
+  ChunkResolver resolver_;
+  util::span<const Array* const> chunks_;
+  std::vector<const Array*> owned_chunks_;
 
  public:
-  explicit ChunkedArrayResolver(const std::vector<const Array*>& chunks)
+  explicit ChunkedArrayResolver(std::vector<const Array*>&& chunks)
+      : resolver_(chunks), chunks_(chunks), owned_chunks_(std::move(chunks)) {}
+  explicit ChunkedArrayResolver(util::span<const Array* const> chunks)
       : resolver_(chunks), chunks_(chunks) {}
 
-  ChunkedArrayResolver(ChunkedArrayResolver&& other) = default;
-  ChunkedArrayResolver& operator=(ChunkedArrayResolver&& other) = default;
+  ARROW_DEFAULT_MOVE_AND_ASSIGN(ChunkedArrayResolver);
 
-  ChunkedArrayResolver(const ChunkedArrayResolver& other) = default;
-  ChunkedArrayResolver& operator=(const ChunkedArrayResolver& other) = default;
+  ChunkedArrayResolver(const ChunkedArrayResolver& other)
+      : resolver_(other.resolver_), owned_chunks_(other.owned_chunks_) {
+    // Rebind span to owned_chunks_ if necessary
+    chunks_ = owned_chunks_.empty() ? other.chunks_ : owned_chunks_;
+  }
+  ChunkedArrayResolver& operator=(const ChunkedArrayResolver& other) {
+    resolver_ = other.resolver_;
+    owned_chunks_ = other.owned_chunks_;
+    chunks_ = owned_chunks_.empty() ? other.chunks_ : owned_chunks_;
+    return *this;
+  }
 
   ResolvedChunk Resolve(int64_t index) const {
     const auto loc = resolver_.Resolve(index);
     return {chunks_[loc.chunk_index], loc.index_in_chunk};
   }
 };
 
-inline std::vector<const Array*> GetArrayPointers(const ArrayVector& arrays) {
-  std::vector<const Array*> pointers(arrays.size());
-  std::transform(arrays.begin(), arrays.end(), pointers.begin(),
-                 [&](const std::shared_ptr<Array>& array) { return array.get(); });
-  return pointers;
-}
+std::vector<const Array*> GetArrayPointers(const ArrayVector& arrays);
+
+// A class that turns logical (linear) indices into physical (chunked) indices,
+// and vice-versa.
+class ChunkedIndexMapper {
+ public:
+  ChunkedIndexMapper(const std::vector<const Array*>& chunks, uint64_t* indices_begin,
+                     uint64_t* indices_end)
+      : ChunkedIndexMapper(util::span(chunks), indices_begin, indices_end) {}
+  ChunkedIndexMapper(util::span<const Array* const> chunks, uint64_t* indices_begin,
+                     uint64_t* indices_end)
+      : resolver_(chunks),
+        chunk_lengths_(GetChunkLengths(chunks)),
+        indices_begin_(indices_begin),
+        indices_end_(indices_end) {}
+  ChunkedIndexMapper(const RecordBatchVector& chunks, uint64_t* indices_begin,
+                     uint64_t* indices_end)
+      : resolver_(chunks),
+        chunk_lengths_(GetChunkLengths(chunks)),
+        indices_begin_(indices_begin),
+        indices_end_(indices_end) {}
+
+  // Turn the original uint64_t logical indices into physical. This reuses the
+  // same memory area, so the logical indices cannot be used anymore until
+  // PhysicalToLogical() is called.
+  Result<std::pair<ResolvedChunkIndex*, ResolvedChunkIndex*>> LogicalToPhysical();
+
+  // Turn the physical indices back into logical, making the uint64_t indices
+  // usable again.
+  Status PhysicalToLogical();
+
+ private:
+  static std::vector<int64_t> GetChunkLengths(util::span<const Array* const> chunks);
+  static std::vector<int64_t> GetChunkLengths(const RecordBatchVector& chunks);
+
+  ChunkResolver resolver_;
+  std::vector<int64_t> chunk_lengths_;
+  uint64_t* indices_begin_;
+  uint64_t* indices_end_;
+};
 
-}  // namespace internal
-}  // namespace compute
-}  // namespace arrow
+}  // namespace arrow::compute::internal
diff --git a/cpp/src/arrow/compute/kernels/vector_rank.cc b/cpp/src/arrow/compute/kernels/vector_rank.cc
@@ -21,6 +21,8 @@
 
 namespace arrow::compute::internal {
 
+using ::arrow::util::span;
+
 namespace {
 
 // ----------------------------------------------------------------------
@@ -237,7 +239,7 @@ class Ranker<ChunkedArray> : public RankerMixin<ChunkedArray, Ranker<ChunkedArra
                          physical_chunks_, order_, null_placement_));
 
     const auto arrays = GetArrayPointers(physical_chunks_);
-    auto value_selector = [resolver = ChunkedArrayResolver(arrays)](int64_t index) {
+    auto value_selector = [resolver = ChunkedArrayResolver(span(arrays))](int64_t index) {
       return resolver.Resolve(index).Value<InType>();
     };
     ARROW_ASSIGN_OR_RAISE(*output_, CreateRankings(ctx_, sorted, null_placement_,