TileDB-Inc · davisp · Sep 27, 2023 · Sep 26, 2023 · Sep 26, 2023
diff --git a/test/src/unit-cppapi-max-fragment-size.cc b/test/src/unit-cppapi-max-fragment-size.cc
@@ -32,6 +32,7 @@
 
 #include <test/support/tdb_catch.h>
 #include "test/support/src/helpers.h"
+#include "tiledb/common/scoped_executor.h"
 #include "tiledb/common/stdx_string.h"
 #include "tiledb/sm/c_api/tiledb_struct_def.h"
 #include "tiledb/sm/cpp_api/tiledb"
@@ -417,3 +418,86 @@ TEST_CASE_METHOD(
   // Validate the fragment domains are now disjoint.
   validate_disjoint_domains();
 }
+
+// This test exists to show the lack of a bug in the GlobalOrderWriter when
+// using the maximum fragment size setting. Previously, we could get into a
+// situation when a write starts and the currently active fragment can't fit
+// any more tiles. Before the changes in this PR we ended up in a convoluted
+// code path that eventually leads us to writing the wrong last_tile_cell_num
+// value in the FragmentMetdata stored on disk. This issue is then not detected
+// until a read against the last tile of the fragment detects a mismatch in the
+// expected size when Tile::load_chunk_data is called.
+//
+// The underlying bug had two specific contributing factors. First, the use of
+// std::vector::operator[] is specified as undefined behavior for out-of-bounds
+// reads. In our case, we ended up calling dim_tiles[-1].cell_num() which
+// returned a non-obvious garbage value rather than segfaulting or some other
+// obviously wrong value.
+//
+// The second part of this bug is how we get there. The GlobalOrderWriter has a
+// mode for writing fragments up to a certain size. When we resumed a write
+// with a partially filled fragment on disk, we did not check whether the first
+// tile would fit in the existing fragment. This failure to check lead us to
+// attempt to write zero tiles to the existing fragment which is how the
+// bad call to dim_tiles[-1] happened. The fix for this part of the bug is
+// simply to call GlobalOrderWriter::start_new_fragment so the current fragment
+// is flushed and committed and the write can continue as normal.
+//
+// If you're looking at this thinking this should be in a regression test, you
+// would be correct. Except for the fact that the regression tests are only
+// linked against the shared library and not TIELDB_CORE_OBJECTS. The issue here
+// is that the `Query::set_fragment_size` is not wrapped by the C API so we
+// have to link against TILEDB_CORE_OBJECTS.
+TEST_CASE(
+    "Global Order Writer Resume Writes Bug is Fixed",
+    "[global-order-writer][bug][sc34072]") {
+  std::string array_name = "cpp_max_fragment_size_bug";
+  Context ctx;
+
+  auto cleanup = [&]() {
+    auto obj = Object::object(ctx, array_name);
+    if (obj.type() == Object::Type::Array) {
+      Object::remove(ctx, array_name);
+    }
+  };
+
+  // Remove any existing arrays.
+  cleanup();
+
+  // Remove the array at the end of this test.
+  ScopedExecutor deferred(cleanup);
+
+  // Create a sparse array (dense arrays are unaffected)
+  auto dim = Dimension::create<uint64_t>(ctx, "dim", {{0, UINT64_MAX - 1}});
+  Domain domain(ctx);
+  domain.add_dimension(dim);
+
+  ArraySchema schema(ctx, TILEDB_SPARSE);
+  schema.set_order({{TILEDB_ROW_MAJOR, TILEDB_ROW_MAJOR}});
+  schema.set_domain(domain);
+  schema.set_capacity(1024 * 1024);
+
+  Array::create(array_name, schema);
+
+  std::vector<uint64_t> data(1024 * 1024);
+
+  Array array(ctx, array_name, TILEDB_WRITE);
+  Query query(ctx, array);
+
+  // Set our max fragment size to force fragment writes
+  query.ptr()->query_->set_fragment_size(1080000);
+
+  query.set_layout(TILEDB_GLOBAL_ORDER).set_data_buffer("dim", data);
+
+  std::iota(data.begin(), data.end(), 0);
+  REQUIRE(query.submit() == Query::Status::COMPLETE);
+
+  std::iota(data.begin(), data.end(), 1024 * 1024);
+  REQUIRE(query.submit() == Query::Status::COMPLETE);
+
+  // Consolidate without a max fragment size showing that we can read the
+  // entire array.
+  REQUIRE_NOTHROW(Array::consolidate(ctx, array_name));
+
+  array.close();
+}
diff --git a/tiledb/sm/query/writers/global_order_writer.cc b/tiledb/sm/query/writers/global_order_writer.cc
@@ -801,6 +801,14 @@ Status GlobalOrderWriter::global_write() {
     // Compute the number of tiles that will fit in this fragment.
     auto num = num_tiles_to_write(idx, tile_num, tiles);
 
+    // If we're resuming a fragment write and the first tile doesn't fit into
+    // the previous fragment, we need to start a new fragment and recalculate
+    // the number of tiles to write.
+    if (current_fragment_size_ > 0 && num == 0) {
+      RETURN_CANCEL_OR_ERROR(start_new_fragment());
+      num = num_tiles_to_write(idx, tile_num, tiles);
+    }
+
     // Set new number of tiles in the fragment metadata
     auto new_num_tiles = frag_meta->tile_index_base() + num;
     throw_if_not_ok(frag_meta->set_num_tiles(new_num_tiles));

diff --git a/tiledb/sm/query/writers/writer_base.cc b/tiledb/sm/query/writers/writer_base.cc
@@ -714,7 +714,7 @@ void WriterBase::set_coords_metadata(
   // Set last tile cell number
   auto dim_0{array_schema_.dimension_ptr(0)};
   const auto& dim_tiles = tiles.find(dim_0->name())->second;
-  auto cell_num = dim_tiles[end_tile_idx - 1].cell_num();
+  auto cell_num = dim_tiles.at(end_tile_idx - 1).cell_num();
   meta->set_last_tile_cell_num(cell_num);
 }