diff --git a/test/src/unit-cppapi-vfs.cc b/test/src/unit-cppapi-vfs.cc index e4f4d40437f..52bf25ac41f 100644 --- a/test/src/unit-cppapi-vfs.cc +++ b/test/src/unit-cppapi-vfs.cc @@ -32,7 +32,9 @@ #include #include +#include "test/support/src/vfs_helpers.h" #include "tiledb/sm/cpp_api/tiledb" +#include "tiledb/sm/cpp_api/vfs_experimental.h" #ifdef _WIN32 #include "tiledb/sm/filesystem/path_win.h" @@ -500,3 +502,187 @@ TEST_CASE( } } } + +TEST_CASE("CPP API: VFS ls_recursive filter", "[cppapi][vfs][ls-recursive]") { + using namespace tiledb::test; + S3Test s3_test({10, 100, 0}); + if (!s3_test.is_supported()) { + return; + } + auto expected_results = s3_test.expected_results(); + + vfs_config cfg; + tiledb::Context ctx(tiledb::Config(&cfg.config)); + tiledb::VFS vfs(ctx); + + tiledb::VFSExperimental::LsObjects ls_objects; + // Predicate filter to apply to ls_recursive. + tiledb::VFSExperimental::LsInclude include; + // Callback to populate ls_objects vector using a filter. + tiledb::VFSExperimental::LsCallback cb = [&](std::string_view path, + uint64_t size) { + if (include(path, size)) { + ls_objects.emplace_back(path, size); + } + return true; + }; + + SECTION("Default filter (include all)") { + include = [](std::string_view, uint64_t) { return true; }; + } + SECTION("Custom filter (include none)") { + include = [](std::string_view, uint64_t) { return false; }; + } + + bool include_result = true; + SECTION("Custom filter (include half)") { + include = [&include_result](std::string_view, uint64_t) { + include_result = !include_result; + return include_result; + }; + } + + SECTION("Custom filter (search for test_file_50)") { + include = [](std::string_view object_name, uint64_t) { + return object_name.find("test_file_50") != std::string::npos; + }; + } + SECTION("Custom filter (search for test_file_1*)") { + include = [](std::string_view object_name, uint64_t) { + return object_name.find("test_file_1") != std::string::npos; + }; + } + SECTION("Custom filter (reject files over 50 bytes)") { + include = [](std::string_view, uint64_t size) { return size <= 50; }; + } + + // Test collecting results with LsInclude predicate. + auto results = tiledb::VFSExperimental::ls_recursive_filter( + ctx, vfs, s3_test.temp_dir_.to_string(), include); + std::erase_if(expected_results, [&include](const auto& object) { + return !include(object.first, object.second); + }); + CHECK(results.size() == expected_results.size()); + CHECK(expected_results == results); + + // Test collecting results with LsCallback, writing data into ls_objects. + tiledb::VFSExperimental::ls_recursive( + ctx, vfs, s3_test.temp_dir_.to_string(), cb); + CHECK(ls_objects.size() == expected_results.size()); + CHECK(expected_results == ls_objects); +} + +TEST_CASE("CPP API: Callback stops traversal", "[cppapi][vfs][ls-recursive]") { + using namespace tiledb::test; + S3Test s3_test({10, 50, 15}); + if (!s3_test.is_supported()) { + return; + } + auto expected_results = s3_test.expected_results(); + + vfs_config cfg; + tiledb::Context ctx(tiledb::Config(&cfg.config)); + tiledb::VFS vfs(ctx); + + tiledb::VFSExperimental::LsObjects ls_objects; + size_t cb_count = GENERATE(1, 10, 11, 50); + auto cb = [&](std::string_view path, uint64_t size) { + // Always emplace to check the callback is not invoked more than `cb_count`. + ls_objects.emplace_back(path, size); + // Signal to stop traversal when we have seen `cb_count` objects. + if (ls_objects.size() == cb_count) { + return false; + } + return true; + }; + tiledb::VFSExperimental::ls_recursive( + ctx, vfs, s3_test.temp_dir_.to_string(), cb); + expected_results.resize(cb_count); + CHECK(ls_objects.size() == cb_count); + CHECK(ls_objects == expected_results); +} + +TEST_CASE("CPP API: Throwing filter", "[cppapi][vfs][ls-recursive]") { + using namespace tiledb::test; + S3Test s3_test({0}); + if (!s3_test.is_supported()) { + return; + } + + vfs_config cfg; + tiledb::Context ctx(tiledb::Config(&cfg.config)); + tiledb::VFS vfs(ctx); + + tiledb::VFSExperimental::LsInclude filter = [](std::string_view, + uint64_t) -> bool { + throw std::runtime_error("Throwing filter"); + }; + auto path = s3_test.temp_dir_.to_string(); + + // If the test directory is empty the filter should not throw. + SECTION("Throwing filter with 0 objects should not throw") { + CHECK_NOTHROW( + tiledb::VFSExperimental::ls_recursive_filter(ctx, vfs, path, filter)); + CHECK_NOTHROW( + tiledb::VFSExperimental::ls_recursive(ctx, vfs, path, filter)); + } + SECTION("Throwing filter with N objects should throw") { + vfs.touch(s3_test.temp_dir_.join_path("test_file").to_string()); + CHECK_THROWS_AS( + tiledb::VFSExperimental::ls_recursive_filter(ctx, vfs, path, filter), + std::runtime_error); + CHECK_THROWS_WITH( + tiledb::VFSExperimental::ls_recursive_filter(ctx, vfs, path, filter), + Catch::Matchers::ContainsSubstring("Throwing filter")); + CHECK_THROWS_AS( + tiledb::VFSExperimental::ls_recursive(ctx, vfs, path, filter), + std::runtime_error); + CHECK_THROWS_WITH( + tiledb::VFSExperimental::ls_recursive(ctx, vfs, path, filter), + Catch::Matchers::ContainsSubstring("Throwing filter")); + } +} + +TEST_CASE( + "CPP API: CallbackWrapperCPP construction validation", + "[ls-recursive][callback][wrapper]") { + using tiledb::sm::CallbackWrapperCPP; + tiledb::VFSExperimental::LsObjects data; + auto cb = [&](std::string_view, uint64_t) -> bool { return true; }; + SECTION("Null callback") { + CHECK_THROWS(CallbackWrapperCPP(nullptr)); + } + SECTION("Valid callback") { + CHECK_NOTHROW(CallbackWrapperCPP(cb)); + } +} + +TEST_CASE( + "CPP API: CallbackWrapperCPP operator() validation", + "[ls-recursive][callback][wrapper]") { + tiledb::VFSExperimental::LsObjects data; + auto cb = [&](std::string_view path, uint64_t object_size) -> bool { + if (object_size > 100) { + // Throw if object size is greater than 100 bytes. + throw std::runtime_error("Throwing callback"); + } else if (!path.ends_with(".txt")) { + // Reject non-txt files. + return false; + } + data.emplace_back(path, object_size); + return true; + }; + tiledb::sm::CallbackWrapperCPP wrapper(cb); + + SECTION("Callback return true accepts object") { + CHECK(wrapper("file.txt", 10) == true); + CHECK(data.size() == 1); + } + SECTION("Callback return false rejects object") { + CHECK(wrapper("some/dir/", 0) == false); + CHECK(data.empty()); + } + SECTION("Callback exception is propagated") { + CHECK_THROWS_WITH(wrapper("path", 101) == 0, "Throwing callback"); + } +} diff --git a/tiledb/api/c_api/vfs/test/unit_capi_vfs.cc b/tiledb/api/c_api/vfs/test/unit_capi_vfs.cc index 985406fb919..1a2984ebda5 100644 --- a/tiledb/api/c_api/vfs/test/unit_capi_vfs.cc +++ b/tiledb/api/c_api/vfs/test/unit_capi_vfs.cc @@ -730,7 +730,7 @@ TEST_CASE( } TEST_CASE( - "C API: CallbackWrapper operator() validation", + "C API: CallbackWrapperCAPI operator() validation", "[ls-recursive][callback][wrapper]") { tiledb::sm::LsObjects data; auto cb = [](const char* path, @@ -748,7 +748,7 @@ TEST_CASE( ls_data->push_back({{path, path_len}, object_size}); return 1; }; - tiledb::sm::CallbackWrapper wrapper(cb, &data); + tiledb::sm::CallbackWrapperCAPI wrapper(cb, &data); SECTION("Callback return 1 signals to continue traversal") { CHECK(wrapper("file.txt", 10) == 1); @@ -763,21 +763,21 @@ TEST_CASE( } TEST_CASE( - "C API: CallbackWrapper construction validation", + "C API: CallbackWrapperCAPI construction validation", "[ls-recursive][callback][wrapper]") { - using tiledb::sm::CallbackWrapper; + using tiledb::sm::CallbackWrapperCAPI; tiledb::sm::LsObjects data; auto cb = [](const char*, size_t, uint64_t, void*) -> int32_t { return 1; }; SECTION("Null callback") { - CHECK_THROWS(CallbackWrapper(nullptr, &data)); + CHECK_THROWS(CallbackWrapperCAPI(nullptr, &data)); } SECTION("Null data") { - CHECK_THROWS(CallbackWrapper(cb, nullptr)); + CHECK_THROWS(CallbackWrapperCAPI(cb, nullptr)); } SECTION("Null callback and data") { - CHECK_THROWS(CallbackWrapper(nullptr, nullptr)); + CHECK_THROWS(CallbackWrapperCAPI(nullptr, nullptr)); } SECTION("Valid callback and data") { - CHECK_NOTHROW(CallbackWrapper(cb, &data)); + CHECK_NOTHROW(CallbackWrapperCAPI(cb, &data)); } } diff --git a/tiledb/api/c_api/vfs/vfs_api_internal.h b/tiledb/api/c_api/vfs/vfs_api_internal.h index fb8dd98f651..b590d755ec8 100644 --- a/tiledb/api/c_api/vfs/vfs_api_internal.h +++ b/tiledb/api/c_api/vfs/vfs_api_internal.h @@ -151,7 +151,7 @@ struct tiledb_vfs_handle_t const tiledb::sm::URI& parent, tiledb_ls_callback_t cb, void* data) const { - tiledb::sm::CallbackWrapper wrapper(cb, data); + tiledb::sm::CallbackWrapperCAPI wrapper(cb, data); vfs_.ls_recursive(parent, wrapper); } }; diff --git a/tiledb/sm/cpp_api/vfs_experimental.h b/tiledb/sm/cpp_api/vfs_experimental.h new file mode 100644 index 00000000000..bb51b6e93a4 --- /dev/null +++ b/tiledb/sm/cpp_api/vfs_experimental.h @@ -0,0 +1,205 @@ +/** + * @file vfs_experimental.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2023 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file declares the experimental C++ API for VFS. + */ + +#ifndef TILEDB_VFS_EXPERIMENTAL_H +#define TILEDB_VFS_EXPERIMENTAL_H + +#include +#include +#include +#include "context.h" +#include "tiledb_experimental.h" +#include "vfs.h" + +namespace tiledb { +class VFSExperimental { + public: + /* ********************************* */ + /* TYPE DEFINITIONS */ + /* ********************************* */ + + /** + * Typedef for ls callback function used to collect results from ls_recursive. + * + * If the callback returns True, the walk will continue. If False, the walk + * will stop. If an error is thrown, the walk will stop and the error will be + * propagated to the caller using std::throw_with_nested. + * + * @param path The path of a visited object for the relative filesystem. + * @param object_size The size of the object at the current path. + * @return True if the walk should continue, else false. + */ + using LsCallback = std::function; + + /** + * Typedef for ls inclusion predicate function used to check if a single + * result should be included in the final results returned from ls_recursive. + * + * If the predicate returns True, the result will be included. If False, the + * result will not be included. If an error is thrown, the walk will stop and + * the error will be propagated. + * + * @param path The path of a visited object for the relative filesystem. + * @param object_size The size of the object at the current path. + * @return True if the result should be included, else false. + */ + using LsInclude = std::function; + + /** + * Default typedef for objects collected by recursive ls, storing a vector of + * pairs for each object path and size. This can be overridden by the client + * to store results into a custom data structure using a custom callback. + * @sa LsCallback + */ + using LsObjects = std::vector>; + + /* ********************************* */ + /* PUBLIC STATIC METHODS */ + /* ********************************* */ + + /** + * Recursively lists objects at the input URI, invoking the provided callback + * on each entry gathered. The callback is passed the data pointer provided + * on each invocation and is responsible for writing the collected results + * into this structure. If the callback returns True, the walk will continue. + * If False, the walk will stop. If an error is thrown, the walk will stop and + * the error will be propagated to the caller using std::throw_with_nested. + * + * Currently only S3 is supported, and the `path` must be a valid S3 URI. + * + * @code{.c} + * VFSExperimental::LsObjects ls_objects; + * VFSExperimental::LsCallback cb = [&](const std::string_view& path, + * uint64_t size) { + * ls_objects.emplace_back(path, size); + * return true; // Continue traversal to next entry. + * } + * + * VFSExperimental::ls_recursive(ctx, vfs, "s3://bucket/foo", cb); + * @endcode + * + * @param ctx The TileDB context. + * @param vfs The VFS instance to use. + * @param uri The base URI to list results recursively. + * @param cb The callback to invoke on each entry. + */ + static void ls_recursive( + const Context& ctx, + const VFS& vfs, + const std::string& uri, + LsCallback cb) { + tiledb::sm::CallbackWrapperCPP wrapper(cb); + ctx.handle_error(tiledb_vfs_ls_recursive( + ctx.ptr().get(), + vfs.ptr().get(), + uri.c_str(), + ls_callback_wrapper, + &wrapper)); + } + + /** + * Recursively lists objects at the input URI, invoking the provided callback + * on each entry gathered. The callback should return true if the entry should + * be included in the results and false otherwise. If no inclusion predicate + * is provided, all results are returned. + * + * Currently only S3 is supported, and the `path` must be a valid S3 URI. + * + * @code{.c} + * VFSExperimental::LsInclude predicate = [](std::string_view path, + * uint64_t object_size) { + * return path.find(".txt") != std::string::npos; + * } + * // Include only files with '.txt' extension using a custom predicate. + * auto ret = VFSExperimental::ls_recursive_filter( + * ctx, vfs, "s3://bucket/foo", predicate); + * + * // Optionally omit the predicate to include all paths collected. + * auto all_paths = VFSExperimental::ls_recursive_filter( + * ctx, vfs, "s3://bucket/foo"); + * @endcode + * + * @param ctx The TileDB context. + * @param vfs The VFS instance to use. + * @param uri The base URI to list results recursively. + * @param include Predicate function to check if a result should be included. + * @return Vector of pairs for each object path and size. + */ + static LsObjects ls_recursive_filter( + const Context& ctx, + const VFS& vfs, + const std::string& uri, + std::optional include = std::nullopt) { + LsObjects ls_objects; + if (include.has_value()) { + auto include_cb = include.value(); + ls_recursive(ctx, vfs, uri, [&](std::string_view path, uint64_t size) { + if (include_cb(path, size)) { + ls_objects.emplace_back(path, size); + } + return true; + }); + } else { + ls_recursive(ctx, vfs, uri, [&](std::string_view path, uint64_t size) { + ls_objects.emplace_back(path, size); + return true; + }); + } + return ls_objects; + } + + private: + /* ********************************* */ + /* PRIVATE STATIC METHODS */ + /* ********************************* */ + + /** + * Callback function for invoking the C++ ls_recursive callback via C API. + * + * @param path The path of a visited object for the relative filesystem. + * @param path_len The length of the path. + * @param object_size The size of the object at the current path. + * @param data Data passed to the callback used to store collected results. + * @return 1 if the callback should continue to the next object, or 0 to stop + * traversal. + * @sa tiledb_ls_callback_t + */ + static int32_t ls_callback_wrapper( + const char* path, size_t path_len, uint64_t object_size, void* data) { + tiledb::sm::CallbackWrapperCPP* cb = + static_cast(data); + return (*cb)({path, path_len}, object_size); + } +}; +} // namespace tiledb + +#endif // TILEDB_VFS_EXPERIMENTAL_H \ No newline at end of file diff --git a/tiledb/sm/filesystem/ls_scanner.h b/tiledb/sm/filesystem/ls_scanner.h index e357e76d7c6..4f925c55aa0 100644 --- a/tiledb/sm/filesystem/ls_scanner.h +++ b/tiledb/sm/filesystem/ls_scanner.h @@ -90,7 +90,7 @@ using DirectoryFilter = std::function; * @return 1 if the callback should continue to the next object, 0 to stop * traversal, or -1 if an error occurred. */ -typedef std::function LsCallback; +using LsCallback = std::function; /** Type defintion for objects returned from ls_recursive. */ using LsObjects = std::vector>; @@ -267,13 +267,13 @@ class LsScanner { /** * Wrapper for the C API callback function to be passed to the C++ API. */ -class CallbackWrapper { +class CallbackWrapperCAPI { public: /** Default constructor is deleted */ - CallbackWrapper() = delete; + CallbackWrapperCAPI() = delete; /** Constructor */ - CallbackWrapper(LsCallback cb, void* data) + CallbackWrapperCAPI(LsCallback cb, void* data) : cb_(cb) , data_(data) { if (cb_ == nullptr) { @@ -311,6 +311,45 @@ class CallbackWrapper { void* data_; }; +/** Class to wrap C++ FilePredicate for passing to the C API. */ +class CallbackWrapperCPP { + public: + /** + * Typedef for ls callback function used to check if a single + * result should be included in the final results returned from ls_recursive. + * + * @param path The path of a visited object for the relative filesystem. + * @param size The size of the current object in bytes. + * @return True if the result should be included, else false. + */ + using LsCallback = std::function; + + /** Default constructor is deleted */ + CallbackWrapperCPP() = delete; + + /** Constructor */ + CallbackWrapperCPP(LsCallback cb) + : cb_(cb) { + if (cb_ == nullptr) { + throw LsScanException("ls_recursive callback function cannot be null"); + } + } + + /** + * Operator to wrap the FilePredicate used in the C++ API. + * + * @param path The path of the object. + * @param size The size of the object in bytes. + * @return True if the object should be included, False otherwise. + */ + bool operator()(std::string_view path, uint64_t size) { + return cb_(path, size); + } + + private: + LsCallback cb_; +}; + } // namespace tiledb::sm #endif // TILEDB_LS_SCANNER_H