Skip to content

Commit

Permalink
Support VFS ls_recursive API for posix filesystem. (#4778)
Browse files Browse the repository at this point in the history
@shaunrd0 previously added `ls_recursive` support for the S3 file
system. This pull request implements it for Posix, in a
hopefully-portable way (`std::filesystem::recursive_directory_iterator`)
which can be recycled for Windows.

Some notes about input/output:
- while `LsObjects` has a `std::string` component, the string is a URI.
- for posix, we include empty directories in the result set.

The recursion is implemented so that if a directory does not pass the
directory filter, then we do not descend into that directory. I have
added a unit test to check this behavior. Something like this could be
useful for S3 as well if the "list objects" request is done in
hierarchical mode.

---
TYPE: FEATURE
DESC: Support VFS `ls_recursive` API for posix filesystem.

---------

Co-authored-by: Luc Rancourt <[email protected]>
  • Loading branch information
rroelke and KiterLuc authored Mar 11, 2024
1 parent c0be5c8 commit a477ca6
Show file tree
Hide file tree
Showing 21 changed files with 835 additions and 119 deletions.
37 changes: 21 additions & 16 deletions test/src/unit-cppapi-vfs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ TEST_CASE("C++ API: Test VFS ls", "[cppapi][cppapi-vfs][cppapi-vfs-ls]") {
std::string file2 = dir + "/file2";
std::string subdir = dir + "/subdir";
std::string subdir2 = dir + "/subdir2";
std::string subdir_empty = dir + "/subdir_empty";
std::string subdir_file = subdir + "/file";
std::string subdir_file2 = subdir2 + "/file2";

Expand All @@ -71,6 +72,7 @@ TEST_CASE("C++ API: Test VFS ls", "[cppapi][cppapi-vfs][cppapi-vfs-ls]") {
vfs.create_dir(dir);
vfs.create_dir(subdir);
vfs.create_dir(subdir2);
vfs.create_dir(subdir_empty);
vfs.touch(file);
vfs.touch(file2);
vfs.touch(subdir_file);
Expand All @@ -85,15 +87,17 @@ TEST_CASE("C++ API: Test VFS ls", "[cppapi][cppapi-vfs][cppapi-vfs-ls]") {
file2 = tiledb::sm::path_win::uri_from_path(file2);
subdir = tiledb::sm::path_win::uri_from_path(subdir);
subdir2 = tiledb::sm::path_win::uri_from_path(subdir2);
subdir_empty = tiledb::sm::path_win::uri_from_path(subdir_empty);
#endif

// Check results
std::sort(children.begin(), children.end());
REQUIRE(children.size() == 4);
REQUIRE(children.size() == 5);
CHECK(children[0] == file);
CHECK(children[1] == file2);
CHECK(children[2] == subdir);
CHECK(children[3] == subdir2);
CHECK(children[4] == subdir_empty);

// Clean up
vfs.remove_dir(path);
Expand Down Expand Up @@ -503,13 +507,18 @@ TEST_CASE(
}
}

TEST_CASE("CPP API: VFS ls_recursive filter", "[cppapi][vfs][ls-recursive]") {
using ls_recursive_test_types =
std::tuple<tiledb::test::LocalFsTest, tiledb::test::S3Test>;
TEMPLATE_LIST_TEST_CASE(
"CPP API: VFS ls_recursive filter",
"[cppapi][vfs][ls-recursive]",
ls_recursive_test_types) {
using namespace tiledb::test;
S3Test s3_test({10, 100, 0});
if (!s3_test.is_supported()) {
TestType test({10, 100, 0});
if (!test.is_supported()) {
return;
}
auto expected_results = s3_test.expected_results();
auto expected_results = test.expected_results();

vfs_config cfg;
tiledb::Context ctx(tiledb::Config(&cfg.config));
Expand All @@ -534,14 +543,6 @@ TEST_CASE("CPP API: VFS ls_recursive filter", "[cppapi][vfs][ls-recursive]") {
include = [](std::string_view, uint64_t) { return false; };
}

bool include_result = true;
SECTION("Custom filter (include half)") {
include = [&include_result](std::string_view, uint64_t) {
include_result = !include_result;
return include_result;
};
}

SECTION("Custom filter (search for test_file_50)") {
include = [](std::string_view object_name, uint64_t) {
return object_name.find("test_file_50") != std::string::npos;
Expand All @@ -553,21 +554,25 @@ TEST_CASE("CPP API: VFS ls_recursive filter", "[cppapi][vfs][ls-recursive]") {
};
}
SECTION("Custom filter (reject files over 50 bytes)") {
include = [](std::string_view, uint64_t size) { return size <= 50; };
include = []([[maybe_unused]] std::string_view entry, uint64_t size) {
return size <= 50;
};
}

// Test collecting results with LsInclude predicate.
auto results = tiledb::VFSExperimental::ls_recursive_filter(
ctx, vfs, s3_test.temp_dir_.to_string(), include);
ctx, vfs, test.temp_dir_.to_string(), include);
std::erase_if(expected_results, [&include](const auto& object) {
return !include(object.first, object.second);
});
std::sort(results.begin(), results.end());
CHECK(results.size() == expected_results.size());
CHECK(expected_results == results);

// Test collecting results with LsCallback, writing data into ls_objects.
tiledb::VFSExperimental::ls_recursive(
ctx, vfs, s3_test.temp_dir_.to_string(), cb);
ctx, vfs, test.temp_dir_.to_string(), cb);
std::sort(ls_objects.begin(), ls_objects.end());
CHECK(ls_objects.size() == expected_results.size());
CHECK(expected_results == ls_objects);
}
Expand Down
4 changes: 2 additions & 2 deletions test/src/unit-tile-metadata-generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ TEST_CASE(
bool empty_tile = test == "empty tile";

uint64_t max_string_size = 100;
uint64_t num_strings = 2000;
int num_strings = 2000;

// Generate the array schema.
uint64_t num_cells = empty_tile ? 0 : 20;
Expand All @@ -362,7 +362,7 @@ TEST_CASE(
// Generate random, sorted strings for the string ascii type.
std::vector<std::string> strings;
strings.reserve(num_strings);
for (uint64_t i = 0; i < num_strings; i++) {
for (int i = 0; i < num_strings; i++) {
strings.emplace_back(tiledb::test::random_string(rand() % max_string_size));
}
std::sort(strings.begin(), strings.end());
Expand Down
4 changes: 2 additions & 2 deletions test/src/unit-tile-metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -748,12 +748,12 @@ struct CPPVarTileMetadataFx {
std::default_random_engine random_engine;

uint64_t max_string_size = 100;
uint64_t num_strings = 2000;
int num_strings = 2000;

if (f == 0) {
// Generate random, sorted strings for the string ascii type.
strings_.reserve(num_strings);
for (uint64_t i = 0; i < num_strings; i++) {
for (int i = 0; i < num_strings; i++) {
strings_.emplace_back(
tiledb::test::random_string(rand() % max_string_size));
}
Expand Down
18 changes: 18 additions & 0 deletions test/support/src/vfs_helpers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,24 @@ LocalFsTest::LocalFsTest(const std::vector<size_t>& test_tree)
temp_dir_ =
tiledb::test::test_dir(prefix_ + tiledb::sm::Posix::current_dir() + "/");
#endif

vfs_.create_dir(temp_dir_).ok();
// TODO: We could refactor to remove duplication with S3Test()
for (size_t i = 1; i <= test_tree_.size(); i++) {
sm::URI path = temp_dir_.join_path("subdir_" + std::to_string(i));
vfs_.create_dir(path).ok();
expected_results().emplace_back(path.to_string(), 0);
for (size_t j = 1; j <= test_tree_[i - 1]; j++) {
auto object_uri = path.join_path("test_file_" + std::to_string(j));
vfs_.touch(object_uri).ok();
std::string data(j * 10, 'a');
vfs_.open_file(object_uri, sm::VFSMode::VFS_WRITE).ok();
vfs_.write(object_uri, data.data(), data.size()).ok();
vfs_.close_file(object_uri).ok();
expected_results().emplace_back(object_uri.to_string(), data.size());
}
}
std::sort(expected_results().begin(), expected_results().end());
}

} // namespace tiledb::test
1 change: 0 additions & 1 deletion tiledb/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,6 @@ set(TILEDB_CORE_SOURCES
${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/filesystem/uri.cc
${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/filesystem/vfs.cc
${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/filesystem/vfs_file_handle.cc
${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/filesystem/ls_scanner.cc
${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/filesystem/win.cc
${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/filesystem/filesystem_base.cc
${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/filter/bit_width_reduction_filter.cc
Expand Down
3 changes: 2 additions & 1 deletion tiledb/api/c_api/vfs/vfs_api_experimental.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ typedef int32_t (*tiledb_ls_callback_t)(
* on error. The callback is responsible for writing gathered entries into the
* `data` buffer, for example using a pointer to a user-defined struct.
*
* Currently only S3 is supported, and the `path` must be a valid S3 URI.
* Currently only Posix and S3 are supported, and the `path` must be a valid URI
* for one of those filesystems.
*
* **Example:**
*
Expand Down
7 changes: 3 additions & 4 deletions tiledb/common/stdx_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,20 @@

namespace tiledb::stdx::string {

bool starts_with(const std::string& value, const std::string& prefix) {
bool starts_with(std::string_view value, std::string_view prefix) {
if (prefix.size() > value.size())
return false;
return std::equal(prefix.begin(), prefix.end(), value.begin());
}

bool ends_with(const std::string& value, const std::string& suffix) {
bool ends_with(std::string_view value, std::string_view suffix) {
if (suffix.size() > value.size())
return false;
return value.compare(value.size() - suffix.size(), suffix.size(), suffix) ==
0;
}

size_t common_prefix_size(
const std::string_view& a, const std::string_view& b) {
size_t common_prefix_size(std::string_view a, std::string_view b) {
size_t size = std::min(a.size(), b.size());
for (size_t i = 0; i < size; ++i) {
if (a[i] != b[i])
Expand Down
10 changes: 5 additions & 5 deletions tiledb/common/stdx_string.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ namespace tiledb::stdx::string {
* @param prefix The prefix string to be tested.
* @return `true` if `value` starts with `prefix`; `false` otherwise.
*/
bool starts_with(const std::string& value, const std::string& prefix);
bool starts_with(std::string_view value, std::string_view prefix);

/**
* Checks if a string ends with a certain suffix.
Expand All @@ -55,7 +55,7 @@ bool starts_with(const std::string& value, const std::string& prefix);
* @param suffix The suffix to be tested.
* @return `true` if `value` ends with `suffix`; `false` otherwise.
*/
bool ends_with(const std::string& value, const std::string& suffix);
bool ends_with(std::string_view value, std::string_view suffix);

/**
* Returns the size of the common prefix between `a` and `b`.
Expand All @@ -64,18 +64,18 @@ bool ends_with(const std::string& value, const std::string& suffix);
* a[0..n) == b[0..n) (These ranges are empty if n == 0.)
* n == length of a or n == length of b or a[n] != b[n]
*/
size_t common_prefix_size(const std::string_view& a, const std::string_view& b);
size_t common_prefix_size(std::string_view a, std::string_view b);

} // namespace tiledb::stdx::string

/*
* Functions forwarded into the legacy namespace.
*/
namespace tiledb::sm::utils::parse {
inline bool starts_with(const std::string& value, const std::string& prefix) {
inline bool starts_with(std::string_view value, std::string_view prefix) {
return tiledb::stdx::string::starts_with(value, prefix);
}
inline bool ends_with(const std::string& value, const std::string& suffix) {
inline bool ends_with(std::string_view value, std::string_view suffix) {
return tiledb::stdx::string::ends_with(value, suffix);
}
} // namespace tiledb::sm::utils::parse
Expand Down
1 change: 0 additions & 1 deletion tiledb/sm/filesystem/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ commence(object_library vfs)
uri.cc
vfs.cc
vfs_file_handle.cc
ls_scanner.cc
win.cc
filesystem_base.cc
../curl/curl_init.cc
Expand Down
33 changes: 0 additions & 33 deletions tiledb/sm/filesystem/ls_scanner.cc

This file was deleted.

57 changes: 57 additions & 0 deletions tiledb/sm/filesystem/ls_scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "tiledb/sm/filesystem/uri.h"

#include <cstdint>
#include <filesystem>
#include <functional>
#include <stdexcept>

Expand Down Expand Up @@ -73,6 +74,10 @@ class LsStopTraversal : public LsScanException {
};

using FileFilter = std::function<bool(const std::string_view&, uint64_t)>;
[[maybe_unused]] static bool accept_all_files(
const std::string_view&, uint64_t) {
return true;
}

using DirectoryFilter = std::function<bool(const std::string_view&)>;
/** Static DirectoryFilter used as default argument. */
Expand Down Expand Up @@ -350,6 +355,58 @@ class CallbackWrapperCPP {
LsCallback cb_;
};

/**
* Implements the `ls_filtered` function for `std::filesystem` which can be used
* for Posix and Win32
*/
template <FilePredicate F, DirectoryPredicate D>
LsObjects std_filesystem_ls_filtered(
const URI& parent, F file_filter, D directory_filter, bool recursive) {
/*
* The input URI was useful to the top-level VFS to identify this is a
* regular filesystem path, but we don't need the "file://" qualifier
* anymore and can reason with unqualified strings for the rest of the
* function.
*/
const auto parentstr = parent.to_path();

LsObjects qualifyingPaths;

// awkward way of iterating, avoids bug in OSX
auto begin = std::filesystem::recursive_directory_iterator(parentstr);
auto end = std::filesystem::recursive_directory_iterator();

for (auto iter = begin; iter != end; ++iter) {
auto& entry = *iter;
const auto abspath = entry.path().string();
const auto absuri = URI(abspath);
if (entry.is_directory()) {
if (file_filter(absuri, 0) || directory_filter(absuri)) {
qualifyingPaths.push_back(
std::make_pair(tiledb::sm::URI(abspath).to_string(), 0));
if (!recursive) {
iter.disable_recursion_pending();
}
} else {
/* do not descend into directories which don't qualify */
iter.disable_recursion_pending();
}
} else {
/*
* A leaf of the filesystem
* (or symbolic link - split to a separate case if we want to descend into
* them)
*/
if (file_filter(absuri, entry.file_size())) {
qualifyingPaths.push_back(
std::make_pair(absuri.to_string(), entry.file_size()));
}
}
}

return qualifyingPaths;
}

} // namespace tiledb::sm

#endif // TILEDB_LS_SCANNER_H
6 changes: 4 additions & 2 deletions tiledb/sm/filesystem/path_win.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,13 @@ std::string uri_from_path(const std::string& path) {
return str_uri;
}

std::string path_from_uri(const std::string& uri) {
if (uri.length() == 0) {
std::string path_from_uri(std::string_view uri_view) {
if (uri_view.length() == 0) {
return "";
}

std::string uri(uri_view);

std::string uri_with_scheme =
(stdx::string::starts_with(uri, "file://") ||
// also accept 'file:/x...'
Expand Down
2 changes: 1 addition & 1 deletion tiledb/sm/filesystem/path_win.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ std::string uri_from_path(const std::string& path);
* @param path The URI to convert.
* @status A Windows path.
*/
std::string path_from_uri(const std::string& uri);
std::string path_from_uri(std::string_view uri);

/**
* Converts any '/' to '\\' (single-backslash) and returns the
Expand Down
Loading

0 comments on commit a477ca6

Please sign in to comment.