diff --git a/r/NEWS.md b/r/NEWS.md index 0e6e4634a0af8..1b5be1b544d4b 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -18,6 +18,7 @@ --> # arrow 17.0.0.9000 +* Expose an option `check_directory_existence_before_creation` in `S3FileSystem` which defaults to `FALSE`. If it's set to false, when creating a directory the code will not check if it already exists or not. It's an optimization to try directory creation and catch the error, rather than issue two dependent I/O calls. If set to `TRUE`, when creating a directory the code will only create the directory when necessary at the cost of extra I/O calls. This can be used for key/value cloud storage which has a hard rate limit to number of object mutation operations or scenarios such as the directories already exist and you do not have creation access. # arrow 17.0.0 diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 62e2182ffcd52..901898e5b29a1 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1372,8 +1372,8 @@ fs___CopyFiles <- function(source_fs, source_sel, destination_fs, destination_ba invisible(.Call(`_arrow_fs___CopyFiles`, source_fs, source_sel, destination_fs, destination_base_dir, chunk_size, use_threads)) } -fs___S3FileSystem__create <- function(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_bucket_creation, allow_bucket_deletion, connect_timeout, request_timeout) { - .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_bucket_creation, allow_bucket_deletion, connect_timeout, request_timeout) +fs___S3FileSystem__create <- function(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_bucket_creation, allow_bucket_deletion, check_directory_existence_before_creation, connect_timeout, request_timeout) { + .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_bucket_creation, allow_bucket_deletion, check_directory_existence_before_creation, connect_timeout, request_timeout) } fs___S3FileSystem__region <- function(fs) { diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 0176cdf846da7..8c8939c38f7a6 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -156,6 +156,13 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' buckets if `$CreateDir()` is called on the bucket level (default `FALSE`). #' - `allow_bucket_deletion`: logical, if TRUE, the filesystem will delete #' buckets if`$DeleteDir()` is called on the bucket level (default `FALSE`). +#' - `check_directory_existence_before_creation`: logical, if `FALSE`, when creating a directory the code will +#' . not check if it already exists or not. It's an optimization to try directory creation and catch the error, +#' rather than issue two dependent I/O calls. +#' if `TRUE`, when creating a directory the code will only create the directory when necessary +#' at the cost of extra I/O calls. This can be used for key/value cloud storage which has +#' a hard rate limit to number of object mutation operations or scenarios such as +#' the directories already exist and you do not have creation access (default `FALSE`). #' - `request_timeout`: Socket read time on Windows and macOS in seconds. If #' negative, the AWS SDK default (typically 3 seconds). #' - `connect_timeout`: Socket connection timeout in seconds. If negative, AWS @@ -411,7 +418,8 @@ S3FileSystem$create <- function(anonymous = FALSE, ...) { invalid_args <- intersect( c( "access_key", "secret_key", "session_token", "role_arn", "session_name", - "external_id", "load_frequency", "allow_bucket_creation", "allow_bucket_deletion" + "external_id", "load_frequency", "allow_bucket_creation", "allow_bucket_deletion", + "check_directory_existence_before_creation" ), names(args) ) @@ -459,6 +467,7 @@ default_s3_options <- list( background_writes = TRUE, allow_bucket_creation = FALSE, allow_bucket_deletion = FALSE, + check_directory_existence_before_creation = FALSE, connect_timeout = -1, request_timeout = -1 ) diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd index dbf89ef1387ac..7b581922f5cb7 100644 --- a/r/man/FileSystem.Rd +++ b/r/man/FileSystem.Rd @@ -57,6 +57,9 @@ in the background, without blocking (default \code{TRUE}) buckets if \verb{$CreateDir()} is called on the bucket level (default \code{FALSE}). \item \code{allow_bucket_deletion}: logical, if TRUE, the filesystem will delete buckets if\verb{$DeleteDir()} is called on the bucket level (default \code{FALSE}). +\item \code{check_directory_existence_before_creation}: logical, if TRUE, the filesystem +will create new directories only when necessary at the cost of extra I/O calls. if FALSE, +the filesystem will create new directories without checking the existence (default \code{FALSE}). \item \code{request_timeout}: Socket read time on Windows and macOS in seconds. If negative, the AWS SDK default (typically 3 seconds). \item \code{connect_timeout}: Socket connection timeout in seconds. If negative, AWS diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index d5aec50219e0b..8a4a22c5b237a 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3521,8 +3521,8 @@ END_CPP11 } // filesystem.cpp #if defined(ARROW_R_WITH_S3) -std::shared_ptr fs___S3FileSystem__create(bool anonymous, std::string access_key, std::string secret_key, std::string session_token, std::string role_arn, std::string session_name, std::string external_id, int load_frequency, std::string region, std::string endpoint_override, std::string scheme, std::string proxy_options, bool background_writes, bool allow_bucket_creation, bool allow_bucket_deletion, double connect_timeout, double request_timeout); -extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp, SEXP allow_bucket_creation_sexp, SEXP allow_bucket_deletion_sexp, SEXP connect_timeout_sexp, SEXP request_timeout_sexp){ +std::shared_ptr fs___S3FileSystem__create(bool anonymous, std::string access_key, std::string secret_key, std::string session_token, std::string role_arn, std::string session_name, std::string external_id, int load_frequency, std::string region, std::string endpoint_override, std::string scheme, std::string proxy_options, bool background_writes, bool allow_bucket_creation, bool allow_bucket_deletion, bool check_directory_existence_before_creation, double connect_timeout, double request_timeout); +extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp, SEXP allow_bucket_creation_sexp, SEXP allow_bucket_deletion_sexp, SEXP check_directory_existence_before_creation_sexp, SEXP connect_timeout_sexp, SEXP request_timeout_sexp){ BEGIN_CPP11 arrow::r::Input::type anonymous(anonymous_sexp); arrow::r::Input::type access_key(access_key_sexp); @@ -3539,13 +3539,14 @@ BEGIN_CPP11 arrow::r::Input::type background_writes(background_writes_sexp); arrow::r::Input::type allow_bucket_creation(allow_bucket_creation_sexp); arrow::r::Input::type allow_bucket_deletion(allow_bucket_deletion_sexp); + arrow::r::Input::type check_directory_existence_before_creation(check_directory_existence_before_creation_sexp); arrow::r::Input::type connect_timeout(connect_timeout_sexp); arrow::r::Input::type request_timeout(request_timeout_sexp); - return cpp11::as_sexp(fs___S3FileSystem__create(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_bucket_creation, allow_bucket_deletion, connect_timeout, request_timeout)); + return cpp11::as_sexp(fs___S3FileSystem__create(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_bucket_creation, allow_bucket_deletion, check_directory_existence_before_creation, connect_timeout, request_timeout)); END_CPP11 } #else -extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp, SEXP allow_bucket_creation_sexp, SEXP allow_bucket_deletion_sexp, SEXP connect_timeout_sexp, SEXP request_timeout_sexp){ +extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp, SEXP allow_bucket_creation_sexp, SEXP allow_bucket_deletion_sexp, SEXP check_directory_existence_before_creation_sexp, SEXP connect_timeout_sexp, SEXP request_timeout_sexp){ Rf_error("Cannot call fs___S3FileSystem__create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index 2274a3d7ff7a2..20c392ce7df01 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -289,7 +289,8 @@ std::shared_ptr fs___S3FileSystem__create( std::string region = "", std::string endpoint_override = "", std::string scheme = "", std::string proxy_options = "", bool background_writes = true, bool allow_bucket_creation = false, bool allow_bucket_deletion = false, - double connect_timeout = -1, double request_timeout = -1) { + bool check_directory_existence_before_creation = false, double connect_timeout = -1, + double request_timeout = -1) { // We need to ensure that S3 is initialized before we start messing with the // options StopIfNotOk(fs::EnsureS3Initialized()); @@ -330,6 +331,8 @@ std::shared_ptr fs___S3FileSystem__create( s3_opts.allow_bucket_creation = allow_bucket_creation; s3_opts.allow_bucket_deletion = allow_bucket_deletion; + s3_opts.check_directory_existence_before_creation = + check_directory_existence_before_creation; s3_opts.request_timeout = request_timeout; s3_opts.connect_timeout = connect_timeout; diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index 8dfac63471612..908c8252eaa4f 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -46,7 +46,8 @@ fs <- S3FileSystem$create( scheme = "http", endpoint_override = paste0("localhost:", minio_port), allow_bucket_creation = TRUE, - allow_bucket_deletion = TRUE + allow_bucket_deletion = TRUE, + check_directory_existence_before_creation = TRUE, ) limited_fs <- S3FileSystem$create( access_key = minio_key, @@ -54,7 +55,8 @@ limited_fs <- S3FileSystem$create( scheme = "http", endpoint_override = paste0("localhost:", minio_port), allow_bucket_creation = FALSE, - allow_bucket_deletion = FALSE + allow_bucket_deletion = FALSE, + check_directory_existence_before_creation = FALSE, ) now <- as.character(as.numeric(Sys.time())) fs$CreateDir(now) diff --git a/r/vignettes/fs.Rmd b/r/vignettes/fs.Rmd index 50278af25bd1b..07476877c5b8d 100644 --- a/r/vignettes/fs.Rmd +++ b/r/vignettes/fs.Rmd @@ -190,7 +190,7 @@ Also note that parameters in the URI need to be For S3, only the following options can be included in the URI as query parameters are `region`, `scheme`, `endpoint_override`, `access_key`, `secret_key`, `allow_bucket_creation`, -and `allow_bucket_deletion`. For GCS, the supported parameters are `scheme`, `endpoint_override`, +`allow_bucket_deletion` and `check_directory_existence_before_creation`. For GCS, the supported parameters are `scheme`, `endpoint_override`, and `retry_limit_seconds`. In GCS, a useful option is `retry_limit_seconds`, which sets the number of seconds