From cf83ca369446370467c5285d588d620c138def6c Mon Sep 17 00:00:00 2001 From: sfc-gh-ext-simba-hx Date: Tue, 24 Oct 2023 09:11:13 -0700 Subject: [PATCH 1/3] Add support for unicode characters in file path in PUT/GET command --- cpp/FileMetadataInitializer.cpp | 30 ++++--- cpp/FileMetadataInitializer.hpp | 7 +- cpp/FileTransferAgent.cpp | 16 ++-- include/snowflake/IStatementPutGet.hpp | 12 +++ tests/test_simple_put.cpp | 116 +++++++++++++++++++++++-- 5 files changed, 151 insertions(+), 30 deletions(-) diff --git a/cpp/FileMetadataInitializer.cpp b/cpp/FileMetadataInitializer.cpp index 519a95e9d2..208304699c 100755 --- a/cpp/FileMetadataInitializer.cpp +++ b/cpp/FileMetadataInitializer.cpp @@ -24,10 +24,12 @@ Snowflake::Client::FileMetadataInitializer::FileMetadataInitializer( std::vector &smallFileMetadata, - std::vector &largeFileMetadata) : + std::vector &largeFileMetadata, + IStatementPutGet *stmtPutGet) : m_smallFileMetadata(smallFileMetadata), m_largeFileMetadata(largeFileMetadata), - m_autoCompress(true) + m_autoCompress(true), + m_stmtPutGet(stmtPutGet) { } @@ -39,9 +41,9 @@ Snowflake::Client::FileMetadataInitializer::initUploadFileMetadata(const std::st fileNameFull += fileName; FileMetadata fileMetadata; - fileMetadata.srcFileName = fileNameFull; + fileMetadata.srcFileName = m_stmtPutGet->platformStringToUTF8(fileNameFull); fileMetadata.srcFileSize = fileSize; - fileMetadata.destFileName = std::string(fileName); + fileMetadata.destFileName = m_stmtPutGet->platformStringToUTF8(std::string(fileName)); // process compression type initCompressionMetadata(fileMetadata); @@ -56,9 +58,11 @@ void Snowflake::Client::FileMetadataInitializer::populateSrcLocUploadMetadata(st size_t putThreshold) { // looking for files on disk. + std::string srcLocationPlatform = m_stmtPutGet->UTF8ToPlatformString(sourceLocation); + #ifdef _WIN32 WIN32_FIND_DATA fdd; - HANDLE hFind = FindFirstFile(sourceLocation.c_str(), &fdd); + HANDLE hFind = FindFirstFile(srcLocationPlatform.c_str(), &fdd); if (hFind == INVALID_HANDLE_VALUE) { DWORD dwError = GetLastError(); @@ -73,7 +77,7 @@ void Snowflake::Client::FileMetadataInitializer::populateSrcLocUploadMetadata(st { CXX_LOG_ERROR("Failed on FindFirstFile. Error: %d", dwError); throw SnowflakeTransferException(TransferError::DIR_OPEN_ERROR, - sourceLocation.c_str(), dwError); + srcLocationPlatform.c_str(), dwError); } } @@ -81,14 +85,14 @@ void Snowflake::Client::FileMetadataInitializer::populateSrcLocUploadMetadata(st if (!(fdd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) ) { std::string fileFullPath = std::string(fdd.cFileName); - size_t dirSep = sourceLocation.find_last_of(PATH_SEP); + size_t dirSep = srcLocationPlatform.find_last_of(PATH_SEP); if (dirSep == std::string::npos) { dirSep = sourceLocation.find_last_of(ALTER_PATH_SEP); } if (dirSep != std::string::npos) { - std::string dirPath = sourceLocation.substr(0, dirSep + 1); + std::string dirPath = srcLocationPlatform.substr(0, dirSep + 1); LARGE_INTEGER fileSize; fileSize.LowPart = fdd.nFileSizeLow; fileSize.HighPart = fdd.nFileSizeHigh; @@ -102,14 +106,14 @@ void Snowflake::Client::FileMetadataInitializer::populateSrcLocUploadMetadata(st { CXX_LOG_ERROR("Failed on FindNextFile. Error: %d", dwError); throw SnowflakeTransferException(TransferError::DIR_OPEN_ERROR, - sourceLocation.c_str(), dwError); + srcLocationPlatform.c_str(), dwError); } FindClose(hFind); #else - unsigned long dirSep = sourceLocation.find_last_of(PATH_SEP); - std::string dirPath = sourceLocation.substr(0, dirSep + 1); - std::string filePattern = sourceLocation.substr(dirSep + 1); + unsigned long dirSep = srcLocationPlatform.find_last_of(PATH_SEP); + std::string dirPath = srcLocationPlatform.substr(0, dirSep + 1); + std::string filePattern = srcLocationPlatform.substr(dirSep + 1); DIR * dir = nullptr; struct dirent * dir_entry; @@ -133,7 +137,7 @@ void Snowflake::Client::FileMetadataInitializer::populateSrcLocUploadMetadata(st { CXX_LOG_ERROR("Cannot read path struct"); throw SnowflakeTransferException(TransferError::DIR_OPEN_ERROR, - sourceLocation.c_str(), ret); + srcLocationPlatform.c_str(), ret); } } } diff --git a/cpp/FileMetadataInitializer.hpp b/cpp/FileMetadataInitializer.hpp index 31015a6751..f60239d1f8 100755 --- a/cpp/FileMetadataInitializer.hpp +++ b/cpp/FileMetadataInitializer.hpp @@ -9,6 +9,7 @@ #include #include "FileMetadata.hpp" #include "IStorageClient.hpp" +#include "snowflake/IStatementPutGet.hpp" // used to decide whether to upload in sequence or in parallel #define DEFAULT_UPLOAD_DATA_SIZE_THRESHOLD 209715200 //200Mb @@ -25,7 +26,8 @@ class FileMetadataInitializer { public: FileMetadataInitializer(std::vector &smallFileMetadata, - std::vector &largeFileMetadata); + std::vector &largeFileMetadata, + IStatementPutGet *stmtPutGet); /** * Given a source locations, find all files that match the location pattern, @@ -101,6 +103,9 @@ class FileMetadataInitializer /// Random device for crytpo random num generator. Crypto::CryptoRandomDevice m_randDevice; + + // statement which provides encoding conversion funcationality + IStatementPutGet *m_stmtPutGet; }; } } diff --git a/cpp/FileTransferAgent.cpp b/cpp/FileTransferAgent.cpp index 69fd783953..e85752fce1 100755 --- a/cpp/FileTransferAgent.cpp +++ b/cpp/FileTransferAgent.cpp @@ -63,7 +63,7 @@ Snowflake::Client::FileTransferAgent::FileTransferAgent( IStatementPutGet *statement, TransferConfig *transferConfig) : m_stmtPutGet(statement), - m_FileMetadataInitializer(m_smallFilesMeta, m_largeFilesMeta), + m_FileMetadataInitializer(m_smallFilesMeta, m_largeFilesMeta, statement), m_executionResults(nullptr), m_storageClient(nullptr), m_lastRefreshTokenSec(0), @@ -488,7 +488,7 @@ RemoteStorageRequestOutcome Snowflake::Client::FileTransferAgent::uploadSingleFi srcFileStream = m_uploadStream; } else { try { - fs = ::std::fstream(fileMetadata->srcFileToUpload.c_str(), + fs = ::std::fstream(m_stmtPutGet->UTF8ToPlatformString(fileMetadata->srcFileToUpload).c_str(), ::std::ios_base::in | ::std::ios_base::binary); } @@ -613,12 +613,13 @@ void Snowflake::Client::FileTransferAgent::compressSourceFile( } std::string stagingFile(tempDir); - stagingFile += fileMetadata->destFileName; + stagingFile += m_stmtPutGet->UTF8ToPlatformString(fileMetadata->destFileName); + std::string srcFileNamePlatform = m_stmtPutGet->UTF8ToPlatformString(fileMetadata->srcFileName); - FILE *sourceFile = fopen(fileMetadata->srcFileName.c_str(), "r"); + FILE *sourceFile = fopen(srcFileNamePlatform.c_str(), "r"); if( !sourceFile ){ CXX_LOG_ERROR("Failed to open srcFileName %s. Errno: %d", fileMetadata->srcFileName.c_str(), errno); - throw SnowflakeTransferException(TransferError::FILE_OPEN_ERROR, fileMetadata->srcFileName.c_str(), -1); + throw SnowflakeTransferException(TransferError::FILE_OPEN_ERROR, srcFileNamePlatform.c_str(), -1); } FILE *destFile = fopen(stagingFile.c_str(), "w"); if ( !destFile) { @@ -626,7 +627,7 @@ void Snowflake::Client::FileTransferAgent::compressSourceFile( throw SnowflakeTransferException(TransferError::FILE_OPEN_ERROR, stagingFile.c_str(), -1); } // set srcFileToUpload after open file successfully to prevent command injection. - fileMetadata->srcFileToUpload = stagingFile; + fileMetadata->srcFileToUpload = m_stmtPutGet->platformStringToUTF8(stagingFile); int ret = Util::CompressionUtil::compressWithGzip(sourceFile, destFile, fileMetadata->srcFileToUploadSize, level); @@ -829,6 +830,7 @@ RemoteStorageRequestOutcome Snowflake::Client::FileTransferAgent::downloadSingle { fileMetadata->destPath = std::string(response.localLocation) + PATH_SEP + fileMetadata->destFileName; + std::string destPathPlatform = m_stmtPutGet->UTF8ToPlatformString(fileMetadata->destPath); RemoteStorageRequestOutcome outcome = RemoteStorageRequestOutcome::FAILED; RetryContext getRetryCtx(fileMetadata->srcFileName, m_maxGetRetries); @@ -839,7 +841,7 @@ RemoteStorageRequestOutcome Snowflake::Client::FileTransferAgent::downloadSingle std::basic_fstream dstFile; try { - dstFile = std::basic_fstream(fileMetadata->destPath.c_str(), + dstFile = std::basic_fstream(destPathPlatform.c_str(), std::ios_base::out | std::ios_base::binary); } catch (...) { diff --git a/include/snowflake/IStatementPutGet.hpp b/include/snowflake/IStatementPutGet.hpp index 15fb0e4431..3a70d82a43 100644 --- a/include/snowflake/IStatementPutGet.hpp +++ b/include/snowflake/IStatementPutGet.hpp @@ -77,6 +77,18 @@ class IStatementPutGet return NULL; } + // Utility functions to convert enconding between UTF-8 to the encoding + // from system locale. No coversion by default. + virtual std::string UTF8ToPlatformString(const std::string& utf8_str) + { + return utf8_str; + } + + virtual std::string platformStringToUTF8(const std::string& platform_str) + { + return platform_str; + } + virtual ~IStatementPutGet() { diff --git a/tests/test_simple_put.cpp b/tests/test_simple_put.cpp index fe72763b5a..456eb3ff9d 100755 --- a/tests/test_simple_put.cpp +++ b/tests/test_simple_put.cpp @@ -13,6 +13,7 @@ #include "snowflake/IStatementPutGet.hpp" #include "StatementPutGet.hpp" #include "FileTransferAgent.hpp" +#include "boost/filesystem.hpp" #define COLUMN_STATUS "STATUS" #define COLUMN_SOURCE "SOURCE" @@ -27,6 +28,50 @@ #define MAX_BUF_SIZE 4096 using namespace ::Snowflake::Client; +using namespace boost::filesystem; + +static std::string PLATFORM_STR = "\xe9"; +static std::string UTF8_STR = "\xc3\xa9"; + +bool replaceInPlace( std::string& str, std::string const& replaceThis, std::string const& withThis ) { + bool replaced = false; + std::size_t i = str.find( replaceThis ); + while( i != std::string::npos ) { + replaced = true; + str = str.substr( 0, i ) + withThis + str.substr( i+replaceThis.size() ); + if( i < str.size()-withThis.size() ) + i = str.find( replaceThis, i+withThis.size() ); + else + i = std::string::npos; + } + return replaced; +} + +namespace Snowflake +{ +namespace Client +{ +class StatementPutGetUnicode : public Snowflake::Client::StatementPutGet +{ +public: + StatementPutGetUnicode(SF_STMT *stmt) : StatementPutGet(stmt) {} + virtual std::string UTF8ToPlatformString(const std::string& utf8_str) + { + std::string result = utf8_str; + replaceInPlace(result, UTF8_STR, PLATFORM_STR); + return result; + } + + virtual std::string platformStringToUTF8(const std::string& platform_str) + { + std::string result = platform_str; + replaceInPlace(result, PLATFORM_STR, UTF8_STR); + return result; + } +}; + +} +} //File list to be made available to re-upload. static std::vector fileList; @@ -64,11 +109,13 @@ void test_simple_put_core(const char * fileName, bool useS3regionalUrl = false, int compressLevel = -1, bool overwrite = false, - SF_CONNECT * connection = nullptr) + SF_CONNECT * connection = nullptr, + bool testUnicode = false) { /* init */ SF_STATUS status; SF_CONNECT *sf; + if (!connection) { sf = setup_snowflake_connection(); status = snowflake_connect(sf); @@ -103,14 +150,15 @@ void test_simple_put_core(const char * fileName, std::string dataDir = TestSetup::getDataDir(); std::string file = dataDir + fileName; - std::string putCommand = "put file://" + file + " @%test_small_put"; + replaceInPlace(file, "\\", "\\\\"); + std::string putCommand = "put 'file://" + file + "' @%test_small_put"; if(createDupTable) { - putCommand = "put file://" + std::string(fileName) + " @%test_small_put_dup"; + putCommand = "put 'file://" + std::string(fileName) + "' @%test_small_put_dup"; } else if (createSubfolder) { - putCommand = "put file://" + file + " @%test_small_put/subfolder"; + putCommand = "put 'file://" + file + "' @%test_small_put/subfolder"; } if (!autoCompress) @@ -132,8 +180,17 @@ void test_simple_put_core(const char * fileName, { putCommand += " overwrite=true"; } - std::unique_ptr stmtPutGet = std::unique_ptr - (new Snowflake::Client::StatementPutGet(sfstmt)); + std::unique_ptr stmtPutGet; + if (testUnicode) + { + stmtPutGet = std::unique_ptr + (new Snowflake::Client::StatementPutGetUnicode(sfstmt)); + } + else + { + stmtPutGet = std::unique_ptr + (new Snowflake::Client::StatementPutGet(sfstmt)); + } TransferConfig transConfig; TransferConfig * transConfigPtr = nullptr; @@ -282,7 +339,7 @@ static int teardown(void **unused) } void test_simple_get_data(const char *getCommand, const char *size, - long getThreshold = 0) + long getThreshold = 0, bool testUnicode = false) { /* init */ SF_STATUS status; @@ -296,8 +353,17 @@ void test_simple_get_data(const char *getCommand, const char *size, /* query */ sfstmt = snowflake_stmt(sf); - std::unique_ptr stmtPutGet = std::unique_ptr - (new Snowflake::Client::StatementPutGet(sfstmt)); + std::unique_ptr stmtPutGet; + if (testUnicode) + { + stmtPutGet = std::unique_ptr + (new Snowflake::Client::StatementPutGetUnicode(sfstmt)); + } + else + { + stmtPutGet = std::unique_ptr + (new Snowflake::Client::StatementPutGet(sfstmt)); + } TransferConfig transConfig; TransferConfig * transConfigPtr = nullptr; @@ -1502,6 +1568,37 @@ void test_upload_file_to_stage_using_stream(void **unused) snowflake_term(sf); } +void test_put_get_with_unicode(void **unused) +{ + std::string dataDir = TestSetup::getDataDir(); + std::string filename=PLATFORM_STR + ".csv"; + copy_file(dataDir + "small_file.csv", dataDir + filename, copy_option::overwrite_if_exists); + filename = UTF8_STR + ".csv"; + test_simple_put_core( + filename.c_str(), // filename + "auto", //source compression + true, // auto compress + true, // copyUploadFile + true, // verifyCopyUploadFile + false, // copyTableToStaging + false, // createDupTable + false, // setCustomThreshold + 64 * 1024 * 1024, // customThreshold + false, // useDevUrand + false, // createSubfolder + nullptr, // tmpDir + false, // useS3regionalUrl + -1, // compressLevel + false, // overwrite + nullptr, // connection + true // testUnicode + ); + + std::string getcmd = std::string("get '@%test_small_put/") + UTF8_STR +".csv.gz'" + " file://" + TestSetup::getDataDir(); + test_simple_get_data(getcmd.c_str(), "48", 0, true); +} + int main(void) { #ifdef __APPLE__ @@ -1533,6 +1630,7 @@ int main(void) { } const struct CMUnitTest tests[] = { + cmocka_unit_test_teardown(test_put_get_with_unicode, teardown), cmocka_unit_test_teardown(test_simple_put_auto_compress, teardown), cmocka_unit_test_teardown(test_simple_put_config_temp_dir, teardown), cmocka_unit_test_teardown(test_simple_put_auto_detect_gzip, teardown), From 88b94462ba9386c2060ab6459f86cee12fad57cd Mon Sep 17 00:00:00 2001 From: sfc-gh-ext-simba-hx Date: Tue, 24 Oct 2023 11:26:12 -0700 Subject: [PATCH 2/3] fix build issue --- tests/test_unit_file_metadata_init.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/test_unit_file_metadata_init.cpp b/tests/test_unit_file_metadata_init.cpp index df4ff50234..592098da62 100755 --- a/tests/test_unit_file_metadata_init.cpp +++ b/tests/test_unit_file_metadata_init.cpp @@ -12,6 +12,10 @@ #include #include #include +#include +#include +#include "snowflake/IStatementPutGet.hpp" +#include "StatementPutGet.hpp" #define FILES_IN_DIR "file1.csv", "file2.csv", "file3.csv", "file4.csv", "file1.gz" @@ -70,13 +74,18 @@ std::vector getListOfTestFileMatchDir() void test_file_pattern_match_core(std::vector *expectedFiles, const char *filePattern) { + SF_CONNECT *sf = snowflake_init(); + SF_STMT *sfstmt = snowflake_stmt(sf); + std::unique_ptr stmtPutGet = std::unique_ptr + (new Snowflake::Client::StatementPutGet(sfstmt)); + std::vector listTestDir = getListOfTestFileMatchDir(); for (auto testDir : listTestDir) { std::vector smallFileMetadata; std::vector largeFileMetadata; - FileMetadataInitializer initializer(smallFileMetadata, largeFileMetadata); + FileMetadataInitializer initializer(smallFileMetadata, largeFileMetadata, stmtPutGet.get()); initializer.setSourceCompression((char *)"none"); std::string fullFilePattern = testDir + filePattern; From 64c8069428b1ec58091a44c2a168a7ba615a003f Mon Sep 17 00:00:00 2001 From: sfc-gh-ext-simba-hx Date: Tue, 24 Oct 2023 19:24:22 -0700 Subject: [PATCH 3/3] fix test failure --- tests/test_simple_put.cpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/test_simple_put.cpp b/tests/test_simple_put.cpp index 456eb3ff9d..189d7c802e 100755 --- a/tests/test_simple_put.cpp +++ b/tests/test_simple_put.cpp @@ -30,7 +30,11 @@ using namespace ::Snowflake::Client; using namespace boost::filesystem; +#ifdef _WIN32 static std::string PLATFORM_STR = "\xe9"; +#else +static std::string PLATFORM_STR = "é"; +#endif static std::string UTF8_STR = "\xc3\xa9"; bool replaceInPlace( std::string& str, std::string const& replaceThis, std::string const& withThis ) { @@ -150,15 +154,20 @@ void test_simple_put_core(const char * fileName, std::string dataDir = TestSetup::getDataDir(); std::string file = dataDir + fileName; - replaceInPlace(file, "\\", "\\\\"); - std::string putCommand = "put 'file://" + file + "' @%test_small_put"; + std::string putCommand = "put file://" + file + " @%test_small_put"; + if (testUnicode) + { + replaceInPlace(file, "\\", "\\\\"); + putCommand = "put 'file://" + file + "' @%test_small_put"; + } + if(createDupTable) { - putCommand = "put 'file://" + std::string(fileName) + "' @%test_small_put_dup"; + putCommand = "put file://" + std::string(fileName) + " @%test_small_put_dup"; } else if (createSubfolder) { - putCommand = "put 'file://" + file + "' @%test_small_put/subfolder"; + putCommand = "put file://" + file + " @%test_small_put/subfolder"; } if (!autoCompress) @@ -1630,7 +1639,6 @@ int main(void) { } const struct CMUnitTest tests[] = { - cmocka_unit_test_teardown(test_put_get_with_unicode, teardown), cmocka_unit_test_teardown(test_simple_put_auto_compress, teardown), cmocka_unit_test_teardown(test_simple_put_config_temp_dir, teardown), cmocka_unit_test_teardown(test_simple_put_auto_detect_gzip, teardown), @@ -1661,6 +1669,7 @@ int main(void) { cmocka_unit_test_teardown(test_simple_put_with_proxy_fromenv, teardown), cmocka_unit_test_teardown(test_simple_put_with_noproxy_fromenv, teardown), cmocka_unit_test_teardown(test_upload_file_to_stage_using_stream, donothing), + cmocka_unit_test_teardown(test_put_get_with_unicode, teardown), }; int ret = cmocka_run_group_tests(tests, gr_setup, gr_teardown); return ret;