From e8bd812d9ab097d52048c3c7473e98aff7ff2fe7 Mon Sep 17 00:00:00 2001 From: Oz Date: Sun, 7 Jul 2024 09:43:40 +0200 Subject: [PATCH] Rewrite directory indexing algorithm to not use recursion Fixes stack overflow issues when indexing deeply nested directory structures (close issue #223) --- src/internal/fsindexer.cpp | 45 +++++---- src/internal/fsindexer.hpp | 4 +- tests/src/test_bititemsvector.cpp | 154 ++++++++++++++++++++++++++++++ 3 files changed, 177 insertions(+), 26 deletions(-) diff --git a/src/internal/fsindexer.cpp b/src/internal/fsindexer.cpp index d34d743b..717d3ac9 100644 --- a/src/internal/fsindexer.cpp +++ b/src/internal/fsindexer.cpp @@ -18,6 +18,7 @@ #include "internal/fsitem.hpp" #include "internal/fsutil.hpp" #include "internal/genericinputitem.hpp" +#include "internal/stringutil.hpp" #include #include @@ -43,44 +44,42 @@ FilesystemIndexer::FilesystemIndexer( FilesystemItem directory, } // NOTE: It indexes all the items whose metadata are needed in the archive to be created! -// NOLINTNEXTLINE(misc-no-recursion) void FilesystemIndexer::listDirectoryItems( std::vector< std::unique_ptr< GenericInputItem > >& result, - bool recursive, - const fs::path& prefix ) { - fs::path path = mDirItem.filesystemPath(); - if ( !prefix.empty() ) { - path = path / prefix; - } + bool recursive ) { const bool includeRootPath = mFilter.empty() || !mDirItem.filesystemPath().has_parent_path() || mDirItem.inArchivePath().filename() != mDirItem.filesystemName(); const bool shouldIncludeMatchedItems = mPolicy == FilterPolicy::Include; + + fs::path basePath = mDirItem.filesystemPath(); std::error_code error; - for ( const auto& currentEntry : fs::directory_iterator( path, error ) ) { - auto searchPath = includeRootPath ? mDirItem.inArchivePath() : fs::path{}; - if ( !prefix.empty() ) { - searchPath = searchPath.empty() ? prefix : searchPath / prefix; - } + for ( auto iterator = fs::recursive_directory_iterator{ basePath, error }; + iterator != fs::recursive_directory_iterator{}; + ++iterator ) { + const auto& currentEntry = *iterator; + const auto& itemPath = currentEntry.path(); + + const auto prefix = fs::relative( itemPath, basePath, error ).remove_filename(); + const auto searchPath = includeRootPath ? mDirItem.inArchivePath() / prefix : prefix; + + const auto itemIsDir = currentEntry.is_directory( error ); + const auto itemName = path_to_tstring( itemPath.filename() ); - const FilesystemItem currentItem{ currentEntry, searchPath, mSymlinkPolicy }; /* An item matches if: * - Its name matches the wildcard pattern, and * - Either is a file, or we are interested also to include folders in the index. * * Note: The boolean expression uses short-circuiting to optimize the evaluation. */ - const bool itemMatches = ( !mOnlyFiles || !currentItem.isDir() ) && - fsutil::wildcard_match( mFilter, currentItem.name() ); + const bool itemMatches = ( !mOnlyFiles || !itemIsDir ) && fsutil::wildcard_match( mFilter, itemName ); if ( itemMatches == shouldIncludeMatchedItems ) { - result.emplace_back( std::make_unique< FilesystemItem >( currentItem ) ); + result.emplace_back( std::make_unique< FilesystemItem >( currentEntry, searchPath, mSymlinkPolicy ) ); } - if ( currentItem.isDir() && ( recursive || ( itemMatches == shouldIncludeMatchedItems ) ) ) { - //currentItem is a directory, and we must list it only if: - // > indexing is done recursively - // > indexing is not recursive, but the directory name matched the filter. - const fs::path nextDir = prefix.empty() ? - currentItem.filesystemName() : prefix / currentItem.filesystemName(); - listDirectoryItems( result, true, nextDir ); + /* We don't need to recurse inside the current item if: + * - it is not a directory; or + * - we are not indexing recursively, and the directory's name doesn't match the wildcard filter. */ + if ( !itemIsDir || ( !recursive && ( itemMatches != shouldIncludeMatchedItems ) ) ) { + iterator.disable_recursion_pending(); } } } diff --git a/src/internal/fsindexer.hpp b/src/internal/fsindexer.hpp index 678d17e3..c193c572 100644 --- a/src/internal/fsindexer.hpp +++ b/src/internal/fsindexer.hpp @@ -30,9 +30,7 @@ class FilesystemIndexer final { SymlinkPolicy symlinkPolicy = SymlinkPolicy::Follow, bool onlyFiles = false ); - void listDirectoryItems( std::vector< std::unique_ptr< GenericInputItem > >& result, - bool recursive, - const fs::path& prefix = fs::path{} ); + void listDirectoryItems( std::vector< std::unique_ptr< GenericInputItem > >& result, bool recursive ); private: FilesystemItem mDirItem; diff --git a/tests/src/test_bititemsvector.cpp b/tests/src/test_bititemsvector.cpp index 150e1329..c053304b 100644 --- a/tests/src/test_bititemsvector.cpp +++ b/tests/src/test_bititemsvector.cpp @@ -224,6 +224,10 @@ TEST_CASE( "BitItemsVector: Indexing a valid directory (only files)", "[bititems TEST_CASE( "BitItemsVector: Indexing a valid directory (retaining folder structure)", "[bititemsvector]" ) { static const TestDirectory testDir{ test_filesystem_dir }; + /* NOTE: BitItemsVector uses the retainFolderStructure option only to decide if it must include + * the root folder in the in-archive paths; the actual folder structure is still kept here, + * and will be discarded by the extract callback (e.g., see FileExtractCallback). */ + // TODO: Rationalize the handling of retainFolderStructure. IndexingOptions options{}; options.retainFolderStructure = true; @@ -1289,6 +1293,32 @@ TEST_CASE( "BitItemsVector: Indexing a valid directory (relative path)", "[bitit "test_filesystem/folder/subfolder2/frequency.xlsx" } }, + TestInputPath{ + "folder/..", + { + "italy.svg", + "Lorem Ipsum.pdf", + "noext", + BIT7Z_NATIVE_STRING( "σαράντα δύο.txt" ), + "dot.folder", + "dot.folder/hello.json", + "empty", + "folder", + "folder/clouds.jpg", + "folder/subfolder", + "folder/subfolder2", + "folder/subfolder2/homework.doc", + "folder/subfolder2/The quick brown fox.pdf", + "folder/subfolder2/frequency.xlsx" + } + }, + TestInputPath{ + "folder/../dot.folder", + { + "dot.folder", + "dot.folder/hello.json" + } + }, TestInputPath{ "../test_filesystem/empty", { "empty" } }, TestInputPath{ "../test_filesystem/folder", @@ -1302,6 +1332,18 @@ TEST_CASE( "BitItemsVector: Indexing a valid directory (relative path)", "[bitit "folder/subfolder2/frequency.xlsx" } }, + TestInputPath{ + "folder/subfolder2/../", + { + "folder", + "folder/clouds.jpg", + "folder/subfolder", + "folder/subfolder2", + "folder/subfolder2/homework.doc", + "folder/subfolder2/The quick brown fox.pdf", + "folder/subfolder2/frequency.xlsx" + } + }, TestInputPath{ "../test_filesystem/folder/subfolder2", { @@ -1350,6 +1392,32 @@ TEST_CASE( "BitItemsVector: Indexing a valid directory (relative path, non-recur "test_filesystem/folder/subfolder2/frequency.xlsx" } }, + TestInputPath{ + "folder/..", + { + "italy.svg", + "Lorem Ipsum.pdf", + "noext", + BIT7Z_NATIVE_STRING( "σαράντα δύο.txt" ), + "dot.folder", + "dot.folder/hello.json", + "empty", + "folder", + "folder/clouds.jpg", + "folder/subfolder", + "folder/subfolder2", + "folder/subfolder2/homework.doc", + "folder/subfolder2/The quick brown fox.pdf", + "folder/subfolder2/frequency.xlsx" + } + }, + TestInputPath{ + "folder/../dot.folder", + { + "dot.folder", + "dot.folder/hello.json" + } + }, TestInputPath{ "../test_filesystem/empty", { "empty" } }, TestInputPath{ "../test_filesystem/folder", @@ -1363,6 +1431,18 @@ TEST_CASE( "BitItemsVector: Indexing a valid directory (relative path, non-recur "folder/subfolder2/frequency.xlsx" } }, + TestInputPath{ + "folder/subfolder2/../", + { + "folder", + "folder/clouds.jpg", + "folder/subfolder", + "folder/subfolder2", + "folder/subfolder2/homework.doc", + "folder/subfolder2/The quick brown fox.pdf", + "folder/subfolder2/frequency.xlsx" + } + }, TestInputPath{ "../test_filesystem/folder/subfolder2", { @@ -1409,6 +1489,46 @@ TEST_CASE( "BitItemsVector: Indexing a valid directory (custom path mapping)", " "custom_folder/folder/subfolder2/frequency.xlsx" } }, + TestInputPath{ + "folder/..", + { + "custom_folder", + "custom_folder/dot.folder", + "custom_folder/dot.folder/hello.json", + "custom_folder/italy.svg", + "custom_folder/Lorem Ipsum.pdf", + "custom_folder/noext", + BIT7Z_NATIVE_STRING( "custom_folder/σαράντα δύο.txt" ), + "custom_folder/empty", + "custom_folder/folder", + "custom_folder/folder/clouds.jpg", + "custom_folder/folder/subfolder", + "custom_folder/folder/subfolder2", + "custom_folder/folder/subfolder2/homework.doc", + "custom_folder/folder/subfolder2/The quick brown fox.pdf", + "custom_folder/folder/subfolder2/frequency.xlsx" + } + }, + TestInputPath{ + "folder/../", + { + "custom_folder", + "custom_folder/dot.folder", + "custom_folder/dot.folder/hello.json", + "custom_folder/italy.svg", + "custom_folder/Lorem Ipsum.pdf", + "custom_folder/noext", + BIT7Z_NATIVE_STRING( "custom_folder/σαράντα δύο.txt" ), + "custom_folder/empty", + "custom_folder/folder", + "custom_folder/folder/clouds.jpg", + "custom_folder/folder/subfolder", + "custom_folder/folder/subfolder2", + "custom_folder/folder/subfolder2/homework.doc", + "custom_folder/folder/subfolder2/The quick brown fox.pdf", + "custom_folder/folder/subfolder2/frequency.xlsx" + } + }, TestInputPath{ "empty", { "custom_folder" } }, TestInputPath{ "./empty", { "custom_folder" } }, TestInputPath{ @@ -1435,6 +1555,30 @@ TEST_CASE( "BitItemsVector: Indexing a valid directory (custom path mapping)", " "custom_folder/subfolder2/frequency.xlsx" } }, + TestInputPath{ + "./folder/subfolder2/..", + { + "custom_folder", + "custom_folder/clouds.jpg", + "custom_folder/subfolder", + "custom_folder/subfolder2", + "custom_folder/subfolder2/homework.doc", + "custom_folder/subfolder2/The quick brown fox.pdf", + "custom_folder/subfolder2/frequency.xlsx" + } + }, + TestInputPath{ + "./folder/subfolder2/../", + { + "custom_folder", + "custom_folder/clouds.jpg", + "custom_folder/subfolder", + "custom_folder/subfolder2", + "custom_folder/subfolder2/homework.doc", + "custom_folder/subfolder2/The quick brown fox.pdf", + "custom_folder/subfolder2/frequency.xlsx" + } + }, TestInputPath{ "folder/subfolder2", { @@ -1823,9 +1967,11 @@ TEST_CASE( "BitItemsVector: Indexing a directory as a file should fail", "[bitit BitItemsVector itemsVector; REQUIRE_THROWS( itemsVector.indexFile( BIT7Z_STRING( "." ) ) ); REQUIRE_THROWS( itemsVector.indexFile( BIT7Z_STRING( "dot.folder" ) ) ); + REQUIRE_THROWS( itemsVector.indexFile( BIT7Z_STRING( "dot.folder/../" ) ) ); REQUIRE_THROWS( itemsVector.indexFile( BIT7Z_STRING( "empty" ) ) ); REQUIRE_THROWS( itemsVector.indexFile( BIT7Z_STRING( "folder" ) ) ); REQUIRE_THROWS( itemsVector.indexFile( BIT7Z_STRING( "folder/subfolder" ) ) ); + REQUIRE_THROWS( itemsVector.indexFile( BIT7Z_STRING( "folder/subfolder/../.." ) ) ); REQUIRE_THROWS( itemsVector.indexFile( BIT7Z_STRING( "folder/subfolder2" ) ) ); } @@ -1849,10 +1995,14 @@ TEST_CASE( "BitItemsVector: Indexing a single file", "[bititemsvector]" ) { const auto testInput = GENERATE( TestFile{ "Lorem Ipsum.pdf", "Lorem Ipsum.pdf" }, + TestFile{ "empty/../Lorem Ipsum.pdf", "Lorem Ipsum.pdf" }, + TestFile{ "folder/subfolder2/../../Lorem Ipsum.pdf", "Lorem Ipsum.pdf" }, TestFile{ "italy.svg", "italy.svg" }, TestFile{ "noext", "noext" }, TestFile{ BIT7Z_NATIVE_STRING( "σαράντα δύο.txt" ), BIT7Z_NATIVE_STRING( "σαράντα δύο.txt" ) }, TestFile{ "folder/clouds.jpg", "folder/clouds.jpg" }, + TestFile{ "dot.folder/../folder/clouds.jpg", "clouds.jpg" }, + TestFile{ "folder/subfolder2/../clouds.jpg", "clouds.jpg" }, TestFile{ "folder/subfolder2/homework.doc", "folder/subfolder2/homework.doc" }, TestFile{ "folder/subfolder2/The quick brown fox.pdf", "folder/subfolder2/The quick brown fox.pdf" }, TestFile{ "folder/subfolder2/frequency.xlsx", "folder/subfolder2/frequency.xlsx" }, @@ -1862,9 +2012,13 @@ TEST_CASE( "BitItemsVector: Indexing a single file", "[bititemsvector]" ) { const auto testInput = GENERATE( TestFile{ "Lorem Ipsum.pdf", "Lorem Ipsum.pdf" }, + TestFile{ "empty/../Lorem Ipsum.pdf", "Lorem Ipsum.pdf" }, + TestFile{ "folder/subfolder2/../../Lorem Ipsum.pdf", "Lorem Ipsum.pdf" }, TestFile{ "italy.svg", "italy.svg" }, TestFile{ "noext", "noext" }, TestFile{ "folder/clouds.jpg", "folder/clouds.jpg" }, + TestFile{ "dot.folder/../folder/clouds.jpg", "clouds.jpg" }, + TestFile{ "folder/subfolder2/../clouds.jpg", "clouds.jpg" }, TestFile{ "folder/subfolder2/homework.doc", "folder/subfolder2/homework.doc" }, TestFile{ "folder/subfolder2/The quick brown fox.pdf", "folder/subfolder2/The quick brown fox.pdf" }, TestFile{ "folder/subfolder2/frequency.xlsx", "folder/subfolder2/frequency.xlsx" },