Skip to content

Commit

Permalink
cruft: parallelize globs matching
Browse files Browse the repository at this point in the history
The most expensive operation of cruft is the filtering of the existing
paths with the configured filters and excludes.  Perform that task in
parallel by using C++17 parallel algorithms.  One downside is, since the
standard library does not implement this functionality itself, this
requires linking against tbb (Threading Building Blocks).

Sequenced:

    Benchmark 1: ./cruft -E ./explain/ -F ./rules/ -I ./ignore -R ./ruleset
      Time (mean ± σ):      7.179 s ±  0.079 s    [User: 6.366 s, System: 2.519 s]
      Range (min … max):    7.060 s …  7.291 s    10 runs

Parallel:

    Benchmark 1: ./cruft -E ./explain/ -F ./rules/ -I ./ignore -R ./ruleset
      Time (mean ± σ):      4.762 s ±  0.058 s    [User: 6.659 s, System: 2.414 s]
      Range (min … max):    4.682 s …  4.849 s    10 runs
  • Loading branch information
cgzones committed Jun 7, 2023
1 parent cd9de58 commit f0b73d0
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 26 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ dpkg_popen.o: dpkg_popen.cc dpkg.h
cruftold: $(SHARED_OBJS) $(CRUFT_OBJS) mlocate.o dpkg_popen.o
$(CXX) $(CXXFLAGS) $(LDFLAGS) $(CPPFLAGS) $(SHARED_OBJS) $(CRUFT_OBJS) mlocate.o dpkg_popen.o -lstdc++fs -pthread -o cruftold
cruft: $(SHARED_OBJS) $(CRUFT_OBJS) plocate.o dpkg_lib.o
$(CXX) $(CXXFLAGS) $(LDFLAGS) $(CPPFLAGS) $(SHARED_OBJS) $(CRUFT_OBJS) plocate.o dpkg_lib.o $(LIBDPKG_LIBS) -pthread -o cruft
$(CXX) $(CXXFLAGS) $(LDFLAGS) $(CPPFLAGS) $(SHARED_OBJS) $(CRUFT_OBJS) plocate.o dpkg_lib.o $(LIBDPKG_LIBS) -pthread -ltbb -o cruft

cpigsold: $(SHARED_OBJS) cpigs.o mlocate.o dpkg_popen.o
$(CXX) $(CXXFLAGS) $(LDFLAGS) $(CPPFLAGS) $(SHARED_OBJS) cpigs.o mlocate.o dpkg_popen.o -lstdc++fs -o cpigsold
Expand Down
80 changes: 55 additions & 25 deletions cruft.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@
#include <ctime>
#include <thread>

#ifdef __has_include
# if __has_include(<version>)
# include <version>
# endif
#endif

#if defined __cpp_lib_scoped_lock && defined __cpp_lib_parallel_algorithm
#include <execution>
#include <mutex>
#endif

#include <sys/stat.h>
#include <getopt.h>
#include <cstring>
Expand Down Expand Up @@ -132,6 +143,44 @@ static void one_file(const string& path)
cerr << "no matching package found\n";
}

static vector<string> filter_cruft(const vector<string>& extras, const vector<owner>& globs, const vector<owner>& explain)
{
vector<string> result;

#if defined __cpp_lib_scoped_lock && defined __cpp_lib_parallel_algorithm
mutex m;
#endif

for_each(
#if defined __cpp_lib_scoped_lock && defined __cpp_lib_parallel_algorithm
execution::par,
#endif
extras.begin(), extras.end(), [&](auto&& extra){
bool match = any_of(globs.begin(), globs.end(), [&](auto&& glob) {
return myglob(extra, glob.path);
});

if (!match) {
match = any_of(explain.begin(), explain.end(), [&](auto&& expl){
return extra == expl.path;
});
}

if (!match) {
#if defined __cpp_lib_scoped_lock && defined __cpp_lib_parallel_algorithm
scoped_lock<mutex> lock { m };
#endif
result.push_back(extra);
}
});

#if defined __cpp_lib_scoped_lock && defined __cpp_lib_parallel_algorithm
sort(execution::par, result.begin(), result.end());
#endif

return result;
}

static clock_t beg = clock();

static void elapsed(const string& action)
Expand Down Expand Up @@ -332,38 +381,19 @@ int main(int argc, char *argv[])
elapsed("missing2");
if (debug) cerr << "count stat():" << count_stat << '\n';

// match the globs against reduced database
vector<owner> globs;
read_filters(filter_dir, ruleset_file, packages, globs);
elapsed("read filters");
vector<string> cruft3;
for (const auto& cr: cruft) {
bool match=false;
for (const auto& gl: globs) {
match=myglob(cr, gl.path);
if (match) break;
}
if (!match) cruft3.push_back(cr);
}
elapsed("extra vs globs");
if (debug) cerr << cruft3.size() << " files in cruft3 database\n\n";

// match the dynamic "explain" filters
vector<owner> explain;
read_explain(explain_dir, packages, explain);
elapsed("read explain");
vector<string> cruft4;
for (const auto& cr: cruft3) {
bool match=false;
for (const auto& ex: explain) {
match=(cr==ex.path);
if (match) break;
}
if (!match) cruft4.push_back(cr);
}
elapsed("extra vs explain");
if (debug) cerr << explain.size() << " explain entries\n";

if (debug) cerr << cruft4.size() << " files in cruft4 database\n";
// match the globs against reduced database
vector<string> cruft3 = filter_cruft(cruft, globs, explain);
elapsed("extra vs globs and explain");
if (debug) cerr << cruft3.size() << " files in cruft3 database\n";

//TODO: some smarter algo when run as non-root
// like checking the R/X bits of parent dir
Expand All @@ -374,7 +404,7 @@ int main(int argc, char *argv[])

//TODO: split by filesystem
cout << "---- unexplained: / ----\n";
for (const auto& cr: cruft4) {
for (const auto& cr: cruft3) {
cout << " " << cr;
auto bug = bugs.find(cr);
if (bug != bugs.end()) {
Expand Down
1 change: 1 addition & 0 deletions debian/control
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Build-Depends:
Build-Depends-Arch:
pkgconf,
libdpkg-dev,
libtbb-dev,
Standards-Version: 4.6.1.0
Homepage: https://github.com/a-detiste/cruft-ng/
Vcs-Git: https://github.com/a-detiste/cruft-ng.git
Expand Down

0 comments on commit f0b73d0

Please sign in to comment.