Skip to content

Commit

Permalink
cruft: parallelize globs matching
Browse files Browse the repository at this point in the history
The most expensive operation of cruft is the filtering of the existing
paths with the configured filters and excludes.  Perform that task in
parallel by using C++17 parallel algorithms.  One downside is, since the
standard library does not implement this functionality itself, this
requires linking against tbb (Threading Building Blocks).

Sequenced:

    Benchmark 1: ./cruft -E ./explain/ -F ./rules/ -I ./ignore -R ./ruleset
      Time (mean ± σ):      7.179 s ±  0.079 s    [User: 6.366 s, System: 2.519 s]
      Range (min … max):    7.060 s …  7.291 s    10 runs

Parallel:

    Benchmark 1: ./cruft -E ./explain/ -F ./rules/ -I ./ignore -R ./ruleset
      Time (mean ± σ):      4.762 s ±  0.058 s    [User: 6.659 s, System: 2.414 s]
      Range (min … max):    4.682 s …  4.849 s    10 runs
  • Loading branch information
cgzones committed Nov 9, 2022
1 parent f878160 commit 2308a90
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 27 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ dpkg_popen.o: dpkg_popen.cc dpkg.h
cruftold: $(SHARED_OBJS) dpkg_popen.o
$(CXX) $(CXXFLAGS) $(LDFLAGS) $(CPPFLAGS) $(SHARED_OBJS) dpkg_popen.o -lstdc++fs -o cruftold
cruft: $(SHARED_OBJS) dpkg_lib.o
$(CXX) $(CXXFLAGS) $(LDFLAGS) $(CPPFLAGS) $(SHARED_OBJS) dpkg_lib.o $(LIBDPKG_LIBS) -o cruft
$(CXX) $(CXXFLAGS) $(LDFLAGS) $(CPPFLAGS) $(SHARED_OBJS) dpkg_lib.o $(LIBDPKG_LIBS) -ltbb -o cruft

cpigsold: cpigs.o explain.o filters.o plocate.o shellexp.o usr_merge.o python.o dpkg_popen.o owner.o
$(CXX) $(CXXFLAGS) $(LDFLAGS) $(CPPFLAGS) cpigs.o explain.o filters.o plocate.o shellexp.o usr_merge.o python.o dpkg_popen.o owner.o -lstdc++fs -o cpigsold
Expand Down
82 changes: 56 additions & 26 deletions cruft.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,17 @@
#include <algorithm>
#include <ctime>

#ifdef __has_include
# if __has_include(<version>)
# include <version>
# endif
#endif

#if defined __cpp_lib_scoped_lock && defined __cpp_lib_parallel_algorithm
#include <execution>
#include <mutex>
#endif

#include <sys/stat.h>
#include <getopt.h>
#include <cstring>
Expand Down Expand Up @@ -119,6 +130,44 @@ static void one_file(const string& infile)
if (not matched) cerr << "no matching package found\n";
}

static vector<string> filter_cruft(const vector<string>& extras, const vector<owner>& globs, const vector<owner>& explain)
{
vector<string> result;

#if defined __cpp_lib_scoped_lock && defined __cpp_lib_parallel_algorithm
mutex m;
#endif

for_each(
#if defined __cpp_lib_scoped_lock && defined __cpp_lib_parallel_algorithm
execution::par,
#endif
extras.begin(), extras.end(), [&](auto&& extra){
bool match = any_of(globs.begin(), globs.end(), [&](auto&& glob) {
return myglob(extra, glob.glob);
});

if (!match) {
match = any_of(explain.begin(), explain.end(), [&](auto&& expl){
return extra == expl.glob;
});
}

if (!match) {
#if defined __cpp_lib_scoped_lock && defined __cpp_lib_parallel_algorithm
scoped_lock<mutex> lock { m };
#endif
result.push_back(extra);
}
});

#if defined __cpp_lib_scoped_lock && defined __cpp_lib_parallel_algorithm
sort(execution::par, result.begin(), result.end());
#endif

return result;
}

static clock_t beg = clock();

static void elapsed(const string& action)
Expand Down Expand Up @@ -318,38 +367,19 @@ int main(int argc, char *argv[])
elapsed("missing2");
if (debug) cerr << "count stat():" << count_stat << '\n';

// match the globs against reduced database
vector<owner> globs;
read_filters(filter_dir, ruleset_file, packages,globs);
read_filters(filter_dir, ruleset_file, packages, globs);
elapsed("read filters");
vector<string> cruft3;
for (const auto& cr: cruft) {
bool match=false;
for (const auto& gl: globs) {
match=myglob(cr, gl.glob);
if (match) break;
}
if (!match) cruft3.push_back(cr);
}
elapsed("extra vs globs");
if (debug) cerr << cruft3.size() << " files in cruft3 database\n\n";

// match the dynamic "explain" filters
vector<owner> explain;
read_explain(explain_dir, packages, explain);
elapsed("read explain");
vector<string> cruft4;
for (const auto& cr: cruft3) {
bool match=false;
for (const auto& ex: explain) {
match=(cr==ex.glob);
if (match) break;
}
if (!match) cruft4.push_back(cr);
}
elapsed("extra vs explain");
if (debug) cerr << explain.size() << " explain entries\n";

if (debug) cerr << cruft4.size() << " files in cruft4 database\n";
// match the globs against reduced database
vector<string> cruft3 = filter_cruft(cruft, globs, explain);
elapsed("extra vs globs and explain");
if (debug) cerr << cruft3.size() << " files in cruft3 database\n";

//TODO: some smarter algo when run as non-root
// like checking the R/X bits of parent dir
Expand All @@ -360,7 +390,7 @@ int main(int argc, char *argv[])

//TODO: split by filesystem
cout << "---- unexplained: / ----\n";
for (const auto& cr: cruft4) {
for (const auto& cr: cruft3) {
cout << " " << cr;
auto bug = bugs.find(cr);
if (bug != bugs.end()) {
Expand Down
1 change: 1 addition & 0 deletions debian/control
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Maintainer: Alexandre Detiste <[email protected]>
Build-Depends:
pkg-config,
libdpkg-dev,
libtbb-dev,
debhelper-compat (= 13),
Standards-Version: 4.6.1.0
Homepage: https://github.com/a-detiste/cruft-ng/
Expand Down

0 comments on commit 2308a90

Please sign in to comment.