Skip to content

Commit

Permalink
Merge pull request #379 from openzim/clone_check
Browse files Browse the repository at this point in the history
Do not detect clone entry as duplicated content.
  • Loading branch information
mgautierfr authored Dec 18, 2023
2 parents 95af479 + 48616c5 commit bb6d10f
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 27 deletions.
2 changes: 1 addition & 1 deletion meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ if static_linkage
endif
endif

libzim_dep = dependency('libzim', version : '>=8.0.0', static:static_linkage)
libzim_dep = dependency('libzim', version : '>=9.1.0', static:static_linkage)
with_xapian_support = compiler.has_header_symbol('zim/zim.h', 'LIBZIM_WITH_XAPIAN')

find_library_in_compiler = meson.version().version_compare('>=0.31.0')
Expand Down
44 changes: 18 additions & 26 deletions src/zimcheck/checks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,30 +14,10 @@
#include <mutex>
#include <thread>
#include <queue>
#include <optional>
#include <zim/archive.h>
#include <zim/item.h>

// Specialization of std::hash needed for our unordered_map. Can be removed in c++14
namespace std {
template <> struct hash<LogTag> {
size_t operator() (const LogTag &t) const { return size_t(t); }
};
}

// Specialization of std::hash needed for our unordered_map. Can be removed in c++14
namespace std {
template <> struct hash<TestType> {
size_t operator() (const TestType &t) const { return size_t(t); }
};
}

// Specialization of std::hash needed for our unordered_map. Can be removed in c++14
namespace std {
template <> struct hash<MsgId> {
size_t operator() (const MsgId &msgid) const { return size_t(msgid); }
};
}

namespace
{

Expand Down Expand Up @@ -113,6 +93,11 @@ SortedMsgParams sortedMsgParams(const MsgParams& msgParams)
return SortedMsgParams(msgParams.begin(), msgParams.end());
}

bool areAliases(const zim::Item& i1, const zim::Item& i2)
{
return i1.getClusterIndex() == i2.getClusterIndex() && i1.getBlobIndex() == i2.getBlobIndex();
}

} // unnamed namespace

namespace JSON
Expand Down Expand Up @@ -487,15 +472,22 @@ void ArticleChecker::detect_redundant_articles()
progress.report();
auto l = it.second;
while ( !l.empty() ) {
const auto e1 = archive.getEntryByPath(l.front());
// The way we have constructed `l`, e1 MUST BE an item
const auto e1 = archive.getEntryByPath(l.front()).getItem();
l.pop_front();
if ( !l.empty() ) {
// The way we have constructed `l`, e1 MUST BE an item
const std::string s1 = e1.getItem().getData();
std::optional<std::string> s1;
decltype(l) articlesDifferentFromE1;
for(auto other : l) {
auto e2 = archive.getEntryByPath(other);
std::string s2 = e2.getItem().getData();
// The way we have constructed `l`, e2 MUST BE an item
const auto e2 = archive.getEntryByPath(other).getItem();
if (areAliases(e1, e2)) {
continue;
}
if (!s1) {
s1 = e1.getData();
}
std::string s2 = e2.getData();
if (s1 != s2 ) {
articlesDifferentFromE1.push_back(other);
continue;
Expand Down

0 comments on commit bb6d10f

Please sign in to comment.