From 560fff8eeba6ae4b554ff4b3de4010b2cdc94ecc Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 28 Feb 2022 18:00:40 +0100 Subject: [PATCH] feat(dedupe): simplify deduplication logic --- middleware/dedupe.js | 84 +++++++--------------------------- test/unit/middleware/dedupe.js | 35 ++++++++++++++ 2 files changed, 51 insertions(+), 68 deletions(-) diff --git a/middleware/dedupe.js b/middleware/dedupe.js index c2f015fd3..3d43d715a 100644 --- a/middleware/dedupe.js +++ b/middleware/dedupe.js @@ -12,19 +12,6 @@ const formatLog = (hit) => { return [name, zip, hit._id].filter(Boolean).join(' '); }; -/** - * Deduplication workflow: - * - * 1. iterate over results starting at position 0 - * 2. on each iteration search for duplicate candidates: - * 2.1 at higher positions in array - * 2.2 not contained in the skip-list - * 3. from the list of candidates, select a preferred master record - * 4. push master record on to return array - * 5. add non-master candidates to a skip-list - * 6. continue down list until end - */ - function dedupeResults(req, res, next) { // do nothing if request data is invalid @@ -33,72 +20,33 @@ function dedupeResults(req, res, next) { // do nothing if no result data is invalid if( _.isUndefined(res) || !_.isArray(res.data) || _.isEmpty(res.data) ){ return next(); } - // loop through data items and only copy unique items to unique - const unique = []; - - // maintain a skip-list - const skip = []; - // use the user agent language to improve deduplication const lang = _.get(req, 'clean.lang.iso6393'); - // 1. iterate over res.data - res.data.forEach((place, ppos) => { - - // skip records in the skip-list - if (skip.includes(place)){ return; } - - // 2. search for duplicate candidates - const candidates = res.data.filter((candidate, cpos) => { - - // 2.1 at higher positions in array - if (cpos <= ppos) { return false; } - - // 2.2 not contained in the skip-list - if (skip.includes(candidate)) { return false; } + // maintain a set of inferior records (by their array offsets) + const inferior = new Set(); + for (var i = 0; i < (res.data.length-1); i++) { + for (var j = (i+1); j < res.data.length; j++) { - // true if the two records are considered duplicates - return !isDifferent(place, candidate, lang); - }); + // ensure these two records are considered duplicates + if (isDifferent(res.data[i], res.data[j], lang)) { continue; } - // 3. select a preferred master record + // decide which of the two records was 'inferior' + // note: $preference equals true when $j is preferred and vice versa + const preference = isPreferred(res.data[i], res.data[j]); + inferior.add(preference ? i : j); - // simple case where no candidates were found - if (candidates.length === 0){ - unique.push(place); - return; - } - - // by default we consider the candidate with the lowest index as master - let master = place; - - // iterate over candidates looking for one which is preferred to - // the currently selected master - candidates.forEach(candidate => { - if (isPreferred(master, candidate)){ - master = candidate; - } - }); - - // logging - if (master !== place) { + // logging logger.debug('[dupe][replacing]', { query: req.clean.text, - previous: formatLog(place), - hit: formatLog(master) + superior: formatLog(res.data[preference ? j : i]), + inferior: formatLog(res.data[preference ? i : j]), }); } + } - // 4. push master record on to return array - unique.push(master); - - // 5. add non-master candidates to a skip-list - candidates.forEach(candidate => { - skip.push(candidate); - }); - }); - - // replace the original data with only the unique hits + // remove inferior records, return the remaining results + const unique = res.data.filter((v, o) => !inferior.has(o)); const maxElements = _.get(req, 'clean.size', undefined); res.data = unique.slice(0, maxElements); diff --git a/test/unit/middleware/dedupe.js b/test/unit/middleware/dedupe.js index 92fd3443b..f43e8ae70 100644 --- a/test/unit/middleware/dedupe.js +++ b/test/unit/middleware/dedupe.js @@ -820,6 +820,41 @@ module.exports.tests.priority = function(test, common) { t.end(); }); }); + + test('A->B B->C dependency graph', function (t) { + var req = { + clean: { + text: 'A B C', + size: 10 + } + }; + var res = { + data: [ + { + 'source': 'example', + 'source_id': 'A', + 'layer': 'test', + 'name': { 'default': ['name1'] } + }, { + 'source': 'example', + 'source_id': 'B', + 'layer': 'test', + 'name': { 'default': ['name1', 'name2'] } + }, { + 'source': 'example', + 'source_id': 'C', + 'layer': 'test', + 'name': { 'default': ['name2'] } + } + ] + }; + + dedupe(req, res, () => { + t.equal(res.data.length, 1, 'results are deduped'); + t.equal(res.data[0].source_id, 'A'); + t.end(); + }); + }); }; module.exports.all = function (tape, common) {