Skip to content

Commit

Permalink
feat(dedupe): simplify deduplication logic
Browse files Browse the repository at this point in the history
  • Loading branch information
missinglink committed Mar 2, 2022
1 parent bc53aee commit 93ec12d
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 68 deletions.
84 changes: 16 additions & 68 deletions middleware/dedupe.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,6 @@ const formatLog = (hit) => {
return [name, zip, hit._id].filter(Boolean).join(' ');
};

/**
* Deduplication workflow:
*
* 1. iterate over results starting at position 0
* 2. on each iteration search for duplicate candidates:
* 2.1 at higher positions in array
* 2.2 not contained in the skip-list
* 3. from the list of candidates, select a preferred master record
* 4. push master record on to return array
* 5. add non-master candidates to a skip-list
* 6. continue down list until end
*/

function dedupeResults(req, res, next) {

// do nothing if request data is invalid
Expand All @@ -33,72 +20,33 @@ function dedupeResults(req, res, next) {
// do nothing if no result data is invalid
if( _.isUndefined(res) || !_.isArray(res.data) || _.isEmpty(res.data) ){ return next(); }

// loop through data items and only copy unique items to unique
const unique = [];

// maintain a skip-list
const skip = [];

// use the user agent language to improve deduplication
const lang = _.get(req, 'clean.lang.iso6393');

// 1. iterate over res.data
res.data.forEach((place, ppos) => {

// skip records in the skip-list
if (skip.includes(place)){ return; }

// 2. search for duplicate candidates
const candidates = res.data.filter((candidate, cpos) => {

// 2.1 at higher positions in array
if (cpos <= ppos) { return false; }

// 2.2 not contained in the skip-list
if (skip.includes(candidate)) { return false; }
// maintain a set of inferior records (by their array offsets)
const inferior = new Set();
for (var i = 0; i < (res.data.length-1); i++) {
for (var j = (i+1); j < res.data.length; j++) {

// true if the two records are considered duplicates
return !isDifferent(place, candidate, lang);
});
// ensure these two records are considered duplicates
if (isDifferent(res.data[i], res.data[j], lang)) { continue; }

// 3. select a preferred master record
// decide which of the two records was 'inferior'
// note: $preference equals true when $j is preferred and vice versa
const preference = isPreferred(res.data[i], res.data[j]);
inferior.add(preference ? i : j);

// simple case where no candidates were found
if (candidates.length === 0){
unique.push(place);
return;
}

// by default we consider the candidate with the lowest index as master
let master = place;

// iterate over candidates looking for one which is preferred to
// the currently selected master
candidates.forEach(candidate => {
if (isPreferred(master, candidate)){
master = candidate;
}
});

// logging
if (master !== place) {
// logging
logger.debug('[dupe][replacing]', {
query: req.clean.text,
previous: formatLog(place),
hit: formatLog(master)
superior: formatLog(res.data[preference ? j : i]),
inferior: formatLog(res.data[preference ? i : j]),
});
}
}

// 4. push master record on to return array
unique.push(master);

// 5. add non-master candidates to a skip-list
candidates.forEach(candidate => {
skip.push(candidate);
});
});

// replace the original data with only the unique hits
// remove inferior records, return the remaining results
const unique = res.data.filter((v, o) => !inferior.has(o));
const maxElements = _.get(req, 'clean.size', undefined);
res.data = unique.slice(0, maxElements);

Expand Down
35 changes: 35 additions & 0 deletions test/unit/middleware/dedupe.js
Original file line number Diff line number Diff line change
Expand Up @@ -820,6 +820,41 @@ module.exports.tests.priority = function(test, common) {
t.end();
});
});

test('A->B->C dependency graph', function (t) {
var req = {
clean: {
text: 'A B C',
size: 10
}
};
var res = {
data: [
{
'source': 'example',
'source_id': 'A',
'layer': 'locality',
'name': { 'default': ['name1'] }
}, {
'source': 'example',
'source_id': 'B',
'layer': 'locality',
'name': { 'default': ['name1', 'name2'] }
}, {
'source': 'example',
'source_id': 'C',
'layer': 'locality',
'name': { 'default': ['name2'] }
}
]
};

dedupe(req, res, () => {
t.equal(res.data.length, 1, 'results are deduped');
t.equal(res.data[0].source_id, 'A');
t.end();
});
});
};

module.exports.all = function (tape, common) {
Expand Down

0 comments on commit 93ec12d

Please sign in to comment.