From ec7def00371e2919ac0faeb36bc0c4159c48a50a Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 17 Aug 2021 13:32:05 +0200 Subject: [PATCH] feat(diff-identity): dedupe by comparing to identity of parent property --- helper/diffPlaces.js | 58 +++++++++++++++++++++- test/unit/helper/diffPlaces.js | 89 ++++++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+), 2 deletions(-) diff --git a/helper/diffPlaces.js b/helper/diffPlaces.js index 0094f212b..d11ef0ba3 100644 --- a/helper/diffPlaces.js +++ b/helper/diffPlaces.js @@ -124,13 +124,14 @@ function isNameDifferent(item1, item2, requestLanguage){ // iterate over all the languages in item2, comparing them to the // 'default' name of item1 and also against the language requested by the user. for( let lang in names2 ){ + if (!names2.hasOwnProperty(lang)) { continue; } // do not iterate over inherited properties if( !isPropertyDifferent({[lang]: names1.default}, names2, lang) ){ return false; } if( requestLanguage && !isPropertyDifferent({[lang]: names1[requestLanguage]}, names2, lang) ){ return false; } } - // iterate over all the languages in item1, comparing them to the - // 'default' name of item2 and also against the language requested by the user. + // as above, but inverse for( let lang in names1 ){ + if (!names1.hasOwnProperty(lang)) { continue; } // do not iterate over inherited properties if( !isPropertyDifferent({[lang]: names2.default}, names1, lang) ){ return false; } if( requestLanguage && !isPropertyDifferent({[lang]: names2[requestLanguage]}, names1, lang) ){ return false; } } @@ -138,6 +139,57 @@ function isNameDifferent(item1, item2, requestLanguage){ return true; } +// convenience function to return a GID (or null in case of an error) +// note: supports scalar values else takes the first element of an Array. +const GID = (item) => { + const parts = _.compact([ + _.first(_.castArray(_.get(item, 'source'))), + _.first(_.castArray(_.get(item, 'layer'))), + _.first(_.castArray(_.get(item, 'source_id'))) + ]); + + return (parts.length === 3) ? parts.join(':') : null; +}; + +// convenience function to return a *parent GID (or null in case of an error) +// note: supports scalar values else takes the first element of an Array. +// note: defaults the 'source' to 'whosonfirst' for backwards compatibility +const parentGID = (item, layer) => { + const source = _.first(_.compact(_.castArray(_.get(item, ['parent', `${layer}_source`])))); + const source_id = _.get(item, ['parent', `${layer}_id`]); + + return GID({ layer, source: (source || 'whosonfirst'), source_id }); +}; + +/** + * Compare the item properties and return true for any of the following predicates, else false + * - IsEqual -- The two items are save the same source, source_id and layer + * - IsEquivalant -- The two items are otherwise equalivant (as per corresponding code comments) + */ +function isEquivalentIdentity(item1, item2) { + + // Both items must be on the same layer + if (item1.layer !== item2.layer) { return false; } + + // Generate a GID value for each item + const gid1 = GID(item1); + const gid2 = GID(item2); + + // Invalid GID(s) + if (gid1 === null || gid2 === null) { return false; } + + // Equal GIDs + if (gid1 === gid2) { return true; } + + // Equivalant GID ($item1 == $item2.parent[$item1.layer]) + if (gid1 === parentGID(item2, item1.layer)) { return true; } + + // Equivalant GID ($item2 = $item1.parent[$item2.layer]) + if (gid2 === parentGID(item1, item2.layer)) { return true; } + + return false; +} + /** * Compare the address_parts properties if they exist. * Returns false if the objects are the same, else true. @@ -174,6 +226,7 @@ function isAddressDifferent(item1, item2){ * Optionally provide $requestLanguage (req.clean.lang.iso6393) to improve name deduplication. */ function isDifferent(item1, item2, requestLanguage){ + if( isEquivalentIdentity( item1, item2 ) ){ return false; } if( isLayerDifferent( item1, item2 ) ){ return true; } if( isParentHierarchyDifferent( item1, item2 ) ){ return true; } if( isNameDifferent( item1, item2, requestLanguage ) ){ return true; } @@ -245,3 +298,4 @@ module.exports.isDifferent = isDifferent; module.exports.layerPreferences = layerPreferences; module.exports.isNameDifferent = isNameDifferent; module.exports.normalizeString = normalizeString; +module.exports.isEquivalentIdentity = isEquivalentIdentity; diff --git a/test/unit/helper/diffPlaces.js b/test/unit/helper/diffPlaces.js index 2304ad469..cfe00f5c6 100644 --- a/test/unit/helper/diffPlaces.js +++ b/test/unit/helper/diffPlaces.js @@ -1,6 +1,7 @@ const isDifferent = require('../../../helper/diffPlaces').isDifferent; const isNameDifferent = require('../../../helper/diffPlaces').isNameDifferent; const normalizeString = require('../../../helper/diffPlaces').normalizeString; +const isEquivalentIdentity = require('../../../helper/diffPlaces').isEquivalentIdentity; module.exports.tests = {}; @@ -539,6 +540,94 @@ module.exports.tests.isNameDifferent = function (test, common) { }); }; +module.exports.tests.isEquivalentIdentity = function (test, common) { + test('basic equivalence', function (t) { + t.true(isEquivalentIdentity( + { source: 'test', source_id: '1', layer: 'example' }, + { source: 'test', source_id: '1', layer: 'example' } + ), 'same source, source_id and layer'); + + t.false(isEquivalentIdentity( + { source: 'test', source_id: '1', layer: 'example' }, + { source: 'foo', source_id: '1', layer: 'example' } + ), 'different source'); + + t.false(isEquivalentIdentity( + { source: 'test', source_id: '1', layer: 'example' }, + { source: 'test', source_id: '2', layer: 'example' } + ), 'different source_id'); + + // two records sharing the same source, source_id but with + // differing layer are considered not equal. + // in practice this should be rare/never happen but if it + // becomes an issue would could consider making this stricter. + t.false(isEquivalentIdentity( + { source: 'test', source_id: '1', layer: 'example' }, + { source: 'test', source_id: '1', layer: 'foo' } + ), 'same source, source_id and layer'); + + // if either record fails to generate a valid GID then they are + // considered not equivalent. + t.false(isEquivalentIdentity( + { source: 'test', source_id: '1' }, + { source: 'test', source_id: '1', layer: 'example' } + ), 'invalid GID'); + + t.end(); + }); + + test('equivalence via parent hierarchy', function (t) { + t.true(isEquivalentIdentity( + { source: 'foo', source_id: '1', layer: 'example' }, + { source: 'bar', source_id: '2', layer: 'example', parent: { + example_id: '1', + example_source: 'foo' + } + }), 'match parent GID'); + + t.false(isEquivalentIdentity( + { source: 'foo', source_id: '1', layer: 'example' }, + { source: 'bar', source_id: '2', layer: 'example', parent: { + foo_id: '1', + foo_source: 'foo' + } + }), 'parent name must be from same layer'); + + t.false(isEquivalentIdentity( + { source: 'foo', source_id: '1', layer: 'example' }, + { source: 'bar', source_id: '2', layer: 'example', parent: { + foo_id: '1', + example_source: 'foo' + } + }), 'parent name must have the same source_id'); + + t.false(isEquivalentIdentity( + { source: 'foo', source_id: '1', layer: 'example' }, + { source: 'bar', source_id: '2', layer: 'example', parent: { + example_id: '1', + example_source: 'not_foo' + } + }), 'parent name must have the same source'); + + t.true(isEquivalentIdentity( + { source: 'whosonfirst', source_id: '1', layer: 'example' }, + { source: 'test', source_id: '2', layer: 'example', parent: { + example_id: '1' + } + }), 'parent source defaults to "whosonfirst"'); + + t.true(isEquivalentIdentity( + { source: 'whosonfirst', source_id: '1', layer: 'example' }, + { source: 'test', source_id: '2', layer: 'example', parent: { + example_id: ['1'], + example_source: [null] + } + }), 'parent source defaults to "whosonfirst" (arrays)'); + + t.end(); + }); +}; + module.exports.tests.normalizeString = function (test, common) { test('lowercase', function (t) { t.equal(normalizeString('Foo Bar'), 'foo bar');