Skip to content

Commit

Permalink
feat(diff-identity): dedupe by comparing to identity of parent property
Browse files Browse the repository at this point in the history
  • Loading branch information
missinglink committed Nov 12, 2021
1 parent 35efea5 commit ec7def0
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 2 deletions.
58 changes: 56 additions & 2 deletions helper/diffPlaces.js
Original file line number Diff line number Diff line change
Expand Up @@ -124,20 +124,72 @@ function isNameDifferent(item1, item2, requestLanguage){
// iterate over all the languages in item2, comparing them to the
// 'default' name of item1 and also against the language requested by the user.
for( let lang in names2 ){
if (!names2.hasOwnProperty(lang)) { continue; } // do not iterate over inherited properties
if( !isPropertyDifferent({[lang]: names1.default}, names2, lang) ){ return false; }
if( requestLanguage && !isPropertyDifferent({[lang]: names1[requestLanguage]}, names2, lang) ){ return false; }
}

// iterate over all the languages in item1, comparing them to the
// 'default' name of item2 and also against the language requested by the user.
// as above, but inverse
for( let lang in names1 ){
if (!names1.hasOwnProperty(lang)) { continue; } // do not iterate over inherited properties
if( !isPropertyDifferent({[lang]: names2.default}, names1, lang) ){ return false; }
if( requestLanguage && !isPropertyDifferent({[lang]: names2[requestLanguage]}, names1, lang) ){ return false; }
}

return true;
}

// convenience function to return a GID (or null in case of an error)
// note: supports scalar values else takes the first element of an Array.
const GID = (item) => {
const parts = _.compact([
_.first(_.castArray(_.get(item, 'source'))),
_.first(_.castArray(_.get(item, 'layer'))),
_.first(_.castArray(_.get(item, 'source_id')))
]);

return (parts.length === 3) ? parts.join(':') : null;
};

// convenience function to return a *parent GID (or null in case of an error)
// note: supports scalar values else takes the first element of an Array.
// note: defaults the 'source' to 'whosonfirst' for backwards compatibility
const parentGID = (item, layer) => {
const source = _.first(_.compact(_.castArray(_.get(item, ['parent', `${layer}_source`]))));
const source_id = _.get(item, ['parent', `${layer}_id`]);

return GID({ layer, source: (source || 'whosonfirst'), source_id });
};

/**
* Compare the item properties and return true for any of the following predicates, else false
* - IsEqual -- The two items are save the same source, source_id and layer
* - IsEquivalant -- The two items are otherwise equalivant (as per corresponding code comments)
*/
function isEquivalentIdentity(item1, item2) {

// Both items must be on the same layer
if (item1.layer !== item2.layer) { return false; }

// Generate a GID value for each item
const gid1 = GID(item1);
const gid2 = GID(item2);

// Invalid GID(s)
if (gid1 === null || gid2 === null) { return false; }

// Equal GIDs
if (gid1 === gid2) { return true; }

// Equivalant GID ($item1 == $item2.parent[$item1.layer])
if (gid1 === parentGID(item2, item1.layer)) { return true; }

// Equivalant GID ($item2 = $item1.parent[$item2.layer])
if (gid2 === parentGID(item1, item2.layer)) { return true; }

return false;
}

/**
* Compare the address_parts properties if they exist.
* Returns false if the objects are the same, else true.
Expand Down Expand Up @@ -174,6 +226,7 @@ function isAddressDifferent(item1, item2){
* Optionally provide $requestLanguage (req.clean.lang.iso6393) to improve name deduplication.
*/
function isDifferent(item1, item2, requestLanguage){
if( isEquivalentIdentity( item1, item2 ) ){ return false; }
if( isLayerDifferent( item1, item2 ) ){ return true; }
if( isParentHierarchyDifferent( item1, item2 ) ){ return true; }
if( isNameDifferent( item1, item2, requestLanguage ) ){ return true; }
Expand Down Expand Up @@ -245,3 +298,4 @@ module.exports.isDifferent = isDifferent;
module.exports.layerPreferences = layerPreferences;
module.exports.isNameDifferent = isNameDifferent;
module.exports.normalizeString = normalizeString;
module.exports.isEquivalentIdentity = isEquivalentIdentity;
89 changes: 89 additions & 0 deletions test/unit/helper/diffPlaces.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
const isDifferent = require('../../../helper/diffPlaces').isDifferent;
const isNameDifferent = require('../../../helper/diffPlaces').isNameDifferent;
const normalizeString = require('../../../helper/diffPlaces').normalizeString;
const isEquivalentIdentity = require('../../../helper/diffPlaces').isEquivalentIdentity;

module.exports.tests = {};

Expand Down Expand Up @@ -539,6 +540,94 @@ module.exports.tests.isNameDifferent = function (test, common) {
});
};

module.exports.tests.isEquivalentIdentity = function (test, common) {
test('basic equivalence', function (t) {
t.true(isEquivalentIdentity(
{ source: 'test', source_id: '1', layer: 'example' },
{ source: 'test', source_id: '1', layer: 'example' }
), 'same source, source_id and layer');

t.false(isEquivalentIdentity(
{ source: 'test', source_id: '1', layer: 'example' },
{ source: 'foo', source_id: '1', layer: 'example' }
), 'different source');

t.false(isEquivalentIdentity(
{ source: 'test', source_id: '1', layer: 'example' },
{ source: 'test', source_id: '2', layer: 'example' }
), 'different source_id');

// two records sharing the same source, source_id but with
// differing layer are considered not equal.
// in practice this should be rare/never happen but if it
// becomes an issue would could consider making this stricter.
t.false(isEquivalentIdentity(
{ source: 'test', source_id: '1', layer: 'example' },
{ source: 'test', source_id: '1', layer: 'foo' }
), 'same source, source_id and layer');

// if either record fails to generate a valid GID then they are
// considered not equivalent.
t.false(isEquivalentIdentity(
{ source: 'test', source_id: '1' },
{ source: 'test', source_id: '1', layer: 'example' }
), 'invalid GID');

t.end();
});

test('equivalence via parent hierarchy', function (t) {
t.true(isEquivalentIdentity(
{ source: 'foo', source_id: '1', layer: 'example' },
{ source: 'bar', source_id: '2', layer: 'example', parent: {
example_id: '1',
example_source: 'foo'
}
}), 'match parent GID');

t.false(isEquivalentIdentity(
{ source: 'foo', source_id: '1', layer: 'example' },
{ source: 'bar', source_id: '2', layer: 'example', parent: {
foo_id: '1',
foo_source: 'foo'
}
}), 'parent name must be from same layer');

t.false(isEquivalentIdentity(
{ source: 'foo', source_id: '1', layer: 'example' },
{ source: 'bar', source_id: '2', layer: 'example', parent: {
foo_id: '1',
example_source: 'foo'
}
}), 'parent name must have the same source_id');

t.false(isEquivalentIdentity(
{ source: 'foo', source_id: '1', layer: 'example' },
{ source: 'bar', source_id: '2', layer: 'example', parent: {
example_id: '1',
example_source: 'not_foo'
}
}), 'parent name must have the same source');

t.true(isEquivalentIdentity(
{ source: 'whosonfirst', source_id: '1', layer: 'example' },
{ source: 'test', source_id: '2', layer: 'example', parent: {
example_id: '1'
}
}), 'parent source defaults to "whosonfirst"');

t.true(isEquivalentIdentity(
{ source: 'whosonfirst', source_id: '1', layer: 'example' },
{ source: 'test', source_id: '2', layer: 'example', parent: {
example_id: ['1'],
example_source: [null]
}
}), 'parent source defaults to "whosonfirst" (arrays)');

t.end();
});
};

module.exports.tests.normalizeString = function (test, common) {
test('lowercase', function (t) {
t.equal(normalizeString('Foo Bar'), 'foo bar');
Expand Down

0 comments on commit ec7def0

Please sign in to comment.