Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dedupe Geonames records with WOF concordances #1606

Merged
merged 2 commits into from
Mar 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions helper/diffPlaces.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ const unicode = require('./unicode');
const placeTypes = require('./placeTypes');
const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers();
const field = require('../helper/fieldValue');
const codec = require('pelias-model').codec;

// only consider these layers as synonymous for deduplication purposes.
// when performing inter-layer deduping, layers coming earlier in this list take
Expand Down Expand Up @@ -186,11 +187,42 @@ function isAddressDifferent(item1, item2){
return false;
}

function isGeonamesConcordanceSame(item1, item2) {
const items = [item1, item2];

const wof_record = items.find(i => i.source === 'whosonfirst');
const gn_record = items.find(i => i.source === 'geonames');

// must have found one wof and one gn record or this check does not apply
if (!wof_record || !gn_record) { return false; }

const concordances = _.get(wof_record, 'addendum.concordances');

if (!concordances) {
return false;
}

const json = codec.decode(concordances);
const concordance_id = json['gn:id'];

if (!concordance_id || !_.isNumber(concordance_id)) { return false; }

// only records with a matching concordance pass this check
if (concordance_id.toString() === gn_record.source_id) {
return true;
}

return false;
}

/**
* Compare the two records and return true if they differ and false if same.
* Optionally provide $requestLanguage (req.clean.lang.iso6393) to improve name deduplication.
*/
function isDifferent(item1, item2, requestLanguage){
// records that share a geonames concordance are the same, regardless of any other checks
if( isGeonamesConcordanceSame( item1, item2 ) ){ return false; }

if( isLayerDifferent( item1, item2 ) ){ return true; }
if( isParentHierarchyDifferent( item1, item2 ) ){ return true; }
if( isNameDifferent( item1, item2, requestLanguage ) ){ return true; }
Expand Down
25 changes: 25 additions & 0 deletions test/unit/helper/diffPlaces.js
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,31 @@ module.exports.tests.layerDependentNormalization = function (test, common) {
});
};

module.exports.tests.geonames = function (test, common) {
test('geonames record with concordance is the same, regardless of anything else', function(t) {
const gn_record = {
source: 'geonames',
source_id: '123',
name: {
'default': 'One name'
}
};
const wof_record = {
source: 'whosonfirst',
source_id: '345',
name: {
default: 'Different name'
},
addendum: {
concordances: '{ "gn:id": 123 }'
}
};

t.false(isDifferent(gn_record, wof_record), 'should be the same based on concordance');
t.end();
});
};

module.exports.all = function (tape, common) {

function test(name, testFunction) {
Expand Down