Skip to content

Commit

Permalink
feat(dedupe): Check Geonames<->WOF concordances
Browse files Browse the repository at this point in the history
These concordances can be trusted over any other signals and really help
us remove lots of bad Geonames data.
  • Loading branch information
orangejulius committed Mar 4, 2022
1 parent bc53aee commit e305b87
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 0 deletions.
33 changes: 33 additions & 0 deletions helper/diffPlaces.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ const unicode = require('./unicode');
const placeTypes = require('./placeTypes');
const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers();
const field = require('../helper/fieldValue');
const codec = require('pelias-model').codec;

// only consider these layers as synonymous for deduplication purposes.
// when performing inter-layer deduping, layers coming earlier in this list take
Expand Down Expand Up @@ -169,11 +170,43 @@ function isAddressDifferent(item1, item2){
return false;
}

function isGeonamesConcordanceSame(item1, item2) {
let wof_record;
let gn_record;

if (item1.source === 'geonames' && item2.source === 'whosonfirst') {
gn_record = item1;
wof_record = item2;
} else if (item2.source === 'geonames' && item1.source === 'whosonfirst') {
gn_record = item2;
wof_record = item1;
} else {
// could not match to one geonames and one wof concordance, so this check does not apply
return false;
}

const concordances = _.get(wof_record, 'addendum.concordances');

if (concordances) {
const json = codec.decode(concordances);
const concordance_id = json['gn:id'];

if (concordance_id && typeof concordance_id === 'number' && concordance_id.toString() === gn_record.source_id) {
return true;
}
}

return false;
}

/**
* Compare the two records and return true if they differ and false if same.
* Optionally provide $requestLanguage (req.clean.lang.iso6393) to improve name deduplication.
*/
function isDifferent(item1, item2, requestLanguage){
// records that share a geonames concordance are the same, regardless of any other checks
if( isGeonamesConcordanceSame( item1, item2 ) ){ return false; }

if( isLayerDifferent( item1, item2 ) ){ return true; }
if( isParentHierarchyDifferent( item1, item2 ) ){ return true; }
if( isNameDifferent( item1, item2, requestLanguage ) ){ return true; }
Expand Down
25 changes: 25 additions & 0 deletions test/unit/helper/diffPlaces.js
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,31 @@ module.exports.tests.normalizeString = function (test, common) {
});
};

module.exports.tests.geonames = function (test, common) {
test('geonames record with concordance is the same, regardless of anything else', function(t) {
const gn_record = {
source: 'geonames',
source_id: '123',
name: {
'default': 'One name'
}
};
const wof_record = {
source: 'whosonfirst',
source_id: '345',
name: {
default: 'Different name'
},
addendum: {
concordances: '{ "gn:id": 123 }'
}
};

t.false(isDifferent(gn_record, wof_record), 'should be the same based on concordance');
t.end();
});
};

module.exports.all = function (tape, common) {

function test(name, testFunction) {
Expand Down

0 comments on commit e305b87

Please sign in to comment.