From 5619a125b1b7deba562ffbd692e272085ef0923b Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Mon, 28 Feb 2022 13:46:03 -0500 Subject: [PATCH] feat(dedupe): Handle Geonames 'City of' prefixes A common cause of deduplication errors is Geonames locality/localadmin records that start with 'City of'. Our name comparison logic is fairly conservative: it only looks at things like punctuation, diacriticals, etc. Otherwise, we have to consider names that are different meaning the underlying records represent genuinely different places. Getting too far away from this general stance could be dangerous, but we can handle specific outliers just fine. Geonames records that start with 'City of' are one of these cases. Often, there is a Geonames `locality` record with just the name, (like 'New York'), and then a Geonames `localadmin` record with the 'City of' prefix. Usually only one of those records will have a WOF concordance, so this is still helpful even combined with https://github.com/pelias/api/pull/1606 --- helper/diffPlaces.js | 32 ++++++++++++++++++++++++++++++-- test/unit/helper/diffPlaces.js | 15 +++++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/helper/diffPlaces.js b/helper/diffPlaces.js index 0094f212b..7105b66de 100644 --- a/helper/diffPlaces.js +++ b/helper/diffPlaces.js @@ -100,13 +100,41 @@ function isParentHierarchyDifferent(item1, item2){ }); } +/* Generate a 'name' value for comparison + * This includes normalizations for specific dataset features + */ +function nameForComparison(name) { + // recurse into object properties if this is an object + if (_.isPlainObject(name)) { + const new_object = {}; + Object.keys(name).forEach((key) => { + new_object[key] = nameForComparison(name[key]); + }); + + return new_object; + } + + // otherwise, only handle strings + if (!_.isString(name)) { + return name; + } + + const city_of_regex = new RegExp(/City of (.*)/, 'i'); + const matches = name.match(city_of_regex); + if (matches) { + return matches[1]; + } + + return name; +} + /** * Compare the name properties if they exist. * Returns false if the objects are the same, else true. */ function isNameDifferent(item1, item2, requestLanguage){ - let names1 = _.get(item1, 'name'); - let names2 = _.get(item2, 'name'); + let names1 = nameForComparison(_.get(item1, 'name')); + let names2 = nameForComparison(_.get(item2, 'name')); // check if these are plain 'ol javascript objects let isPojo1 = _.isPlainObject(names1); diff --git a/test/unit/helper/diffPlaces.js b/test/unit/helper/diffPlaces.js index 2304ad469..025d75f68 100644 --- a/test/unit/helper/diffPlaces.js +++ b/test/unit/helper/diffPlaces.js @@ -539,6 +539,21 @@ module.exports.tests.isNameDifferent = function (test, common) { }); }; +module.exports.tests.nameForcomparison = function (test, common) { + test('geonames City of', function (t) { + t.false(isNameDifferent( + { name: { default: 'City of New York' } }, + { name: { default: 'New York' } } + ), 'Geonames \'City of\' prefix is ignored'); + + t.false(isNameDifferent( + { name: { en: 'City of New York' } }, + { name: { default: 'New York' } } + ), 'Geonames \'City of\' prefix is ignored across languages'); + t.end(); + }); +}; + module.exports.tests.normalizeString = function (test, common) { test('lowercase', function (t) { t.equal(normalizeString('Foo Bar'), 'foo bar');