Skip to content

Commit

Permalink
fix(dedupe): improved deduplication between USA ZIP vs ZIP+4 properties
Browse files Browse the repository at this point in the history
  • Loading branch information
missinglink committed Jul 12, 2024
1 parent 7457f15 commit a11659d
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 5 deletions.
29 changes: 24 additions & 5 deletions helper/diffPlaces.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,12 @@ function isLayerDifferent(item1, item2){
return false;
}

function isCountryCode(item, code) {
return field.getStringValue( item?.parent?.country_a ) === code;
}

function isUsState(item) {
if (!_.isArray(item.parent.country_a)) { return false; }
return item.parent.country_a[0] === 'USA' && item.layer === 'region';
return isCountryCode(item, 'USA') && item.layer === 'region';
}

// Geonames records in the locality and localadmin layer are parented by themselves
Expand Down Expand Up @@ -206,7 +209,7 @@ function isAddressDifferent(item1, item2){
// only compare zip if both records have it, otherwise just ignore and assume it's the same
// since by this time we've already compared parent hierarchies
if( _.has(address1, 'zip') && _.has(address2, 'zip') ){
if( isPropertyDifferent(address1, address2, 'zip') ){ return true; }
if( isZipDifferent(item1, item2) ){ return true; }
}

return false;
Expand Down Expand Up @@ -255,10 +258,26 @@ function isDifferent(item1, item2, requestLanguage){
return false;
}

/**
* return true if zip codes are different
*/
function isZipDifferent(item1, item2) {
let address1 = _.get(item1, 'address_parts');
let address2 = _.get(item2, 'address_parts');

// handle USA ZIP+4 vs ZIP (98036-6119 vs 98036)
if (isCountryCode(item1, 'USA') && isCountryCode(item2, 'USA')) {
const firstWordOnly = (str) => _.first(normalizeString(str).split(' '));
return isPropertyDifferent(address1, address2, 'zip', firstWordOnly);
}

return isPropertyDifferent(address1, address2, 'zip');
}

/**
* return true if properties are different
*/
function isPropertyDifferent(item1, item2, prop ){
function isPropertyDifferent(item1, item2, prop, normalizer = normalizeString ){

// if neither item has prop, we consider them the same
if( !_.has(item1, prop) && !_.has(item2, prop) ){ return false; }
Expand All @@ -274,7 +293,7 @@ function isPropertyDifferent(item1, item2, prop ){
let prop1StringValue = field.getStringValue( prop1[i] );
for( let j=0; j<prop2.length; j++ ){
let prop2StringValue = field.getStringValue( prop2[j] );
if( normalizeString( prop1StringValue ) === normalizeString( prop2StringValue ) ){
if( normalizer( prop1StringValue ) === normalizer( prop2StringValue ) ){
return false;
}
}
Expand Down
52 changes: 52 additions & 0 deletions test/unit/helper/diffPlaces.js
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,58 @@ module.exports.tests.dedupe = function(test, common) {
t.end();
});

test('ZIP vs ZIP+4', function(t) {
var item1 = {
'parent': {
'country_a': ['USA']
},
'address_parts': {
'number': '1',
'street': 'Main Street',
'zip': '90210'
}
};
var item2 = {
'parent': {
'country_a': ['USA']
},
'address_parts': {
'number': '1',
'street': 'Main Street',
'zip': '90210-1111'
}
};

t.false(isDifferent(item1, item2), 'should be the same');
t.end();
});

test('ZIP vs ZIP+4 functionality does not apply for non-USA documents', function(t) {
var item1 = {
'parent': {
'country_a': ['NOT']
},
'address_parts': {
'number': '1',
'street': 'Main Street',
'zip': '90210'
}
};
var item2 = {
'parent': {
'country_a': ['NOT']
},
'address_parts': {
'number': '1',
'street': 'Main Street',
'zip': '90210-1111'
}
};

t.true(isDifferent(item1, item2), 'should be the same');
t.end();
});

test('completely empty objects', function(t) {
var item1 = {};
var item2 = {};
Expand Down

0 comments on commit a11659d

Please sign in to comment.