From 7d28b15f3cebee2d5739266f82cb3f0daa0f17f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Fio=C5=82ka?= <35661939+JanF01@users.noreply.github.com> Date: Tue, 15 Aug 2023 10:44:01 +0200 Subject: [PATCH] feat(poland): classifying more addresses for poland (#174) --- classifier/PlaceClassifier.js | 2 +- classifier/PostcodeClassifier.js | 2 +- classifier/StreetPrefixClassifier.js | 2 +- classifier/StreetSuffixClassifier.js | 2 +- classifier/scheme/street.js | 38 +++++++++++++++++++ .../dictionaries/libpostal/pl/place_names.txt | 1 + .../dictionaries/libpostal/pl/synonyms.txt | 1 + test/address.pol.test.js | 22 +++++++++++ 8 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 resources/pelias/dictionaries/libpostal/pl/place_names.txt create mode 100644 resources/pelias/dictionaries/libpostal/pl/synonyms.txt diff --git a/classifier/PlaceClassifier.js b/classifier/PlaceClassifier.js index 67967896..9820b6bc 100644 --- a/classifier/PlaceClassifier.js +++ b/classifier/PlaceClassifier.js @@ -9,7 +9,7 @@ class PlaceClassifier extends WordClassifier { setup () { // load index tokens this.index = {} - libpostal.load(this.index, ['fr', 'de', 'en'], 'place_names.txt') + libpostal.load(this.index, ['fr', 'de', 'en', 'pl'], 'place_names.txt') libpostal.generatePlurals(this.index) } diff --git a/classifier/PostcodeClassifier.js b/classifier/PostcodeClassifier.js index c263c30e..5352c2d4 100644 --- a/classifier/PostcodeClassifier.js +++ b/classifier/PostcodeClassifier.js @@ -10,7 +10,7 @@ const dictPath = path.join(__dirname, `../resources/chromium-i18n/ssl-address`) // const countryCodes = fs.readdirSync(dictPath) // .filter(p => p.endsWith('.json')) // .map(p => p.split('.')[0]) -const countryCodes = ['us', 'gb', 'fr', 'de', 'es', 'pt', 'au', 'nz', 'kr', 'jp', 'in', 'ru', 'br', 'nl'] +const countryCodes = ['us', 'gb', 'fr', 'de', 'es', 'pt', 'au', 'nz', 'kr', 'jp', 'in', 'ru', 'br', 'nl', 'pl'] class PostcodeClassifier extends WordClassifier { setup () { diff --git a/classifier/StreetPrefixClassifier.js b/classifier/StreetPrefixClassifier.js index a0f8bde4..fd6a8fbd 100644 --- a/classifier/StreetPrefixClassifier.js +++ b/classifier/StreetPrefixClassifier.js @@ -7,7 +7,7 @@ const libpostal = require('../resources/libpostal/libpostal') // prefix languages // languages which use a street prefix instead of a suffix -const prefix = ['fr', 'ca', 'es', 'pt', 'ro'] +const prefix = ['fr', 'ca', 'es', 'pt', 'ro', 'pl'] class StreetPrefixClassifier extends WordClassifier { setup () { diff --git a/classifier/StreetSuffixClassifier.js b/classifier/StreetSuffixClassifier.js index 6f34bc55..0f3fb33b 100644 --- a/classifier/StreetSuffixClassifier.js +++ b/classifier/StreetSuffixClassifier.js @@ -7,7 +7,7 @@ const libpostal = require('../resources/libpostal/libpostal') // prefix languages // languages which use a street prefix instead of a suffix -const prefix = ['fr', 'ca', 'es', 'pt', 'ro'] +const prefix = ['fr', 'ca', 'es', 'pt', 'ro', 'pl'] class StreetSuffixClassifier extends WordClassifier { setup () { diff --git a/classifier/scheme/street.js b/classifier/scheme/street.js index 9df7ae43..d06f1a89 100644 --- a/classifier/scheme/street.js +++ b/classifier/scheme/street.js @@ -216,6 +216,44 @@ module.exports = [ } ] }, + { + // Aleja Wojska Polskiego + confidence: 0.91, + Class: StreetClassification, + scheme: [ + { + is: ['StreetPrefixClassification'], + not: ['StreetClassification', 'IntersectionClassification'] + }, + { + is: ['PlaceClassification'], + not: ['StreetClassification', 'IntersectionClassification'] + }, + { + is: ['AlphaClassification', 'GivenNameClassification', 'PersonClassification'], + not: ['StreetClassification', 'StreetPrefixClassification'] + } + ] + }, + { + // Aleja 11 Listopada + confidence: 0.84, + Class: StreetClassification, + scheme: [ + { + is: ['StreetPrefixClassification'], + not: ['StreetClassification', 'IntersectionClassification'] + }, + { + is: ['NumericClassification'], + not: ['StreetClassification', 'IntersectionClassification'] + }, + { + is: ['AlphaClassification', 'GivenNameClassification', 'PersonClassification'], + not: ['StreetClassification', 'StreetPrefixClassification'] + } + ] + }, { // Boulevard du Général Charles De Gaulle confidence: 0.81, diff --git a/resources/pelias/dictionaries/libpostal/pl/place_names.txt b/resources/pelias/dictionaries/libpostal/pl/place_names.txt new file mode 100644 index 00000000..528e8d14 --- /dev/null +++ b/resources/pelias/dictionaries/libpostal/pl/place_names.txt @@ -0,0 +1 @@ +wojsko|wojska|wojsk diff --git a/resources/pelias/dictionaries/libpostal/pl/synonyms.txt b/resources/pelias/dictionaries/libpostal/pl/synonyms.txt new file mode 100644 index 00000000..12a0f279 --- /dev/null +++ b/resources/pelias/dictionaries/libpostal/pl/synonyms.txt @@ -0,0 +1 @@ +polskiego|polski|pol diff --git a/test/address.pol.test.js b/test/address.pol.test.js index 981b3f48..19c52b48 100644 --- a/test/address.pol.test.js +++ b/test/address.pol.test.js @@ -5,6 +5,28 @@ const testcase = (test, common) => { { street: 'Szewska' }, { housenumber: '6' }, { locality: 'Kraków' } ]) + + assert('aleja Wojska Polskiego 178', [ + { street: 'aleja Wojska Polskiego' }, { housenumber: '178' } + ]) + + assert('aleja 29 listopada 11', [ + { street: 'aleja 29 listopada' }, { housenumber: '11' } + ]) + + assert('aleja Wojska 178', [ + { street: 'aleja Wojska' }, { housenumber: '178' } + ]) + + assert('Ulica Strzelecka 12, Nowy Sącz', [ + { street: 'Ulica Strzelecka' }, { housenumber: '12' }, + { locality: 'Nowy Sącz' } + ]) + + assert('Żorska 11, 47-400', [ + { street: 'Żorska' }, { housenumber: '11' }, + { postcode: '47-400' } + ]) } module.exports.all = (tape, common) => {