From c2490d1ed5e8aa4ca360894865e7c923c30fed98 Mon Sep 17 00:00:00 2001 From: cquest Date: Wed, 14 Feb 2018 23:16:59 +0100 Subject: [PATCH 01/28] more cleanup BP/CS/TSA/CIDEX/CEDEX --- addok_france/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 779ab89..c3b01eb 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -51,8 +51,9 @@ def clean_query(q): q = re.sub(r'([\d]{5})', r' \1 ', q, flags=re.IGNORECASE) + q = re.sub(r'(^| )(b\.?p\.?|cs|tsa|cidex) *(n *|no *|)[\d]* ?', '', q, flags=re.IGNORECASE) + q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', q, flags=re.IGNORECASE) q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) - q = re.sub(r'\b(bp|cs|tsa|cidex) *[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE) q = re.sub(' {2,}', ' ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) From 253cb541fdd32defb4070ce496ecaf15d42ebf74 Mon Sep 17 00:00:00 2001 From: cquest Date: Wed, 14 Feb 2018 23:35:42 +0100 Subject: [PATCH 02/28] =?UTF-8?q?cleanup=20BP/CS/TSA/CIDEX=20N=C2=B0=20+?= =?UTF-8?q?=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- addok_france/utils.py | 5 ++++- tests/test_utils.py | 20 +++++++++++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index c3b01eb..093fbce 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -51,8 +51,11 @@ def clean_query(q): q = re.sub(r'([\d]{5})', r' \1 ', q, flags=re.IGNORECASE) - q = re.sub(r'(^| )(b\.?p\.?|cs|tsa|cidex) *(n *|no *|)[\d]* ?', '', q, flags=re.IGNORECASE) + print(q) + q = re.sub(r'(^| )(b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', r'\1', q, flags=re.IGNORECASE) + print(q) q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', q, flags=re.IGNORECASE) + print(q) q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE) q = re.sub(' {2,}', ' ', q, flags=re.IGNORECASE) diff --git a/tests/test_utils.py b/tests/test_utils.py index 969ee91..9ea13cc 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -13,13 +13,17 @@ @pytest.mark.parametrize("input,expected", [ ("2 allée Jules Guesde 31068 TOULOUSE CEDEX 7", - "2 allée Jules Guesde 31068 TOULOUSE"), + "2 allée Jules Guesde 31 TOULOUSE"), ("7, avenue Léon-Blum 31507 Toulouse Cedex 5", - "7, avenue Léon-Blum 31507 Toulouse"), + "7, avenue Léon-Blum 31 Toulouse"), ("159, avenue Jacques-Douzans 31604 Muret Cedex", - "159, avenue Jacques-Douzans 31604 Muret"), + "159, avenue Jacques-Douzans 31 Muret"), ("2 allée Jules Guesde BP 7015 31068 TOULOUSE", "2 allée Jules Guesde 31068 TOULOUSE"), + ("2 allée Jules Guesde B.P. 7015 31068 TOULOUSE", + "2 allée Jules Guesde 31068 TOULOUSE"), + ("2 allée Jules Guesde B.P. N 7015 31068 TOULOUSE", + "2 allée Jules Guesde 31068 TOULOUSE"), ("BP 80111 159, avenue Jacques-Douzans 31604 Muret", "159, avenue Jacques-Douzans 31604 Muret"), ("12, place de l'Hôtel-de-Ville BP 46 02150 Sissonne", @@ -27,7 +31,7 @@ ("6, rue Winston-Churchill CS 40055 60321 Compiègne", "6, rue Winston-Churchill 60321 Compiègne"), ("BP 80111 159, avenue Jacques-Douzans 31604 Muret Cedex", - "159, avenue Jacques-Douzans 31604 Muret"), + "159, avenue Jacques-Douzans 31 Muret"), ("BP 20169 Cite administrative - 8e étage Rue Gustave-Delory 59017 Lille", "Cite administrative - Rue Gustave-Delory 59017 Lille"), ("12e étage Rue Gustave-Delory 59017 Lille", @@ -52,9 +56,15 @@ ("32bis Rue des Vosges93290", "32bis Rue des Vosges 93290"), ("20 avenue de Ségur TSA 30719 75334 Paris Cedex 07", - "20 avenue de Ségur 75334 Paris"), + "20 avenue de Ségur 75 Paris"), + ("20 avenue de Ségur TSA No30719 75334 Paris Cedex 07", + "20 avenue de Ségur 75 Paris"), + ("20 avenue de Ségur TSA N 30719 75334 Paris Cedex 07", + "20 avenue de Ségur 75 Paris"), ("20 rue saint germain CIDEX 304 89110 Poilly-sur-tholon", "20 rue saint germain 89110 Poilly-sur-tholon"), + ("20 rue saint germain CIDEX N°304 89110 Poilly-sur-tholon", + "20 rue saint germain 89110 Poilly-sur-tholon"), ]) def test_clean_query(input, expected): assert clean_query(input) == expected From 3f75ccfc8c15d27a70e602f60ef29d03fa154fa2 Mon Sep 17 00:00:00 2001 From: cquest Date: Wed, 14 Feb 2018 23:36:22 +0100 Subject: [PATCH 03/28] do not break on queries like "12bis" --- addok_france/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 093fbce..28efe54 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -120,14 +120,17 @@ def flag_housenumber(tokens): def fold_ordinal(s): """3bis => 3b.""" - if s[0].isdigit() and not s.isdigit(): + if s is not None and s !='' and s[0].isdigit() and not s.isdigit(): try: number, ordinal = FOLD_PATTERN.findall(s)[0] except (IndexError, ValueError): pass else: - s = s.update('{}{}'.format(number, + try: + s = s.update('{}{}'.format(number, FOLD.get(ordinal.lower(), ordinal))) + except: + pass return s From 6d59df33d5dfb4b7fc09da4f4c2abf2844c46701 Mon Sep 17 00:00:00 2001 From: cquest Date: Wed, 14 Feb 2018 23:38:36 +0100 Subject: [PATCH 04/28] print() removed --- addok_france/utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 28efe54..265cc01 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -51,11 +51,8 @@ def clean_query(q): q = re.sub(r'([\d]{5})', r' \1 ', q, flags=re.IGNORECASE) - print(q) q = re.sub(r'(^| )(b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', r'\1', q, flags=re.IGNORECASE) - print(q) q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', q, flags=re.IGNORECASE) - print(q) q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE) q = re.sub(' {2,}', ' ', q, flags=re.IGNORECASE) From 4e5be21774d2f9b437119ff15160f7652a9be5d0 Mon Sep 17 00:00:00 2001 From: cquest Date: Wed, 14 Feb 2018 23:58:20 +0100 Subject: [PATCH 05/28] cleanup phone/fax numbers --- addok_france/utils.py | 3 ++- tests/test_utils.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 265cc01..205594a 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -50,11 +50,12 @@ def clean_query(q): - q = re.sub(r'([\d]{5})', r' \1 ', q, flags=re.IGNORECASE) q = re.sub(r'(^| )(b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', r'\1', q, flags=re.IGNORECASE) q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', q, flags=re.IGNORECASE) + q = re.sub(r'([^\d ])([\d]{5})([^\d]|$)', r'\1 \2 ', q, flags=re.IGNORECASE) q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE) + q = re.sub(r'((fax|t[eé]l|t[eé]l[eé]copieur)[ :,\.]*|)(\d{10}|[0-9][0-9][ -\./]\d\d[-\./ ]\d\d[-\./ ]\d\d[-\./ ]\d\d)', '', q, flags=re.IGNORECASE) q = re.sub(' {2,}', ' ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE) diff --git a/tests/test_utils.py b/tests/test_utils.py index 9ea13cc..5e4ce00 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -65,6 +65,16 @@ "20 rue saint germain 89110 Poilly-sur-tholon"), ("20 rue saint germain CIDEX N°304 89110 Poilly-sur-tholon", "20 rue saint germain 89110 Poilly-sur-tholon"), + ("20 rue saint germain 89110 Poilly-sur-tholon 01.23.45.67.89", + "20 rue saint germain 89110 Poilly-sur-tholon"), + ("32bis Rue des Vosges93290 fax: 0123456789", + "32bis Rue des Vosges 93290"), + ("32bis Rue des Vosges 93290 tel 01 23 45 67 89", + "32bis Rue des Vosges 93290"), + ("32bis Rue des Vosges 93290 telecopieur. 01/23/45/67/89", + "32bis Rue des Vosges 93290"), + ("32bis Rue des Vosges 93290 télécopieur, 01-23-45-67-89", + "32bis Rue des Vosges 93290"), ]) def test_clean_query(input, expected): assert clean_query(input) == expected From 454ca5006542d430677727e4474297a23e63fba2 Mon Sep 17 00:00:00 2001 From: cquest Date: Thu, 15 Feb 2018 00:27:54 +0100 Subject: [PATCH 06/28] fold initiales: F F I > F F I FFI, etc --- addok_france/utils.py | 1 + tests/test_utils.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/addok_france/utils.py b/addok_france/utils.py index 205594a..28100b8 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -60,6 +60,7 @@ def clean_query(q): q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE) q = re.sub('^lieux?[ -]?dits?\\b(?=.)', '', q, flags=re.IGNORECASE) + q = re.sub(r'(^| )(([A-Z]) ([A-Z]) (([A-Z]) )?(([A-Z]) )?(([A-Z])( |$))?)', r'\1\2\3\4\6\8\10 ', q, flags=re.IGNORECASE) q = q.strip() return q diff --git a/tests/test_utils.py b/tests/test_utils.py index 5e4ce00..e5f128d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -75,6 +75,8 @@ "32bis Rue des Vosges 93290"), ("32bis Rue des Vosges 93290 télécopieur, 01-23-45-67-89", "32bis Rue des Vosges 93290"), + ("10 BLD DES F F I 85300 CHALLANS", + "10 BLD DES F F I FFI 85300 CHALLANS"), ]) def test_clean_query(input, expected): assert clean_query(input) == expected From 045c8f03a734ef823b41c399a411e6354b2049c5 Mon Sep 17 00:00:00 2001 From: cquest Date: Thu, 15 Feb 2018 01:02:29 +0100 Subject: [PATCH 07/28] boite postale --- addok_france/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 28100b8..f1640d2 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -50,7 +50,7 @@ def clean_query(q): - q = re.sub(r'(^| )(b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', r'\1', q, flags=re.IGNORECASE) + q = re.sub(r'(^| )(boite postale|b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', r'\1', q, flags=re.IGNORECASE) q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', q, flags=re.IGNORECASE) q = re.sub(r'([^\d ])([\d]{5})([^\d]|$)', r'\1 \2 ', q, flags=re.IGNORECASE) q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) From 2261c92108b33e999779ae6eca56ed851aa17b36 Mon Sep 17 00:00:00 2001 From: Christian Quest Date: Fri, 16 Feb 2018 18:56:47 +0100 Subject: [PATCH 08/28] WIP fold_initials "F F I" > "FFI" --- addok_france/__init__.py | 1 + addok_france/utils.py | 15 +++++++++++++++ tests/test_utils.py | 30 +++++++++++++++++++++++++++++- 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/addok_france/__init__.py b/addok_france/__init__.py index 2b20ccb..8f48d8c 100644 --- a/addok_france/__init__.py +++ b/addok_france/__init__.py @@ -14,6 +14,7 @@ extract_address = yielder(utils.extract_address) glue_ordinal = utils.glue_ordinal fold_ordinal = yielder(utils.fold_ordinal) +fold_initials = yielder(utils.fold_initials) flag_housenumber = utils.flag_housenumber make_labels = utils.make_labels remove_leading_zeros = yielder(utils.remove_leading_zeros) diff --git a/addok_france/utils.py b/addok_france/utils.py index f1640d2..4b4f94a 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -133,6 +133,21 @@ def fold_ordinal(s): return s +def fold_initials(tokens): + initials = [] + for _, token, next_ in neighborhood(tokens): + if len(token)==1: + initials.append(token) + else: + if len(initial)>2: + initials[0].update("".join(initials)) + yield initials[0] + else: + for tk in initials: + yield tk + yield token + + def remove_leading_zeros(s): """0003 => 3.""" # Limit digits from 1 to 3 in order to avoid processing postcodes. diff --git a/tests/test_utils.py b/tests/test_utils.py index e5f128d..445ae1f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,7 +8,7 @@ from addok.helpers.text import Token from addok_france.utils import (clean_query, extract_address, flag_housenumber, fold_ordinal, glue_ordinal, make_labels, - remove_leading_zeros) + remove_leading_zeros, fold_initials) @pytest.mark.parametrize("input,expected", [ @@ -353,3 +353,31 @@ def test_make_municipality_labels(config): '59000 Lille', 'Lille 59000', ] + + +@pytest.mark.parametrize("inputs,expected", [ + (['6', 'bis'], ['6bis']), + (['6'], ['6']), + (['6', 'avenue'], ['6', 'avenue']), + (['60', 'bis', 'avenue'], ['60bis', 'avenue']), + (['600', 'ter', 'avenue'], ['600ter', 'avenue']), + (['6', 'quinquies', 'avenue'], ['6quinquies', 'avenue']), + (['60', 'sexies', 'avenue'], ['60sexies', 'avenue']), + (['600', 'quater', 'avenue'], ['600quater', 'avenue']), + (['6', 's', 'avenue'], ['6s', 'avenue']), + (['60b', 'avenue'], ['60b', 'avenue']), + (['600', 'b', 'avenue'], ['600b', 'avenue']), + (['241', 'r', 'de'], ['241', 'r', 'de']), + (['120', 'r', 'renard'], ['120', 'r', 'renard']), + (['241', 'r', 'rue'], ['241r', 'rue']), + (['place', 'des', 'terreaux'], ['place', 'des', 'terreaux']), + (['rue', 'du', 'bis'], ['rue', 'du', 'bis']), +]) +@pytest.mark.parametrize("input,expected", [ + (['allee','a','b','c'], ['allee','abc']), + (['allee','a','b','c','toto'], ['allee','abc','toto']), + (['allee','a','b','c','d'], ['allee','abcd']), + (['allee','a','b','c','d','e'], ['allee','abcde']), +]) +def test_fold_initials(input, expected): + assert fold_initials(Token(input)) == expected From 5f11ac443104c8c90fe3c0825a5ffa8ad2409875 Mon Sep 17 00:00:00 2001 From: Christian Quest Date: Fri, 16 Feb 2018 19:48:27 +0100 Subject: [PATCH 09/28] fold_initials + tests --- addok_france/utils.py | 15 +++++++++------ tests/test_utils.py | 27 ++++++--------------------- 2 files changed, 15 insertions(+), 27 deletions(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 4b4f94a..6697a78 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -127,24 +127,27 @@ def fold_ordinal(s): else: try: s = s.update('{}{}'.format(number, - FOLD.get(ordinal.lower(), ordinal))) + FOLD.get(ordinal.lower(), ordinal))) except: pass return s def fold_initials(tokens): + """ folds 'F F I' into 'FFI' """ initials = [] for _, token, next_ in neighborhood(tokens): - if len(token)==1: + isinitial = len(token) == 1 and token.isalpha() + if isinitial: initials.append(token) - else: - if len(initial)>2: - initials[0].update("".join(initials)) - yield initials[0] + if not next_ or not isinitial: + if len(initials) > 2: + yield initials[0].update("".join(initials)) else: for tk in initials: yield tk + initials = [] + if not isinitial: yield token diff --git a/tests/test_utils.py b/tests/test_utils.py index 445ae1f..c08f55b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -356,28 +356,13 @@ def test_make_municipality_labels(config): @pytest.mark.parametrize("inputs,expected", [ - (['6', 'bis'], ['6bis']), - (['6'], ['6']), - (['6', 'avenue'], ['6', 'avenue']), - (['60', 'bis', 'avenue'], ['60bis', 'avenue']), - (['600', 'ter', 'avenue'], ['600ter', 'avenue']), - (['6', 'quinquies', 'avenue'], ['6quinquies', 'avenue']), - (['60', 'sexies', 'avenue'], ['60sexies', 'avenue']), - (['600', 'quater', 'avenue'], ['600quater', 'avenue']), - (['6', 's', 'avenue'], ['6s', 'avenue']), - (['60b', 'avenue'], ['60b', 'avenue']), - (['600', 'b', 'avenue'], ['600b', 'avenue']), - (['241', 'r', 'de'], ['241', 'r', 'de']), - (['120', 'r', 'renard'], ['120', 'r', 'renard']), - (['241', 'r', 'rue'], ['241r', 'rue']), - (['place', 'des', 'terreaux'], ['place', 'des', 'terreaux']), - (['rue', 'du', 'bis'], ['rue', 'du', 'bis']), -]) -@pytest.mark.parametrize("input,expected", [ - (['allee','a','b','c'], ['allee','abc']), (['allee','a','b','c','toto'], ['allee','abc','toto']), + (['allee','a','b','c','toto','d','e','f'], ['allee','abc','toto','def']), + (['allee','a','2','c','toto'], ['allee','a','2','c','toto']), + (['allee','a','b','c'], ['allee','abc']), (['allee','a','b','c','d'], ['allee','abcd']), (['allee','a','b','c','d','e'], ['allee','abcde']), ]) -def test_fold_initials(input, expected): - assert fold_initials(Token(input)) == expected +def test_fold_initials(inputs, expected): + tokens = [Token(input_) for input_ in inputs] + assert list(fold_initials(tokens)) == expected From 330169a89d923e9bb2d39339d2661db178711389 Mon Sep 17 00:00:00 2001 From: Christian Quest Date: Fri, 16 Feb 2018 19:48:58 +0100 Subject: [PATCH 10/28] pep8 --- addok_france/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 6697a78..ec2dd55 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -111,7 +111,7 @@ def flag_housenumber(tokens): found = False for previous, token, next_ in neighborhood(tokens): if ((token.is_first or (next_ and TYPES_PATTERN.match(next_))) - and NUMBER_PATTERN.match(token) and not found): + and NUMBER_PATTERN.match(token) and not found): token.kind = 'housenumber' found = True yield token @@ -119,7 +119,7 @@ def flag_housenumber(tokens): def fold_ordinal(s): """3bis => 3b.""" - if s is not None and s !='' and s[0].isdigit() and not s.isdigit(): + if s is not None and s != '' and s[0].isdigit() and not s.isdigit(): try: number, ordinal = FOLD_PATTERN.findall(s)[0] except (IndexError, ValueError): From fd5d9ee94f81cb0f13c8f61aea8303ab089885b1 Mon Sep 17 00:00:00 2001 From: Christian Quest Date: Fri, 16 Feb 2018 20:25:13 +0100 Subject: [PATCH 11/28] pep8 --- addok_france/utils.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index ec2dd55..ab6235a 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -50,9 +50,12 @@ def clean_query(q): - q = re.sub(r'(^| )(boite postale|b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', r'\1', q, flags=re.IGNORECASE) - q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', q, flags=re.IGNORECASE) - q = re.sub(r'([^\d ])([\d]{5})([^\d]|$)', r'\1 \2 ', q, flags=re.IGNORECASE) + q = re.sub(r'(^| )(boite postale|b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', + r'\1', q, flags=re.IGNORECASE) + q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', + q, flags=re.IGNORECASE) + q = re.sub(r'([^\d ])([\d]{5})([^\d]|$)', r'\1 \2 ', + q, flags=re.IGNORECASE) q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE) q = re.sub(r'((fax|t[eé]l|t[eé]l[eé]copieur)[ :,\.]*|)(\d{10}|[0-9][0-9][ -\./]\d\d[-\./ ]\d\d[-\./ ]\d\d[-\./ ]\d\d)', '', q, flags=re.IGNORECASE) @@ -60,7 +63,8 @@ def clean_query(q): q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE) q = re.sub('^lieux?[ -]?dits?\\b(?=.)', '', q, flags=re.IGNORECASE) - q = re.sub(r'(^| )(([A-Z]) ([A-Z]) (([A-Z]) )?(([A-Z]) )?(([A-Z])( |$))?)', r'\1\2\3\4\6\8\10 ', q, flags=re.IGNORECASE) + q = re.sub(r'(^| )(([A-Z]) ([A-Z]) (([A-Z]) )?(([A-Z]) )?(([A-Z])( |$))?)', + r'\1\2\3\4\6\8\10 ', q, flags=re.IGNORECASE) q = q.strip() return q From 3ccc267cf7d1630663a40936bdf7f479efc96676 Mon Sep 17 00:00:00 2001 From: Christian Quest Date: Fri, 16 Feb 2018 20:29:10 +0100 Subject: [PATCH 12/28] pep8 --- tests/test_utils.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index c08f55b..62962b5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -356,12 +356,18 @@ def test_make_municipality_labels(config): @pytest.mark.parametrize("inputs,expected", [ - (['allee','a','b','c','toto'], ['allee','abc','toto']), - (['allee','a','b','c','toto','d','e','f'], ['allee','abc','toto','def']), - (['allee','a','2','c','toto'], ['allee','a','2','c','toto']), - (['allee','a','b','c'], ['allee','abc']), - (['allee','a','b','c','d'], ['allee','abcd']), - (['allee','a','b','c','d','e'], ['allee','abcde']), + (['allee', 'a', 'b', 'c', 'toto'], + ['allee', 'abc', 'toto']), + (['allee', 'a', 'b', 'c', 'toto', 'd', 'e', 'f'], + ['allee', 'abc', 'toto', 'def']), + (['allee', 'a', '2', 'c', 'toto'], + ['allee', 'a', '2', 'c', 'toto']), + (['allee', 'a', 'b', 'c'], + ['allee', 'abc']), + (['allee', 'a', 'b', 'c', 'd'], + ['allee', 'abcd']), + (['allee', 'a', 'b', 'c', 'd', 'e'], + ['allee', 'abcde']), ]) def test_fold_initials(inputs, expected): tokens = [Token(input_) for input_ in inputs] From 9b90da3a0a597ec36ee84a5476a5c765f053ff96 Mon Sep 17 00:00:00 2001 From: cquest Date: Fri, 16 Feb 2018 22:52:48 +0100 Subject: [PATCH 13/28] separate PR --- addok_france/utils.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index ab6235a..e4414ce 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -123,17 +123,14 @@ def flag_housenumber(tokens): def fold_ordinal(s): """3bis => 3b.""" - if s is not None and s != '' and s[0].isdigit() and not s.isdigit(): + if s[0].isdigit() and not s.isdigit(): try: number, ordinal = FOLD_PATTERN.findall(s)[0] except (IndexError, ValueError): pass else: - try: - s = s.update('{}{}'.format(number, - FOLD.get(ordinal.lower(), ordinal))) - except: - pass + s = s.update('{}{}'.format(number, + FOLD.get(ordinal.lower(), ordinal))) return s From a3017f85751fcc945221a12d320787a1320ef77f Mon Sep 17 00:00:00 2001 From: cquest Date: Sun, 18 Feb 2018 17:25:26 +0100 Subject: [PATCH 14/28] fold_initials is a yielder --- addok_france/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/addok_france/__init__.py b/addok_france/__init__.py index 8f48d8c..4f55576 100644 --- a/addok_france/__init__.py +++ b/addok_france/__init__.py @@ -14,7 +14,7 @@ extract_address = yielder(utils.extract_address) glue_ordinal = utils.glue_ordinal fold_ordinal = yielder(utils.fold_ordinal) -fold_initials = yielder(utils.fold_initials) +fold_initials = utils.fold_initials flag_housenumber = utils.flag_housenumber make_labels = utils.make_labels remove_leading_zeros = yielder(utils.remove_leading_zeros) From bb92a1d35daf5e8c5cd5db222a2dda0162b9f1ed Mon Sep 17 00:00:00 2001 From: cquest Date: Wed, 21 Feb 2018 17:18:51 +0100 Subject: [PATCH 15/28] fold_initials > glue_initials --- addok_france/__init__.py | 2 +- addok_france/utils.py | 6 ++---- tests/test_utils.py | 8 ++++---- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/addok_france/__init__.py b/addok_france/__init__.py index 4f55576..c1ae1b7 100644 --- a/addok_france/__init__.py +++ b/addok_france/__init__.py @@ -14,7 +14,7 @@ extract_address = yielder(utils.extract_address) glue_ordinal = utils.glue_ordinal fold_ordinal = yielder(utils.fold_ordinal) -fold_initials = utils.fold_initials +glue_initials = utils.glue_initials flag_housenumber = utils.flag_housenumber make_labels = utils.make_labels remove_leading_zeros = yielder(utils.remove_leading_zeros) diff --git a/addok_france/utils.py b/addok_france/utils.py index e4414ce..c7d001c 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -63,8 +63,6 @@ def clean_query(q): q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE) q = re.sub('^lieux?[ -]?dits?\\b(?=.)', '', q, flags=re.IGNORECASE) - q = re.sub(r'(^| )(([A-Z]) ([A-Z]) (([A-Z]) )?(([A-Z]) )?(([A-Z])( |$))?)', - r'\1\2\3\4\6\8\10 ', q, flags=re.IGNORECASE) q = q.strip() return q @@ -134,8 +132,8 @@ def fold_ordinal(s): return s -def fold_initials(tokens): - """ folds 'F F I' into 'FFI' """ +def glue_initials(tokens): + """ glue 'F F I' into 'FFI' """ initials = [] for _, token, next_ in neighborhood(tokens): isinitial = len(token) == 1 and token.isalpha() diff --git a/tests/test_utils.py b/tests/test_utils.py index 62962b5..eea4772 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,7 +8,7 @@ from addok.helpers.text import Token from addok_france.utils import (clean_query, extract_address, flag_housenumber, fold_ordinal, glue_ordinal, make_labels, - remove_leading_zeros, fold_initials) + remove_leading_zeros, glue_initials) @pytest.mark.parametrize("input,expected", [ @@ -76,7 +76,7 @@ ("32bis Rue des Vosges 93290 télécopieur, 01-23-45-67-89", "32bis Rue des Vosges 93290"), ("10 BLD DES F F I 85300 CHALLANS", - "10 BLD DES F F I FFI 85300 CHALLANS"), + "10 BLD DES F F I 85300 CHALLANS"), # done by glue_initials ]) def test_clean_query(input, expected): assert clean_query(input) == expected @@ -369,6 +369,6 @@ def test_make_municipality_labels(config): (['allee', 'a', 'b', 'c', 'd', 'e'], ['allee', 'abcde']), ]) -def test_fold_initials(inputs, expected): +def test_glue_initials(inputs, expected): tokens = [Token(input_) for input_ in inputs] - assert list(fold_initials(tokens)) == expected + assert list(glue_initials(tokens)) == expected From 01cfa2a66360afdcab4814aafb040343c6db7778 Mon Sep 17 00:00:00 2001 From: cquest Date: Sun, 18 Feb 2018 22:55:30 +0100 Subject: [PATCH 16/28] glue usual words like 'MONT' 'VAL' 'LE' 'LA' 'L' in an additionnal token --- addok_france/__init__.py | 1 + addok_france/utils.py | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/addok_france/__init__.py b/addok_france/__init__.py index 2b20ccb..42ca4ab 100644 --- a/addok_france/__init__.py +++ b/addok_france/__init__.py @@ -14,6 +14,7 @@ extract_address = yielder(utils.extract_address) glue_ordinal = utils.glue_ordinal fold_ordinal = yielder(utils.fold_ordinal) +fold_words = utils.fold_words flag_housenumber = utils.flag_housenumber make_labels = utils.make_labels remove_leading_zeros = yielder(utils.remove_leading_zeros) diff --git a/addok_france/utils.py b/addok_france/utils.py index 779ab89..b1f4fba 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -127,6 +127,16 @@ def fold_ordinal(s): return s +FOLD_WORDS = ["mont", "val", "le", "la", "l"] + +def fold_words(tokens): + """ folds 'MONT GRIFFON' into 'MONTGRIFFON' """ + for _, token, next_ in neighborhood(tokens): + yield token + if token in FOLD_WORDS and next_ and next_.isalpha() and len(next_)>2: + yield token.update(token+next_) + + def remove_leading_zeros(s): """0003 => 3.""" # Limit digits from 1 to 3 in order to avoid processing postcodes. From f6e2e0b2c6a2f039f0f738f2ff7b61b93fcb44d4 Mon Sep 17 00:00:00 2001 From: cquest Date: Wed, 21 Feb 2018 17:13:52 +0100 Subject: [PATCH 17/28] flod_words > glue_words --- addok_france/__init__.py | 2 +- addok_france/utils.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/addok_france/__init__.py b/addok_france/__init__.py index 42ca4ab..10507f5 100644 --- a/addok_france/__init__.py +++ b/addok_france/__init__.py @@ -14,7 +14,7 @@ extract_address = yielder(utils.extract_address) glue_ordinal = utils.glue_ordinal fold_ordinal = yielder(utils.fold_ordinal) -fold_words = utils.fold_words +glue_words = utils.glue_words flag_housenumber = utils.flag_housenumber make_labels = utils.make_labels remove_leading_zeros = yielder(utils.remove_leading_zeros) diff --git a/addok_france/utils.py b/addok_france/utils.py index b1f4fba..a5174e2 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -127,13 +127,13 @@ def fold_ordinal(s): return s -FOLD_WORDS = ["mont", "val", "le", "la", "l"] +GLUE_WORDS = ["mont", "val", "le", "la", "l", "champ"] -def fold_words(tokens): - """ folds 'MONT GRIFFON' into 'MONTGRIFFON' """ +def glue_words(tokens): + """ glue 'MONT GRIFFON' into 'MONTGRIFFON' """ for _, token, next_ in neighborhood(tokens): yield token - if token in FOLD_WORDS and next_ and next_.isalpha() and len(next_)>2: + if token in GLUE_WORDS and next_ and next_.isalpha() and len(next_)>2: yield token.update(token+next_) From 221584ac16dcd3b2b63728b3c6dab4092f50fa80 Mon Sep 17 00:00:00 2001 From: cquest Date: Wed, 21 Feb 2018 17:40:20 +0100 Subject: [PATCH 18/28] glue_words test --- tests/test_utils.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 969ee91..efc4dc3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,7 +8,7 @@ from addok.helpers.text import Token from addok_france.utils import (clean_query, extract_address, flag_housenumber, fold_ordinal, glue_ordinal, make_labels, - remove_leading_zeros) + remove_leading_zeros, glue_words) @pytest.mark.parametrize("input,expected", [ @@ -331,3 +331,15 @@ def test_make_municipality_labels(config): '59000 Lille', 'Lille 59000', ] + + +@pytest.mark.parametrize("inputs,expected", [ + (['mont', 'griffon'], ['mont', 'montgriffon', 'griffon']), + (['champ', 'vallon'], ['champ', 'champvallon', 'vallon']), + (['val', 'suzon'], ['val', 'valsuzon', 'suzon']), + (['l', 'a', 'peu', 'pres'], ['l', 'a', 'peu', 'pres']), + (['l', 'un', 'des'], ['l', 'un', 'des']), +]) +def test_glue_ordinal(inputs, expected): + tokens = [Token(input_) for input_ in inputs] + assert list(glue_words(tokens)) == expected From 60acc2337e0d61baacc5dbcaad712557e31868a6 Mon Sep 17 00:00:00 2001 From: cquest Date: Tue, 29 Oct 2019 09:13:20 +0100 Subject: [PATCH 19/28] more tests --- tests/test_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_utils.py b/tests/test_utils.py index 43f8967..8d0949e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -62,6 +62,8 @@ "20 avenue de Ségur 75 Paris"), ("20 avenue de Ségur TSA N 30719 75334 Paris Cedex 07", "20 avenue de Ségur 75 Paris"), + ("20 avenue de Ségur TSA N°30719 75334 Paris Cedex 07", + "20 avenue de Ségur 75 Paris"), ("20 rue saint germain CIDEX 304 89110 Poilly-sur-tholon", "20 rue saint germain 89110 Poilly-sur-tholon"), ("20 rue saint germain CIDEX N°304 89110 Poilly-sur-tholon", From 5ecc80f0469f17d1501e03335f34e470486d4bac Mon Sep 17 00:00:00 2001 From: cquest Date: Tue, 29 Oct 2019 09:13:39 +0100 Subject: [PATCH 20/28] test 0 manquant sur postcode --- tests/test_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_utils.py b/tests/test_utils.py index 8d0949e..ea0c608 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -80,6 +80,10 @@ "32bis Rue des Vosges 93290"), ("10 BLD DES F F I 85300 CHALLANS", "10 BLD DES F F I 85300 CHALLANS"), # done by glue_initials + ("6 rue de suisse 6000 Nice", + "6 rue de suisse 06000 Nice"), + ("6000 rue de suisse 6000 Nice", + "6000 rue de suisse 06000 Nice"), ]) def test_clean_query(input, expected): assert clean_query(input) == expected From 3b2faa76151e8e27c62719253cb562855d3441ff Mon Sep 17 00:00:00 2001 From: cquest Date: Tue, 29 Oct 2019 09:15:19 +0100 Subject: [PATCH 21/28] =?UTF-8?q?prise=20en=20compte=20des=20abr=C3=A9viat?= =?UTF-8?q?ions=20dans=20l'extraction?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- addok_france/utils.py | 19 +++++++++++-------- tests/test_utils.py | 5 +++++ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index be93f80..b240962 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -1,14 +1,17 @@ import re TYPES = [ - 'av(enue)?', 'r(ue)?', 'b(oulevar)?d', 'all[ée]es?', 'impasse', 'place', - 'chemin', 'rocade', 'route', 'l[ôo]tissement', 'mont[ée]e', 'c[ôo]te', - 'clos', 'champ', 'bois', 'taillis', 'boucle', 'passage', 'domaine', - 'étang', 'etang', 'quai', 'desserte', 'pré', 'porte', 'square', 'mont', - 'r[ée]sidence', 'parc', 'cours?', 'promenade', 'hameau', 'faubourg', - 'ilot', 'berges?', 'via', 'cit[ée]', 'sent(e|ier)', 'rond[- ][Pp]oint', - 'pas(se)?', 'carrefour', 'traverse', 'giratoire', 'esplanade', 'voie', - 'chauss[ée]e', + 'av(enue)?', 'r(ue)?', 'b(oulevar|l?v?)?d', 'all([ée]es?)?', 'imp(asse)?', 'pl(ace)?', + 'che?(m(in)?)?', 'rocade', 'r(ou)?te', 'l[ôo]t(issement)?', 'mont[ée]e', 'c[ôo]te', + 'clos', 'ch(am)?p', 'bois', 'taillis', 'b(ou)?cle', 'pass(age)?', 'dom(aine)?', + '[ée]ta?ng', 'desserte', 'pré', 'porte', 'squ?(are)?', 'mont', + 'r[ée]s(idence)?', 'parc', 'cours?', 'pro?m(enade)?', 'ham(eau)?', 'f(aubour|b|bour)?g', + 'ilot', 'ber(ges?)?', 'via', 'cit[ée]', 'sent(e|ier)', 'rond[- ][Pp]oint', 'rd?pt', + 'pas(se)?', 'carr?(efour)?', 'trav(erse)?', 'giratoire', 'espl?(anade)?', 'voie', + 'chauss[ée]e', 'aer(odrome)?', 'gr(ande?)?', 'gr(e|es|s)?', 'anc(ien(ne)?)?', 'c(en)?tre', + 'devi(ation)?', 'dig(ue)?', 'embr(anchement)?', 'jard(in)?', 'j(et)?te', 'p(asserel)?le', + 'p(or)?te', 'p(lace)?tte', 'p(arvis|rv|vr)', 'q(ua|rt)(ier)?', 'qu?(ai)?', + 'r(uel)?le','t(erra)?sse','tunn?(el)?', 'viad(uc)?', 'v(il)?la', ] TYPES_REGEX = '|'.join( map(lambda x: '[{}{}]{}'.format(x[0], x[0].upper(), x[1:]), TYPES) diff --git a/tests/test_utils.py b/tests/test_utils.py index ea0c608..0229d28 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -131,6 +131,11 @@ def test_clean_query(input, expected): "boulevard jean larrieu 44000 mont de marsan"), ("PARC D ACTIVITE DE SAUMATY 26 AV ANDRE ROUSSIN 13016 MARSEILLE 16", "26 AV ANDRE ROUSSIN 13016 MARSEILLE 16"), + # Abréviations + ("resid goelands 28 bis imp des petrels 76460 Saint-valery-en-caux", + "28 bis imp des petrels 76460 Saint-valery-en-caux"), + ("bla bla bl 28 r des moulins", + "28 r des moulins"), ("Non matching pattern", "Non matching pattern"), ]) From 25c34618ebe3ab6ab2731a2e8ee3a9f8d335ddaf Mon Sep 17 00:00:00 2001 From: cquest Date: Tue, 29 Oct 2019 09:15:58 +0100 Subject: [PATCH 22/28] suppression de bte/boite/case postale --- addok_france/utils.py | 2 +- tests/test_utils.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index b240962..8e70c28 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -53,7 +53,7 @@ def clean_query(q): - q = re.sub(r'(^| )(boite postale|b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', + q = re.sub(r'(^| )((b(oi)?te|case) postale|b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', r'\1', q, flags=re.IGNORECASE) q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', q, flags=re.IGNORECASE) diff --git a/tests/test_utils.py b/tests/test_utils.py index 0229d28..56af1c6 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -29,6 +29,12 @@ "159, avenue Jacques-Douzans 31604 Muret"), ("12, place de l'Hôtel-de-Ville BP 46 02150 Sissonne", "12, place de l'Hôtel-de-Ville 02150 Sissonne"), + ("12, place de l'Hôtel-de-Ville boite postale 46 02150 Sissonne", + "12, place de l'Hôtel-de-Ville 02150 Sissonne"), + ("12, place de l'Hôtel-de-Ville case postale 46 02150 Sissonne", + "12, place de l'Hôtel-de-Ville 02150 Sissonne"), + ("12, place de l'Hôtel-de-Ville bte postale 46 02150 Sissonne", + "12, place de l'Hôtel-de-Ville 02150 Sissonne"), ("6, rue Winston-Churchill CS 40055 60321 Compiègne", "6, rue Winston-Churchill 60321 Compiègne"), ("BP 80111 159, avenue Jacques-Douzans 31604 Muret Cedex", From 35709fe274f3b33a635450a613149c85addf1421 Mon Sep 17 00:00:00 2001 From: cquest Date: Tue, 29 Oct 2019 09:16:46 +0100 Subject: [PATCH 23/28] =?UTF-8?q?prise=20en=20compte=20de=20"1er=20=C3=A9t?= =?UTF-8?q?age"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- addok_france/utils.py | 2 +- tests/test_utils.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 8e70c28..6e042bf 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -60,7 +60,7 @@ def clean_query(q): q = re.sub(r'([^\d ])([\d]{5})([^\d]|$)', r'\1 \2 ', q, flags=re.IGNORECASE) q = re.sub(r'c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) - q = re.sub(r'\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE) + q = re.sub(r'\d{,2}(e|[eè]me|er) ([eé]tage)', '', q, flags=re.IGNORECASE) q = re.sub(r'((fax|t[eé]l|t[eé]l[eé]copieur)[ :,\.]*|)(\d{10}|[0-9][0-9][ -\./]\d\d[-\./ ]\d\d[-\./ ]\d\d[-\./ ]\d\d)', '', q, flags=re.IGNORECASE) q = re.sub(r' {2,}', ' ', q, flags=re.IGNORECASE) q = re.sub(r'[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) diff --git a/tests/test_utils.py b/tests/test_utils.py index 56af1c6..a1f59d5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -43,6 +43,8 @@ "Cite administrative - Rue Gustave-Delory 59017 Lille"), ("12e étage Rue Gustave-Delory 59017 Lille", "Rue Gustave-Delory 59017 Lille"), + ("Rue Gustave-Delory 1er étage 59017 Lille", + "Rue Gustave-Delory 59017 Lille"), ("12eme étage Rue Gustave-Delory 59017 Lille", "Rue Gustave-Delory 59017 Lille"), ("12ème étage Rue Gustave-Delory 59017 Lille", From aaf5f9592d29c342bd31a502f3a299f9e3ef6a96 Mon Sep 17 00:00:00 2001 From: cquest Date: Tue, 29 Oct 2019 09:17:14 +0100 Subject: [PATCH 24/28] ss -> sous --- addok_france/utils.py | 2 +- tests/test_utils.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 6e042bf..90a5516 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -64,7 +64,7 @@ def clean_query(q): q = re.sub(r'((fax|t[eé]l|t[eé]l[eé]copieur)[ :,\.]*|)(\d{10}|[0-9][0-9][ -\./]\d\d[-\./ ]\d\d[-\./ ]\d\d[-\./ ]\d\d)', '', q, flags=re.IGNORECASE) q = re.sub(r' {2,}', ' ', q, flags=re.IGNORECASE) q = re.sub(r'[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) - q = re.sub(r'[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE) + q = re.sub(r'[ -]s/?s[ -]', ' sous ', q, flags=re.IGNORECASE) q = re.sub(r'^lieux?[ -]?dits?\b(?=.)', '', q, flags=re.IGNORECASE) q = re.sub(r' (\d{4}) ', r' 0\1 ', q, flags=re.IGNORECASE) q = q.strip() diff --git a/tests/test_utils.py b/tests/test_utils.py index a1f59d5..c8129ce 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -54,6 +54,7 @@ ("air s/ l'adour", "air sur l'adour"), ("air-s/-l'adour", "air sur l'adour"), ("Saint Didier s/s Ecouves", "Saint Didier sous Ecouves"), + ("Saint Didier ss Ecouves", "Saint Didier sous Ecouves"), ("La Chapelle-aux-Brocs", "La Chapelle-aux-Brocs"), ("Lieu-Dit Les Chênes", "Les Chênes"), ("Lieu Dit Les Chênes", "Les Chênes"), From 30e3f3b4c3ce05d9105d3c9f7a894de04376b925 Mon Sep 17 00:00:00 2001 From: cquest Date: Sun, 8 Nov 2020 17:08:37 +0100 Subject: [PATCH 25/28] handle StopIteration exception (empty tokens) --- addok_france/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 90a5516..37f98af 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -84,7 +84,11 @@ def neighborhood(iterable, first=None, last=None): """ iterator = iter(iterable) previous = first - current = next(iterator) # Throws StopIteration if empty. + try: + current = next(iterator) + except StopIteration: # StopIteration if empty. + return + for next_ in iterator: yield (previous, current, next_) previous = current From 2a5e3b2cfb7ce0fc4c15a6ee6f67170febd918bb Mon Sep 17 00:00:00 2001 From: cquest Date: Sun, 8 Nov 2020 17:51:51 +0100 Subject: [PATCH 26/28] avoid bad transform in TYPES_REGEX due to leading [ --- addok_france/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 37f98af..b9cc507 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -4,7 +4,7 @@ 'av(enue)?', 'r(ue)?', 'b(oulevar|l?v?)?d', 'all([ée]es?)?', 'imp(asse)?', 'pl(ace)?', 'che?(m(in)?)?', 'rocade', 'r(ou)?te', 'l[ôo]t(issement)?', 'mont[ée]e', 'c[ôo]te', 'clos', 'ch(am)?p', 'bois', 'taillis', 'b(ou)?cle', 'pass(age)?', 'dom(aine)?', - '[ée]ta?ng', 'desserte', 'pré', 'porte', 'squ?(are)?', 'mont', + 'eta?ng', 'éta?ng', 'desserte', 'pré', 'porte', 'squ?(are)?', 'mont', 'r[ée]s(idence)?', 'parc', 'cours?', 'pro?m(enade)?', 'ham(eau)?', 'f(aubour|b|bour)?g', 'ilot', 'ber(ges?)?', 'via', 'cit[ée]', 'sent(e|ier)', 'rond[- ][Pp]oint', 'rd?pt', 'pas(se)?', 'carr?(efour)?', 'trav(erse)?', 'giratoire', 'espl?(anade)?', 'voie', From 1e1b9cadaaa85758ac0c881e0815fd6be7713297 Mon Sep 17 00:00:00 2001 From: cquest Date: Sun, 22 Nov 2020 18:18:25 +0100 Subject: [PATCH 27/28] added: glue_refs, to glue D 412 into D412 --- addok_france/__init__.py | 1 + addok_france/utils.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/addok_france/__init__.py b/addok_france/__init__.py index 2aed9e7..8424187 100644 --- a/addok_france/__init__.py +++ b/addok_france/__init__.py @@ -19,3 +19,4 @@ flag_housenumber = utils.flag_housenumber make_labels = utils.make_labels remove_leading_zeros = yielder(utils.remove_leading_zeros) +glue_refs = utils.glue_refs diff --git a/addok_france/utils.py b/addok_france/utils.py index b9cc507..e4a74e7 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -169,6 +169,25 @@ def glue_initials(tokens): yield token +GLUE_REFS = re.compile(r'^(a|n|rn|d|rd|m|rm)[0-9]+$', flags=re.IGNORECASE) + +def glue_refs(tokens): + ref = None + for _, token, next_ in neighborhood(tokens): + print(ref, token, next_) + if next_ and GLUE_REFS.match(token+next_): + ref = token+next_ + elif next_ and ref and GLUE_REFS.match(ref+next_): + ref = ref+next_ + elif ref: + yield token.update(re.sub(r'^r(n|d)', r'\1', ref)) + ref = None + elif GLUE_REFS.match(token): + yield token.update(re.sub(r'^r(n|d)', r'\1', token)) + else: + yield token + + def remove_leading_zeros(s): """0003 => 3.""" # Limit digits from 1 to 3 in order to avoid processing postcodes. From 3000457f01fb4489944e926fbd87c7173e4bc8c2 Mon Sep 17 00:00:00 2001 From: cquest Date: Sun, 22 Nov 2020 18:27:59 +0100 Subject: [PATCH 28/28] glue_refs test --- tests/test_utils.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index c8129ce..c6bc65f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -9,7 +9,7 @@ from addok_france.utils import (clean_query, extract_address, flag_housenumber, fold_ordinal, glue_ordinal, make_labels, remove_leading_zeros, glue_words, - glue_initials) + glue_initials, glue_refs) @pytest.mark.parametrize("input,expected", [ @@ -175,6 +175,18 @@ def test_glue_ordinal(inputs, expected): assert list(glue_ordinal(tokens)) == expected +@pytest.mark.parametrize("inputs,expected", [ + (['d', '412'], ['d412']), + (['rd', '30'], ['d30']), + (['d', '30', 'a', '4'], ['d30', 'a4']), + (['route', 'd', '30', 'a', '4','b'], ['route', 'd30', 'a4', 'b']), + (['route', '30', 'a'], ['route', '30', 'a']), +]) +def test_glue_refs(inputs, expected): + tokens = [Token(input_) for input_ in inputs] + assert list(glue_refs(tokens)) == expected + + @pytest.mark.parametrize("inputs,expected", [ (['6b'], True), (['6'], True),