diff --git a/addok_france/__init__.py b/addok_france/__init__.py index 2b20ccb..8424187 100644 --- a/addok_france/__init__.py +++ b/addok_france/__init__.py @@ -14,6 +14,9 @@ extract_address = yielder(utils.extract_address) glue_ordinal = utils.glue_ordinal fold_ordinal = yielder(utils.fold_ordinal) +glue_words = utils.glue_words +glue_initials = utils.glue_initials flag_housenumber = utils.flag_housenumber make_labels = utils.make_labels remove_leading_zeros = yielder(utils.remove_leading_zeros) +glue_refs = utils.glue_refs diff --git a/addok_france/utils.py b/addok_france/utils.py index 779ab89..e4a74e7 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -1,14 +1,17 @@ import re TYPES = [ - 'av(enue)?', 'r(ue)?', 'b(oulevar)?d', 'all[ée]es?', 'impasse', 'place', - 'chemin', 'rocade', 'route', 'l[ôo]tissement', 'mont[ée]e', 'c[ôo]te', - 'clos', 'champ', 'bois', 'taillis', 'boucle', 'passage', 'domaine', - 'étang', 'etang', 'quai', 'desserte', 'pré', 'porte', 'square', 'mont', - 'r[ée]sidence', 'parc', 'cours?', 'promenade', 'hameau', 'faubourg', - 'ilot', 'berges?', 'via', 'cit[ée]', 'sent(e|ier)', 'rond[- ][Pp]oint', - 'pas(se)?', 'carrefour', 'traverse', 'giratoire', 'esplanade', 'voie', - 'chauss[ée]e', + 'av(enue)?', 'r(ue)?', 'b(oulevar|l?v?)?d', 'all([ée]es?)?', 'imp(asse)?', 'pl(ace)?', + 'che?(m(in)?)?', 'rocade', 'r(ou)?te', 'l[ôo]t(issement)?', 'mont[ée]e', 'c[ôo]te', + 'clos', 'ch(am)?p', 'bois', 'taillis', 'b(ou)?cle', 'pass(age)?', 'dom(aine)?', + 'eta?ng', 'éta?ng', 'desserte', 'pré', 'porte', 'squ?(are)?', 'mont', + 'r[ée]s(idence)?', 'parc', 'cours?', 'pro?m(enade)?', 'ham(eau)?', 'f(aubour|b|bour)?g', + 'ilot', 'ber(ges?)?', 'via', 'cit[ée]', 'sent(e|ier)', 'rond[- ][Pp]oint', 'rd?pt', + 'pas(se)?', 'carr?(efour)?', 'trav(erse)?', 'giratoire', 'espl?(anade)?', 'voie', + 'chauss[ée]e', 'aer(odrome)?', 'gr(ande?)?', 'gr(e|es|s)?', 'anc(ien(ne)?)?', 'c(en)?tre', + 'devi(ation)?', 'dig(ue)?', 'embr(anchement)?', 'jard(in)?', 'j(et)?te', 'p(asserel)?le', + 'p(or)?te', 'p(lace)?tte', 'p(arvis|rv|vr)', 'q(ua|rt)(ier)?', 'qu?(ai)?', + 'r(uel)?le','t(erra)?sse','tunn?(el)?', 'viad(uc)?', 'v(il)?la', ] TYPES_REGEX = '|'.join( map(lambda x: '[{}{}]{}'.format(x[0], x[0].upper(), x[1:]), TYPES) @@ -50,14 +53,20 @@ def clean_query(q): - q = re.sub(r'([\d]{5})', r' \1 ', q, flags=re.IGNORECASE) - q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) - q = re.sub(r'\b(bp|cs|tsa|cidex) *[\d]*', '', q, flags=re.IGNORECASE) - q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE) - q = re.sub(' {2,}', ' ', q, flags=re.IGNORECASE) - q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) - q = re.sub('[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE) - q = re.sub('^lieux?[ -]?dits?\\b(?=.)', '', q, flags=re.IGNORECASE) + q = re.sub(r'(^| )((b(oi)?te|case) postale|b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', + r'\1', q, flags=re.IGNORECASE) + q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', + q, flags=re.IGNORECASE) + q = re.sub(r'([^\d ])([\d]{5})([^\d]|$)', r'\1 \2 ', + q, flags=re.IGNORECASE) + q = re.sub(r'c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) + q = re.sub(r'\d{,2}(e|[eè]me|er) ([eé]tage)', '', q, flags=re.IGNORECASE) + q = re.sub(r'((fax|t[eé]l|t[eé]l[eé]copieur)[ :,\.]*|)(\d{10}|[0-9][0-9][ -\./]\d\d[-\./ ]\d\d[-\./ ]\d\d[-\./ ]\d\d)', '', q, flags=re.IGNORECASE) + q = re.sub(r' {2,}', ' ', q, flags=re.IGNORECASE) + q = re.sub(r'[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) + q = re.sub(r'[ -]s/?s[ -]', ' sous ', q, flags=re.IGNORECASE) + q = re.sub(r'^lieux?[ -]?dits?\b(?=.)', '', q, flags=re.IGNORECASE) + q = re.sub(r' (\d{4}) ', r' 0\1 ', q, flags=re.IGNORECASE) q = q.strip() return q @@ -75,7 +84,11 @@ def neighborhood(iterable, first=None, last=None): """ iterator = iter(iterable) previous = first - current = next(iterator) # Throws StopIteration if empty. + try: + current = next(iterator) + except StopIteration: # StopIteration if empty. + return + for next_ in iterator: yield (previous, current, next_) previous = current @@ -108,7 +121,7 @@ def flag_housenumber(tokens): found = False for previous, token, next_ in neighborhood(tokens): if ((token.is_first or (next_ and TYPES_PATTERN.match(next_))) - and NUMBER_PATTERN.match(token) and not found): + and NUMBER_PATTERN.match(token) and not found): token.kind = 'housenumber' found = True yield token @@ -123,10 +136,58 @@ def fold_ordinal(s): pass else: s = s.update('{}{}'.format(number, - FOLD.get(ordinal.lower(), ordinal))) + FOLD.get(ordinal.lower(), ordinal))) return s +GLUE_WORDS = ["mont", "val", "le", "la", "l", "champ"] + +def glue_words(tokens): + """ glue 'MONT GRIFFON' into 'MONTGRIFFON' """ + for _, token, next_ in neighborhood(tokens): + if token != next_: + yield token + if token in GLUE_WORDS and next_ and next_.isalpha() and len(next_)>2: + yield token.update(token+next_) + + +def glue_initials(tokens): + """ glue 'F F I' into 'FFI' """ + initials = [] + for _, token, next_ in neighborhood(tokens): + isinitial = len(token) == 1 and token.isalpha() + if isinitial: + initials.append(token) + if not next_ or not isinitial: + if len(initials) > 2: + yield initials[0].update("".join(initials)) + else: + for tk in initials: + yield tk + initials = [] + if not isinitial: + yield token + + +GLUE_REFS = re.compile(r'^(a|n|rn|d|rd|m|rm)[0-9]+$', flags=re.IGNORECASE) + +def glue_refs(tokens): + ref = None + for _, token, next_ in neighborhood(tokens): + print(ref, token, next_) + if next_ and GLUE_REFS.match(token+next_): + ref = token+next_ + elif next_ and ref and GLUE_REFS.match(ref+next_): + ref = ref+next_ + elif ref: + yield token.update(re.sub(r'^r(n|d)', r'\1', ref)) + ref = None + elif GLUE_REFS.match(token): + yield token.update(re.sub(r'^r(n|d)', r'\1', token)) + else: + yield token + + def remove_leading_zeros(s): """0003 => 3.""" # Limit digits from 1 to 3 in order to avoid processing postcodes. diff --git a/tests/test_utils.py b/tests/test_utils.py index 969ee91..c6bc65f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,30 +8,43 @@ from addok.helpers.text import Token from addok_france.utils import (clean_query, extract_address, flag_housenumber, fold_ordinal, glue_ordinal, make_labels, - remove_leading_zeros) + remove_leading_zeros, glue_words, + glue_initials, glue_refs) @pytest.mark.parametrize("input,expected", [ ("2 allée Jules Guesde 31068 TOULOUSE CEDEX 7", - "2 allée Jules Guesde 31068 TOULOUSE"), + "2 allée Jules Guesde 31 TOULOUSE"), ("7, avenue Léon-Blum 31507 Toulouse Cedex 5", - "7, avenue Léon-Blum 31507 Toulouse"), + "7, avenue Léon-Blum 31 Toulouse"), ("159, avenue Jacques-Douzans 31604 Muret Cedex", - "159, avenue Jacques-Douzans 31604 Muret"), + "159, avenue Jacques-Douzans 31 Muret"), ("2 allée Jules Guesde BP 7015 31068 TOULOUSE", "2 allée Jules Guesde 31068 TOULOUSE"), + ("2 allée Jules Guesde B.P. 7015 31068 TOULOUSE", + "2 allée Jules Guesde 31068 TOULOUSE"), + ("2 allée Jules Guesde B.P. N 7015 31068 TOULOUSE", + "2 allée Jules Guesde 31068 TOULOUSE"), ("BP 80111 159, avenue Jacques-Douzans 31604 Muret", "159, avenue Jacques-Douzans 31604 Muret"), ("12, place de l'Hôtel-de-Ville BP 46 02150 Sissonne", "12, place de l'Hôtel-de-Ville 02150 Sissonne"), + ("12, place de l'Hôtel-de-Ville boite postale 46 02150 Sissonne", + "12, place de l'Hôtel-de-Ville 02150 Sissonne"), + ("12, place de l'Hôtel-de-Ville case postale 46 02150 Sissonne", + "12, place de l'Hôtel-de-Ville 02150 Sissonne"), + ("12, place de l'Hôtel-de-Ville bte postale 46 02150 Sissonne", + "12, place de l'Hôtel-de-Ville 02150 Sissonne"), ("6, rue Winston-Churchill CS 40055 60321 Compiègne", "6, rue Winston-Churchill 60321 Compiègne"), ("BP 80111 159, avenue Jacques-Douzans 31604 Muret Cedex", - "159, avenue Jacques-Douzans 31604 Muret"), + "159, avenue Jacques-Douzans 31 Muret"), ("BP 20169 Cite administrative - 8e étage Rue Gustave-Delory 59017 Lille", "Cite administrative - Rue Gustave-Delory 59017 Lille"), ("12e étage Rue Gustave-Delory 59017 Lille", "Rue Gustave-Delory 59017 Lille"), + ("Rue Gustave-Delory 1er étage 59017 Lille", + "Rue Gustave-Delory 59017 Lille"), ("12eme étage Rue Gustave-Delory 59017 Lille", "Rue Gustave-Delory 59017 Lille"), ("12ème étage Rue Gustave-Delory 59017 Lille", @@ -41,6 +54,7 @@ ("air s/ l'adour", "air sur l'adour"), ("air-s/-l'adour", "air sur l'adour"), ("Saint Didier s/s Ecouves", "Saint Didier sous Ecouves"), + ("Saint Didier ss Ecouves", "Saint Didier sous Ecouves"), ("La Chapelle-aux-Brocs", "La Chapelle-aux-Brocs"), ("Lieu-Dit Les Chênes", "Les Chênes"), ("Lieu Dit Les Chênes", "Les Chênes"), @@ -52,9 +66,33 @@ ("32bis Rue des Vosges93290", "32bis Rue des Vosges 93290"), ("20 avenue de Ségur TSA 30719 75334 Paris Cedex 07", - "20 avenue de Ségur 75334 Paris"), + "20 avenue de Ségur 75 Paris"), + ("20 avenue de Ségur TSA No30719 75334 Paris Cedex 07", + "20 avenue de Ségur 75 Paris"), + ("20 avenue de Ségur TSA N 30719 75334 Paris Cedex 07", + "20 avenue de Ségur 75 Paris"), + ("20 avenue de Ségur TSA N°30719 75334 Paris Cedex 07", + "20 avenue de Ségur 75 Paris"), ("20 rue saint germain CIDEX 304 89110 Poilly-sur-tholon", "20 rue saint germain 89110 Poilly-sur-tholon"), + ("20 rue saint germain CIDEX N°304 89110 Poilly-sur-tholon", + "20 rue saint germain 89110 Poilly-sur-tholon"), + ("20 rue saint germain 89110 Poilly-sur-tholon 01.23.45.67.89", + "20 rue saint germain 89110 Poilly-sur-tholon"), + ("32bis Rue des Vosges93290 fax: 0123456789", + "32bis Rue des Vosges 93290"), + ("32bis Rue des Vosges 93290 tel 01 23 45 67 89", + "32bis Rue des Vosges 93290"), + ("32bis Rue des Vosges 93290 telecopieur. 01/23/45/67/89", + "32bis Rue des Vosges 93290"), + ("32bis Rue des Vosges 93290 télécopieur, 01-23-45-67-89", + "32bis Rue des Vosges 93290"), + ("10 BLD DES F F I 85300 CHALLANS", + "10 BLD DES F F I 85300 CHALLANS"), # done by glue_initials + ("6 rue de suisse 6000 Nice", + "6 rue de suisse 06000 Nice"), + ("6000 rue de suisse 6000 Nice", + "6000 rue de suisse 06000 Nice"), ]) def test_clean_query(input, expected): assert clean_query(input) == expected @@ -102,6 +140,11 @@ def test_clean_query(input, expected): "boulevard jean larrieu 44000 mont de marsan"), ("PARC D ACTIVITE DE SAUMATY 26 AV ANDRE ROUSSIN 13016 MARSEILLE 16", "26 AV ANDRE ROUSSIN 13016 MARSEILLE 16"), + # Abréviations + ("resid goelands 28 bis imp des petrels 76460 Saint-valery-en-caux", + "28 bis imp des petrels 76460 Saint-valery-en-caux"), + ("bla bla bl 28 r des moulins", + "28 r des moulins"), ("Non matching pattern", "Non matching pattern"), ]) @@ -132,6 +175,18 @@ def test_glue_ordinal(inputs, expected): assert list(glue_ordinal(tokens)) == expected +@pytest.mark.parametrize("inputs,expected", [ + (['d', '412'], ['d412']), + (['rd', '30'], ['d30']), + (['d', '30', 'a', '4'], ['d30', 'a4']), + (['route', 'd', '30', 'a', '4','b'], ['route', 'd30', 'a4', 'b']), + (['route', '30', 'a'], ['route', '30', 'a']), +]) +def test_glue_refs(inputs, expected): + tokens = [Token(input_) for input_ in inputs] + assert list(glue_refs(tokens)) == expected + + @pytest.mark.parametrize("inputs,expected", [ (['6b'], True), (['6'], True), @@ -331,3 +386,34 @@ def test_make_municipality_labels(config): '59000 Lille', 'Lille 59000', ] + + +@pytest.mark.parametrize("inputs,expected", [ + (['mont', 'griffon'], ['mont', 'montgriffon', 'griffon']), + (['champ', 'vallon'], ['champ', 'champvallon', 'vallon']), + (['val', 'suzon'], ['val', 'valsuzon', 'suzon']), + (['l', 'a', 'peu', 'pres'], ['l', 'a', 'peu', 'pres']), + (['l', 'un', 'des'], ['l', 'un', 'des']), +]) +def test_glue_ordinal(inputs, expected): + tokens = [Token(input_) for input_ in inputs] + assert list(glue_words(tokens)) == expected + + +@pytest.mark.parametrize("inputs,expected", [ + (['allee', 'a', 'b', 'c', 'toto'], + ['allee', 'abc', 'toto']), + (['allee', 'a', 'b', 'c', 'toto', 'd', 'e', 'f'], + ['allee', 'abc', 'toto', 'def']), + (['allee', 'a', '2', 'c', 'toto'], + ['allee', 'a', '2', 'c', 'toto']), + (['allee', 'a', 'b', 'c'], + ['allee', 'abc']), + (['allee', 'a', 'b', 'c', 'd'], + ['allee', 'abcd']), + (['allee', 'a', 'b', 'c', 'd', 'e'], + ['allee', 'abcde']), +]) +def test_glue_initials(inputs, expected): + tokens = [Token(input_) for input_ in inputs] + assert list(glue_initials(tokens)) == expected