Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better cleaning #8

Open
wants to merge 29 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
c2490d1
more cleanup BP/CS/TSA/CIDEX/CEDEX
cquest Feb 14, 2018
253cb54
cleanup BP/CS/TSA/CIDEX N° + tests
cquest Feb 14, 2018
3f75ccf
do not break on queries like "12bis"
cquest Feb 14, 2018
6d59df3
print() removed
cquest Feb 14, 2018
4e5be21
cleanup phone/fax numbers
cquest Feb 14, 2018
454ca50
fold initiales: F F I > F F I FFI, etc
cquest Feb 14, 2018
045c8f0
boite postale
cquest Feb 15, 2018
2261c92
WIP fold_initials "F F I" > "FFI"
cquest Feb 16, 2018
5f11ac4
fold_initials + tests
cquest Feb 16, 2018
330169a
pep8
cquest Feb 16, 2018
fd5d9ee
pep8
cquest Feb 16, 2018
3ccc267
pep8
cquest Feb 16, 2018
9b90da3
separate PR
cquest Feb 16, 2018
a3017f8
fold_initials is a yielder
cquest Feb 18, 2018
bb92a1d
fold_initials > glue_initials
cquest Feb 21, 2018
01cfa2a
glue usual words like 'MONT' 'VAL' 'LE' 'LA' 'L' in an additionnal token
cquest Feb 18, 2018
f6e2e0b
flod_words > glue_words
cquest Feb 21, 2018
221584a
glue_words test
cquest Feb 21, 2018
904ac5c
Merge branch 'glue_initials' into glue_words
cquest Oct 29, 2019
60acc23
more tests
cquest Oct 29, 2019
5ecc80f
test 0 manquant sur postcode
cquest Oct 29, 2019
3b2faa7
prise en compte des abréviations dans l'extraction
cquest Oct 29, 2019
25c3461
suppression de bte/boite/case postale
cquest Oct 29, 2019
35709fe
prise en compte de "1er étage"
cquest Oct 29, 2019
aaf5f95
ss -> sous
cquest Oct 29, 2019
30e3f3b
handle StopIteration exception (empty tokens)
cquest Nov 8, 2020
2a5e3b2
avoid bad transform in TYPES_REGEX due to leading [
cquest Nov 8, 2020
1e1b9ca
added: glue_refs, to glue D 412 into D412
cquest Nov 22, 2020
3000457
glue_refs test
cquest Nov 22, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions addok_france/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
extract_address = yielder(utils.extract_address)
glue_ordinal = utils.glue_ordinal
fold_ordinal = yielder(utils.fold_ordinal)
glue_words = utils.glue_words
glue_initials = utils.glue_initials
flag_housenumber = utils.flag_housenumber
make_labels = utils.make_labels
remove_leading_zeros = yielder(utils.remove_leading_zeros)
glue_refs = utils.glue_refs
99 changes: 80 additions & 19 deletions addok_france/utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import re

TYPES = [
'av(enue)?', 'r(ue)?', 'b(oulevar)?d', 'all[ée]es?', 'impasse', 'place',
'chemin', 'rocade', 'route', 'l[ôo]tissement', 'mont[ée]e', 'c[ôo]te',
'clos', 'champ', 'bois', 'taillis', 'boucle', 'passage', 'domaine',
'étang', 'etang', 'quai', 'desserte', 'pré', 'porte', 'square', 'mont',
'r[ée]sidence', 'parc', 'cours?', 'promenade', 'hameau', 'faubourg',
'ilot', 'berges?', 'via', 'cit[ée]', 'sent(e|ier)', 'rond[- ][Pp]oint',
'pas(se)?', 'carrefour', 'traverse', 'giratoire', 'esplanade', 'voie',
'chauss[ée]e',
'av(enue)?', 'r(ue)?', 'b(oulevar|l?v?)?d', 'all([ée]es?)?', 'imp(asse)?', 'pl(ace)?',
'che?(m(in)?)?', 'rocade', 'r(ou)?te', 'l[ôo]t(issement)?', 'mont[ée]e', 'c[ôo]te',
'clos', 'ch(am)?p', 'bois', 'taillis', 'b(ou)?cle', 'pass(age)?', 'dom(aine)?',
'eta?ng', 'éta?ng', 'desserte', 'pré', 'porte', 'squ?(are)?', 'mont',
'r[ée]s(idence)?', 'parc', 'cours?', 'pro?m(enade)?', 'ham(eau)?', 'f(aubour|b|bour)?g',
'ilot', 'ber(ges?)?', 'via', 'cit[ée]', 'sent(e|ier)', 'rond[- ][Pp]oint', 'rd?pt',
'pas(se)?', 'carr?(efour)?', 'trav(erse)?', 'giratoire', 'espl?(anade)?', 'voie',
'chauss[ée]e', 'aer(odrome)?', 'gr(ande?)?', 'gr(e|es|s)?', 'anc(ien(ne)?)?', 'c(en)?tre',
'devi(ation)?', 'dig(ue)?', 'embr(anchement)?', 'jard(in)?', 'j(et)?te', 'p(asserel)?le',
'p(or)?te', 'p(lace)?tte', 'p(arvis|rv|vr)', 'q(ua|rt)(ier)?', 'qu?(ai)?',
'r(uel)?le','t(erra)?sse','tunn?(el)?', 'viad(uc)?', 'v(il)?la',
]
TYPES_REGEX = '|'.join(
map(lambda x: '[{}{}]{}'.format(x[0], x[0].upper(), x[1:]), TYPES)
Expand Down Expand Up @@ -50,14 +53,20 @@


def clean_query(q):
q = re.sub(r'([\d]{5})', r' \1 ', q, flags=re.IGNORECASE)
q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE)
q = re.sub(r'\b(bp|cs|tsa|cidex) *[\d]*', '', q, flags=re.IGNORECASE)
q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE)
q = re.sub(' {2,}', ' ', q, flags=re.IGNORECASE)
q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE)
q = re.sub('[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE)
q = re.sub('^lieux?[ -]?dits?\\b(?=.)', '', q, flags=re.IGNORECASE)
q = re.sub(r'(^| )((b(oi)?te|case) postale|b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *',
r'\1', q, flags=re.IGNORECASE)
q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2',
q, flags=re.IGNORECASE)
q = re.sub(r'([^\d ])([\d]{5})([^\d]|$)', r'\1 \2 ',
q, flags=re.IGNORECASE)
q = re.sub(r'c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE)
q = re.sub(r'\d{,2}(e|[eè]me|er) ([eé]tage)', '', q, flags=re.IGNORECASE)
q = re.sub(r'((fax|t[eé]l|t[eé]l[eé]copieur)[ :,\.]*|)(\d{10}|[0-9][0-9][ -\./]\d\d[-\./ ]\d\d[-\./ ]\d\d[-\./ ]\d\d)', '', q, flags=re.IGNORECASE)
q = re.sub(r' {2,}', ' ', q, flags=re.IGNORECASE)
q = re.sub(r'[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE)
q = re.sub(r'[ -]s/?s[ -]', ' sous ', q, flags=re.IGNORECASE)
q = re.sub(r'^lieux?[ -]?dits?\b(?=.)', '', q, flags=re.IGNORECASE)
q = re.sub(r' (\d{4}) ', r' 0\1 ', q, flags=re.IGNORECASE)
q = q.strip()
return q

Expand All @@ -75,7 +84,11 @@ def neighborhood(iterable, first=None, last=None):
"""
iterator = iter(iterable)
previous = first
current = next(iterator) # Throws StopIteration if empty.
try:
current = next(iterator)
except StopIteration: # StopIteration if empty.
return

for next_ in iterator:
yield (previous, current, next_)
previous = current
Expand Down Expand Up @@ -108,7 +121,7 @@ def flag_housenumber(tokens):
found = False
for previous, token, next_ in neighborhood(tokens):
if ((token.is_first or (next_ and TYPES_PATTERN.match(next_)))
and NUMBER_PATTERN.match(token) and not found):
and NUMBER_PATTERN.match(token) and not found):
token.kind = 'housenumber'
found = True
yield token
Expand All @@ -123,10 +136,58 @@ def fold_ordinal(s):
pass
else:
s = s.update('{}{}'.format(number,
FOLD.get(ordinal.lower(), ordinal)))
FOLD.get(ordinal.lower(), ordinal)))
return s


GLUE_WORDS = ["mont", "val", "le", "la", "l", "champ"]

def glue_words(tokens):
""" glue 'MONT GRIFFON' into 'MONTGRIFFON' """
for _, token, next_ in neighborhood(tokens):
if token != next_:
yield token
if token in GLUE_WORDS and next_ and next_.isalpha() and len(next_)>2:
yield token.update(token+next_)


def glue_initials(tokens):
""" glue 'F F I' into 'FFI' """
initials = []
for _, token, next_ in neighborhood(tokens):
isinitial = len(token) == 1 and token.isalpha()
if isinitial:
initials.append(token)
if not next_ or not isinitial:
if len(initials) > 2:
yield initials[0].update("".join(initials))
else:
for tk in initials:
yield tk
initials = []
if not isinitial:
yield token


GLUE_REFS = re.compile(r'^(a|n|rn|d|rd|m|rm)[0-9]+$', flags=re.IGNORECASE)

def glue_refs(tokens):
ref = None
for _, token, next_ in neighborhood(tokens):
print(ref, token, next_)
if next_ and GLUE_REFS.match(token+next_):
ref = token+next_
elif next_ and ref and GLUE_REFS.match(ref+next_):
ref = ref+next_
elif ref:
yield token.update(re.sub(r'^r(n|d)', r'\1', ref))
ref = None
elif GLUE_REFS.match(token):
yield token.update(re.sub(r'^r(n|d)', r'\1', token))
else:
yield token


def remove_leading_zeros(s):
"""0003 => 3."""
# Limit digits from 1 to 3 in order to avoid processing postcodes.
Expand Down
98 changes: 92 additions & 6 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,43 @@
from addok.helpers.text import Token
from addok_france.utils import (clean_query, extract_address, flag_housenumber,
fold_ordinal, glue_ordinal, make_labels,
remove_leading_zeros)
remove_leading_zeros, glue_words,
glue_initials, glue_refs)


@pytest.mark.parametrize("input,expected", [
("2 allée Jules Guesde 31068 TOULOUSE CEDEX 7",
"2 allée Jules Guesde 31068 TOULOUSE"),
"2 allée Jules Guesde 31 TOULOUSE"),
("7, avenue Léon-Blum 31507 Toulouse Cedex 5",
"7, avenue Léon-Blum 31507 Toulouse"),
"7, avenue Léon-Blum 31 Toulouse"),
("159, avenue Jacques-Douzans 31604 Muret Cedex",
"159, avenue Jacques-Douzans 31604 Muret"),
"159, avenue Jacques-Douzans 31 Muret"),
("2 allée Jules Guesde BP 7015 31068 TOULOUSE",
"2 allée Jules Guesde 31068 TOULOUSE"),
("2 allée Jules Guesde B.P. 7015 31068 TOULOUSE",
"2 allée Jules Guesde 31068 TOULOUSE"),
("2 allée Jules Guesde B.P. N 7015 31068 TOULOUSE",
"2 allée Jules Guesde 31068 TOULOUSE"),
("BP 80111 159, avenue Jacques-Douzans 31604 Muret",
"159, avenue Jacques-Douzans 31604 Muret"),
("12, place de l'Hôtel-de-Ville BP 46 02150 Sissonne",
"12, place de l'Hôtel-de-Ville 02150 Sissonne"),
("12, place de l'Hôtel-de-Ville boite postale 46 02150 Sissonne",
"12, place de l'Hôtel-de-Ville 02150 Sissonne"),
("12, place de l'Hôtel-de-Ville case postale 46 02150 Sissonne",
"12, place de l'Hôtel-de-Ville 02150 Sissonne"),
("12, place de l'Hôtel-de-Ville bte postale 46 02150 Sissonne",
"12, place de l'Hôtel-de-Ville 02150 Sissonne"),
("6, rue Winston-Churchill CS 40055 60321 Compiègne",
"6, rue Winston-Churchill 60321 Compiègne"),
("BP 80111 159, avenue Jacques-Douzans 31604 Muret Cedex",
"159, avenue Jacques-Douzans 31604 Muret"),
"159, avenue Jacques-Douzans 31 Muret"),
("BP 20169 Cite administrative - 8e étage Rue Gustave-Delory 59017 Lille",
"Cite administrative - Rue Gustave-Delory 59017 Lille"),
("12e étage Rue Gustave-Delory 59017 Lille",
"Rue Gustave-Delory 59017 Lille"),
("Rue Gustave-Delory 1er étage 59017 Lille",
"Rue Gustave-Delory 59017 Lille"),
("12eme étage Rue Gustave-Delory 59017 Lille",
"Rue Gustave-Delory 59017 Lille"),
("12ème étage Rue Gustave-Delory 59017 Lille",
Expand All @@ -41,6 +54,7 @@
("air s/ l'adour", "air sur l'adour"),
("air-s/-l'adour", "air sur l'adour"),
("Saint Didier s/s Ecouves", "Saint Didier sous Ecouves"),
("Saint Didier ss Ecouves", "Saint Didier sous Ecouves"),
("La Chapelle-aux-Brocs", "La Chapelle-aux-Brocs"),
("Lieu-Dit Les Chênes", "Les Chênes"),
("Lieu Dit Les Chênes", "Les Chênes"),
Expand All @@ -52,9 +66,33 @@
("32bis Rue des Vosges93290",
"32bis Rue des Vosges 93290"),
("20 avenue de Ségur TSA 30719 75334 Paris Cedex 07",
"20 avenue de Ségur 75334 Paris"),
"20 avenue de Ségur 75 Paris"),
("20 avenue de Ségur TSA No30719 75334 Paris Cedex 07",
"20 avenue de Ségur 75 Paris"),
("20 avenue de Ségur TSA N 30719 75334 Paris Cedex 07",
"20 avenue de Ségur 75 Paris"),
("20 avenue de Ségur TSA N°30719 75334 Paris Cedex 07",
"20 avenue de Ségur 75 Paris"),
("20 rue saint germain CIDEX 304 89110 Poilly-sur-tholon",
"20 rue saint germain 89110 Poilly-sur-tholon"),
("20 rue saint germain CIDEX N°304 89110 Poilly-sur-tholon",
"20 rue saint germain 89110 Poilly-sur-tholon"),
("20 rue saint germain 89110 Poilly-sur-tholon 01.23.45.67.89",
"20 rue saint germain 89110 Poilly-sur-tholon"),
("32bis Rue des Vosges93290 fax: 0123456789",
"32bis Rue des Vosges 93290"),
("32bis Rue des Vosges 93290 tel 01 23 45 67 89",
"32bis Rue des Vosges 93290"),
("32bis Rue des Vosges 93290 telecopieur. 01/23/45/67/89",
"32bis Rue des Vosges 93290"),
("32bis Rue des Vosges 93290 télécopieur, 01-23-45-67-89",
"32bis Rue des Vosges 93290"),
("10 BLD DES F F I 85300 CHALLANS",
"10 BLD DES F F I 85300 CHALLANS"), # done by glue_initials
("6 rue de suisse 6000 Nice",
"6 rue de suisse 06000 Nice"),
("6000 rue de suisse 6000 Nice",
"6000 rue de suisse 06000 Nice"),
])
def test_clean_query(input, expected):
assert clean_query(input) == expected
Expand Down Expand Up @@ -102,6 +140,11 @@ def test_clean_query(input, expected):
"boulevard jean larrieu 44000 mont de marsan"),
("PARC D ACTIVITE DE SAUMATY 26 AV ANDRE ROUSSIN 13016 MARSEILLE 16",
"26 AV ANDRE ROUSSIN 13016 MARSEILLE 16"),
# Abréviations
("resid goelands 28 bis imp des petrels 76460 Saint-valery-en-caux",
"28 bis imp des petrels 76460 Saint-valery-en-caux"),
("bla bla bl 28 r des moulins",
"28 r des moulins"),
("Non matching pattern",
"Non matching pattern"),
])
Expand Down Expand Up @@ -132,6 +175,18 @@ def test_glue_ordinal(inputs, expected):
assert list(glue_ordinal(tokens)) == expected


@pytest.mark.parametrize("inputs,expected", [
(['d', '412'], ['d412']),
(['rd', '30'], ['d30']),
(['d', '30', 'a', '4'], ['d30', 'a4']),
(['route', 'd', '30', 'a', '4','b'], ['route', 'd30', 'a4', 'b']),
(['route', '30', 'a'], ['route', '30', 'a']),
])
def test_glue_refs(inputs, expected):
tokens = [Token(input_) for input_ in inputs]
assert list(glue_refs(tokens)) == expected


@pytest.mark.parametrize("inputs,expected", [
(['6b'], True),
(['6'], True),
Expand Down Expand Up @@ -331,3 +386,34 @@ def test_make_municipality_labels(config):
'59000 Lille',
'Lille 59000',
]


@pytest.mark.parametrize("inputs,expected", [
(['mont', 'griffon'], ['mont', 'montgriffon', 'griffon']),
(['champ', 'vallon'], ['champ', 'champvallon', 'vallon']),
(['val', 'suzon'], ['val', 'valsuzon', 'suzon']),
(['l', 'a', 'peu', 'pres'], ['l', 'a', 'peu', 'pres']),
(['l', 'un', 'des'], ['l', 'un', 'des']),
])
def test_glue_ordinal(inputs, expected):
tokens = [Token(input_) for input_ in inputs]
assert list(glue_words(tokens)) == expected


@pytest.mark.parametrize("inputs,expected", [
(['allee', 'a', 'b', 'c', 'toto'],
['allee', 'abc', 'toto']),
(['allee', 'a', 'b', 'c', 'toto', 'd', 'e', 'f'],
['allee', 'abc', 'toto', 'def']),
(['allee', 'a', '2', 'c', 'toto'],
['allee', 'a', '2', 'c', 'toto']),
(['allee', 'a', 'b', 'c'],
['allee', 'abc']),
(['allee', 'a', 'b', 'c', 'd'],
['allee', 'abcd']),
(['allee', 'a', 'b', 'c', 'd', 'e'],
['allee', 'abcde']),
])
def test_glue_initials(inputs, expected):
tokens = [Token(input_) for input_ in inputs]
assert list(glue_initials(tokens)) == expected