-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsoundexpy.py
92 lines (78 loc) · 1.91 KB
/
soundexpy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import re
"""
This module encodes a string using Soundex, as described by
http://en.wikipedia.org/w/index.php?title=Soundex&oldid=466065377
Only strings with the letters A-Z and of length >= 2 are supported.
"""
invalid_re = re.compile("[AEHIOUWY]|[^A-Z]")
charsubs = {'B': '1', 'F': '1', 'P': '1', 'V': '1',
'C': '2', 'G': '2', 'J': '2', 'K': '2',
'Q': '2', 'S': '2', 'X': '2', 'Z': '2',
'D': '3', 'T': '3', 'L': '4', 'M': '5',
'N': '5', 'R': '6'}
def normalize(s):
""" Returns a copy of s without invalid chars and repeated letters. """
# remove invalid chars
first = s[0].upper()
s = re.sub(invalid_re, "", s.upper()[1:])
# remove repeated chars
char = None
s_clean = first
for c in s:
if char != c:
s_clean += c
char = c
return s_clean
def soundex(s):
""" Encode a string using Soundex.
Takes a string and returns its Soundex representation.
>>> soundex("ashcraft")
'A261'
>>> soundex("ashcroft")
'A261'
>>> soundex("rubin")
'R150'
>>> soundex("robert")
'R163'
>>> soundex("rupert")
'R163'
>>> soundex("euler")
'E460'
>>> soundex("ellery")
'E460'
>>> soundex("gauss")
'G200'
>>> soundex("ghosh")
'G200'
>>> soundex("hilbert")
'H416'
>>> soundex("heilbronn")
'H416'
>>> soundex("knuth")
'K530'
>>> soundex("kant")
'K530'
>>> soundex("lloyd")
'L430'
>>> soundex("ladd")
'L300'
>>> soundex("lukasiewicz")
'L200'
"""
if len(s) < 2:
return None
s = normalize(s)
last = None
enc = s[0]
for c in s[1:]:
if len(enc) == 4:
break
if charsubs[c] != last:
enc += charsubs[c]
last = charsubs[c]
while len(enc) < 4:
enc += '0'
return enc
if __name__ == "__main__":
import doctest
doctest.testmod()