Skip to content

Commit

Permalink
Add fixup for common address parsing problems
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew-Dickinson committed Feb 14, 2024
1 parent bf77982 commit 65d125d
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 23 deletions.
43 changes: 34 additions & 9 deletions src/meshdb/utils/spreadsheet_import/building/pelias.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,37 @@
import logging
import re
from typing import List, Optional, Tuple

from django.conf import os
from typing import List, Tuple

import inflect
import requests
from django.conf import os

from meshdb.utils.spreadsheet_import.building.constants import (
DatabaseAddress,
NormalizedAddressVariant,
)
from meshdb.utils.spreadsheet_import.building.constants import DatabaseAddress, NormalizedAddressVariant
from meshdb.utils.spreadsheet_import.building.us_state_codes import convert_state_name_to_code

PELIAS_ADDRESS_PARSER_URL = os.environ.get("PELIAS_ADDRESS_PARSER_URL")


def adjust_index_down(index_val: int, start_val: int, shift_amount: int):
if index_val >= start_val:
return index_val - shift_amount
else:
return index_val


def call_pelias_parser(address_str: str) -> List[Tuple[float, dict, dict]]:
response = requests.get(PELIAS_ADDRESS_PARSER_URL, params={"text": address_str})
# "Bowery" is a weird street that Pelias doesn't handle well. Here we
# (incorrectly) add "Street" for pelias parsing, which we will look for and
# remove in the results
pattern = re.compile("bowery", re.IGNORECASE)
modified_addr = pattern.sub("Bowery Street", address_str)

bowery_detected = modified_addr != address_str
bowery_word_end = None
if bowery_detected:
bowery_word_end = re.search(pattern, address_str).end()

response = requests.get(PELIAS_ADDRESS_PARSER_URL, params={"text": modified_addr})
output = []

for solution in response.json()["solutions"]:
Expand All @@ -44,8 +58,19 @@ def call_pelias_parser(address_str: str) -> List[Tuple[float, dict, dict]]:
]:
continue

if label == "street" and bowery_detected:
# Remove our "Bowery Street" hack from above
classification["value"] = classification["value"].replace("Bowery Street", "Bowery")

components[label] = classification["value"]
indices[label] = (classification["start"], classification["end"])

if bowery_detected:
indices[label] = (
adjust_index_down(classification["start"], bowery_word_end, 7),
adjust_index_down(classification["end"], bowery_word_end, 7),
)
else:
indices[label] = (classification["start"], classification["end"])

output.append((solution["score"], components, indices))

Expand Down
83 changes: 69 additions & 14 deletions src/meshdb/utils/spreadsheet_import/building/resolve_address.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import dataclasses
import logging
import re
from typing import Callable, List, Optional, Tuple

import geopy.distance
Expand Down Expand Up @@ -85,6 +85,55 @@ def osm_location_is_in_nyc(osm_raw_addr: dict) -> bool:
)


def fixup_bad_address(bad_address: str) -> str:
modified_addr = " ".join(bad_address.split()) # Multiple spaces between sections can confuse Pelias
st_no_space_match = re.search(r"(\d+)[Ss][Tt]", modified_addr)
if st_no_space_match:
modified_addr = (
modified_addr[: st_no_space_match.start(0)]
+ st_no_space_match[1]
+ " St"
+ modified_addr[st_no_space_match.end(0) :]
)

ave_no_space_match = re.search(r"(\d+)[Aa][Vv][Ee]", modified_addr)
if ave_no_space_match:
modified_addr = (
modified_addr[: ave_no_space_match.start(0)]
+ ave_no_space_match[1]
+ " Ave"
+ modified_addr[ave_no_space_match.end(0) :]
)

east_west_no_space_match = re.search(r"([EeWw])(\d+)", modified_addr)
if east_west_no_space_match:
modified_addr = (
modified_addr[: east_west_no_space_match.start(0)]
+ east_west_no_space_match[1]
+ " "
+ east_west_no_space_match[2]
+ modified_addr[east_west_no_space_match.end(0) :]
)

simple_typo_substitutions = {
"steet": "Street",
"avue": "Avenue",
"concoourse": "Concourse",
";": ",",
"Aveune": "Avenue",
"nlvd": "Boulevard",
"410 Grand": "410 Grand Street",
"460 Grand": "460 Grand Street",
"131 Broome": "131 Broome Street",
}

for typo, fix in simple_typo_substitutions.items():
pattern = re.compile(typo, re.IGNORECASE)
modified_addr = pattern.sub(fix, modified_addr)

return modified_addr


class AddressParser:
def __init__(self):
self.geolocator = get_geolocator()
Expand Down Expand Up @@ -331,19 +380,25 @@ def parse_address(
# for dropped edits, to avoid runtime errors
add_dropped_edit = lambda x: None

row.address = row.address.strip(
input_address = row.address.strip(
"., " # Leading / trailing whitespace and punctuation can cause issues
# and should never be semantically meaningful
)

pelias_response = call_pelias_parser(row.address)
pelias_response = call_pelias_parser(input_address)
if not pelias_response:
raise AddressError(f"Invalid address: '{row.address}'. No components detected")
logging.warning(f"Detected invalid address '{input_address}'. Trying some common substitutions to fix it")
input_address = fixup_bad_address(input_address)
pelias_response = call_pelias_parser(input_address)
if not pelias_response:
raise AddressError(
f"Invalid address: '{input_address}'. No components detected, even after attempting fixes"
)

try:
if pelias_response[0][0] < PELIAS_SCORE_WARNING_THRESHOLD:
logging.debug(
f"Got low score of {pelias_response[0][0]} from " f"Pelias when parsing address '{row.address}'"
f"Got low score of {pelias_response[0][0]} from " f"Pelias when parsing address '{input_address}'"
)

required_components = ["housenumber", "street"]
Expand All @@ -352,41 +407,41 @@ def parse_address(
# to parse into a specific building with OSM or the NYC Planning API,
# Fall back to string parsing
raise AddressError(
f"Invalid address: '{row.address}'. All of "
f"Invalid address: '{input_address}'. All of "
f"{required_components} are required and at least one is missing"
)

osm_db_addr = pelias_to_database_address_components(
row.address, pelias_response[0], NormalizedAddressVariant.OSMNominatim
input_address, pelias_response[0], NormalizedAddressVariant.OSMNominatim
)
normalized_osm_addr = database_address_components_to_normalized_address_string(osm_db_addr)

closest_osm_location = self._get_closest_osm_location(normalized_osm_addr, (row.latitude, row.longitude))

if not closest_osm_location:
raise AddressError(f"Unable to find '{row.address}' in OSM database")
raise AddressError(f"Unable to find '{input_address}' in OSM database")

if closest_osm_location.raw["type"] in ["postcode", "administrative", "neighbourhood"]:
# Fall back to string parsing for vague place descriptions
raise AddressError(
f"Address '{row.address}' is not substantial enough to resolve " f"to a specific place"
f"Address '{input_address}' is not substantial enough to resolve to a specific place"
)

if osm_location_is_in_nyc(closest_osm_location.raw["address"]):
# We are in NYC, call the city planning API
result = self._find_nyc_building(
row.address, pelias_response[0], (row.latitude, row.longitude), row.bin
input_address, pelias_response[0], (row.latitude, row.longitude), row.bin
)
else:
# We are not in NYC, the best we can do is the OSM geolocation
r_addr = closest_osm_location.raw["address"]

for prop in ["house_number", "road", "ISO3166-2-lvl4", "postcode"]:
if prop not in r_addr:
raise AddressError(f"Invalid address '{row.address}' - {prop} not found in OSM data")
raise AddressError(f"Invalid address '{input_address}' - {prop} not found in OSM data")

if not any(prop in r_addr for prop in ["city", "town", "village"]):
raise AddressError(f"Invalid address '{row.address}' - city/town/village not found in OSM data")
raise AddressError(f"Invalid address '{input_address}' - city/town/village not found in OSM data")

city, state = convert_osm_city_village_suburb_nonsense(r_addr)

Expand All @@ -406,11 +461,11 @@ def parse_address(
)
except AddressError:
logging.debug(
f"Error locating '{row.address}'. Falling back to string parsing. "
f"Error locating '{input_address}'. Falling back to string parsing. "
f"Is this address valid and located in the NYC metro area?"
)
return self._parse_pelias_result_to_answer_and_fill_gaps_with_geocode(
row.address,
input_address,
pelias_response[0],
sources=[AddressTruthSource.PeliasStringParsing],
spreadsheet_latlon=(row.latitude, row.longitude),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from meshdb.utils.spreadsheet_import.building.resolve_address import call_pelias_parser, fixup_bad_address


def test_fixup():
assert fixup_bad_address("244 E 45st New York") == "244 E 45 St New York"
assert fixup_bad_address("244 E 45St New York") == "244 E 45 St New York"
assert fixup_bad_address("244 E 45ST New York") == "244 E 45 St New York"
assert fixup_bad_address("244 E 45 St New York") == "244 E 45 St New York"

assert fixup_bad_address("244 5Ave New York") == "244 5 Ave New York"
assert fixup_bad_address("244 5AVE New York") == "244 5 Ave New York"
assert fixup_bad_address("244 5 Ave New York") == "244 5 Ave New York"

assert fixup_bad_address("357 13th Steet Apt #2") == "357 13th Street Apt #2"
assert fixup_bad_address("357 13th STEET Apt #2") == "357 13th Street Apt #2"
assert fixup_bad_address("357 13th steet Apt #2") == "357 13th Street Apt #2"

assert fixup_bad_address("357 6th Avue") == "357 6th Avenue"
assert fixup_bad_address("357 6th steet") == "357 6th Street"
assert fixup_bad_address("357 Grand concoourse") == "357 Grand Concourse"

assert fixup_bad_address("244 E45 St New York") == "244 E 45 St New York"
assert fixup_bad_address("244 e45St New York") == "244 e 45 St New York"
assert fixup_bad_address("244 W45St New York") == "244 W 45 St New York"

assert fixup_bad_address("244 Abc nlvd New York") == "244 Abc Boulevard New York"

assert fixup_bad_address("244 W 45St New York") == "244 W 45 St New York"
assert fixup_bad_address("244 W 45St New York; 10023") == "244 W 45 St New York, 10023"


def test_pelias_bowery():
# If this is failing, it's probably because you don't have connectivity
# to a pelias parser, maybe you need to run it with Docker?

result = call_pelias_parser("123 Bowery, New York, NY")
assert result[0][1] == {"housenumber": "123", "locality": "New York", "region": "NY", "street": "Bowery"}
assert result[0][2] == {"housenumber": (0, 3), "locality": (12, 20), "region": (22, 24), "street": (4, 10)}

result = call_pelias_parser("123 Bowery")
assert result[0][1] == {"housenumber": "123", "street": "Bowery"}
assert result[0][2] == {"housenumber": (0, 3), "street": (4, 10)}

0 comments on commit 65d125d

Please sign in to comment.