Skip to content

Commit

Permalink
Address handling fixes in import script (#188)
Browse files Browse the repository at this point in the history
- Include spreadsheet distance error calculations in building notes
- Reduce warning distance to 100 meters
- Convert to using spreadsheet lat/lon with a warning about discrepancies
- Add fixup for common address parsing problems
  • Loading branch information
Andrew-Dickinson committed Feb 14, 2024
1 parent a85142e commit 3777746
Show file tree
Hide file tree
Showing 4 changed files with 185 additions and 49 deletions.
43 changes: 34 additions & 9 deletions src/meshdb/utils/spreadsheet_import/building/pelias.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,37 @@
import logging
import re
from typing import List, Optional, Tuple

from django.conf import os
from typing import List, Tuple

import inflect
import requests
from django.conf import os

from meshdb.utils.spreadsheet_import.building.constants import (
DatabaseAddress,
NormalizedAddressVariant,
)
from meshdb.utils.spreadsheet_import.building.constants import DatabaseAddress, NormalizedAddressVariant
from meshdb.utils.spreadsheet_import.building.us_state_codes import convert_state_name_to_code

PELIAS_ADDRESS_PARSER_URL = os.environ.get("PELIAS_ADDRESS_PARSER_URL")


def adjust_index_down(index_val: int, start_val: int, shift_amount: int):
if index_val >= start_val:
return index_val - shift_amount
else:
return index_val


def call_pelias_parser(address_str: str) -> List[Tuple[float, dict, dict]]:
response = requests.get(PELIAS_ADDRESS_PARSER_URL, params={"text": address_str})
# "Bowery" is a weird street that Pelias doesn't handle well. Here we
# (incorrectly) add "Street" for pelias parsing, which we will look for and
# remove in the results
pattern = re.compile("bowery", re.IGNORECASE)
modified_addr = pattern.sub("Bowery Street", address_str)

bowery_detected = modified_addr != address_str
bowery_word_end = None
if bowery_detected:
bowery_word_end = re.search(pattern, address_str).end()

response = requests.get(PELIAS_ADDRESS_PARSER_URL, params={"text": modified_addr})
output = []

for solution in response.json()["solutions"]:
Expand All @@ -44,8 +58,19 @@ def call_pelias_parser(address_str: str) -> List[Tuple[float, dict, dict]]:
]:
continue

if label == "street" and bowery_detected:
# Remove our "Bowery Street" hack from above
classification["value"] = classification["value"].replace("Bowery Street", "Bowery")

components[label] = classification["value"]
indices[label] = (classification["start"], classification["end"])

if bowery_detected:
indices[label] = (
adjust_index_down(classification["start"], bowery_word_end, 7),
adjust_index_down(classification["end"], bowery_word_end, 7),
)
else:
indices[label] = (classification["start"], classification["end"])

output.append((solution["score"], components, indices))

Expand Down
101 changes: 69 additions & 32 deletions src/meshdb/utils/spreadsheet_import/building/resolve_address.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import dataclasses
import logging
import re
from typing import Callable, List, Optional, Tuple

import geopy.distance
Expand Down Expand Up @@ -85,6 +85,55 @@ def osm_location_is_in_nyc(osm_raw_addr: dict) -> bool:
)


def fixup_bad_address(bad_address: str) -> str:
modified_addr = " ".join(bad_address.split()) # Multiple spaces between sections can confuse Pelias
st_no_space_match = re.search(r"(\d+)[Ss][Tt]", modified_addr)
if st_no_space_match:
modified_addr = (
modified_addr[: st_no_space_match.start(0)]
+ st_no_space_match[1]
+ " St"
+ modified_addr[st_no_space_match.end(0) :]
)

ave_no_space_match = re.search(r"(\d+)[Aa][Vv][Ee]", modified_addr)
if ave_no_space_match:
modified_addr = (
modified_addr[: ave_no_space_match.start(0)]
+ ave_no_space_match[1]
+ " Ave"
+ modified_addr[ave_no_space_match.end(0) :]
)

east_west_no_space_match = re.search(r"([EeWw])(\d+)", modified_addr)
if east_west_no_space_match:
modified_addr = (
modified_addr[: east_west_no_space_match.start(0)]
+ east_west_no_space_match[1]
+ " "
+ east_west_no_space_match[2]
+ modified_addr[east_west_no_space_match.end(0) :]
)

simple_typo_substitutions = {
"steet": "Street",
"avue": "Avenue",
"concoourse": "Concourse",
";": ",",
"Aveune": "Avenue",
"nlvd": "Boulevard",
"410 Grand": "410 Grand Street",
"460 Grand": "460 Grand Street",
"131 Broome": "131 Broome Street",
}

for typo, fix in simple_typo_substitutions.items():
pattern = re.compile(typo, re.IGNORECASE)
modified_addr = pattern.sub(fix, modified_addr)

return modified_addr


class AddressParser:
def __init__(self):
self.geolocator = get_geolocator()
Expand Down Expand Up @@ -331,19 +380,25 @@ def parse_address(
# for dropped edits, to avoid runtime errors
add_dropped_edit = lambda x: None

row.address = row.address.strip(
input_address = row.address.strip(
"., " # Leading / trailing whitespace and punctuation can cause issues
# and should never be semantically meaningful
)

pelias_response = call_pelias_parser(row.address)
pelias_response = call_pelias_parser(input_address)
if not pelias_response:
raise AddressError(f"Invalid address: '{row.address}'. No components detected")
logging.warning(f"Detected invalid address '{input_address}'. Trying some common substitutions to fix it")
input_address = fixup_bad_address(input_address)
pelias_response = call_pelias_parser(input_address)
if not pelias_response:
raise AddressError(
f"Invalid address: '{input_address}'. No components detected, even after attempting fixes"
)

try:
if pelias_response[0][0] < PELIAS_SCORE_WARNING_THRESHOLD:
logging.debug(
f"Got low score of {pelias_response[0][0]} from " f"Pelias when parsing address '{row.address}'"
f"Got low score of {pelias_response[0][0]} from " f"Pelias when parsing address '{input_address}'"
)

required_components = ["housenumber", "street"]
Expand All @@ -352,41 +407,41 @@ def parse_address(
# to parse into a specific building with OSM or the NYC Planning API,
# Fall back to string parsing
raise AddressError(
f"Invalid address: '{row.address}'. All of "
f"Invalid address: '{input_address}'. All of "
f"{required_components} are required and at least one is missing"
)

osm_db_addr = pelias_to_database_address_components(
row.address, pelias_response[0], NormalizedAddressVariant.OSMNominatim
input_address, pelias_response[0], NormalizedAddressVariant.OSMNominatim
)
normalized_osm_addr = database_address_components_to_normalized_address_string(osm_db_addr)

closest_osm_location = self._get_closest_osm_location(normalized_osm_addr, (row.latitude, row.longitude))

if not closest_osm_location:
raise AddressError(f"Unable to find '{row.address}' in OSM database")
raise AddressError(f"Unable to find '{input_address}' in OSM database")

if closest_osm_location.raw["type"] in ["postcode", "administrative", "neighbourhood"]:
# Fall back to string parsing for vague place descriptions
raise AddressError(
f"Address '{row.address}' is not substantial enough to resolve " f"to a specific place"
f"Address '{input_address}' is not substantial enough to resolve to a specific place"
)

if osm_location_is_in_nyc(closest_osm_location.raw["address"]):
# We are in NYC, call the city planning API
result = self._find_nyc_building(
row.address, pelias_response[0], (row.latitude, row.longitude), row.bin
input_address, pelias_response[0], (row.latitude, row.longitude), row.bin
)
else:
# We are not in NYC, the best we can do is the OSM geolocation
r_addr = closest_osm_location.raw["address"]

for prop in ["house_number", "road", "ISO3166-2-lvl4", "postcode"]:
if prop not in r_addr:
raise AddressError(f"Invalid address '{row.address}' - {prop} not found in OSM data")
raise AddressError(f"Invalid address '{input_address}' - {prop} not found in OSM data")

if not any(prop in r_addr for prop in ["city", "town", "village"]):
raise AddressError(f"Invalid address '{row.address}' - city/town/village not found in OSM data")
raise AddressError(f"Invalid address '{input_address}' - city/town/village not found in OSM data")

city, state = convert_osm_city_village_suburb_nonsense(r_addr)

Expand All @@ -406,32 +461,14 @@ def parse_address(
)
except AddressError:
logging.debug(
f"Error locating '{row.address}'. Falling back to string parsing. "
f"Error locating '{input_address}'. Falling back to string parsing. "
f"Is this address valid and located in the NYC metro area?"
)
return self._parse_pelias_result_to_answer_and_fill_gaps_with_geocode(
row.address,
input_address,
pelias_response[0],
sources=[AddressTruthSource.PeliasStringParsing],
spreadsheet_latlon=(row.latitude, row.longitude),
)

error_vs_google = geopy.distance.geodesic(result.discovered_lat_lon, (row.latitude, row.longitude)).m
if error_vs_google > 200:
add_dropped_edit(
DroppedModification(
[row.id],
row.id,
result.discovered_bin if result.discovered_bin else result.address.street_address,
"lat_long_discrepancy_vs_spreadsheet",
str(result.discovered_lat_lon),
str((row.latitude, row.longitude)),
)
)
logging.debug(
f"Mismatch vs spreadsheet of {error_vs_google} meters for address '{row.address}'"
f" for install # {row.id}. Wrong borough or city? We think this address is in "
f"{result.address.city}, {result.address.state}"
)

return result
48 changes: 40 additions & 8 deletions src/meshdb/utils/spreadsheet_import/parse_building.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from typing import Callable, Optional, Tuple

from geopy import Nominatim
import geopy.distance

from meshapi import models
from meshapi.exceptions import AddressError
Expand Down Expand Up @@ -197,8 +197,36 @@ def get_or_create_building(
except AddressError as e:
return None

latitude = address_result.discovered_lat_lon[0] if address_result.discovered_lat_lon else row.latitude
longitude = address_result.discovered_lat_lon[1] if address_result.discovered_lat_lon else row.longitude
distance_warning = ""
error_vs_google = geopy.distance.geodesic(address_result.discovered_lat_lon, (row.latitude, row.longitude)).m
if error_vs_google > 100:
add_dropped_edit(
DroppedModification(
[row.id],
row.id,
address_result.discovered_bin
if address_result.discovered_bin
else address_result.address.street_address,
"lat_long_discrepancy_vs_spreadsheet",
str(address_result.discovered_lat_lon),
str((row.latitude, row.longitude)),
)
)
distance_warning = (
f"WARNING: Mismatch vs spreadsheet lat/lon {str((row.latitude, row.longitude))} "
f"of {error_vs_google} meters\n"
)
logging.debug(
f"Mismatch vs spreadsheet of {error_vs_google} meters for address '{row.address}'"
f" for install # {row.id}. Wrong borough or city? We think this address is in "
f"{address_result.address.city}, {address_result.address.state}"
)

addr_latitude = address_result.discovered_lat_lon[0] if address_result.discovered_lat_lon else None
addr_longitude = address_result.discovered_lat_lon[1] if address_result.discovered_lat_lon else None

latitude = row.latitude
longitude = row.longitude
altitude = (
# TODO: Change this to match new DOB ID if changed from spreadsheet?
# Would require another API call
Expand Down Expand Up @@ -232,6 +260,11 @@ def get_or_create_building(

return existing_building

addr_truth_sources = (
",".join(source.value for source in address_result.truth_sources)
if address_result.truth_sources
else None # This is so we throw an exception and notice when we are missing sources
)
return Building(
bin=address_result.discovered_bin,
street_address=address_result.address.street_address,
Expand All @@ -250,15 +283,14 @@ def get_or_create_building(
for source in [AddressTruthSource.NYCPlanningLabs, AddressTruthSource.OSMNominatim]
)
),
address_truth_sources=",".join(source.value for source in address_result.truth_sources)
if address_result.truth_sources
else None, # This is so we throw an exception and notice when we are missing sources
address_truth_sources=addr_truth_sources,
primary_nn=row.nn if row.nn else None,
node_name=row.nodeName if row.nodeName else None,
# Let's not throw away the spreadsheet location information, just case it's useful in
# chasing down a mis-parsed address in the future
notes=f"Spreadsheet Address: {row.address}\n"
f"Spreadsheet Neighborhood: {row.neighborhood}\n"
f"Spreadsheet BIN: {dob_bin}\n"
f"Spreadsheet Coordinates: {row.latitude}, {row.longitude}, {row.altitude}\n",
f"Spreadsheet BIN: {dob_bin}\n\n"
f"Our Expected Coordinates (from {addr_truth_sources}): {addr_latitude}, {addr_longitude}\n"
f"{distance_warning}",
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from meshdb.utils.spreadsheet_import.building.resolve_address import call_pelias_parser, fixup_bad_address


def test_fixup():
assert fixup_bad_address("244 E 45st New York") == "244 E 45 St New York"
assert fixup_bad_address("244 E 45St New York") == "244 E 45 St New York"
assert fixup_bad_address("244 E 45ST New York") == "244 E 45 St New York"
assert fixup_bad_address("244 E 45 St New York") == "244 E 45 St New York"

assert fixup_bad_address("244 5Ave New York") == "244 5 Ave New York"
assert fixup_bad_address("244 5AVE New York") == "244 5 Ave New York"
assert fixup_bad_address("244 5 Ave New York") == "244 5 Ave New York"

assert fixup_bad_address("357 13th Steet Apt #2") == "357 13th Street Apt #2"
assert fixup_bad_address("357 13th STEET Apt #2") == "357 13th Street Apt #2"
assert fixup_bad_address("357 13th steet Apt #2") == "357 13th Street Apt #2"

assert fixup_bad_address("357 6th Avue") == "357 6th Avenue"
assert fixup_bad_address("357 6th steet") == "357 6th Street"
assert fixup_bad_address("357 Grand concoourse") == "357 Grand Concourse"

assert fixup_bad_address("244 E45 St New York") == "244 E 45 St New York"
assert fixup_bad_address("244 e45St New York") == "244 e 45 St New York"
assert fixup_bad_address("244 W45St New York") == "244 W 45 St New York"

assert fixup_bad_address("244 Abc nlvd New York") == "244 Abc Boulevard New York"

assert fixup_bad_address("244 W 45St New York") == "244 W 45 St New York"
assert fixup_bad_address("244 W 45St New York; 10023") == "244 W 45 St New York, 10023"


def test_pelias_bowery():
# If this is failing, it's probably because you don't have connectivity
# to a pelias parser, maybe you need to run it with Docker?

result = call_pelias_parser("123 Bowery, New York, NY")
assert result[0][1] == {"housenumber": "123", "locality": "New York", "region": "NY", "street": "Bowery"}
assert result[0][2] == {"housenumber": (0, 3), "locality": (12, 20), "region": (22, 24), "street": (4, 10)}

result = call_pelias_parser("123 Bowery")
assert result[0][1] == {"housenumber": "123", "street": "Bowery"}
assert result[0][2] == {"housenumber": (0, 3), "street": (4, 10)}

0 comments on commit 3777746

Please sign in to comment.