diff --git a/src/geocoders/libpostal.rs b/src/geocoders/libpostal.rs index 4086377..fa998c5 100644 --- a/src/geocoders/libpostal.rs +++ b/src/geocoders/libpostal.rs @@ -9,7 +9,7 @@ use crate::{addresses::Address, Result}; use super::{Geocoded, Geocoder}; -static COLUMN_NAMES: &[&str] = &[ +pub(crate) static COLUMN_NAMES: &[&str] = &[ // From // https://github.com/OpenCageData/address-formatting/blob/master/conf/components.yaml. "archipelago", diff --git a/src/geocoders/mod.rs b/src/geocoders/mod.rs index a7869d7..37273d8 100644 --- a/src/geocoders/mod.rs +++ b/src/geocoders/mod.rs @@ -19,6 +19,7 @@ pub mod cache; pub mod invalid_record_skipper; pub mod libpostal; pub mod normalizer; +pub mod paired; pub mod smarty; /// A `hyper` client shared between multiple workers. diff --git a/src/geocoders/paired.rs b/src/geocoders/paired.rs new file mode 100644 index 0000000..6c05bd0 --- /dev/null +++ b/src/geocoders/paired.rs @@ -0,0 +1,65 @@ +//! Paired geocoder. Run two geocoders, return both. +//! +//! ¿Por qué no los dos? + +use async_trait::async_trait; + +use crate::geocoders::{Geocoded, Geocoder}; +use crate::{addresses::Address, Result}; + +/// A geocoder that runs two geocoders and returns both results. +pub struct Paired { + /// The first geocoder. + fst: Box, + /// The second geocoder. + snd: Box, + /// The column names output by this geocoder. Includes both sets + /// of column names. + column_names: Vec, + /// The configuration key for this geocoder. + config_key: String, +} + +impl Paired { + /// Create a new pair of geocoders. + pub fn new(fst: Box, snd: Box) -> Paired { + let column_names = fst + .column_names() + .iter() + .chain(snd.column_names()) + .cloned() + .collect(); + let config_key = + format!("{}+{}", fst.configuration_key(), snd.configuration_key()); + Paired { + fst, + snd, + column_names, + config_key, + } + } +} + +#[async_trait] +impl Geocoder for Paired { + fn tag(&self) -> &str { + "pair" + } + + fn configuration_key(&self) -> &str { + &self.config_key + } + + fn column_names(&self) -> &[String] { + &self.column_names + } + + async fn geocode_addresses( + &self, + addresses: &[Address], + ) -> Result>> { + let fst = self.fst.geocode_addresses(addresses).await?; + let snd = self.snd.geocode_addresses(addresses).await?; + Ok(fst.into_iter().chain(snd.into_iter()).collect()) + } +} diff --git a/src/main.rs b/src/main.rs index a836a26..b51829b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -33,7 +33,6 @@ mod pipeline; mod server; mod unpack_vec; -use crate::addresses::AddressColumnSpec; use crate::geocoders::{ cache::Cache, invalid_record_skipper::InvalidRecordSkipper, libpostal::LibPostal, normalizer::Normalizer, shared_http_client, smarty::Smarty, Geocoder, @@ -42,6 +41,7 @@ use crate::geocoders::{ use crate::key_value_stores::KeyValueStore; use crate::pipeline::{geocode_stdio, OnDuplicateColumns, CONCURRENCY, GEOCODE_SIZE}; use crate::server::run_server; +use crate::{addresses::AddressColumnSpec, geocoders::paired::Paired}; #[cfg(all(feature = "jemallocator", not(target_env = "msvc")))] #[global_allocator] @@ -143,6 +143,10 @@ struct Opt { #[arg(long = "normalize")] normalize: bool, + /// Include normalization-related columns. + #[arg(long = "include-normalizer-columns", requires = "normalize")] + include_normalizer_columns: bool, + /// Limit the speed with which we access external geocoding APIs. Does not /// affect the cache or local geocoding. #[arg(long = "max-addresses-per-second")] @@ -272,7 +276,11 @@ async fn main() -> Result<()> { // If we were asked, normalize addresses a bit first. if opt.normalize { - geocoder = Box::new(Normalizer::new(geocoder)); + let normalizer = Normalizer::new(geocoder); + geocoder = Box::new(normalizer); + if opt.include_normalizer_columns { + geocoder = Box::new(Paired::new(geocoder, Box::new(LibPostal::new()))); + } } // Decide which command to run.