Skip to content

Commit

Permalink
Allow including secondary output from libpostal
Browse files Browse the repository at this point in the history
We add a new command-line flag allowing the user
to combine regular geocoder output with columns
from libpostal.
  • Loading branch information
emk committed Mar 24, 2024
1 parent c3da225 commit 58512c5
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 3 deletions.
2 changes: 1 addition & 1 deletion src/geocoders/libpostal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use crate::{addresses::Address, Result};

use super::{Geocoded, Geocoder};

static COLUMN_NAMES: &[&str] = &[
pub(crate) static COLUMN_NAMES: &[&str] = &[
// From
// https://github.com/OpenCageData/address-formatting/blob/master/conf/components.yaml.
"archipelago",
Expand Down
1 change: 1 addition & 0 deletions src/geocoders/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ pub mod cache;
pub mod invalid_record_skipper;
pub mod libpostal;
pub mod normalizer;
pub mod paired;
pub mod smarty;

/// A `hyper` client shared between multiple workers.
Expand Down
65 changes: 65 additions & 0 deletions src/geocoders/paired.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
//! Paired geocoder. Run two geocoders, return both.
//!
//! ¿Por qué no los dos?
use async_trait::async_trait;

use crate::geocoders::{Geocoded, Geocoder};
use crate::{addresses::Address, Result};

/// A geocoder that runs two geocoders and returns both results.
pub struct Paired {
/// The first geocoder.
fst: Box<dyn Geocoder>,
/// The second geocoder.
snd: Box<dyn Geocoder>,
/// The column names output by this geocoder. Includes both sets
/// of column names.
column_names: Vec<String>,
/// The configuration key for this geocoder.
config_key: String,
}

impl Paired {
/// Create a new pair of geocoders.
pub fn new(fst: Box<dyn Geocoder>, snd: Box<dyn Geocoder>) -> Paired {
let column_names = fst
.column_names()
.iter()
.chain(snd.column_names())
.cloned()
.collect();
let config_key =
format!("{}+{}", fst.configuration_key(), snd.configuration_key());
Paired {
fst,
snd,
column_names,
config_key,
}
}
}

#[async_trait]
impl Geocoder for Paired {
fn tag(&self) -> &str {
"pair"
}

fn configuration_key(&self) -> &str {
&self.config_key
}

fn column_names(&self) -> &[String] {
&self.column_names
}

async fn geocode_addresses(
&self,
addresses: &[Address],
) -> Result<Vec<Option<Geocoded>>> {
let fst = self.fst.geocode_addresses(addresses).await?;
let snd = self.snd.geocode_addresses(addresses).await?;
Ok(fst.into_iter().chain(snd.into_iter()).collect())
}
}
12 changes: 10 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ mod pipeline;
mod server;
mod unpack_vec;

use crate::addresses::AddressColumnSpec;
use crate::geocoders::{
cache::Cache, invalid_record_skipper::InvalidRecordSkipper, libpostal::LibPostal,
normalizer::Normalizer, shared_http_client, smarty::Smarty, Geocoder,
Expand All @@ -42,6 +41,7 @@ use crate::geocoders::{
use crate::key_value_stores::KeyValueStore;
use crate::pipeline::{geocode_stdio, OnDuplicateColumns, CONCURRENCY, GEOCODE_SIZE};
use crate::server::run_server;
use crate::{addresses::AddressColumnSpec, geocoders::paired::Paired};

#[cfg(all(feature = "jemallocator", not(target_env = "msvc")))]
#[global_allocator]
Expand Down Expand Up @@ -143,6 +143,10 @@ struct Opt {
#[arg(long = "normalize")]
normalize: bool,

/// Include normalization-related columns.
#[arg(long = "include-normalizer-columns", requires = "normalize")]
include_normalizer_columns: bool,

/// Limit the speed with which we access external geocoding APIs. Does not
/// affect the cache or local geocoding.
#[arg(long = "max-addresses-per-second")]
Expand Down Expand Up @@ -272,7 +276,11 @@ async fn main() -> Result<()> {

// If we were asked, normalize addresses a bit first.
if opt.normalize {
geocoder = Box::new(Normalizer::new(geocoder));
let normalizer = Normalizer::new(geocoder);
geocoder = Box::new(normalizer);
if opt.include_normalizer_columns {
geocoder = Box::new(Paired::new(geocoder, Box::new(LibPostal::new())));
}
}

// Decide which command to run.
Expand Down

0 comments on commit 58512c5

Please sign in to comment.