Skip to content

Commit

Permalink
Fix code to append libpostal and write tests
Browse files Browse the repository at this point in the history
  • Loading branch information
emk committed Mar 25, 2024
1 parent 58512c5 commit a22912e
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 15 deletions.
7 changes: 7 additions & 0 deletions src/geocoders/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,13 @@ impl Geocoded {
pub fn contains_null_bytes(&self) -> bool {
self.column_values.iter().any(|s| s.contains('\0'))
}

/// Concatenate two `Geocoded` values.
pub fn concat(&self, other: &Geocoded) -> Geocoded {
let mut column_values = self.column_values.clone();
column_values.extend(other.column_values.iter().cloned());
Geocoded { column_values }
}
}

/// Abstract geocoding interface.
Expand Down
65 changes: 58 additions & 7 deletions src/geocoders/paired.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,40 +4,78 @@
use async_trait::async_trait;

use crate::format_err;
use crate::geocoders::{Geocoded, Geocoder};
use crate::{addresses::Address, Result};

/// A geocoder that runs two geocoders and returns both results.
pub struct Paired {
/// The first geocoder.
fst: Box<dyn Geocoder>,

/// The second geocoder.
snd: Box<dyn Geocoder>,

/// An empty set of columns with the same width as the first geocoder.
fst_empty_output: Geocoded,

/// An empty set of columns with the same width as the second geocoder.
snd_empty_output: Geocoded,

/// The column names output by this geocoder. Includes both sets
/// of column names.
column_names: Vec<String>,

/// The configuration key for this geocoder.
config_key: String,
}

impl Paired {
/// Create a new pair of geocoders.
pub fn new(fst: Box<dyn Geocoder>, snd: Box<dyn Geocoder>) -> Paired {
let column_names = fst
/// Create a new geocoder which returns the results of two geocoders. The
/// columns from the first geocoder are returned first, followed by the
/// columns from the second geocoder. The column names from the second
/// geocoder are prefixed with the tag of the second geocoder.
pub fn new(
fst: Box<dyn Geocoder>,
snd_label: &str,
snd: Box<dyn Geocoder>,
) -> Paired {
let fst_column_names = fst.column_names().iter().cloned();
let snd_column_names = snd
.column_names()
.iter()
.chain(snd.column_names())
.cloned()
.collect();
.map(|c| format!("{}_{}", snd_label, c));
let fst_empty_output = Geocoded {
column_values: vec!["".to_owned(); fst.column_names().len()],
};
let snd_empty_output = Geocoded {
column_values: vec!["".to_owned(); snd.column_names().len()],
};
let column_names = fst_column_names.chain(snd_column_names).collect();
let config_key =
format!("{}+{}", fst.configuration_key(), snd.configuration_key());
Paired {
fst,
snd,
fst_empty_output,
snd_empty_output,
column_names,
config_key,
}
}

fn combine_geocoder_results(
&self,
fst: Option<Geocoded>,
snd: Option<Geocoded>,
) -> Option<Geocoded> {
match (fst, snd) {
(None, None) => None,
(Some(f), None) => Some(f.concat(&self.snd_empty_output)),
(None, Some(s)) => Some(self.fst_empty_output.concat(&s)),
(Some(f), Some(s)) => Some(f.concat(&s)),
}
}
}

#[async_trait]
Expand All @@ -60,6 +98,19 @@ impl Geocoder for Paired {
) -> Result<Vec<Option<Geocoded>>> {
let fst = self.fst.geocode_addresses(addresses).await?;
let snd = self.snd.geocode_addresses(addresses).await?;
Ok(fst.into_iter().chain(snd.into_iter()).collect())
if fst.len() != snd.len() {
return Err(format_err!(
"Geocoders returned different numbers of results: {} from {} vs {} from {}",
fst.len(),
self.fst.tag(),
snd.len(),
self.snd.tag(),
));
}
Ok(fst
.into_iter()
.zip(snd)
.map(|(f, s)| self.combine_geocoder_results(f, s))
.collect())
}
}
21 changes: 13 additions & 8 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,9 @@ struct Opt {
#[arg(long = "normalize")]
normalize: bool,

/// Include normalization-related columns.
#[arg(long = "include-normalizer-columns", requires = "normalize")]
include_normalizer_columns: bool,
/// Include libpostal columns in addition to another geocoder's output.
#[arg(long = "include-libpostal")]
include_libpostal: bool,

/// Limit the speed with which we access external geocoding APIs. Does not
/// affect the cache or local geocoding.
Expand Down Expand Up @@ -276,11 +276,16 @@ async fn main() -> Result<()> {

// If we were asked, normalize addresses a bit first.
if opt.normalize {
let normalizer = Normalizer::new(geocoder);
geocoder = Box::new(normalizer);
if opt.include_normalizer_columns {
geocoder = Box::new(Paired::new(geocoder, Box::new(LibPostal::new())));
}
geocoder = Box::new(Normalizer::new(geocoder));
}

// Include libpostal columns in the output if requested.
if opt.include_libpostal {
geocoder = Box::new(Paired::new(
geocoder,
"libpostal",
Box::new(LibPostal::new()),
));
}

// Decide which command to run.
Expand Down
34 changes: 34 additions & 0 deletions tests/specs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,3 +255,37 @@ fn skip_records_with_empty_house_number_and_street() {
assert!(output.stdout_str().contains("New York"));
assert!(output.stdout_str().contains("Provo"));
}

#[test]
#[ignore]
fn append_libpostal() {
let testdir = TestDir::new("geocode-csv", "append_libpostal");

testdir.create_file(
"spec.json",
r#"{
"gc": {
"house_number_and_street": [
"address_1",
"address_2"
],
"city": "city",
"state": "state",
"postcode": "zip_code"
}
}"#,
);
let output = testdir
.cmd()
.arg("--license=us-core-enterprise-cloud")
.arg("--spec=spec.json")
.arg("--include-libpostal")
.output_with_stdin(SIMPLE_CSV)
.tee_output()
.expect_success();
assert!(output.stdout_str().contains("gc_addressee"));
assert!(output.stdout_str().contains("Commercial"));
assert!(output.stdout_str().contains("Residential"));
assert!(output.stdout_str().contains("40.21"));
assert!(output.stdout_str().contains("gc_libpostal_city"));
}

0 comments on commit a22912e

Please sign in to comment.