From 4e009609a4ecaa8557468e57cbad097c32033353 Mon Sep 17 00:00:00 2001 From: Jay Kickliter Date: Wed, 26 Jul 2023 12:13:56 -0600 Subject: [PATCH] Add disktree --- Cargo.toml | 11 ++- benches/benches.rs | 52 +++++++++- src/disktree/dptr.rs | 82 ++++++++++++++++ src/disktree/iter.rs | 176 ++++++++++++++++++++++++++++++++++ src/disktree/mod.rs | 213 +++++++++++++++++++++++++++++++++++++++++ src/disktree/node.rs | 48 ++++++++++ src/disktree/tree.rs | 119 +++++++++++++++++++++++ src/disktree/varint.rs | 86 +++++++++++++++++ src/disktree/writer.rs | 129 +++++++++++++++++++++++++ src/error.rs | 85 +++++++++++++++- src/hex_tree_map.rs | 2 +- src/hex_tree_set.rs | 2 +- src/lib.rs | 4 + tests/tests.rs | 2 +- 14 files changed, 1001 insertions(+), 10 deletions(-) create mode 100644 src/disktree/dptr.rs create mode 100644 src/disktree/iter.rs create mode 100644 src/disktree/mod.rs create mode 100644 src/disktree/node.rs create mode 100755 src/disktree/tree.rs create mode 100644 src/disktree/varint.rs create mode 100644 src/disktree/writer.rs diff --git a/Cargo.toml b/Cargo.toml index f304887..06d3c09 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,18 +20,25 @@ all-features = true [features] default = [] +disktree = [ + "byteorder", + "memmap", + "serde", +] serde = ["dep:serde"] [dependencies] +byteorder = { version = "1", optional = true } +memmap = { version = "0.7", optional = true } serde = { version = "1", optional = true, features = ["derive"] } [dev-dependencies] -byteorder = "1" +bincode = { version = "1.3.3" } criterion = { version = "0.3", features = ["html_reports"] } geo = "0.26.0" -geo-types = "0.7" h3o = { version = "0.4.0", features = ["geo"] } h3ron = "0.15.1" +tempfile = "3" [dev-dependencies.h3-lorawan-regions] git = "https://github.com/JayKickliter/h3-lorawan-regions.git" diff --git a/benches/benches.rs b/benches/benches.rs index e0bdeb2..4cb8e22 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -1,6 +1,5 @@ use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; -use geo::polygon; -use geo_types::coord; +use geo::{coord, polygon}; use h3_lorawan_regions::{ compact::US915 as COMPACT_US915_INDICES, nocompact::US915 as PLAIN_US915_INDICES, }; @@ -9,7 +8,7 @@ use h3o::{ CellIndex, Resolution, }; use h3ron::H3Cell; -use hextree::{compaction::EqCompactor, Cell, HexTreeMap, HexTreeSet}; +use hextree::{compaction::EqCompactor, disktree::DiskTreeMap, Cell, HexTreeMap, HexTreeSet}; use std::convert::TryFrom; fn set_lookup(c: &mut Criterion) { @@ -49,6 +48,50 @@ fn set_lookup(c: &mut Criterion) { } } +fn disk_set_lookup(c: &mut Criterion) { + let mut group = c.benchmark_group("US915 DiskTreeSet lookup"); + + let us915_disk_set = { + let us915_set: HexTreeSet = PLAIN_US915_INDICES + .iter() + .map(|&idx| Cell::try_from(idx).unwrap()) + .collect(); + let mut file = tempfile::tempfile().unwrap(); + us915_set + .to_disktree(&mut file, |_, _| Ok::<(), std::io::Error>(())) + .unwrap(); + DiskTreeMap::memmap(file).unwrap() + }; + + let tarpon_springs = coord! {x: -82.753822, y: 28.15215}; + let gulf_of_mexico = coord! {x: -83.101920, y: 28.128096}; + let paris = coord! {x: 2.340340, y: 48.868680}; + + for resolution in [0, 4, 8, 12, 15] { + let tarpon_springs = + Cell::try_from(*H3Cell::from_coordinate(tarpon_springs, resolution).unwrap()).unwrap(); + let gulf_of_mexico = + Cell::try_from(*H3Cell::from_coordinate(gulf_of_mexico, resolution).unwrap()).unwrap(); + let paris = Cell::try_from(*H3Cell::from_coordinate(paris, resolution).unwrap()).unwrap(); + + group.bench_with_input( + BenchmarkId::new("Tarpon Spring", resolution), + &tarpon_springs, + |b, &cell| b.iter(|| us915_disk_set.contains(cell)), + ); + + group.bench_with_input( + BenchmarkId::new("Gulf of Mexico", resolution), + &gulf_of_mexico, + |b, &cell| b.iter(|| us915_disk_set.contains(cell)), + ); + + group.bench_with_input(BenchmarkId::new("Paris", resolution), &paris, |b, &cell| { + b.iter(|| us915_disk_set.contains(cell)) + }); + } +} + fn set_construction(c: &mut Criterion) { let mut group = c.benchmark_group("US915 HexTreeSet construction"); @@ -256,8 +299,9 @@ fn subtree_iter(c: &mut Criterion) { criterion_group!( benches, - subtree_iter, set_lookup, + disk_set_lookup, + subtree_iter, map_lookup, set_iteration, map_iteration, diff --git a/src/disktree/dptr.rs b/src/disktree/dptr.rs new file mode 100644 index 0000000..2837b4b --- /dev/null +++ b/src/disktree/dptr.rs @@ -0,0 +1,82 @@ +use crate::Result; +use std::{ + io::{Read, Write}, + mem::size_of, +}; + +/// A 'disk' pointer. +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] +#[repr(transparent)] +pub(crate) struct Dptr(u64); + +impl Dptr { + #[allow(clippy::cast_possible_truncation)] + const MAX: u64 = 2_u64.pow(Self::DISK_REPR_SZ as u32 * 8) - 1; + const DISK_REPR_SZ: usize = 5; + const NULL: u64 = 0; + + pub(crate) const fn is_null(self) -> bool { + self.0 == Self::NULL + } + + pub(crate) const fn null() -> Dptr { + Dptr(Self::NULL) + } + + pub(crate) const fn size() -> u64 { + Self::DISK_REPR_SZ as u64 + } + + /// Read 5 bytes from disk and parses them as little-endian `u64`. + pub(crate) fn read(src: &mut R) -> Result + where + R: Read, + { + let mut buf = [0u8; size_of::()]; + src.read_exact(&mut buf[..Self::DISK_REPR_SZ])?; + let dptr = u64::from_le_bytes(buf); + Ok(dptr.into()) + } + + /// Read 5 * `n` bytes from disk, for up to n=7, and parses them as + /// little-endian `u64`s. + pub(crate) fn read_n(src: &mut R, n: usize) -> Result> + where + R: Read, + { + debug_assert!(n <= 7); + let mut buf = [0; Self::DISK_REPR_SZ * 7]; + src.read_exact(&mut buf[..(Self::DISK_REPR_SZ * n)])?; + Ok(buf[..(Self::DISK_REPR_SZ * n)] + .chunks(Self::DISK_REPR_SZ) + .map(|chunk| { + let mut buf = [0u8; size_of::()]; + buf[..Self::DISK_REPR_SZ].copy_from_slice(chunk); + u64::from_le_bytes(buf) + }) + .map(Dptr) + .collect()) + } + + /// Writes the 5 lower bytes of a `u64` to disk. + pub(crate) fn write(self, dst: &mut W) -> Result + where + W: Write, + { + let buf = self.0.to_le_bytes(); + Ok(dst.write_all(&buf[..Self::DISK_REPR_SZ])?) + } +} + +impl From for u64 { + fn from(Dptr(raw): Dptr) -> u64 { + raw + } +} + +impl From for Dptr { + fn from(raw: u64) -> Dptr { + assert!(raw <= Self::MAX); + Dptr(raw) + } +} diff --git a/src/disktree/iter.rs b/src/disktree/iter.rs new file mode 100644 index 0000000..f39ad71 --- /dev/null +++ b/src/disktree/iter.rs @@ -0,0 +1,176 @@ +use crate::{ + cell::CellStack, + disktree::{dptr::Dptr, tree::HDR_SZ, varint}, + error::Result, + Cell, +}; +use byteorder::ReadBytesExt; +use std::io::{Cursor, Seek, SeekFrom}; + +pub(crate) struct Iter<'a> { + cell_stack: CellStack, + curr_node: Option<(u8, Dptr)>, + disktree_buf: &'a [u8], + disktree_csr: Cursor<&'a [u8]>, + node_stack: Vec>, + recycle_bin: Vec>, +} + +enum Node { + // File position for the fist byte of value data. + Leaf(Dptr), + // (H3 Cell digit, file position of child's node tag) + Parent(Vec<(u8, Dptr)>), +} + +impl<'a> Iter<'a> { + fn seek_to(&mut self, dptr: Dptr) -> Result { + Ok(Dptr::from( + self.disktree_csr.seek(SeekFrom::Start(u64::from(dptr)))?, + )) + } + + fn read_base_nodes(rdr: &mut Cursor<&[u8]>) -> Result> { + let mut buf = Vec::with_capacity(122); + rdr.seek(SeekFrom::Start(HDR_SZ))?; + for digit in 0..122 { + let dptr = Dptr::read(rdr)?; + if !dptr.is_null() { + buf.push((digit, dptr)); + } + } + buf.reverse(); + Ok(buf) + } + + // `pos` is a position in the file of this node's tag. + fn read_node(&mut self, dptr: Dptr) -> Result { + let dptr = self.seek_to(dptr)?; + let node_tag = self.disktree_csr.read_u8()?; + if 0 == node_tag & 0b1000_0000 { + Ok(Node::Leaf(dptr)) + } else { + let mut children = self.node_buf(); + let n_children = (node_tag & 0b0111_1111).count_ones() as usize; + let child_dptrs = Dptr::read_n(&mut self.disktree_csr, n_children)?; + children.extend( + (0..7) + .rev() + .filter(|digit| node_tag & (1 << digit) != 0) + .zip(child_dptrs.into_iter().rev()), + ); + Ok(Node::Parent(children)) + } + } + + /// Returns a recycled node buffer if available, otherwise + /// allocates a new one. + /// + /// See [`Iter::recycle_node_buf`]. + fn node_buf(&mut self) -> Vec<(u8, Dptr)> { + let buf = self + .recycle_bin + .pop() + .unwrap_or_else(|| Vec::with_capacity(7)); + debug_assert!(buf.is_empty()); + buf + } + + /// Accepts a used, empty, node buffer for later reuse. + /// + /// See [`Iter::node_buf`]. + fn recycle_node_buf(&mut self, buf: Vec<(u8, Dptr)>) { + debug_assert!(buf.is_empty()); + self.recycle_bin.push(buf); + } + + // We've encountered an IO error. We're still going to return + // `Some` with the contents of the user's deserializer, but let's + // make sure we never yield another value by clearing stack. + fn stop_yielding(&mut self) { + self.node_stack.clear(); + self.curr_node = None; + } + + pub(crate) fn new(disktree_buf: &'a [u8]) -> Result> { + let mut disktree_csr = Cursor::new(disktree_buf); + let mut cell_stack = CellStack::new(); + let mut node_stack = Vec::new(); + let recycle_bin = Vec::new(); + let mut base_nodes = Self::read_base_nodes(&mut disktree_csr)?; + let curr_node = base_nodes.pop(); + node_stack.push(base_nodes); + if let Some((digit, _)) = curr_node { + cell_stack.push(digit); + } + Ok(Self { + cell_stack, + curr_node, + disktree_buf, + disktree_csr, + recycle_bin, + node_stack, + }) + } +} + +impl<'a> Iterator for Iter<'a> { + type Item = Result<(Cell, &'a [u8])>; + + fn next(&mut self) -> Option { + while self.curr_node.is_none() { + if let Some(mut dptrs) = self.node_stack.pop() { + self.cell_stack.pop(); + if let Some((digit, dptr)) = dptrs.pop() { + self.cell_stack.push(digit); + self.curr_node = Some((digit, dptr)); + self.node_stack.push(dptrs); + } else { + self.recycle_node_buf(dptrs); + } + } else { + break; + } + } + while let Some((digit, dptr)) = self.curr_node { + self.cell_stack.swap(digit); + match self.read_node(dptr) { + Err(e) => { + self.stop_yielding(); + return Some(Err(e)); + } + Ok(Node::Parent(mut children)) => { + if let Some((digit, dptr)) = children.pop() { + self.cell_stack.push(digit); + self.curr_node = Some((digit, dptr)); + self.node_stack.push(children); + } else { + self.recycle_node_buf(children); + } + } + Ok(Node::Leaf(dptr)) => { + self.curr_node = None; + if let Err(e) = self.seek_to(dptr) { + self.stop_yielding(); + return Some(Err(e)); + } + match varint::read(&mut self.disktree_csr) { + Err(e) => { + self.stop_yielding(); + return Some(Err(e)); + } + Ok((val_len, _n_read)) => { + let pos = self.disktree_csr.position() as usize; + let val_buf = &self.disktree_buf[pos..][..val_len as usize]; + return Some(Ok(( + *self.cell_stack.cell().expect("corrupted cell-stack"), + val_buf, + ))); + } + } + } + }; + } + None + } +} diff --git a/src/disktree/mod.rs b/src/disktree/mod.rs new file mode 100644 index 0000000..8eadf67 --- /dev/null +++ b/src/disktree/mod.rs @@ -0,0 +1,213 @@ +//! An on-disk hextree. + +#[cfg(not(target_pointer_width = "64"))] +compile_warning!("disktree may silently fail on non-64bit systems"); + +pub use tree::DiskTreeMap; + +mod dptr; +mod iter; +mod node; +mod tree; +mod varint; +mod writer; + +#[cfg(test)] +mod tests { + use super::*; + use byteorder::{LittleEndian as LE, ReadBytesExt}; + use serde::{Deserialize, Serialize}; + use std::collections::HashMap; + + #[test] + fn test_roundtrip_monaco() { + use crate::{compaction::EqCompactor, Cell, HexTreeMap}; + let idx_bytes = include_bytes!("../../assets/monaco.res12.h3idx"); + let rdr = &mut idx_bytes.as_slice(); + let mut cells = Vec::new(); + while let Ok(idx) = rdr.read_u64::() { + cells.push(Cell::from_raw(idx).unwrap()); + } + + #[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)] + enum Region { + Monaco, + } + + // Construct map with a compactor that automatically combines + // cells with the same save value. + let mut monaco = HexTreeMap::with_compactor(EqCompactor); + + // Now extend the map with cells and a region value. + monaco.extend(cells.iter().copied().zip(std::iter::repeat(Region::Monaco))); + + // You can see in the map above that our set covers Point 1 (green + // check) but not Point 2 (red x), let's test that. + // Lat/lon 43.73631, 7.42418 @ res 12 + let point_1 = Cell::from_raw(0x8c3969a41da15ff).unwrap(); + // Lat/lon 43.73008, 7.42855 @ res 12 + let point_2 = Cell::from_raw(0x8c3969a415065ff).unwrap(); + + let file = tempfile::NamedTempFile::new().unwrap(); + let (mut file, path) = file.keep().unwrap(); + println!("disktree path: {path:?}"); + monaco + .to_disktree(&mut file, |wtr, val| bincode::serialize_into(wtr, val)) + .unwrap(); + let monaco_disktree = DiskTreeMap::open(path).unwrap(); + + assert_eq!(monaco.get(point_2).unzip().1, None); + assert_eq!(monaco.get(point_1).unzip().1, Some(&Region::Monaco)); + + for (ht_cell, &ht_val) in monaco.iter() { + let now = std::time::Instant::now(); + let (dt_cell, val_buf) = monaco_disktree.get(ht_cell).unwrap().unwrap(); + let dt_val = bincode::deserialize_from(val_buf).unwrap(); + let lookup_duration = now.elapsed(); + println!("loookup of {dt_cell} took {lookup_duration:?}"); + assert_eq!(ht_val, dt_val); + assert_eq!(ht_cell, dt_cell); + } + } + + #[test] + fn test_variable_sized_vals() { + use crate::{Cell, HexTreeMap}; + + let (keeper_cells, test_cells): (Vec, Vec) = { + let idx_bytes = include_bytes!("../../assets/monaco.res12.h3idx"); + let rdr = &mut idx_bytes.as_slice(); + let mut cells = Vec::new(); + while let Ok(idx) = rdr.read_u64::() { + cells.push(Cell::from_raw(idx).unwrap()); + } + let (l, r) = cells.split_at(625); + (l.to_vec(), r.to_vec()) + }; + + assert_eq!(keeper_cells.len(), 625); + assert_eq!(test_cells.len(), 200); + + fn cell_to_value(cell: &Cell) -> Vec { + use std::hash::{Hash, Hasher}; + let mut s = std::collections::hash_map::DefaultHasher::new(); + cell.hash(&mut s); + // Generate length between 0..=0xFFFF; + let len = match s.finish() & 0xFFFF { + len if len.trailing_ones() == 8 => 0, + len => len, + }; + // assert_ne!(len, 0); + (0..len).map(|idx| idx as u8).collect::>() + } + + let mut zero_len_val_cnt = 0; + + let monaco_hashmap: HashMap<&Cell, Vec> = { + let mut map = HashMap::new(); + for cell in &keeper_cells { + let val = cell_to_value(cell); + if val.is_empty() { + zero_len_val_cnt += 1; + } + map.insert(cell, val); + } + map + }; + + // Ensure we get at least one 0-length value. + assert_ne!(zero_len_val_cnt, 0); + + let monaco_hextree: HexTreeMap<&[u8]> = { + let mut map = HexTreeMap::new(); + for (cell, val) in &monaco_hashmap { + map.insert(**cell, val.as_slice()) + } + map + }; + + let monaco_disktree: DiskTreeMap<_> = { + let file = tempfile::NamedTempFile::new().unwrap(); + let (mut file, path) = file.keep().unwrap(); + monaco_hextree + .to_disktree(&mut file, |wtr, val| wtr.write_all(val)) + .unwrap(); + let _ = file; + DiskTreeMap::open(path).unwrap() + }; + + // Assert neither hashmap nor disktree contain reserved cells. + for cell in test_cells { + assert!(monaco_hashmap.get(&cell).is_none()); + assert!(!monaco_disktree.contains(cell).unwrap()); + } + + // Assert disktree contains all the same values as the + // hashmap. + for (cell, val) in monaco_hashmap + .iter() + .map(|(cell, vec)| (**cell, vec.as_slice())) + { + assert_eq!((cell, val), monaco_disktree.get(cell).unwrap().unwrap()) + } + + // Assert hashmap contains all the same values as the + // disktree. + for (cell, val) in monaco_disktree.iter().unwrap().map(|entry| entry.unwrap()) { + assert_eq!( + (cell, val), + ( + cell, + monaco_hashmap.get(&cell).map(|vec| vec.as_slice()).unwrap() + ) + ) + } + } + + #[test] + fn test_iter() { + use crate::{Cell, HexTreeMap}; + let idx_bytes = include_bytes!("../../assets/monaco.res12.h3idx"); + let rdr = &mut idx_bytes.as_slice(); + let mut cells = Vec::new(); + while let Ok(idx) = rdr.read_u64::() { + cells.push(Cell::from_raw(idx).unwrap()); + } + + // Construct map with a compactor that automatically combines + // cells with the same save value. + let mut monaco = HexTreeMap::new(); + + // Now extend the map with cells and a region value. + monaco.extend(cells.iter().copied().zip(cells.iter().copied())); + + let file = tempfile::NamedTempFile::new().unwrap(); + let (mut file, path) = file.keep().unwrap(); + println!("disktree path: {path:?}"); + monaco + .to_disktree(&mut file, |wtr, val| bincode::serialize_into(wtr, val)) + .unwrap(); + let monaco_disktree = DiskTreeMap::open(path).unwrap(); + + // Create the iterator with the user-defined deserialzer. + let disktree_iter = monaco_disktree.iter().unwrap(); + let start = std::time::Instant::now(); + let mut disktree_collection = Vec::new(); + for res in disktree_iter { + let (cell, val_buf) = res.unwrap(); + disktree_collection.push((cell, bincode::deserialize_from(val_buf).unwrap())); + } + let elapsed = start.elapsed(); + println!("{elapsed:?}"); + let start = std::time::Instant::now(); + let hextree_collection: Vec<_> = monaco.iter().map(|(k, v)| (k, *v)).collect(); + let elapsed = start.elapsed(); + println!("{elapsed:?}"); + + assert_eq!( + hextree_collection, + disktree_collection, + "iterating a disktree should yield identically ordered elements as the hextree tree it was derived from" + ); + } +} diff --git a/src/disktree/node.rs b/src/disktree/node.rs new file mode 100644 index 0000000..011374f --- /dev/null +++ b/src/disktree/node.rs @@ -0,0 +1,48 @@ +use crate::{ + disktree::{dptr::Dptr, varint}, + error::Result, +}; +use byteorder::ReadBytesExt; +use std::{ + io::{Read, Seek}, + mem::size_of, + ops::Range, +}; + +// Enough bytes to read node tag and 7 child dptrs. +const NODE_BUF_SZ: usize = size_of::() + 7 * Dptr::size() as usize; + +pub(crate) enum Node { + // value_begin..value_end + Leaf(Range), + // (H3 Cell digit, file position of child's node tag) + Parent([Option; 7]), +} + +impl Node { + pub(crate) fn read(rdr: &mut R) -> Result + where + R: Seek + Read, + { + let start_pos = rdr.stream_position()?; + let mut buf = [0u8; NODE_BUF_SZ]; + let bytes_read = rdr.read(&mut buf)?; + let buf_rdr = &mut &buf[..bytes_read]; + let node_tag = buf_rdr.read_u8()?; + if 0 == node_tag & 0b1000_0000 { + let (val_len, n_read) = varint::read(&mut &buf[..bytes_read])?; + let begin = (start_pos + n_read) as usize; + let end = begin + val_len as usize; + Ok(Node::Leaf(begin..end)) + } else { + let mut children: [Option; 7] = [None, None, None, None, None, None, None]; + for (_digit, child) in (0..7) + .zip(children.iter_mut()) + .filter(|(digit, _)| node_tag & (1 << digit) != 0) + { + *child = Some(Dptr::read(buf_rdr)?); + } + Ok(Node::Parent(children)) + } + } +} diff --git a/src/disktree/tree.rs b/src/disktree/tree.rs new file mode 100755 index 0000000..62fcad1 --- /dev/null +++ b/src/disktree/tree.rs @@ -0,0 +1,119 @@ +use crate::{ + digits::Digits, + disktree::{dptr::Dptr, iter::Iter, node::Node}, + error::Result, + Cell, Error, +}; +use byteorder::ReadBytesExt; +use memmap::{Mmap, MmapOptions}; +use std::{ + fs::File, + io::{Cursor, Read, Seek, SeekFrom}, + ops::Range, + path::Path, +}; + +pub(crate) const HDR_MAGIC: &[u8] = b"hextree\0"; +pub(crate) const HDR_SZ: u64 = HDR_MAGIC.len() as u64 + 1; + +/// An on-disk hextree map. +pub struct DiskTreeMap(B); + +impl DiskTreeMap { + /// Opens a `DiskTree` at the specified path. + pub fn open>(path: P) -> Result { + let file = File::open(path)?; + Self::memmap(file) + } + + /// Memory maps the provided disktree-containing. + pub fn memmap(file: File) -> Result { + #[allow(unsafe_code)] + let mm = unsafe { MmapOptions::new().map(&file)? }; + Self::with_buf(mm) + } +} + +impl> DiskTreeMap { + /// Opens a `DiskTree` with a provided buffer. + pub fn with_buf(buf: B) -> Result { + let mut csr = Cursor::new(buf); + let magic = { + let mut buf = [0_u8; HDR_MAGIC.len()]; + csr.read_exact(&mut buf)?; + buf + }; + if magic != HDR_MAGIC { + return Err(Error::NotDisktree); + } + + let version = { + // We use 0xFE as a version offset since it is much less + // likely to randomly appear than 0; + 0xFE - csr.read_u8()? + }; + match version { + 0 => Ok(Self(csr.into_inner())), + unsupported_version => Err(Error::Version(unsupported_version)), + } + } + + /// Returns `(Cell, &[u8])`, if present. + pub fn get(&self, cell: Cell) -> Result> { + let base_cell_pos = Self::base_cell_dptr(cell); + let mut csr = Cursor::new(self.0.as_ref()); + csr.seek(SeekFrom::Start(base_cell_pos.into()))?; + let node_dptr = Dptr::read(&mut csr)?; + if node_dptr.is_null() { + return Ok(None); + } + let digits = Digits::new(cell); + if let Some((cell, range)) = Self::_get(&mut csr, 0, node_dptr, cell, digits)? { + let val_bytes = &self.0.as_ref()[range]; + Ok(Some((cell, val_bytes))) + } else { + Ok(None) + } + } + + /// Returns `true` if the tree fully contains `cell`. + pub fn contains(&self, cell: Cell) -> Result { + self.get(cell).map(|opt| opt.is_some()) + } + + /// Returns an iterator visiting all `(Cell, &[u8])` pairs in + /// arbitrary order. + pub fn iter(&self) -> Result>> { + Iter::new(self.0.as_ref()) + } + + fn _get( + csr: &mut Cursor<&[u8]>, + res: u8, + node_dptr: Dptr, + cell: Cell, + mut digits: Digits, + ) -> Result)>> { + csr.seek(SeekFrom::Start(node_dptr.into()))?; + let node = Node::read(csr)?; + match (digits.next(), node) { + (None, Node::Leaf(range)) => Ok(Some((cell, range))), + (Some(_), Node::Leaf(range)) => Ok(Some(( + cell.to_parent(res).expect("invalid condition"), + range, + ))), + (Some(digit), Node::Parent(children)) => match children[digit as usize] { + None => Ok(None), + Some(dptr) => Self::_get(csr, res + 1, dptr, cell, digits), + }, + // No digits left, but `self` isn't full, so this cell + // can't fully contain the target. + (None, _) => Ok(None), + } + } + + /// Returns the DPtr to a base (res0) cell dptr. + fn base_cell_dptr(cell: Cell) -> Dptr { + Dptr::from(HDR_SZ + Dptr::size() * (cell.base() as u64)) + } +} diff --git a/src/disktree/varint.rs b/src/disktree/varint.rs new file mode 100644 index 0000000..fc6b7ac --- /dev/null +++ b/src/disktree/varint.rs @@ -0,0 +1,86 @@ +use crate::error::{Error, Result}; +use byteorder::{BigEndian as BE, ReadBytesExt, WriteBytesExt}; +use std::io::{Read, Write}; + +// 134_217_727 +// 2^27 - 1 +#[allow(dead_code)] +const MAX_VARINT_VAL: u32 = 0x7FF_FFFF; + +pub(crate) fn write(mut wtr: W, value: u32) -> Result { + if value < 0x40 { + // 01xx_xxxx + wtr.write_u8((value | 0x40) as u8)?; + Ok(1) + } else if value < 0x2000 { + // 001x_xxxx xxxx_xxxx + wtr.write_u16::((value | 0x2000) as u16)?; + Ok(2) + } else if value < 0x10_0000 { + // 0001_xxxx xxxx_xxxx xxxx_xxxx + let value = value | 0x10_0000; + wtr.write_u8((value >> 16) as u8)?; + wtr.write_u16::((value & 0xffff) as u16)?; + Ok(3) + } else if value < 0x800_0000 { + // 0000_1xxx xxxx_xxxx xxxx_xxxx xxxx_xxxx + wtr.write_u32::(value | 0x800_0000)?; + Ok(4) + } else { + Err(Error::Varint(value)) + } +} + +pub(crate) fn read(mut rdr: R) -> Result<(u32, u64)> { + let a = rdr.read_u8()?; + match a.leading_zeros() { + 1 => { + // 01xx_xxxx + let val = (a & 0x3F) as u32; + Ok((val, 1)) + } + 2 => { + // 001x_xxxx xxxx_xxxx + let a = (a & 0x1F) as u32; + let b = rdr.read_u8()? as u32; + let val = a << 8 | b; + Ok((val, 2)) + } + 3 => { + // 0001_xxxx xxxx_xxxx + let a = (a & 0x0F) as u32; + let b = rdr.read_u16::()? as u32; + let val = a << 16 | b; + Ok((val, 3)) + } + 4 => { + // 0000_1xxx xxxx_xxxx xxxx_xxxx + let a = (a & 0x07) as u32; + let b = rdr.read_u8()? as u32; + let c = rdr.read_u16::()? as u32; + let val = a << 24 | b << 16 | c; + Ok((val, 4)) + } + _ => Err(Error::Varint(a as u32)), + } +} + +#[cfg(test)] +mod tests { + use super::{read, write, MAX_VARINT_VAL}; + + #[test] + fn test_varint() { + let mut buf = Vec::new(); + for val in 0..=MAX_VARINT_VAL { + write(&mut buf, val).unwrap(); + assert!(buf[0].leading_zeros() > 0); + let (r_val, _n) = read(&mut &buf[..]).unwrap(); + assert_eq!(val, r_val); + buf.clear(); + } + for val in MAX_VARINT_VAL + 1..=u32::MAX { + assert!(write(&mut buf, val).is_err()); + } + } +} diff --git a/src/disktree/writer.rs b/src/disktree/writer.rs new file mode 100644 index 0000000..afd5fc3 --- /dev/null +++ b/src/disktree/writer.rs @@ -0,0 +1,129 @@ +use crate::{ + compaction::Compactor, + disktree::{dptr::Dptr, tree::HDR_MAGIC, varint}, + error::{Error, Result}, + node::Node, + HexTreeMap, +}; +use byteorder::WriteBytesExt; +use std::io::{Seek, SeekFrom, Write}; + +impl> HexTreeMap { + /// Write self to disk. + pub fn to_disktree(&self, wtr: W, f: F) -> Result + where + W: Write + Seek, + F: Fn(&mut dyn Write, &V) -> std::result::Result<(), E>, + E: std::error::Error + Sync + Send + 'static, + { + DiskTreeWriter::new(wtr).write(self, f) + } +} + +pub(crate) struct DiskTreeWriter { + scratch_pad: Vec, + wtr: W, +} + +impl DiskTreeWriter { + pub fn new(wtr: W) -> Self { + let scratch_pad = Vec::new(); + Self { wtr, scratch_pad } + } +} + +impl DiskTreeWriter { + pub fn write(&mut self, hextree: &HexTreeMap, mut f: F) -> Result + where + F: Fn(&mut dyn Write, &V) -> std::result::Result<(), E>, + E: std::error::Error + Sync + Send + 'static, + { + // Write magic string + self.wtr.write_all(HDR_MAGIC)?; + // Write version field + const VERSION: u8 = 0; + self.wtr.write_u8(0xFE - VERSION)?; + + let mut fixups: Vec<(Dptr, &Node)> = Vec::new(); + + // Write base cells placeholder offsets. + for base in hextree.nodes.iter() { + match base.as_deref() { + None => Dptr::null().write(&mut self.wtr)?, + Some(node) => { + fixups.push((self.pos()?, node)); + Dptr::null().write(&mut self.wtr)? + } + } + } + + for (fixee_dptr, node) in fixups { + let node_dptr = self.write_node(node, &mut f)?; + self.seek_to(fixee_dptr)?; + node_dptr.write(&mut self.wtr)?; + } + + Ok(()) + } + + fn write_node(&mut self, node: &Node, f: &mut F) -> Result + where + F: Fn(&mut dyn Write, &V) -> std::result::Result<(), E>, + E: std::error::Error + Sync + Send + 'static, + { + let node_pos: Dptr = self.wtr.seek(SeekFrom::End(0))?.into(); + let mut node_fixups: Vec<(Dptr, &Node)> = Vec::new(); + match node { + Node::Leaf(val) => { + self.scratch_pad.clear(); + f(&mut self.scratch_pad, val).map_err(|e| Error::Writer(Box::new(e)))?; + let val_len = self.scratch_pad.len() as u64; + varint::write(&mut self.wtr, val_len as u32)?; + self.wtr.write_all(&self.scratch_pad)?; + } + Node::Parent(children) => { + let tag_pos = self.pos()?; + // Write a dummy value so children have accurate + // stream position information. + self.wtr.write_u8(0b1000_0000)?; + let mut tag = 0; + for child in children.iter() { + match child.as_deref() { + None => { + // "insert" a 0 into the tag denoting that + // this node is empty. + tag >>= 1; + } + Some(node) => { + // "insert" a 1 into the tag denoting that + // this node is empty. + tag = (tag >> 1) | 0b1000_0000; + node_fixups.push((self.pos()?, node)); + Dptr::null().write(&mut self.wtr)?; + } + }; + } + self.seek_to(tag_pos)?; + // Make the top bit 1 as a sentinel. + tag = (tag >> 1) | 0b1000_0000; + self.wtr.write_u8(tag)?; + } + }; + + for (fixee_dptr, node) in node_fixups { + let node_dptr = self.write_node(node, f)?; + self.seek_to(fixee_dptr)?; + node_dptr.write(&mut self.wtr)?; + } + + Ok(node_pos) + } + + fn pos(&mut self) -> Result { + Ok(Dptr::from(self.wtr.stream_position()?)) + } + + fn seek_to(&mut self, dptr: Dptr) -> Result { + Ok(Dptr::from(self.wtr.seek(SeekFrom::Start(u64::from(dptr)))?)) + } +} diff --git a/src/error.rs b/src/error.rs index 56963b0..ba8110a 100644 --- a/src/error.rs +++ b/src/error.rs @@ -7,14 +7,97 @@ pub type Result = std::result::Result; pub enum Error { /// An invalid raw source value was used for an H3 cell. Index(u64), + + /// An io error. + #[cfg(feature = "disktree")] + Io(std::io::Error), + + /// Not a disktree. + #[cfg(feature = "disktree")] + NotDisktree, + + /// Unsupported version. + #[cfg(feature = "disktree")] + Version(u8), + + /// Invalid value tag found in disktree. + #[cfg(feature = "disktree")] + InvalidTag(u8, u64), + + /// Invalid value size bytes found in disktree header. + #[cfg(feature = "disktree")] + Varint(u32), + + /// User-provided serializer failed. + #[cfg(feature = "disktree")] + Writer(Box), } -impl std::error::Error for Error {} +#[cfg(feature = "disktree")] +impl std::convert::From for Error { + fn from(other: std::io::Error) -> Self { + Error::Io(other) + } +} + +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Error::Index(_) => None, + + #[cfg(feature = "disktree")] + Error::Io(inner) => inner.source(), + + #[cfg(feature = "disktree")] + Error::NotDisktree => None, + + #[cfg(feature = "disktree")] + Error::Version(_) => None, + + #[cfg(feature = "disktree")] + Error::InvalidTag(_, _) => None, + + #[cfg(feature = "disktree")] + Error::Varint(_) => None, + + #[cfg(feature = "disktree")] + Error::Writer(inner) => inner.source(), + } + } +} impl std::fmt::Display for Error { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { Error::Index(bits) => write!(f, "raw u64 is not a valid H3 index: {bits}"), + + #[cfg(feature = "disktree")] + Error::Io(io_error) => io_error.fmt(f), + + #[cfg(feature = "disktree")] + Error::NotDisktree => { + write!(f, "file missing magic header") + } + + #[cfg(feature = "disktree")] + Error::Version(version) => { + write!(f, "unsupported version, got {version}") + } + + #[cfg(feature = "disktree")] + Error::InvalidTag(tag, pos) => { + write!(f, "invalid tag, got {tag}, pos {pos}") + } + + #[cfg(feature = "disktree")] + Error::Varint(val) => { + write!(f, "byte sequence is not a valid varint, got {val}") + } + + #[cfg(feature = "disktree")] + Error::Writer(writer_error) => { + write!(f, "provided writer returned an error, got {writer_error}") + } } } } diff --git a/src/hex_tree_map.rs b/src/hex_tree_map.rs index 983fa3e..d5e63fe 100644 --- a/src/hex_tree_map.rs +++ b/src/hex_tree_map.rs @@ -65,7 +65,7 @@ use std::{cmp::PartialEq, iter::FromIterator}; #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct HexTreeMap { /// All h3 0 base cell indices in the tree - nodes: Box<[Option>>]>, + pub(crate) nodes: Box<[Option>>]>, /// User-provided compator. Defaults to the null compactor. compactor: C, } diff --git a/src/hex_tree_set.rs b/src/hex_tree_set.rs index 2eeed54..ca2471a 100644 --- a/src/hex_tree_set.rs +++ b/src/hex_tree_set.rs @@ -17,7 +17,7 @@ use std::iter::FromIterator; /// /// ``` /// # fn main() -> Result<(), Box> { -/// use geo_types::coord; +/// use geo::coord; /// use hextree::{Cell, HexTreeSet}; /// # /// # use byteorder::{LittleEndian as LE, ReadBytesExt}; diff --git a/src/lib.rs b/src/lib.rs index efeeb82..6d35835 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -46,6 +46,8 @@ mod cell; pub mod compaction; mod digits; +#[cfg(feature = "disktree")] +pub mod disktree; mod entry; mod error; pub mod hex_tree_map; @@ -57,3 +59,5 @@ pub use crate::cell::Cell; pub use crate::hex_tree_map::HexTreeMap; pub use crate::hex_tree_set::HexTreeSet; pub use error::{Error, Result}; +#[cfg(feature = "disktree")] +pub use memmap; diff --git a/tests/tests.rs b/tests/tests.rs index 38bb8d4..76493c2 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -1,4 +1,4 @@ -use geo_types::coord; +use geo::coord; use h3_lorawan_regions as regions; use h3ron::H3Cell; use hextree::{compaction::EqCompactor, Cell, HexTreeMap, HexTreeSet};