Skip to content

Commit

Permalink
Merge pull request #36 from JayKickliter/jsk/add-disk-repr
Browse files Browse the repository at this point in the history
Add on DiskTree: an on-disk hextree
  • Loading branch information
JayKickliter authored Jan 5, 2024
2 parents 730f71d + c4d8108 commit 917b819
Show file tree
Hide file tree
Showing 15 changed files with 1,003 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ jobs:
- name: Check formatting
run: cargo fmt --check
- name: Tests
run: cargo test
run: cargo test --release --all-features
12 changes: 10 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,26 @@ all-features = true

[features]
default = []
disktree = [
"byteorder",
"memmap",
"serde",
]
serde = ["dep:serde"]

[dependencies]
byteorder = { version = "1", optional = true }
memmap = { version = "0.7", optional = true }
serde = { version = "1", optional = true, features = ["derive"] }

[dev-dependencies]
byteorder = "1"
bincode = { version = "1.3.3" }
byteorder = { version = "1" }
criterion = { version = "0.3", features = ["html_reports"] }
geo = "0.26.0"
geo-types = "0.7"
h3o = { version = "0.4.0", features = ["geo"] }
h3ron = "0.15.1"
tempfile = "3"

[dev-dependencies.h3-lorawan-regions]
git = "https://github.com/JayKickliter/h3-lorawan-regions.git"
Expand Down
52 changes: 48 additions & 4 deletions benches/benches.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
use geo::polygon;
use geo_types::coord;
use geo::{coord, polygon};
use h3_lorawan_regions::{
compact::US915 as COMPACT_US915_INDICES, nocompact::US915 as PLAIN_US915_INDICES,
};
Expand All @@ -9,7 +8,7 @@ use h3o::{
CellIndex, Resolution,
};
use h3ron::H3Cell;
use hextree::{compaction::EqCompactor, Cell, HexTreeMap, HexTreeSet};
use hextree::{compaction::EqCompactor, disktree::DiskTreeMap, Cell, HexTreeMap, HexTreeSet};
use std::convert::TryFrom;

fn set_lookup(c: &mut Criterion) {
Expand Down Expand Up @@ -49,6 +48,50 @@ fn set_lookup(c: &mut Criterion) {
}
}

fn disk_set_lookup(c: &mut Criterion) {
let mut group = c.benchmark_group("US915 DiskTreeSet lookup");

let us915_disk_set = {
let us915_set: HexTreeSet = PLAIN_US915_INDICES
.iter()
.map(|&idx| Cell::try_from(idx).unwrap())
.collect();
let mut file = tempfile::tempfile().unwrap();
us915_set
.to_disktree(&mut file, |_, _| Ok::<(), std::io::Error>(()))
.unwrap();
DiskTreeMap::memmap(file).unwrap()
};

let tarpon_springs = coord! {x: -82.753822, y: 28.15215};
let gulf_of_mexico = coord! {x: -83.101920, y: 28.128096};
let paris = coord! {x: 2.340340, y: 48.868680};

for resolution in [0, 4, 8, 12, 15] {
let tarpon_springs =
Cell::try_from(*H3Cell::from_coordinate(tarpon_springs, resolution).unwrap()).unwrap();
let gulf_of_mexico =
Cell::try_from(*H3Cell::from_coordinate(gulf_of_mexico, resolution).unwrap()).unwrap();
let paris = Cell::try_from(*H3Cell::from_coordinate(paris, resolution).unwrap()).unwrap();

group.bench_with_input(
BenchmarkId::new("Tarpon Spring", resolution),
&tarpon_springs,
|b, &cell| b.iter(|| us915_disk_set.contains(cell)),
);

group.bench_with_input(
BenchmarkId::new("Gulf of Mexico", resolution),
&gulf_of_mexico,
|b, &cell| b.iter(|| us915_disk_set.contains(cell)),
);

group.bench_with_input(BenchmarkId::new("Paris", resolution), &paris, |b, &cell| {
b.iter(|| us915_disk_set.contains(cell))
});
}
}

fn set_construction(c: &mut Criterion) {
let mut group = c.benchmark_group("US915 HexTreeSet construction");

Expand Down Expand Up @@ -256,8 +299,9 @@ fn subtree_iter(c: &mut Criterion) {

criterion_group!(
benches,
subtree_iter,
set_lookup,
disk_set_lookup,
subtree_iter,
map_lookup,
set_iteration,
map_iteration,
Expand Down
82 changes: 82 additions & 0 deletions src/disktree/dptr.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
use crate::Result;
use std::{
io::{Read, Write},
mem::size_of,
};

/// A 'disk' pointer.
#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
#[repr(transparent)]
pub(crate) struct Dptr(u64);

impl Dptr {
#[allow(clippy::cast_possible_truncation)]
const MAX: u64 = 2_u64.pow(Self::DISK_REPR_SZ as u32 * 8) - 1;
const DISK_REPR_SZ: usize = 5;
const NULL: u64 = 0;

pub(crate) const fn is_null(self) -> bool {
self.0 == Self::NULL
}

pub(crate) const fn null() -> Dptr {
Dptr(Self::NULL)
}

pub(crate) const fn size() -> u64 {
Self::DISK_REPR_SZ as u64
}

/// Read 5 bytes from disk and parses them as little-endian `u64`.
pub(crate) fn read<R>(src: &mut R) -> Result<Self>
where
R: Read,
{
let mut buf = [0u8; size_of::<u64>()];
src.read_exact(&mut buf[..Self::DISK_REPR_SZ])?;
let dptr = u64::from_le_bytes(buf);
Ok(dptr.into())
}

/// Read 5 * `n` bytes from disk, for up to n=7, and parses them as
/// little-endian `u64`s.
pub(crate) fn read_n<R>(src: &mut R, n: usize) -> Result<Vec<Dptr>>
where
R: Read,
{
debug_assert!(n <= 7);
let mut buf = [0; Self::DISK_REPR_SZ * 7];
src.read_exact(&mut buf[..(Self::DISK_REPR_SZ * n)])?;
Ok(buf[..(Self::DISK_REPR_SZ * n)]
.chunks(Self::DISK_REPR_SZ)
.map(|chunk| {
let mut buf = [0u8; size_of::<u64>()];
buf[..Self::DISK_REPR_SZ].copy_from_slice(chunk);
u64::from_le_bytes(buf)
})
.map(Dptr)
.collect())
}

/// Writes the 5 lower bytes of a `u64` to disk.
pub(crate) fn write<W>(self, dst: &mut W) -> Result
where
W: Write,
{
let buf = self.0.to_le_bytes();
Ok(dst.write_all(&buf[..Self::DISK_REPR_SZ])?)
}
}

impl From<Dptr> for u64 {
fn from(Dptr(raw): Dptr) -> u64 {
raw
}
}

impl From<u64> for Dptr {
fn from(raw: u64) -> Dptr {
assert!(raw <= Self::MAX);
Dptr(raw)
}
}
176 changes: 176 additions & 0 deletions src/disktree/iter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
use crate::{
cell::CellStack,
disktree::{dptr::Dptr, tree::HDR_SZ, varint},
error::Result,
Cell,
};
use byteorder::ReadBytesExt;
use std::io::{Cursor, Seek, SeekFrom};

pub(crate) struct Iter<'a> {
cell_stack: CellStack,
curr_node: Option<(u8, Dptr)>,
disktree_buf: &'a [u8],
disktree_csr: Cursor<&'a [u8]>,
node_stack: Vec<Vec<(u8, Dptr)>>,
recycle_bin: Vec<Vec<(u8, Dptr)>>,
}

enum Node {
// File position for the fist byte of value data.
Leaf(Dptr),
// (H3 Cell digit, file position of child's node tag)
Parent(Vec<(u8, Dptr)>),
}

impl<'a> Iter<'a> {
fn seek_to(&mut self, dptr: Dptr) -> Result<Dptr> {
Ok(Dptr::from(
self.disktree_csr.seek(SeekFrom::Start(u64::from(dptr)))?,
))
}

fn read_base_nodes(rdr: &mut Cursor<&[u8]>) -> Result<Vec<(u8, Dptr)>> {
let mut buf = Vec::with_capacity(122);
rdr.seek(SeekFrom::Start(HDR_SZ))?;
for digit in 0..122 {
let dptr = Dptr::read(rdr)?;
if !dptr.is_null() {
buf.push((digit, dptr));
}
}
buf.reverse();
Ok(buf)
}

// `pos` is a position in the file of this node's tag.
fn read_node(&mut self, dptr: Dptr) -> Result<Node> {
let dptr = self.seek_to(dptr)?;
let node_tag = self.disktree_csr.read_u8()?;
if 0 == node_tag & 0b1000_0000 {
Ok(Node::Leaf(dptr))
} else {
let mut children = self.node_buf();
let n_children = (node_tag & 0b0111_1111).count_ones() as usize;
let child_dptrs = Dptr::read_n(&mut self.disktree_csr, n_children)?;
children.extend(
(0..7)
.rev()
.filter(|digit| node_tag & (1 << digit) != 0)
.zip(child_dptrs.into_iter().rev()),
);
Ok(Node::Parent(children))
}
}

/// Returns a recycled node buffer if available, otherwise
/// allocates a new one.
///
/// See [`Iter::recycle_node_buf`].
fn node_buf(&mut self) -> Vec<(u8, Dptr)> {
let buf = self
.recycle_bin
.pop()
.unwrap_or_else(|| Vec::with_capacity(7));
debug_assert!(buf.is_empty());
buf
}

/// Accepts a used, empty, node buffer for later reuse.
///
/// See [`Iter::node_buf`].
fn recycle_node_buf(&mut self, buf: Vec<(u8, Dptr)>) {
debug_assert!(buf.is_empty());
self.recycle_bin.push(buf);
}

// We've encountered an IO error. We're still going to return
// `Some` with the contents of the user's deserializer, but let's
// make sure we never yield another value by clearing stack.
fn stop_yielding(&mut self) {
self.node_stack.clear();
self.curr_node = None;
}

pub(crate) fn new(disktree_buf: &'a [u8]) -> Result<Iter<'a>> {
let mut disktree_csr = Cursor::new(disktree_buf);
let mut cell_stack = CellStack::new();
let mut node_stack = Vec::new();
let recycle_bin = Vec::new();
let mut base_nodes = Self::read_base_nodes(&mut disktree_csr)?;
let curr_node = base_nodes.pop();
node_stack.push(base_nodes);
if let Some((digit, _)) = curr_node {
cell_stack.push(digit);
}
Ok(Self {
cell_stack,
curr_node,
disktree_buf,
disktree_csr,
recycle_bin,
node_stack,
})
}
}

impl<'a> Iterator for Iter<'a> {
type Item = Result<(Cell, &'a [u8])>;

fn next(&mut self) -> Option<Self::Item> {
while self.curr_node.is_none() {
if let Some(mut dptrs) = self.node_stack.pop() {
self.cell_stack.pop();
if let Some((digit, dptr)) = dptrs.pop() {
self.cell_stack.push(digit);
self.curr_node = Some((digit, dptr));
self.node_stack.push(dptrs);
} else {
self.recycle_node_buf(dptrs);
}
} else {
break;
}
}
while let Some((digit, dptr)) = self.curr_node {
self.cell_stack.swap(digit);
match self.read_node(dptr) {
Err(e) => {
self.stop_yielding();
return Some(Err(e));
}
Ok(Node::Parent(mut children)) => {
if let Some((digit, dptr)) = children.pop() {
self.cell_stack.push(digit);
self.curr_node = Some((digit, dptr));
self.node_stack.push(children);
} else {
self.recycle_node_buf(children);
}
}
Ok(Node::Leaf(dptr)) => {
self.curr_node = None;
if let Err(e) = self.seek_to(dptr) {
self.stop_yielding();
return Some(Err(e));
}
match varint::read(&mut self.disktree_csr) {
Err(e) => {
self.stop_yielding();
return Some(Err(e));
}
Ok((val_len, _n_read)) => {
let pos = self.disktree_csr.position() as usize;
let val_buf = &self.disktree_buf[pos..][..val_len as usize];
return Some(Ok((
*self.cell_stack.cell().expect("corrupted cell-stack"),
val_buf,
)));
}
}
}
};
}
None
}
}
Loading

0 comments on commit 917b819

Please sign in to comment.