Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add on DiskTree: an on-disk hextree #36

Merged
merged 1 commit into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ jobs:
- name: Check formatting
run: cargo fmt --check
- name: Tests
run: cargo test
run: cargo test --release --all-features
12 changes: 10 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,26 @@ all-features = true

[features]
default = []
disktree = [
"byteorder",
"memmap",
"serde",
]
serde = ["dep:serde"]

[dependencies]
byteorder = { version = "1", optional = true }
memmap = { version = "0.7", optional = true }
serde = { version = "1", optional = true, features = ["derive"] }

[dev-dependencies]
byteorder = "1"
bincode = { version = "1.3.3" }
byteorder = { version = "1" }
criterion = { version = "0.3", features = ["html_reports"] }
geo = "0.26.0"
geo-types = "0.7"
h3o = { version = "0.4.0", features = ["geo"] }
h3ron = "0.15.1"
tempfile = "3"

[dev-dependencies.h3-lorawan-regions]
git = "https://github.com/JayKickliter/h3-lorawan-regions.git"
Expand Down
52 changes: 48 additions & 4 deletions benches/benches.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
use geo::polygon;
use geo_types::coord;
use geo::{coord, polygon};
use h3_lorawan_regions::{
compact::US915 as COMPACT_US915_INDICES, nocompact::US915 as PLAIN_US915_INDICES,
};
Expand All @@ -9,7 +8,7 @@ use h3o::{
CellIndex, Resolution,
};
use h3ron::H3Cell;
use hextree::{compaction::EqCompactor, Cell, HexTreeMap, HexTreeSet};
use hextree::{compaction::EqCompactor, disktree::DiskTreeMap, Cell, HexTreeMap, HexTreeSet};
use std::convert::TryFrom;

fn set_lookup(c: &mut Criterion) {
Expand Down Expand Up @@ -49,6 +48,50 @@ fn set_lookup(c: &mut Criterion) {
}
}

fn disk_set_lookup(c: &mut Criterion) {
let mut group = c.benchmark_group("US915 DiskTreeSet lookup");

let us915_disk_set = {
let us915_set: HexTreeSet = PLAIN_US915_INDICES
.iter()
.map(|&idx| Cell::try_from(idx).unwrap())
.collect();
let mut file = tempfile::tempfile().unwrap();
us915_set
.to_disktree(&mut file, |_, _| Ok::<(), std::io::Error>(()))
.unwrap();
DiskTreeMap::memmap(file).unwrap()
};

let tarpon_springs = coord! {x: -82.753822, y: 28.15215};
let gulf_of_mexico = coord! {x: -83.101920, y: 28.128096};
let paris = coord! {x: 2.340340, y: 48.868680};

for resolution in [0, 4, 8, 12, 15] {
let tarpon_springs =
Cell::try_from(*H3Cell::from_coordinate(tarpon_springs, resolution).unwrap()).unwrap();
let gulf_of_mexico =
Cell::try_from(*H3Cell::from_coordinate(gulf_of_mexico, resolution).unwrap()).unwrap();
let paris = Cell::try_from(*H3Cell::from_coordinate(paris, resolution).unwrap()).unwrap();

group.bench_with_input(
BenchmarkId::new("Tarpon Spring", resolution),
&tarpon_springs,
|b, &cell| b.iter(|| us915_disk_set.contains(cell)),
);

group.bench_with_input(
BenchmarkId::new("Gulf of Mexico", resolution),
&gulf_of_mexico,
|b, &cell| b.iter(|| us915_disk_set.contains(cell)),
);

group.bench_with_input(BenchmarkId::new("Paris", resolution), &paris, |b, &cell| {
b.iter(|| us915_disk_set.contains(cell))
});
}
}

fn set_construction(c: &mut Criterion) {
let mut group = c.benchmark_group("US915 HexTreeSet construction");

Expand Down Expand Up @@ -256,8 +299,9 @@ fn subtree_iter(c: &mut Criterion) {

criterion_group!(
benches,
subtree_iter,
set_lookup,
disk_set_lookup,
subtree_iter,
map_lookup,
set_iteration,
map_iteration,
Expand Down
82 changes: 82 additions & 0 deletions src/disktree/dptr.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
use crate::Result;
use std::{
io::{Read, Write},
mem::size_of,
};

/// A 'disk' pointer.
#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
#[repr(transparent)]
pub(crate) struct Dptr(u64);

impl Dptr {
#[allow(clippy::cast_possible_truncation)]
const MAX: u64 = 2_u64.pow(Self::DISK_REPR_SZ as u32 * 8) - 1;
const DISK_REPR_SZ: usize = 5;
const NULL: u64 = 0;

pub(crate) const fn is_null(self) -> bool {
self.0 == Self::NULL
}

pub(crate) const fn null() -> Dptr {
Dptr(Self::NULL)
}

pub(crate) const fn size() -> u64 {
Self::DISK_REPR_SZ as u64
}

/// Read 5 bytes from disk and parses them as little-endian `u64`.
pub(crate) fn read<R>(src: &mut R) -> Result<Self>
where
R: Read,
{
let mut buf = [0u8; size_of::<u64>()];
src.read_exact(&mut buf[..Self::DISK_REPR_SZ])?;
let dptr = u64::from_le_bytes(buf);
Ok(dptr.into())
}

/// Read 5 * `n` bytes from disk, for up to n=7, and parses them as
/// little-endian `u64`s.
pub(crate) fn read_n<R>(src: &mut R, n: usize) -> Result<Vec<Dptr>>
where
R: Read,
{
debug_assert!(n <= 7);
let mut buf = [0; Self::DISK_REPR_SZ * 7];
src.read_exact(&mut buf[..(Self::DISK_REPR_SZ * n)])?;
Ok(buf[..(Self::DISK_REPR_SZ * n)]
.chunks(Self::DISK_REPR_SZ)
.map(|chunk| {
let mut buf = [0u8; size_of::<u64>()];
buf[..Self::DISK_REPR_SZ].copy_from_slice(chunk);
u64::from_le_bytes(buf)
})
.map(Dptr)
.collect())
}

/// Writes the 5 lower bytes of a `u64` to disk.
pub(crate) fn write<W>(self, dst: &mut W) -> Result
where
W: Write,
{
let buf = self.0.to_le_bytes();
Ok(dst.write_all(&buf[..Self::DISK_REPR_SZ])?)
}
}

impl From<Dptr> for u64 {
fn from(Dptr(raw): Dptr) -> u64 {
raw
}
}

impl From<u64> for Dptr {
fn from(raw: u64) -> Dptr {
assert!(raw <= Self::MAX);
Dptr(raw)
}
}
176 changes: 176 additions & 0 deletions src/disktree/iter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
use crate::{
cell::CellStack,
disktree::{dptr::Dptr, tree::HDR_SZ, varint},
error::Result,
Cell,
};
use byteorder::ReadBytesExt;
use std::io::{Cursor, Seek, SeekFrom};

pub(crate) struct Iter<'a> {
cell_stack: CellStack,
curr_node: Option<(u8, Dptr)>,
disktree_buf: &'a [u8],
disktree_csr: Cursor<&'a [u8]>,
node_stack: Vec<Vec<(u8, Dptr)>>,
recycle_bin: Vec<Vec<(u8, Dptr)>>,
}

enum Node {
// File position for the fist byte of value data.
Leaf(Dptr),
// (H3 Cell digit, file position of child's node tag)
Parent(Vec<(u8, Dptr)>),
}

impl<'a> Iter<'a> {
fn seek_to(&mut self, dptr: Dptr) -> Result<Dptr> {
Ok(Dptr::from(
self.disktree_csr.seek(SeekFrom::Start(u64::from(dptr)))?,
))
}

fn read_base_nodes(rdr: &mut Cursor<&[u8]>) -> Result<Vec<(u8, Dptr)>> {
let mut buf = Vec::with_capacity(122);
rdr.seek(SeekFrom::Start(HDR_SZ))?;
for digit in 0..122 {
let dptr = Dptr::read(rdr)?;
if !dptr.is_null() {
buf.push((digit, dptr));
}
}
buf.reverse();
Ok(buf)
}

// `pos` is a position in the file of this node's tag.
fn read_node(&mut self, dptr: Dptr) -> Result<Node> {
let dptr = self.seek_to(dptr)?;
let node_tag = self.disktree_csr.read_u8()?;
if 0 == node_tag & 0b1000_0000 {
Ok(Node::Leaf(dptr))
} else {
let mut children = self.node_buf();
let n_children = (node_tag & 0b0111_1111).count_ones() as usize;
let child_dptrs = Dptr::read_n(&mut self.disktree_csr, n_children)?;
children.extend(
(0..7)
.rev()
.filter(|digit| node_tag & (1 << digit) != 0)
.zip(child_dptrs.into_iter().rev()),
);
Ok(Node::Parent(children))
}
}

/// Returns a recycled node buffer if available, otherwise
/// allocates a new one.
///
/// See [`Iter::recycle_node_buf`].
fn node_buf(&mut self) -> Vec<(u8, Dptr)> {
let buf = self
.recycle_bin
.pop()
.unwrap_or_else(|| Vec::with_capacity(7));
debug_assert!(buf.is_empty());
buf
}

/// Accepts a used, empty, node buffer for later reuse.
///
/// See [`Iter::node_buf`].
fn recycle_node_buf(&mut self, buf: Vec<(u8, Dptr)>) {
debug_assert!(buf.is_empty());
self.recycle_bin.push(buf);
}

// We've encountered an IO error. We're still going to return
// `Some` with the contents of the user's deserializer, but let's
// make sure we never yield another value by clearing stack.
fn stop_yielding(&mut self) {
self.node_stack.clear();
self.curr_node = None;
}

pub(crate) fn new(disktree_buf: &'a [u8]) -> Result<Iter<'a>> {
let mut disktree_csr = Cursor::new(disktree_buf);
let mut cell_stack = CellStack::new();
let mut node_stack = Vec::new();
let recycle_bin = Vec::new();
let mut base_nodes = Self::read_base_nodes(&mut disktree_csr)?;
let curr_node = base_nodes.pop();
node_stack.push(base_nodes);
if let Some((digit, _)) = curr_node {
cell_stack.push(digit);
}
Ok(Self {
cell_stack,
curr_node,
disktree_buf,
disktree_csr,
recycle_bin,
node_stack,
})
}
}

impl<'a> Iterator for Iter<'a> {
type Item = Result<(Cell, &'a [u8])>;

fn next(&mut self) -> Option<Self::Item> {
while self.curr_node.is_none() {
if let Some(mut dptrs) = self.node_stack.pop() {
self.cell_stack.pop();
if let Some((digit, dptr)) = dptrs.pop() {
self.cell_stack.push(digit);
self.curr_node = Some((digit, dptr));
self.node_stack.push(dptrs);
} else {
self.recycle_node_buf(dptrs);
}
} else {
break;
}
}
while let Some((digit, dptr)) = self.curr_node {
self.cell_stack.swap(digit);
match self.read_node(dptr) {
Err(e) => {
self.stop_yielding();
return Some(Err(e));
}
Ok(Node::Parent(mut children)) => {
if let Some((digit, dptr)) = children.pop() {
self.cell_stack.push(digit);
self.curr_node = Some((digit, dptr));
self.node_stack.push(children);
} else {
self.recycle_node_buf(children);
}
}
Ok(Node::Leaf(dptr)) => {
self.curr_node = None;
if let Err(e) = self.seek_to(dptr) {
self.stop_yielding();
return Some(Err(e));
}
match varint::read(&mut self.disktree_csr) {
Err(e) => {
self.stop_yielding();
return Some(Err(e));
}
Ok((val_len, _n_read)) => {
let pos = self.disktree_csr.position() as usize;
let val_buf = &self.disktree_buf[pos..][..val_len as usize];
return Some(Ok((
*self.cell_stack.cell().expect("corrupted cell-stack"),
val_buf,
)));
}
}
}
};
}
None
}
}
Loading
Loading