Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EXP: implement some intermediate APIs on top of Rust using pyo3 #428

Closed
wants to merge 38 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
480f319
refactor & rename & consolidate
ctb Aug 17, 2024
e6b1c5b
remove 'lower'
ctb Aug 17, 2024
3bef912
trying out a lower level (index) API
ctb Aug 17, 2024
ba2e967
fix
ctb Aug 17, 2024
5c7a760
compiling...
ctb Aug 17, 2024
87b4563
y'know ...tests
ctb Aug 17, 2024
92c9f3c
add __len__
ctb Aug 17, 2024
02e3e7d
victoir
ctb Aug 17, 2024
9209b63
more better
ctb Aug 17, 2024
a7a723e
add as load_from plugin
ctb Aug 17, 2024
5c673a3
I mean it seems to work, right?
ctb Aug 17, 2024
c8bf074
provide wrapper objects, for now
ctb Aug 17, 2024
8d4fdb5
shrug
ctb Aug 18, 2024
153f246
Merge branch 'main' of github.com:sourmash-bio/sourmash_plugin_branch…
ctb Aug 18, 2024
0d7a556
add cargo doc output for private fn
ctb Aug 18, 2024
df753db
Merge branch 'main' of github.com:sourmash-bio/sourmash_plugin_branch…
ctb Aug 18, 2024
1da0cf3
add a few comments/docs
ctb Aug 18, 2024
2e7f027
switch to dev version of sourmash
ctb Aug 18, 2024
6b9e00f
tracking
ctb Aug 18, 2024
2747935
cleaner
ctb Aug 18, 2024
4f49ef8
cleanup
ctb Aug 18, 2024
af1c82d
load rocksdb natively
ctb Aug 18, 2024
53924d6
foo
ctb Aug 18, 2024
7c73b1a
update to latest sourmash
ctb Aug 19, 2024
34f1430
Merge branch 'ctb_misc2' into branch_api
ctb Aug 19, 2024
e5faed8
Merge branch 'main' of github.com:sourmash-bio/sourmash_plugin_branch…
ctb Aug 19, 2024
7649375
Merge branch 'main' of github.com:sourmash-bio/sourmash_plugin_branch…
ctb Aug 19, 2024
e4618f0
Merge branch 'ctb_misc_cleanup' into ctb_misc2
ctb Aug 19, 2024
3462f92
cargo fmt
ctb Aug 19, 2024
9823ef6
upd
ctb Aug 20, 2024
bfb5053
upd
ctb Aug 20, 2024
c311a69
fix fmt
ctb Aug 20, 2024
28b43d8
MRG: create `MultiCollection` for collections that span multiple file…
ctb Aug 20, 2024
a1b19ae
clippy fixes
ctb Aug 20, 2024
51a14ac
compiling again
ctb Aug 20, 2024
99bd174
cleanup
ctb Aug 20, 2024
083eaf2
Merge branch 'ctb_misc2' into branch_api
ctb Aug 20, 2024
7d2e0c8
foo
ctb Aug 20, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .cargo/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[build]
rustdocflags = ["--document-private-items"]
44 changes: 22 additions & 22 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ crate-type = ["cdylib"]
pyo3 = { version = "0.22.2", features = ["extension-module", "anyhow"] }
rayon = "1.10.0"
serde = { version = "1.0.208", features = ["derive"] }
sourmash = { version = "0.15.0", features = ["branchwater"] }
sourmash = { git = "https://github.com/sourmash-bio/sourmash.git", branch = "more_rs_updates", features = ["branchwater"] }
#sourmash = { version = "0.15.0", features = ["branchwater"] }
serde_json = "1.0.125"
niffler = "2.4.0"
log = "0.4.22"
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ authors = [
requires = ["maturin>=1.4.0,<2"]
build-backend = "maturin"

[project.entry-points."sourmash.load_from"]
collection_reader = "sourmash_plugin_branchwater:load_collection"

[project.entry-points."sourmash.cli_script"]
manysearch = "sourmash_plugin_branchwater:Branchwater_Manysearch"
multisearch = "sourmash_plugin_branchwater:Branchwater_Multisearch"
Expand Down
165 changes: 165 additions & 0 deletions src/branch_api.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/// Lower-level Python API implementation for sourmash_plugin_branchwater
use pyo3::prelude::*;

use crate::utils::build_selection;
use crate::utils::load_collection;
use crate::utils::ReportType;
use crate::utils::multicollection::MultiCollection;
use sourmash::collection::Collection;
use sourmash::manifest::{Manifest, Record};
use pyo3::types::{IntoPyDict, PyDict, PyList};

#[pyclass]
pub struct BranchRecord {
record: Record,
}

#[pymethods]
impl BranchRecord {
pub fn get_name(&self) -> PyResult<String> {
Ok(self.record.name().clone())
}

#[getter]
pub fn get_as_row<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyDict>> {
let dict = {
let key_vals: Vec<(&str, PyObject)> = vec![
("ksize", self.record.ksize().to_object(py)),
("moltype", self.record.moltype().to_string().to_object(py)),
("scaled", self.record.scaled().to_object(py)),
("num", self.record.num().to_object(py)),
("with_abundance", self.record.with_abundance().to_object(py)),
("n_hashes", self.record.n_hashes().to_object(py)),
];
key_vals.into_py_dict_bound(py)
};
Ok(dict)
}
}

/*
impl<T, I> IntoPyDict for I
where
T: PyDictItem
I: IntoIterator<Item = T>
fn into_py_dict(self, py: Python<'_>) -> Bound<'_, PyDict> {
let dict = PyDict::new(py);
for item in self {
dict.set_item(item.key(), item.value())
.expect("Failed to set_item on dict");
}
dict
}
}
*/

#[pyclass]
pub struct BranchManifest {
manifest: Manifest,
}

#[pymethods]
impl BranchManifest {
pub fn __len__(&self) -> PyResult<usize> {
Ok(self.manifest.len())
}
pub fn _check_row_values(&self) -> PyResult<bool> {
Ok(true)
}
#[getter]
pub fn get_rows<'py>(&self, py: Python<'py>) -> PyResult<Vec<Bound<'py, PyDict>>> {
let res: Vec<_> = self.manifest.iter().map(|x| { BranchRecord {
record: x.clone(),
}.get_as_row(py).unwrap()
}).collect();

Ok(res)
}
}

#[pyclass]
pub struct BranchCollection {
#[pyo3(get)]
pub location: String,

#[pyo3(get)]
pub is_database: bool,

#[pyo3(get)]
pub has_manifest: bool,

collection: MultiCollection,
}

#[pymethods]
impl BranchCollection {
pub fn __len__(&self) -> PyResult<usize> {
Ok(self.collection.len())
}

#[getter]
pub fn get_manifest(&self) -> PyResult<Py<BranchManifest>> {
let manifest: Manifest = self.collection.manifest().clone();
let obj =
Python::with_gil(|py| Py::new(py, BranchManifest { manifest: manifest }).unwrap());
Ok(obj)
}
pub fn get_first_record(&self) -> PyResult<Py<BranchRecord>> {
let records: Vec<_> = self.collection.iter().collect();
let first_record = records.first().unwrap().1;

// @CTB: can I turn this into something automatic?
let obj = Python::with_gil(|py| {
Py::new(
py,
BranchRecord {
record: first_record.clone(),
},
)
.unwrap()
});
Ok(obj)
}

#[getter]
pub fn get_rows(&self) -> PyResult<Vec<BranchRecord>> {
let records: Vec<_> = self.collection.iter().collect();

let obj = records
.iter()
.map(|x| {
BranchRecord {
record: x.1.clone(),
}
})
.collect();

// @CTB: this does the GIL grabbing as needed?
Ok(obj)
}
}

#[pyfunction]
pub fn api_load_collection(
location: String,
ksize: u8,
scaled: usize,
moltype: String,
) -> PyResult<Py<BranchCollection>> {
let selection = build_selection(ksize, scaled, &moltype);

let collection = load_collection(&location, &selection, ReportType::Query, true).unwrap();
let obj = Python::with_gil(|py| {
Py::new(
py,
BranchCollection {
location: location,
collection,
is_database: false,
has_manifest: true,
},
)
.unwrap()
});
Ok(obj)
}
2 changes: 1 addition & 1 deletion src/fastgather.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ pub fn fastgather(
)
}
// get single query sig and minhash
let query_sig = query_collection.sig_for_dataset(0)?; // need this for original md5sum
let query_sig = query_collection.get_first_sig().unwrap();
let query_sig_ds = query_sig.clone().select(selection)?; // downsample
let query_mh = match query_sig_ds.minhash() {
Some(query_mh) => query_mh,
Expand Down
6 changes: 3 additions & 3 deletions src/fastmultigather.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,11 @@ pub fn fastmultigather(
let skipped_paths = AtomicUsize::new(0);
let failed_paths = AtomicUsize::new(0);

query_collection.par_iter().for_each(|(_idx, record)| {
query_collection.par_iter().for_each(|(c, _idx, record)| {
// increment counter of # of queries. q: could we instead use the _idx from par_iter(), or will it vary based on thread?
let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst);
// Load query sig (downsampling happens here)
match query_collection.sig_from_record(record) {
match c.sig_from_record(record) {
Ok(query_sig) => {
let name = query_sig.name();
let prefix = name.split(' ').next().unwrap_or_default().to_string();
Expand Down Expand Up @@ -133,7 +133,7 @@ pub fn fastmultigather(
if let Ok(mut file) = File::create(&sig_filename) {
let unique_hashes: HashSet<u64> = hashes.into_iter().collect();
let mut new_mh = KmerMinHash::new(
query_mh.scaled().try_into().unwrap(),
query_mh.scaled(),
query_mh.ksize().try_into().unwrap(),
query_mh.hash_function().clone(),
query_mh.seed(),
Expand Down
16 changes: 15 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
/// Python interface Rust code for sourmash_plugin_branchwater.
//! Rust-to-Pyton interface code for sourmash_plugin_branchwater, using pyo3.
//!
//! If you're using Rust, you're probably most interested in
//! [utils](utils/index.html)

use pyo3::prelude::*;

#[macro_use]
extern crate simple_error;

mod branch_api;
mod utils;
use crate::utils::build_selection;
use crate::utils::is_revindex_database;
Expand Down Expand Up @@ -106,6 +111,7 @@ fn do_fastgather(
}

#[pyfunction]
#[allow(clippy::too_many_arguments)]
#[pyo3(signature = (query_filenames, siglist_path, threshold_bp, ksize, scaled, moltype, output_path=None, save_matches=false))]
fn do_fastmultigather(
query_filenames: String,
Expand Down Expand Up @@ -322,8 +328,11 @@ fn do_cluster(
}
}

/// Module interface for the `sourmash_plugin_branchwater` extension module.

#[pymodule]
fn sourmash_plugin_branchwater(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
// top level 'scripts' commands
m.add_function(wrap_pyfunction!(do_manysearch, m)?)?;
m.add_function(wrap_pyfunction!(do_fastgather, m)?)?;
m.add_function(wrap_pyfunction!(do_fastmultigather, m)?)?;
Expand All @@ -334,5 +343,10 @@ fn sourmash_plugin_branchwater(_py: Python, m: &Bound<'_, PyModule>) -> PyResult
m.add_function(wrap_pyfunction!(do_multisearch, m)?)?;
m.add_function(wrap_pyfunction!(do_pairwise, m)?)?;
m.add_function(wrap_pyfunction!(do_cluster, m)?)?;

// lower level API stuff
m.add_class::<branch_api::BranchCollection>()?;
m.add_function(wrap_pyfunction!(branch_api::api_load_collection, m)?)?;

Ok(())
}
Loading
Loading