diff --git a/Cargo.lock b/Cargo.lock index b631a48..4e6b015 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -29,15 +29,6 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "aho-corasick" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" -dependencies = [ - "memchr", -] - [[package]] name = "anstream" version = "0.6.15" @@ -87,21 +78,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "anyhow" -version = "1.0.86" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" - -[[package]] -name = "approx" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" -dependencies = [ - "num-traits", -] - [[package]] name = "async-compression" version = "0.4.12" @@ -136,79 +112,12 @@ dependencies = [ "rustc-demangle", ] -[[package]] -name = "bio" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8cbd545253762ecf9ef741f2c49f07c06a0ce4d041d74ee9c3f1ce0e2d5446e" -dependencies = [ - "anyhow", - "approx", - "bio-types", - "bit-set", - "bv", - "bytecount", - "csv", - "custom_derive", - "editdistancek", - "enum-map", - "fxhash", - "itertools", - "itertools-num", - "lazy_static", - "multimap", - "ndarray", - "newtype_derive", - "num-integer", - "num-traits", - "ordered-float", - "petgraph", - "rand", - "regex", - "serde", - "serde_derive", - "statrs", - "strum", - "strum_macros", - "thiserror", - "triple_accel", - "vec_map", -] - -[[package]] -name = "bio-types" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cc7edd677651969cc262a8dfb870f0c2266c3ceeaf863d742982e39699ff460" -dependencies = [ - "derive-new", - "lazy_static", - "regex", - "strum_macros", - "thiserror", -] - -[[package]] -name = "bit-set" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" -dependencies = [ - "bit-vec 0.8.0", -] - [[package]] name = "bit-vec" version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2c54ff287cfc0a34f38a6b832ea1bd8e448a330b3e40a50859e6488bee07f22" -[[package]] -name = "bit-vec" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" - [[package]] name = "bitflags" version = "2.6.0" @@ -234,28 +143,6 @@ dependencies = [ "serde", ] -[[package]] -name = "bv" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8834bb1d8ee5dc048ee3124f2c7c1afcc6bc9aed03f11e9dfd8c69470a5db340" -dependencies = [ - "feature-probe", - "serde", -] - -[[package]] -name = "bytecount" -version = "0.6.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" - -[[package]] -name = "bytemuck" -version = "1.16.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83" - [[package]] name = "byteorder" version = "1.5.0" @@ -390,44 +277,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "csv" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" -dependencies = [ - "csv-core", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "csv-core" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" -dependencies = [ - "memchr", -] - -[[package]] -name = "custom_derive" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9" - -[[package]] -name = "derive-new" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d150dea618e920167e5973d70ae6ece4385b7164e0d799fe7c122dd0a5d912ad" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "digest" version = "0.10.7" @@ -438,38 +287,12 @@ dependencies = [ "crypto-common", ] -[[package]] -name = "editdistancek" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e02df23d5b1c6f9e69fa603b890378123b93073df998a21e6e33b9db0a32613" - [[package]] name = "either" version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" -[[package]] -name = "enum-map" -version = "2.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6866f3bfdf8207509a033af1a75a7b08abda06bbaaeae6669323fd5a097df2e9" -dependencies = [ - "enum-map-derive", -] - -[[package]] -name = "enum-map-derive" -version = "0.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f282cfdfe92516eb26c2af8589c274c7c17681f5ecc03c18255fe741c6aa64eb" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "equivalent" version = "1.0.1" @@ -488,12 +311,6 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" -[[package]] -name = "feature-probe" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "835a3dc7d1ec9e75e2b5fb4ba75396837112d2060b03f7d43bc1897c7f7211da" - [[package]] name = "fixedbitset" version = "0.4.2" @@ -575,22 +392,14 @@ dependencies = [ "slab", ] -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - [[package]] name = "gen" version = "0.1.0" dependencies = [ - "bio", "clap", "include_dir", + "intervaltree", + "itertools", "noodles", "petgraph", "rusqlite", @@ -608,17 +417,6 @@ dependencies = [ "version_check", ] -[[package]] -name = "getrandom" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" -dependencies = [ - "cfg-if", - "libc", - "wasi", -] - [[package]] name = "gimli" version = "0.29.0" @@ -678,6 +476,15 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "intervaltree" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "270bc34e57047cab801a8c871c124d9dc7132f6473c6401f645524f4e6edd111" +dependencies = [ + "smallvec", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -693,27 +500,6 @@ dependencies = [ "either", ] -[[package]] -name = "itertools-num" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a872a22f9e6f7521ca557660adb96dd830e54f0f490fa115bb55dd69d38b27e7" -dependencies = [ - "num-traits", -] - -[[package]] -name = "itoa" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" - -[[package]] -name = "lazy_static" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" - [[package]] name = "lexical-core" version = "0.8.5" @@ -784,12 +570,6 @@ version = "0.2.155" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" -[[package]] -name = "libm" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" - [[package]] name = "libsqlite3-sys" version = "0.28.0" @@ -818,16 +598,6 @@ dependencies = [ "pkg-config", ] -[[package]] -name = "matrixmultiply" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9380b911e3e96d10c1f415da0876389aaf1b56759054eeb0de7df940c456ba1a" -dependencies = [ - "autocfg", - "rawpointer", -] - [[package]] name = "md-5" version = "0.10.6" @@ -853,66 +623,6 @@ dependencies = [ "adler", ] -[[package]] -name = "multimap" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "defc4c55412d89136f966bbb339008b474350e5e6e78d2714439c386b3137a03" -dependencies = [ - "serde", -] - -[[package]] -name = "nalgebra" -version = "0.32.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b5c17de023a86f59ed79891b2e5d5a94c705dbe904a5b5c9c952ea6221b03e4" -dependencies = [ - "approx", - "matrixmultiply", - "nalgebra-macros", - "num-complex", - "num-rational", - "num-traits", - "rand", - "rand_distr", - "simba", - "typenum", -] - -[[package]] -name = "nalgebra-macros" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "254a5372af8fc138e36684761d3c0cdb758a4410e938babcff1c860ce14ddbfc" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "ndarray" -version = "0.15.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" -dependencies = [ - "matrixmultiply", - "num-complex", - "num-integer", - "num-traits", - "rawpointer", -] - -[[package]] -name = "newtype_derive" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac8cd24d9f185bb7223958d8c1ff7a961b74b1953fd05dba7cc568a63b3861ec" -dependencies = [ - "rustc_version", -] - [[package]] name = "noodles" version = "0.78.0" @@ -922,6 +632,7 @@ dependencies = [ "noodles-bam", "noodles-bcf", "noodles-bgzf", + "noodles-core", "noodles-cram", "noodles-csi", "noodles-fasta", @@ -938,7 +649,7 @@ version = "0.65.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "406d4768f21c73e3075c0c0d77a5b21bc8b8169c8f0963122607cc410427b727" dependencies = [ - "bit-vec 0.7.0", + "bit-vec", "bstr", "byteorder", "bytes", @@ -1023,7 +734,7 @@ version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4bc8001c54f1d8e47e1ac6041a5f27edc99b68bacea3fade9c89059de285aea" dependencies = [ - "bit-vec 0.7.0", + "bit-vec", "byteorder", "indexmap", "noodles-bgzf", @@ -1095,7 +806,7 @@ version = "0.43.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "545e16e229b7f8734b0a2a36bd4c98a5b70128663b16b5201ddadc0d09c28d4a" dependencies = [ - "bit-vec 0.7.0", + "bit-vec", "byteorder", "indexmap", "noodles-bgzf", @@ -1122,44 +833,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "num-complex" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-integer" -version = "0.1.46" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-rational" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" -dependencies = [ - "num-integer", - "num-traits", -] - -[[package]] -name = "num-traits" -version = "0.2.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" -dependencies = [ - "autocfg", - "libm", -] - [[package]] name = "object" version = "0.36.2" @@ -1175,21 +848,6 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" -[[package]] -name = "ordered-float" -version = "4.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a91171844676f8c7990ce64959210cd2eaef32c2612c50f9fae9f8aaa6065a6" -dependencies = [ - "num-traits", -] - -[[package]] -name = "paste" -version = "1.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" - [[package]] name = "percent-encoding" version = "2.3.1" @@ -1224,15 +882,6 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" -[[package]] -name = "ppv-lite86" -version = "0.2.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" -dependencies = [ - "zerocopy", -] - [[package]] name = "proc-macro2" version = "1.0.86" @@ -1251,81 +900,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "rand" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "libc", - "rand_chacha", - "rand_core", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom", -] - -[[package]] -name = "rand_distr" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" -dependencies = [ - "num-traits", - "rand", -] - -[[package]] -name = "rawpointer" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" - -[[package]] -name = "regex" -version = "1.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" - [[package]] name = "rusqlite" version = "0.31.0" @@ -1357,42 +931,6 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" -[[package]] -name = "rustc_version" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084" -dependencies = [ - "semver", -] - -[[package]] -name = "rustversion" -version = "1.0.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" - -[[package]] -name = "ryu" -version = "1.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" - -[[package]] -name = "safe_arch" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3460605018fdc9612bce72735cba0d27efbcd9904780d44c7e3a9948f96148a" -dependencies = [ - "bytemuck", -] - -[[package]] -name = "semver" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac" - [[package]] name = "serde" version = "1.0.204" @@ -1424,19 +962,6 @@ dependencies = [ "digest", ] -[[package]] -name = "simba" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061507c94fc6ab4ba1c9a0305018408e312e17c041eb63bef8aa726fa33aceae" -dependencies = [ - "approx", - "num-complex", - "num-traits", - "paste", - "wide", -] - [[package]] name = "slab" version = "0.4.9" @@ -1458,43 +983,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" -[[package]] -name = "statrs" -version = "0.17.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f697a07e4606a0a25c044de247e583a330dbb1731d11bc7350b81f48ad567255" -dependencies = [ - "approx", - "nalgebra", - "num-traits", - "rand", -] - [[package]] name = "strsim" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" -[[package]] -name = "strum" -version = "0.26.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" - -[[package]] -name = "strum_macros" -version = "0.26.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "rustversion", - "syn", -] - [[package]] name = "syn" version = "2.0.72" @@ -1506,26 +1000,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "thiserror" -version = "1.0.63" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" -dependencies = [ - "thiserror-impl", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.63" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "tokio" version = "1.39.2" @@ -1550,12 +1024,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "triple_accel" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22048bc95dfb2ffd05b1ff9a756290a009224b60b2f0e7525faeee7603851e63" - [[package]] name = "typenum" version = "1.17.0" @@ -1580,37 +1048,12 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" -[[package]] -name = "vec_map" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" -dependencies = [ - "serde", -] - [[package]] name = "version_check" version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" -[[package]] -name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" - -[[package]] -name = "wide" -version = "0.7.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "901e8597c777fa042e9e245bd56c0dc4418c5db3f845b6ff94fbac732c6a0692" -dependencies = [ - "bytemuck", - "safe_arch", -] - [[package]] name = "windows-sys" version = "0.52.0" @@ -1699,7 +1142,6 @@ version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ - "byteorder", "zerocopy-derive", ] diff --git a/Cargo.toml b/Cargo.toml index 31ce0d3..c560b9f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,11 +4,12 @@ version = "0.1.0" edition = "2021" [dependencies] -bio = "2.0.0" clap = { version = "4.5.8", features = ["derive"] } include_dir = "0.7.4" +intervaltree = "0.2.7" +itertools = "0.13.0" rusqlite = { version = "0.31.0", features = ["bundled", "array"] } rusqlite_migration = { version = "1.2.0" , features = ["from-directory"]} sha2 = "0.10.8" -noodles = { version = "0.78.0", features = ["vcf", "fasta", "async"] } +noodles = { version = "0.78.0", features = ["core", "vcf", "fasta", "async"] } petgraph = "0.6.5" diff --git a/migrations/01-initial/up.sql b/migrations/01-initial/up.sql index 36d3c7d..13a6c70 100644 --- a/migrations/01-initial/up.sql +++ b/migrations/01-initial/up.sql @@ -9,7 +9,7 @@ CREATE TABLE sample ( CREATE TABLE sequence ( hash TEXT PRIMARY KEY NOT NULL, sequence_type TEXT NOT NULL, - sequence TEXT, + sequence TEXT NOT NULL, "length" INTEGER NOT NULL ); @@ -79,4 +79,41 @@ CREATE TABLE change_log ( FOREIGN KEY(path_id) REFERENCES path(id), FOREIGN KEY(sequence_hash) REFERENCES sequence(hash) ); -CREATE UNIQUE INDEX change_log_uidx ON change_log(hash); \ No newline at end of file +CREATE UNIQUE INDEX change_log_uidx ON change_log(hash); + +CREATE TABLE new_edges ( + id INTEGER PRIMARY KEY NOT NULL, + source_hash TEXT NOT NULL, + source_coordinate INTEGER NOT NULL, + source_strand TEXT NOT NULL, + target_hash TEXT NOT NULL, + target_coordinate INTEGER NOT NULL, + target_strand TEXT NOT NULL, + chromosome_index INTEGER NOT NULL, + phased INTEGER NOT NULL, + FOREIGN KEY(source_hash) REFERENCES sequence(hash), + FOREIGN KEY(target_hash) REFERENCES sequence(hash), + constraint chk_phased check (phased in (0, 1)) +); +CREATE UNIQUE INDEX new_edge_uidx ON new_edges(source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased); + +CREATE TABLE path_edges ( + id INTEGER PRIMARY KEY NOT NULL, + path_id INTEGER NOT NULL, + index_in_path INTEGER NOT NULL, + edge_id INTEGER NOT NULL, + FOREIGN KEY(edge_id) REFERENCES new_edges(id), + FOREIGN KEY(path_id) REFERENCES path(id) +); +CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, edge_id); + +CREATE TABLE block_group_edges ( + id INTEGER PRIMARY KEY NOT NULL, + block_group_id INTEGER NOT NULL, + edge_id INTEGER NOT NULL, + FOREIGN KEY(block_group_id) REFERENCES block_group(id), + FOREIGN KEY(edge_id) REFERENCES new_edges(id) +); +CREATE UNIQUE INDEX block_group_edges_uidx ON block_group_edges(block_group_id, edge_id); + +INSERT INTO sequence (hash, sequence_type, sequence, "length") values ("start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "OTHER", "start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", 64), ("end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "OTHER", "end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", 64); diff --git a/src/main.rs b/src/main.rs index bea2e9d..31ea7c6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,11 +3,22 @@ use clap::{Parser, Subcommand}; use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::path::PathBuf; +use std::{io, str}; -use bio::io::fasta; use gen::migrations::run_migrations; -use gen::models::{self, block::Block, edge::Edge, path::Path, sequence::Sequence, BlockGroup}; +use gen::models::{ + self, + block::Block, + block_group_edge::BlockGroupEdge, + edge::Edge, + new_edge::NewEdge, + path::{NewBlock, Path}, + path_edge::PathEdge, + sequence::Sequence, + BlockGroup, +}; use gen::{get_connection, parse_genotype}; +use noodles::fasta; use noodles::vcf; use noodles::vcf::variant::record::samples::series::value::genotype::Phasing; use noodles::vcf::variant::record::samples::series::Value; @@ -15,7 +26,6 @@ use noodles::vcf::variant::record::samples::{Sample, Series}; use noodles::vcf::variant::record::{AlternateBases, ReferenceBases, Samples}; use noodles::vcf::variant::Record; use rusqlite::{types::Value as SQLValue, Connection}; -use std::io; #[derive(Parser)] #[command(version, about, long_about = None)] @@ -66,27 +76,76 @@ enum Commands { fn import_fasta(fasta: &String, name: &str, shallow: bool, conn: &mut Connection) { // TODO: support gz - let mut reader = fasta::Reader::from_file(fasta).unwrap(); + let mut reader = fasta::io::reader::Builder.build_from_path(fasta).unwrap(); if !models::Collection::exists(conn, name) { let collection = models::Collection::create(conn, name); for result in reader.records() { let record = result.expect("Error during fasta record parsing"); - let sequence = String::from_utf8(record.seq().to_vec()).unwrap(); + let sequence = str::from_utf8(record.sequence().as_ref()) + .unwrap() + .to_string(); + let name = String::from_utf8(record.name().to_vec()).unwrap(); + let sequence_length = record.sequence().len() as i32; let seq_hash = Sequence::create(conn, "DNA", &sequence, !shallow); - let block_group = BlockGroup::create(conn, &collection.name, None, record.id()); - let block = Block::create( + let block_group = BlockGroup::create(conn, &collection.name, None, &name); + let block = Block::create(conn, &seq_hash, block_group.id, 0, sequence_length, "+"); + Edge::create(conn, None, Some(block.id), 0, 0); + Edge::create(conn, Some(block.id), None, 0, 0); + Path::create(conn, &name, block_group.id, vec![block.id]); + } + println!("Created it"); + } else { + println!("Collection {:1} already exists", name); + } +} + +fn new_import_fasta(fasta: &String, name: &str, shallow: bool, conn: &mut Connection) { + // TODO: support gz + let mut reader = fasta::io::reader::Builder.build_from_path(fasta).unwrap(); + + if !models::Collection::exists(conn, name) { + let collection = models::Collection::create(conn, name); + + for result in reader.records() { + let record = result.expect("Error during fasta record parsing"); + let sequence = str::from_utf8(record.sequence().as_ref()) + .unwrap() + .to_string(); + let name = String::from_utf8(record.name().to_vec()).unwrap(); + let sequence_length = record.sequence().len() as i32; + let seq_hash = Sequence::create(conn, "DNA", &sequence, !shallow); + let block_group = BlockGroup::create(conn, &collection.name, None, &name); + let edge_into = NewEdge::create( conn, - &seq_hash, - block_group.id, + NewEdge::PATH_START_HASH.to_string(), + 0, + "+".to_string(), + seq_hash.to_string(), + 0, + "+".to_string(), + 0, 0, - (sequence.len() as i32), - "+", ); - Edge::create(conn, None, Some(block.id), 0, 0); - Edge::create(conn, Some(block.id), None, 0, 0); - Path::create(conn, record.id(), block_group.id, vec![block.id]); + let edge_out_of = NewEdge::create( + conn, + seq_hash.to_string(), + sequence_length, + "+".to_string(), + NewEdge::PATH_END_HASH.to_string(), + 0, + "+".to_string(), + 0, + 0, + ); + BlockGroupEdge::bulk_create(conn, block_group.id, vec![edge_into.id, edge_out_of.id]); + Path::new_create( + conn, + &name, + block_group.id, + vec![edge_into.id, edge_out_of.id], + ); } println!("Created it"); } else { @@ -233,6 +292,156 @@ fn update_with_vcf( } } +fn new_update_with_vcf( + vcf_path: &String, + collection_name: &String, + fixed_genotype: String, + fixed_sample: String, + conn: &mut Connection, +) { + run_migrations(conn); + + let mut reader = vcf::io::reader::Builder::default() + .build_from_path(vcf_path) + .expect("Unable to parse"); + let header = reader.read_header().unwrap(); + let sample_names = header.sample_names(); + for name in sample_names { + models::Sample::create(conn, name); + } + if !fixed_sample.is_empty() { + models::Sample::create(conn, &fixed_sample); + } + let mut genotype = vec![]; + if !fixed_genotype.is_empty() { + genotype = parse_genotype(&fixed_genotype); + } + + for result in reader.records() { + let record = result.unwrap(); + let seq_name = record.reference_sequence_name().to_string(); + let ref_allele = record.reference_bases(); + // this converts the coordinates to be zero based, start inclusive, end exclusive + let ref_start = record.variant_start().unwrap().unwrap().get() - 1; + let ref_end = record.variant_end(&header).unwrap().get(); + let alt_bases = record.alternate_bases(); + let alt_alleles: Vec<_> = alt_bases.iter().collect::>().unwrap(); + // TODO: fix this duplication of handling an insert + if !fixed_sample.is_empty() && !genotype.is_empty() { + for (chromosome_index, genotype) in genotype.iter().enumerate() { + if let Some(gt) = genotype { + if gt.allele != 0 { + let alt_seq = alt_alleles[chromosome_index - 1]; + let phased = match gt.phasing { + Phasing::Phased => 1, + Phasing::Unphased => 0, + }; + // TODO: new sequence may not be real and be or some sort. Handle these. + let new_sequence_hash = Sequence::create(conn, "DNA", alt_seq, true); + let sequence = + Sequence::sequence_from_hash(conn, &new_sequence_hash).unwrap(); + let sample_bg_id = BlockGroup::get_or_create_sample_block_group( + conn, + collection_name, + &fixed_sample, + &seq_name, + ); + let sample_paths = Path::get_paths( + conn, + "select * from path where block_group_id = ?1 AND name = ?2", + vec![ + SQLValue::from(sample_bg_id), + SQLValue::from(seq_name.clone()), + ], + ); + let new_block = NewBlock { + id: 0, + sequence: sequence.clone(), + block_sequence: alt_seq.to_string(), + sequence_start: 0, + sequence_end: alt_seq.len() as i32, + path_start: ref_start as i32, + path_end: ref_end as i32, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change( + conn, + sample_bg_id, + &sample_paths[0], + ref_start as i32, + ref_end as i32, + &new_block, + chromosome_index as i32, + phased, + ); + } + } + } + } else { + for (sample_index, sample) in record.samples().iter().enumerate() { + let genotype = sample.get(&header, "GT"); + if genotype.is_some() { + if let Value::Genotype(genotypes) = genotype.unwrap().unwrap().unwrap() { + for (chromosome_index, gt) in genotypes.iter().enumerate() { + if gt.is_ok() { + let (allele, phasing) = gt.unwrap(); + let phased = match phasing { + Phasing::Phased => 1, + Phasing::Unphased => 0, + }; + let allele = allele.unwrap(); + if allele != 0 { + let alt_seq = alt_alleles[allele - 1]; + // TODO: new sequence may not be real and be or some sort. Handle these. + let new_sequence_hash = + Sequence::create(conn, "DNA", alt_seq, true); + let sequence = + Sequence::sequence_from_hash(conn, &new_sequence_hash) + .unwrap(); + let sample_bg_id = BlockGroup::get_or_create_sample_block_group( + conn, + collection_name, + &sample_names[sample_index], + &seq_name, + ); + let sample_paths = Path::get_paths( + conn, + "select * from path where block_group_id = ?1 AND name = ?2", + vec![ + SQLValue::from(sample_bg_id), + SQLValue::from(seq_name.clone()), + ], + ); + let new_block = NewBlock { + id: 0, + sequence: sequence.clone(), + block_sequence: alt_seq.to_string(), + sequence_start: 0, + sequence_end: alt_seq.len() as i32, + path_start: ref_start as i32, + path_end: ref_end as i32, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change( + conn, + sample_bg_id, + &sample_paths[0], + ref_start as i32, + ref_end as i32, + &new_block, + chromosome_index as i32, + phased, + ); + } + } + } + } + } + } + } + } +} + fn main() { let cli = Cli::parse(); @@ -242,7 +451,7 @@ fn main() { name, db, shallow, - }) => import_fasta(fasta, name, *shallow, &mut get_connection(db)), + }) => new_import_fasta(fasta, name, *shallow, &mut get_connection(db)), Some(Commands::Update { name, db, @@ -250,7 +459,7 @@ fn main() { vcf, genotype, sample, - }) => update_with_vcf( + }) => new_update_with_vcf( vcf, name, genotype.clone().unwrap_or("".to_string()), diff --git a/src/models.rs b/src/models.rs index 2f20026..fc7ea2f 100644 --- a/src/models.rs +++ b/src/models.rs @@ -1,3 +1,4 @@ +use itertools::Itertools; use petgraph::graphmap::DiGraphMap; use petgraph::Direction; use rusqlite::types::Value; @@ -7,14 +8,20 @@ use std::collections::{HashMap, HashSet}; use std::fmt::*; pub mod block; +pub mod block_group_edge; pub mod edge; +pub mod new_edge; pub mod path; +pub mod path_edge; pub mod sequence; use crate::graph::all_simple_paths; use crate::models::block::Block; +use crate::models::block_group_edge::BlockGroupEdge; use crate::models::edge::Edge; -use crate::models::path::{Path, PathBlock}; +use crate::models::new_edge::{EdgeData, NewEdge}; +use crate::models::path::{NewBlock, Path, PathBlock}; +use crate::models::path_edge::PathEdge; use crate::models::sequence::Sequence; use crate::{get_overlap, models}; @@ -91,6 +98,21 @@ pub struct BlockGroup { pub name: String, } +#[derive(Clone)] +pub struct GroupBlock { + pub id: i32, + pub sequence_hash: String, + pub sequence: String, + pub start: i32, + pub end: i32, +} + +#[derive(Eq, Hash, PartialEq)] +pub struct BlockKey { + pub sequence_hash: String, + pub coordinate: i32, +} + impl BlockGroup { pub fn create( conn: &Connection, @@ -214,11 +236,11 @@ impl BlockGroup { ); for path in existing_paths { - let mut new_blocks = vec![]; - for block in path.blocks { - new_blocks.push(*block_map.get(&block).unwrap()); - } - Path::create(conn, &path.name, target_block_group_id, new_blocks); + let edge_ids = PathEdge::edges_for(conn, path.id) + .into_iter() + .map(|edge| edge.id) + .collect(); + Path::new_create(conn, &path.name, target_block_group_id, edge_ids); } } @@ -274,17 +296,9 @@ impl BlockGroup { } let sequence_hashes = block_map .values() - .map(|block| format!("\"{id}\"", id = block.sequence_hash)) - .collect::>() - .join(","); - let mut sequence_map = HashMap::new(); - for sequence in Sequence::get_sequences( - conn, - &format!("select * from sequence where hash in ({sequence_hashes})"), - vec![], - ) { - sequence_map.insert(sequence.hash, sequence.sequence); - } + .map(|block| block.sequence_hash.clone()) + .collect::>(); + let sequence_map = Sequence::sequences_by_hash(conn, sequence_hashes); let block_ids = block_map .keys() .map(|id| format!("{id}")) @@ -321,7 +335,8 @@ impl BlockGroup { let block = block_map.get(&start_node).unwrap(); let block_sequence = sequence_map.get(&block.sequence_hash).unwrap(); sequences.insert( - block_sequence[(block.start as usize)..(block.end as usize)].to_string(), + block_sequence.sequence[(block.start as usize)..(block.end as usize)] + .to_string(), ); } else { for path in all_simple_paths(&graph, start_node, *end_node) { @@ -330,7 +345,8 @@ impl BlockGroup { let block = block_map.get(&node).unwrap(); let block_sequence = sequence_map.get(&block.sequence_hash).unwrap(); current_sequence.push_str( - &block_sequence[(block.start as usize)..(block.end as usize)], + &block_sequence.sequence + [(block.start as usize)..(block.end as usize)], ); } sequences.insert(current_sequence); @@ -341,6 +357,195 @@ impl BlockGroup { sequences } + pub fn blocks_from_edges(conn: &Connection, edges: Vec) -> Vec { + let mut sequence_hashes = HashSet::new(); + for edge in &edges { + if edge.source_hash != NewEdge::PATH_START_HASH { + sequence_hashes.insert(edge.source_hash.clone()); + } + if edge.target_hash != NewEdge::PATH_END_HASH { + sequence_hashes.insert(edge.target_hash.clone()); + } + } + + let mut boundary_edges_by_hash = HashMap::>::new(); + for edge in edges { + if (edge.source_hash == edge.target_hash) + && (edge.target_coordinate == edge.source_coordinate) + { + boundary_edges_by_hash + .entry(edge.source_hash.clone()) + .and_modify(|current_edges| current_edges.push(edge.clone())) + .or_insert_with(|| vec![edge.clone()]); + } + } + + let sequences_by_hash = + Sequence::sequences_by_hash(conn, sequence_hashes.into_iter().collect::>()); + let mut blocks = vec![]; + + let mut block_index = 0; + for (hash, sequence) in sequences_by_hash.into_iter() { + let sequence_edges = boundary_edges_by_hash.get(&hash); + if sequence_edges.is_some() { + let sorted_sequence_edges: Vec = sequence_edges + .unwrap() + .iter() + .sorted_by(|edge1, edge2| { + Ord::cmp(&edge1.source_coordinate, &edge2.source_coordinate) + }) + .cloned() + .collect(); + let first_edge = sorted_sequence_edges[0].clone(); + let start = 0; + let end = first_edge.source_coordinate; + let block_sequence = sequence.sequence[start as usize..end as usize].to_string(); + let first_block = GroupBlock { + id: block_index, + sequence_hash: hash.clone(), + sequence: block_sequence, + start, + end, + }; + blocks.push(first_block); + block_index += 1; + for (into, out_of) in sorted_sequence_edges.clone().into_iter().tuple_windows() { + let start = into.target_coordinate; + let end = out_of.source_coordinate; + let block_sequence = + sequence.sequence[start as usize..end as usize].to_string(); + let block = GroupBlock { + id: block_index, + sequence_hash: hash.clone(), + sequence: block_sequence, + start, + end, + }; + blocks.push(block); + block_index += 1; + } + let last_edge = &sorted_sequence_edges[sorted_sequence_edges.len() - 1]; + let start = last_edge.target_coordinate; + let end = sequence.sequence.len() as i32; + let block_sequence = sequence.sequence[start as usize..end as usize].to_string(); + let last_block = GroupBlock { + id: block_index, + sequence_hash: hash.clone(), + sequence: block_sequence, + start, + end, + }; + blocks.push(last_block); + block_index += 1; + } else { + blocks.push(GroupBlock { + id: block_index, + sequence_hash: hash.clone(), + sequence: sequence.sequence.clone(), + start: 0, + end: sequence.sequence.len() as i32, + }); + block_index += 1; + } + } + blocks + } + + pub fn new_get_all_sequences(conn: &Connection, block_group_id: i32) -> HashSet { + let edges = BlockGroupEdge::edges_for_block_group(conn, block_group_id); + let blocks = BlockGroup::blocks_from_edges(conn, edges.clone()); + + let blocks_by_start = blocks + .clone() + .into_iter() + .map(|block| { + ( + BlockKey { + sequence_hash: block.sequence_hash, + coordinate: block.start, + }, + block.id, + ) + }) + .collect::>(); + let blocks_by_end = blocks + .clone() + .into_iter() + .map(|block| { + ( + BlockKey { + sequence_hash: block.sequence_hash, + coordinate: block.end, + }, + block.id, + ) + }) + .collect::>(); + let blocks_by_id = blocks + .clone() + .into_iter() + .map(|block| (block.id, block)) + .collect::>(); + + let mut graph: DiGraphMap = DiGraphMap::new(); + for block in blocks { + graph.add_node(block.id); + } + for edge in edges { + let source_key = BlockKey { + sequence_hash: edge.source_hash, + coordinate: edge.source_coordinate, + }; + let source_id = blocks_by_end.get(&source_key); + let target_key = BlockKey { + sequence_hash: edge.target_hash, + coordinate: edge.target_coordinate, + }; + let target_id = blocks_by_start.get(&target_key); + if let Some(source_id_value) = source_id { + if let Some(target_id_value) = target_id { + graph.add_edge(*source_id_value, *target_id_value, ()); + } + } + } + + let mut start_nodes = vec![]; + let mut end_nodes = vec![]; + for node in graph.nodes() { + let has_incoming = graph.neighbors_directed(node, Direction::Incoming).next(); + let has_outgoing = graph.neighbors_directed(node, Direction::Outgoing).next(); + if has_incoming.is_none() { + start_nodes.push(node); + } + if has_outgoing.is_none() { + end_nodes.push(node); + } + } + let mut sequences = HashSet::::new(); + + for start_node in start_nodes { + for end_node in &end_nodes { + // TODO: maybe make all_simple_paths return a single path id where start == end + if start_node == *end_node { + let block = blocks_by_id.get(&start_node).unwrap(); + sequences.insert(block.sequence.clone()); + } else { + for path in all_simple_paths(&graph, start_node, *end_node) { + let mut current_sequence = "".to_string(); + for node in path { + let block = blocks_by_id.get(&node).unwrap(); + let block_sequence = block.sequence.clone(); + current_sequence.push_str(&block_sequence); + } + sequences.insert(current_sequence); + } + } + } + } + + sequences + } + #[allow(clippy::ptr_arg)] #[allow(clippy::too_many_arguments)] pub fn insert_change( @@ -426,20 +631,19 @@ impl BlockGroup { // |----range---| let start_split_point = block.start + start - path_start; let end_split_point = block.start + end - path_start; - let mut next_block; - if start_split_point == block.start { + let next_block = if start_split_point == block.start { if let Some(pb) = previous_block { new_edges.push((Some(pb.id), Some(new_block_id))); } - next_block = block.clone(); + block.clone() } else { let (left_block, right_block) = Block::split(conn, block, start_split_point, chromosome_index, phased) .unwrap(); Block::delete(conn, block.id); new_edges.push((Some(left_block.id), Some(new_block_id))); - next_block = right_block.clone(); - } + right_block.clone() + }; if end_split_point == next_block.start { new_edges.push((Some(new_block_id), Some(next_block.id))); @@ -506,6 +710,122 @@ impl BlockGroup { change.save(conn); } + + #[allow(clippy::ptr_arg)] + #[allow(clippy::too_many_arguments)] + #[allow(clippy::needless_late_init)] + pub fn new_insert_change( + conn: &mut Connection, + block_group_id: i32, + path: &Path, + start: i32, + end: i32, + new_block: &NewBlock, + chromosome_index: i32, + phased: i32, + ) { + let tree = Path::intervaltree_for(conn, path); + + let start_blocks: Vec = + tree.query_point(start).map(|x| x.value.clone()).collect(); + assert_eq!(start_blocks.len(), 1); + // NOTE: This may not be used but needs to be initialized here instead of inside the if + // statement that uses it, so that the borrow checker is happy + let previous_start_blocks: Vec = tree + .query_point(start - 1) + .map(|x| x.value.clone()) + .collect(); + assert_eq!(previous_start_blocks.len(), 1); + let start_block; + if start_blocks[0].path_start == start { + // First part of this block will be replaced/deleted, need to get previous block to add + // edge including it + start_block = &previous_start_blocks[0]; + } else { + start_block = &start_blocks[0]; + } + + let end_blocks: Vec = tree.query_point(end).map(|x| x.value.clone()).collect(); + assert_eq!(end_blocks.len(), 1); + let end_block = &end_blocks[0]; + + let mut new_edges = vec![]; + + if new_block.sequence_start == new_block.sequence_end { + // Deletion + let new_edge = EdgeData { + source_hash: start_block.sequence.hash.clone(), + source_coordinate: start - start_block.path_start + start_block.sequence_start, + source_strand: "+".to_string(), + target_hash: end_block.sequence.hash.clone(), + target_coordinate: end - end_block.path_start + end_block.sequence_start, + target_strand: "+".to_string(), + chromosome_index, + phased, + }; + new_edges.push(new_edge); + } else { + // Insertion/replacement + let new_start_edge = EdgeData { + source_hash: start_block.sequence.hash.clone(), + source_coordinate: start - start_block.path_start + start_block.sequence_start, + source_strand: "+".to_string(), + target_hash: new_block.sequence.hash.clone(), + target_coordinate: new_block.sequence_start, + target_strand: "+".to_string(), + chromosome_index, + phased, + }; + let new_end_edge = EdgeData { + source_hash: new_block.sequence.hash.clone(), + source_coordinate: new_block.sequence_end, + source_strand: "+".to_string(), + target_hash: end_block.sequence.hash.clone(), + target_coordinate: end - end_block.path_start + end_block.sequence_start, + target_strand: "+".to_string(), + chromosome_index, + phased, + }; + new_edges.push(new_start_edge); + new_edges.push(new_end_edge); + } + + // NOTE: Add edges marking the existing part of the sequence that is being substituted out, + // so we can retrieve it as one node of the overall graph + if start < start_block.path_end { + let split_coordinate = start - start_block.path_start + start_block.sequence_start; + let new_split_start_edge = EdgeData { + source_hash: start_block.sequence.hash.clone(), + source_coordinate: split_coordinate, + source_strand: "+".to_string(), + target_hash: start_block.sequence.hash.clone(), + target_coordinate: split_coordinate, + target_strand: "+".to_string(), + chromosome_index, + phased, + }; + new_edges.push(new_split_start_edge); + } + + if end > end_block.path_start { + let split_coordinate = end - end_block.path_start + end_block.sequence_start; + let new_split_end_edge = EdgeData { + source_hash: end_block.sequence.hash.clone(), + source_coordinate: split_coordinate, + source_strand: "+".to_string(), + target_hash: end_block.sequence.hash.clone(), + target_coordinate: split_coordinate, + target_strand: "+".to_string(), + chromosome_index, + phased, + }; + + new_edges.push(new_split_end_edge); + } + + let edge_ids = NewEdge::bulk_create(conn, new_edges); + BlockGroupEdge::bulk_create(conn, block_group_id, edge_ids); + } } pub struct ChangeLog { @@ -583,7 +903,6 @@ impl ChangeLog { mod tests { use super::*; use crate::migrations::run_migrations; - use std::hash::Hash; fn get_connection() -> Connection { let mut conn = Connection::open_in_memory() @@ -843,4 +1162,428 @@ mod tests { ]) ); } + + fn setup_multipath(conn: &Connection) -> (i32, Path) { + let a_seq_hash = Sequence::create(conn, "DNA", "AAAAAAAAAA", true); + let t_seq_hash = Sequence::create(conn, "DNA", "TTTTTTTTTT", true); + let c_seq_hash = Sequence::create(conn, "DNA", "CCCCCCCCCC", true); + let g_seq_hash = Sequence::create(conn, "DNA", "GGGGGGGGGG", true); + let _collection = Collection::create(conn, "test"); + let block_group = BlockGroup::create(conn, "test", None, "hg19"); + let edge0 = NewEdge::create( + conn, + NewEdge::PATH_START_HASH.to_string(), + 0, + "+".to_string(), + a_seq_hash.clone(), + 0, + "+".to_string(), + 0, + 0, + ); + let edge1 = NewEdge::create( + conn, + a_seq_hash, + 10, + "+".to_string(), + t_seq_hash.clone(), + 0, + "+".to_string(), + 0, + 0, + ); + let edge2 = NewEdge::create( + conn, + t_seq_hash, + 10, + "+".to_string(), + c_seq_hash.clone(), + 0, + "+".to_string(), + 0, + 0, + ); + let edge3 = NewEdge::create( + conn, + c_seq_hash, + 10, + "+".to_string(), + g_seq_hash.clone(), + 0, + "+".to_string(), + 0, + 0, + ); + let edge4 = NewEdge::create( + conn, + g_seq_hash, + 10, + "+".to_string(), + NewEdge::PATH_END_HASH.to_string(), + 0, + "+".to_string(), + 0, + 0, + ); + BlockGroupEdge::bulk_create( + conn, + block_group.id, + vec![edge0.id, edge1.id, edge2.id, edge3.id, edge4.id], + ); + let path = Path::new_create( + conn, + "chr1", + block_group.id, + vec![edge0.id, edge1.id, edge2.id, edge3.id, edge4.id], + ); + (block_group.id, path) + } + + #[test] + fn insert_and_deletion_new_get_all() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 7, + path_end: 15, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 7, 15, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAANNNNTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + + let deletion_sequence_hash = Sequence::create(&conn, "DNA", "", true); + let deletion_sequence = + Sequence::sequence_from_hash(&conn, &deletion_sequence_hash).unwrap(); + let deletion = NewBlock { + id: 0, + sequence: deletion_sequence.clone(), + block_sequence: deletion_sequence.sequence.clone(), + sequence_start: 0, + sequence_end: 0, + path_start: 19, + path_end: 31, + strand: "+".to_string(), + }; + + // take out an entire block. + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 19, 31, &deletion, 1, 0); + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAANNNNTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAATTTTTTTTTGGGGGGGGG".to_string(), + "AAAAAAANNNNTTTTGGGGGGGGG".to_string(), + ]) + ) + } + + #[test] + fn simple_insert_new_get_all() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 7, + path_end: 15, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 7, 15, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAANNNNTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_on_block_boundary_middle_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 15, + path_end: 15, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 15, 15, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAATTTTTNNNNTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_within_block_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 12, + path_end: 17, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 12, 17, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAATTNNNNTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_on_block_boundary_start_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 10, + path_end: 10, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 10, 10, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAANNNNTTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_on_block_boundary_end_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 9, + path_end: 9, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 9, 9, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAANNNNATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_across_entire_block_boundary_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 10, + path_end: 20, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 10, 20, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAANNNNCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_across_two_blocks_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 15, + path_end: 25, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 15, 25, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAATTTTTNNNNCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_spanning_blocks_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 5, + path_end: 35, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 5, 35, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAANNNNGGGGG".to_string() + ]) + ); + } + + #[test] + fn simple_deletion_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let deletion_sequence_hash = Sequence::create(&conn, "DNA", "", true); + let deletion_sequence = + Sequence::sequence_from_hash(&conn, &deletion_sequence_hash).unwrap(); + let deletion = NewBlock { + id: 0, + sequence: deletion_sequence.clone(), + block_sequence: deletion_sequence.sequence.clone(), + sequence_start: 0, + sequence_end: 0, + path_start: 19, + path_end: 31, + strand: "+".to_string(), + }; + + // take out an entire block. + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 19, 31, &deletion, 1, 0); + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAATTTTTTTTTGGGGGGGGG".to_string(), + ]) + ) + } + + #[test] + fn doesnt_apply_same_insert_twice_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 7, + path_end: 15, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 7, 15, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAANNNNTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 7, 15, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAANNNNTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } } diff --git a/src/models/block_group_edge.rs b/src/models/block_group_edge.rs new file mode 100644 index 0000000..9a19675 --- /dev/null +++ b/src/models/block_group_edge.rs @@ -0,0 +1,54 @@ +use crate::models::new_edge::NewEdge; +use rusqlite::types::Value; +use rusqlite::{params_from_iter, Connection}; + +#[derive(Clone, Debug)] +pub struct BlockGroupEdge { + pub id: i32, + pub block_group_id: i32, + pub edge_id: i32, +} + +impl BlockGroupEdge { + pub fn bulk_create(conn: &Connection, block_group_id: i32, edge_ids: Vec) { + let mut rows_to_insert = vec![]; + for edge_id in edge_ids { + let row = format!("({0}, {1})", block_group_id, edge_id); + rows_to_insert.push(row); + } + let formatted_rows_to_insert = rows_to_insert.join(", "); + + let insert_statement = format!( + "INSERT OR IGNORE INTO block_group_edges (block_group_id, edge_id) VALUES {0};", + formatted_rows_to_insert + ); + let _ = conn.execute(&insert_statement, ()); + } + + pub fn edges_for_block_group(conn: &Connection, block_group_id: i32) -> Vec { + let query = format!( + "select * from block_group_edges where block_group_id = {};", + block_group_id + ); + let block_group_edges = BlockGroupEdge::query(conn, &query, vec![]); + let edge_ids = block_group_edges + .into_iter() + .map(|block_group_edge| block_group_edge.edge_id) + .collect(); + NewEdge::bulk_load(conn, edge_ids) + } + + pub fn query(conn: &Connection, query: &str, placeholders: Vec) -> Vec { + let mut stmt = conn.prepare(query).unwrap(); + let rows = stmt + .query_map(params_from_iter(placeholders), |row| { + Ok(BlockGroupEdge { + id: row.get(0)?, + block_group_id: row.get(1)?, + edge_id: row.get(2)?, + }) + }) + .unwrap(); + rows.map(|row| row.unwrap()).collect() + } +} diff --git a/src/models/new_edge.rs b/src/models/new_edge.rs new file mode 100644 index 0000000..4ee3c8a --- /dev/null +++ b/src/models/new_edge.rs @@ -0,0 +1,386 @@ +use rusqlite::types::Value; +use rusqlite::{params_from_iter, types::Value as SQLValue, Connection}; +use std::collections::{HashMap, HashSet}; +use std::hash::RandomState; + +#[derive(Clone, Debug)] +pub struct NewEdge { + pub id: i32, + pub source_hash: String, + pub source_coordinate: i32, + pub source_strand: String, + pub target_hash: String, + pub target_coordinate: i32, + pub target_strand: String, + pub chromosome_index: i32, + pub phased: i32, +} + +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub struct EdgeData { + pub source_hash: String, + pub source_coordinate: i32, + pub source_strand: String, + pub target_hash: String, + pub target_coordinate: i32, + pub target_strand: String, + pub chromosome_index: i32, + pub phased: i32, +} + +impl NewEdge { + pub const PATH_START_HASH: &'static str = + "start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"; + pub const PATH_END_HASH: &'static str = + "end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"; + + #[allow(clippy::too_many_arguments)] + pub fn create( + conn: &Connection, + source_hash: String, + source_coordinate: i32, + source_strand: String, + target_hash: String, + target_coordinate: i32, + target_strand: String, + chromosome_index: i32, + phased: i32, + ) -> NewEdge { + let query = "INSERT INTO new_edges (source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8) RETURNING *"; + let id_query = "select id from new_edges where source_hash = ?1 and source_coordinate = ?2 and source_strand = ?3 and target_hash = ?4 and target_coordinate = ?5 and target_strand = ?6 and chromosome_index = ?7 and phased = ?8"; + let placeholders: Vec = vec![ + source_hash.clone().into(), + source_coordinate.into(), + source_strand.clone().into(), + target_hash.clone().into(), + target_coordinate.into(), + target_strand.clone().into(), + chromosome_index.into(), + phased.into(), + ]; + + let mut stmt = conn.prepare(query).unwrap(); + match stmt.query_row(params_from_iter(&placeholders), |row| { + Ok(NewEdge { + id: row.get(0)?, + source_hash: row.get(1)?, + source_coordinate: row.get(2)?, + source_strand: row.get(3)?, + target_hash: row.get(4)?, + target_coordinate: row.get(5)?, + target_strand: row.get(6)?, + chromosome_index: row.get(7)?, + phased: row.get(8)?, + }) + }) { + Ok(edge) => edge, + Err(rusqlite::Error::SqliteFailure(err, details)) => { + if err.code == rusqlite::ErrorCode::ConstraintViolation { + println!("{err:?} {details:?}"); + NewEdge { + id: conn + .query_row(id_query, params_from_iter(&placeholders), |row| row.get(0)) + .unwrap(), + source_hash, + source_coordinate, + source_strand, + target_hash, + target_coordinate, + target_strand, + chromosome_index, + phased, + } + } else { + panic!("something bad happened querying the database") + } + } + Err(_) => { + panic!("something bad happened querying the database") + } + } + } + + pub fn bulk_load(conn: &Connection, edge_ids: Vec) -> Vec { + let formatted_edge_ids = edge_ids + .into_iter() + .map(|edge_id| edge_id.to_string()) + .collect::>() + .join(","); + let query = format!("select id, source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased from new_edges where id in ({});", formatted_edge_ids); + NewEdge::query(conn, &query, vec![]) + } + + pub fn query(conn: &Connection, query: &str, placeholders: Vec) -> Vec { + let mut stmt = conn.prepare(query).unwrap(); + let rows = stmt + .query_map(params_from_iter(placeholders), |row| { + Ok(NewEdge { + id: row.get(0)?, + source_hash: row.get(1)?, + source_coordinate: row.get(2)?, + source_strand: row.get(3)?, + target_hash: row.get(4)?, + target_coordinate: row.get(5)?, + target_strand: row.get(6)?, + chromosome_index: row.get(7)?, + phased: row.get(8)?, + }) + }) + .unwrap(); + let mut edges = vec![]; + for row in rows { + edges.push(row.unwrap()); + } + edges + } + + pub fn bulk_create(conn: &Connection, edges: Vec) -> Vec { + let mut edge_rows = vec![]; + for edge in &edges { + let source_hash = format!("\"{0}\"", edge.source_hash); + let source_strand = format!("\"{0}\"", edge.source_strand); + let target_hash = format!("\"{0}\"", edge.target_hash); + let target_strand = format!("\"{0}\"", edge.target_strand); + let edge_row = format!( + "({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7})", + source_hash, + edge.source_coordinate, + source_strand, + target_hash, + edge.target_coordinate, + target_strand, + edge.chromosome_index, + edge.phased + ); + edge_rows.push(edge_row); + } + let formatted_edge_rows = edge_rows.join(", "); + + let select_statement = format!("SELECT * FROM new_edges WHERE (source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased) in ({0});", formatted_edge_rows); + let existing_edges = NewEdge::query(conn, &select_statement, vec![]); + let mut existing_edge_ids: Vec = existing_edges + .clone() + .into_iter() + .map(|edge| edge.id) + .collect(); + + let existing_edge_set = HashSet::::from_iter( + existing_edges.into_iter().map(NewEdge::to_data), + ); + let mut edges_to_insert = HashSet::new(); + for edge in &edges { + if !existing_edge_set.contains(edge) { + edges_to_insert.insert(edge); + } + } + + let mut edge_rows_to_insert = vec![]; + for edge in edges_to_insert { + let source_hash = format!("\"{0}\"", edge.source_hash); + let target_hash = format!("\"{0}\"", edge.target_hash); + let source_strand = format!("\"{0}\"", edge.source_strand); + let target_strand = format!("\"{0}\"", edge.target_strand); + let edge_row = format!( + "({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7})", + source_hash, + edge.source_coordinate, + source_strand, + target_hash, + edge.target_coordinate, + target_strand, + edge.chromosome_index, + edge.phased + ); + edge_rows_to_insert.push(edge_row); + } + + if edge_rows_to_insert.is_empty() { + return existing_edge_ids; + } + + let formatted_edge_rows_to_insert = edge_rows_to_insert.join(", "); + + let insert_statement = format!("INSERT INTO new_edges (source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased) VALUES {0} RETURNING (id);", formatted_edge_rows_to_insert); + let mut stmt = conn.prepare(&insert_statement).unwrap(); + let rows = stmt.query_map([], |row| row.get(0)).unwrap(); + let mut edge_ids: Vec = vec![]; + for row in rows { + edge_ids.push(row.unwrap()); + } + + existing_edge_ids.extend(edge_ids); + existing_edge_ids + } + + pub fn to_data(edge: NewEdge) -> EdgeData { + EdgeData { + source_hash: edge.source_hash, + source_coordinate: edge.source_coordinate, + source_strand: edge.source_strand, + target_hash: edge.target_hash, + target_coordinate: edge.target_coordinate, + target_strand: edge.target_strand, + chromosome_index: edge.chromosome_index, + phased: edge.phased, + } + } +} + +mod tests { + use rusqlite::Connection; + // Note this useful idiom: importing names from outer (for mod tests) scope. + use super::*; + + use crate::migrations::run_migrations; + use crate::models::{sequence::Sequence, Collection}; + + fn get_connection() -> Connection { + let mut conn = Connection::open_in_memory() + .unwrap_or_else(|_| panic!("Error opening in memory test db")); + rusqlite::vtab::array::load_module(&conn).unwrap(); + run_migrations(&mut conn); + conn + } + + #[test] + fn test_bulk_create() { + let conn = &mut get_connection(); + Collection::create(conn, "test collection"); + let sequence1_hash = Sequence::create(conn, "DNA", "ATCGATCG", true); + let edge1 = EdgeData { + source_hash: NewEdge::PATH_START_HASH.to_string(), + source_coordinate: -1, + source_strand: "+".to_string(), + target_hash: sequence1_hash.clone(), + target_coordinate: 1, + target_strand: "+".to_string(), + chromosome_index: 0, + phased: 0, + }; + let sequence2_hash = Sequence::create(conn, "DNA", "AAAAAAAA", true); + let edge2 = EdgeData { + source_hash: sequence1_hash.clone(), + source_coordinate: 2, + source_strand: "+".to_string(), + target_hash: sequence2_hash.clone(), + target_coordinate: 3, + target_strand: "+".to_string(), + chromosome_index: 0, + phased: 0, + }; + let edge3 = EdgeData { + source_hash: sequence2_hash.clone(), + source_coordinate: 4, + source_strand: "+".to_string(), + target_hash: NewEdge::PATH_END_HASH.to_string(), + target_coordinate: -1, + target_strand: "+".to_string(), + chromosome_index: 0, + phased: 0, + }; + + let edge_ids = NewEdge::bulk_create(conn, vec![edge1, edge2, edge3]); + assert_eq!(edge_ids.len(), 3); + let edges = NewEdge::bulk_load(conn, edge_ids); + assert_eq!(edges.len(), 3); + + let edges_by_source_hash = edges + .into_iter() + .map(|edge| (edge.source_hash.clone(), edge)) + .collect::>(); + + let edge_result1 = edges_by_source_hash.get(NewEdge::PATH_START_HASH).unwrap(); + assert_eq!(edge_result1.source_coordinate, -1); + assert_eq!(edge_result1.target_hash, sequence1_hash); + assert_eq!(edge_result1.target_coordinate, 1); + let edge_result2 = edges_by_source_hash.get(&sequence1_hash).unwrap(); + assert_eq!(edge_result2.source_coordinate, 2); + assert_eq!(edge_result2.target_hash, sequence2_hash); + assert_eq!(edge_result2.target_coordinate, 3); + let edge_result3 = edges_by_source_hash.get(&sequence2_hash).unwrap(); + assert_eq!(edge_result3.source_coordinate, 4); + assert_eq!(edge_result3.target_hash, NewEdge::PATH_END_HASH); + assert_eq!(edge_result3.target_coordinate, -1); + } + + #[test] + fn test_bulk_create_with_existing_edge() { + let conn = &mut get_connection(); + Collection::create(conn, "test collection"); + let sequence1_hash = Sequence::create(conn, "DNA", "ATCGATCG", true); + // NOTE: Create one edge ahead of time to confirm an existing row ID gets returned in the bulk create + let existing_edge = NewEdge::create( + conn, + NewEdge::PATH_START_HASH.to_string(), + -1, + "+".to_string(), + sequence1_hash.clone(), + 1, + "+".to_string(), + 0, + 0, + ); + assert_eq!(existing_edge.source_hash, NewEdge::PATH_START_HASH); + assert_eq!(existing_edge.source_coordinate, -1); + assert_eq!(existing_edge.target_hash, sequence1_hash); + assert_eq!(existing_edge.target_coordinate, 1); + + let edge1 = EdgeData { + source_hash: NewEdge::PATH_START_HASH.to_string(), + source_coordinate: -1, + source_strand: "+".to_string(), + target_hash: sequence1_hash.clone(), + target_coordinate: 1, + target_strand: "+".to_string(), + chromosome_index: 0, + phased: 0, + }; + let sequence2_hash = Sequence::create(conn, "DNA", "AAAAAAAA", true); + let edge2 = EdgeData { + source_hash: sequence1_hash.clone(), + source_coordinate: 2, + source_strand: "+".to_string(), + target_hash: sequence2_hash.clone(), + target_coordinate: 3, + target_strand: "+".to_string(), + chromosome_index: 0, + phased: 0, + }; + let edge3 = EdgeData { + source_hash: sequence2_hash.clone(), + source_coordinate: 4, + source_strand: "+".to_string(), + target_hash: NewEdge::PATH_END_HASH.to_string(), + target_coordinate: -1, + target_strand: "+".to_string(), + chromosome_index: 0, + phased: 0, + }; + + let edge_ids = NewEdge::bulk_create(conn, vec![edge1, edge2, edge3]); + assert_eq!(edge_ids.len(), 3); + let edges = NewEdge::bulk_load(conn, edge_ids); + assert_eq!(edges.len(), 3); + + let edges_by_source_hash = edges + .into_iter() + .map(|edge| (edge.source_hash.clone(), edge)) + .collect::>(); + + let edge_result1 = edges_by_source_hash.get(NewEdge::PATH_START_HASH).unwrap(); + + assert_eq!(edge_result1.id, existing_edge.id); + + assert_eq!(edge_result1.source_coordinate, -1); + assert_eq!(edge_result1.target_hash, sequence1_hash); + assert_eq!(edge_result1.target_coordinate, 1); + let edge_result2 = edges_by_source_hash.get(&sequence1_hash).unwrap(); + assert_eq!(edge_result2.source_coordinate, 2); + assert_eq!(edge_result2.target_hash, sequence2_hash); + assert_eq!(edge_result2.target_coordinate, 3); + let edge_result3 = edges_by_source_hash.get(&sequence2_hash).unwrap(); + assert_eq!(edge_result3.source_coordinate, 4); + assert_eq!(edge_result3.target_hash, NewEdge::PATH_END_HASH); + assert_eq!(edge_result3.target_coordinate, -1); + } +} diff --git a/src/models/path.rs b/src/models/path.rs index d561089..b1ac76d 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -1,12 +1,14 @@ -use crate::models::block::Block; -use crate::models::edge::Edge; +use crate::models::{block::Block, new_edge::NewEdge, path_edge::PathEdge, sequence::Sequence}; +use intervaltree::IntervalTree; +use itertools::Itertools; use petgraph::graphmap::DiGraphMap; use petgraph::prelude::Dfs; use petgraph::Direction; use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; +use std::collections::{HashMap, HashSet}; -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct Path { pub id: i32, pub name: String, @@ -47,6 +49,18 @@ pub fn revcomp(seq: &str) -> String { .unwrap() } +#[derive(Clone, Debug)] +pub struct NewBlock { + pub id: i32, + pub sequence: Sequence, + pub block_sequence: String, + pub sequence_start: i32, + pub sequence_end: i32, + pub path_start: i32, + pub path_end: i32, + pub strand: String, +} + impl Path { pub fn create(conn: &Connection, name: &str, block_group_id: i32, blocks: Vec) -> Path { let query = "INSERT INTO path (name, block_group_id) VALUES (?1, ?2) RETURNING (id)"; @@ -75,6 +89,33 @@ impl Path { path } + pub fn new_create( + conn: &Connection, + name: &str, + block_group_id: i32, + edge_ids: Vec, + ) -> Path { + let query = "INSERT INTO path (name, block_group_id) VALUES (?1, ?2) RETURNING (id)"; + let mut stmt = conn.prepare(query).unwrap(); + let mut rows = stmt + .query_map((name, block_group_id), |row| { + Ok(Path { + id: row.get(0)?, + name: name.to_string(), + block_group_id, + blocks: vec![], + }) + }) + .unwrap(); + let path = rows.next().unwrap().unwrap(); + + for (index, edge_id) in edge_ids.iter().enumerate() { + PathEdge::create(conn, path.id, index.try_into().unwrap(), *edge_id); + } + + path + } + pub fn get(conn: &mut Connection, path_id: i32) -> Path { let query = "SELECT id, block_group_id, name from path where id = ?1;"; let mut stmt = conn.prepare(query).unwrap(); @@ -100,7 +141,7 @@ impl Path { id: path_id, block_group_id: row.get(1)?, name: row.get(2)?, - blocks: PathBlock::get_blocks(conn, path_id), + blocks: vec![], }) }) .unwrap(); @@ -124,6 +165,105 @@ impl Path { } sequence } + + pub fn new_sequence(conn: &Connection, path: Path) -> String { + let blocks = Path::blocks_for(conn, &path); + blocks + .into_iter() + .map(|block| block.block_sequence) + .collect::>() + .join("") + } + + pub fn edge_pairs_to_block( + block_id: i32, + path: &Path, + into: NewEdge, + out_of: NewEdge, + sequences_by_hash: &HashMap, + current_path_length: i32, + ) -> NewBlock { + if into.target_hash != out_of.source_hash { + panic!( + "Consecutive edges in path {0} don't share the same block", + path.id + ); + } + + let sequence = sequences_by_hash.get(&into.target_hash).unwrap(); + let start = into.target_coordinate; + let end = out_of.source_coordinate; + + let strand; + let block_sequence_length; + + if into.target_strand == out_of.source_strand { + strand = into.target_strand; + block_sequence_length = end - start; + } else { + panic!( + "Edge pair with target_strand/source_strand mismatch for path {}", + path.id + ); + } + + let block_sequence = if strand == "-" { + revcomp(&sequence.sequence[start as usize..end as usize]) + } else { + sequence.sequence[start as usize..end as usize].to_string() + }; + + NewBlock { + id: block_id, + sequence: sequence.clone(), + block_sequence, + sequence_start: start, + sequence_end: end, + path_start: current_path_length, + path_end: current_path_length + block_sequence_length, + strand: strand.to_string(), + } + } + + pub fn blocks_for(conn: &Connection, path: &Path) -> Vec { + let edges = PathEdge::edges_for(conn, path.id); + let mut sequence_hashes = HashSet::new(); + for edge in &edges { + if edge.source_hash != NewEdge::PATH_START_HASH { + sequence_hashes.insert(edge.source_hash.clone()); + } + if edge.target_hash != NewEdge::PATH_END_HASH { + sequence_hashes.insert(edge.target_hash.clone()); + } + } + let sequences_by_hash = + Sequence::sequences_by_hash(conn, sequence_hashes.into_iter().collect()); + + let mut blocks = vec![]; + let mut path_length = 0; + for (index, (into, out_of)) in edges.into_iter().tuple_windows().enumerate() { + let block = Path::edge_pairs_to_block( + index as i32, + path, + into, + out_of, + &sequences_by_hash, + path_length, + ); + path_length += block.block_sequence.len() as i32; + blocks.push(block); + } + blocks + } + + pub fn intervaltree_for(conn: &Connection, path: &Path) -> IntervalTree { + let blocks = Path::blocks_for(conn, path); + let tree: IntervalTree = blocks + .into_iter() + .map(|block| (block.path_start..block.path_end, block)) + .collect(); + tree + } } #[derive(Debug)] @@ -275,7 +415,7 @@ mod tests { use super::*; use crate::migrations::run_migrations; - use crate::models::{sequence::Sequence, BlockGroup, Collection}; + use crate::models::{sequence::Sequence, BlockGroup, Collection, Edge}; fn get_connection() -> Connection { let mut conn = Connection::open_in_memory() @@ -343,4 +483,107 @@ mod tests { assert_eq!(revcomp("CNNNNA"), "TNNNNG"); assert_eq!(revcomp("cNNgnAt"), "aTncNNg"); } + + #[test] + fn test_intervaltree() { + let conn = &mut get_connection(); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1_hash = Sequence::create(conn, "DNA", "ATCGATCG", true); + let edge1 = NewEdge::create( + conn, + NewEdge::PATH_START_HASH.to_string(), + -1, + "+".to_string(), + sequence1_hash.clone(), + 0, + "+".to_string(), + 0, + 0, + ); + let sequence2_hash = Sequence::create(conn, "DNA", "AAAAAAAA", true); + let edge2 = NewEdge::create( + conn, + sequence1_hash.clone(), + 8, + "+".to_string(), + sequence2_hash.clone(), + 1, + "+".to_string(), + 0, + 0, + ); + let sequence3_hash = Sequence::create(conn, "DNA", "CCCCCCCC", true); + let edge3 = NewEdge::create( + conn, + sequence2_hash.clone(), + 8, + "+".to_string(), + sequence3_hash.clone(), + 1, + "+".to_string(), + 0, + 0, + ); + let sequence4_hash = Sequence::create(conn, "DNA", "GGGGGGGG", true); + let edge4 = NewEdge::create( + conn, + sequence3_hash.clone(), + 8, + "+".to_string(), + sequence4_hash.clone(), + 1, + "+".to_string(), + 0, + 0, + ); + let edge5 = NewEdge::create( + conn, + sequence4_hash.clone(), + 8, + "+".to_string(), + NewEdge::PATH_END_HASH.to_string(), + -1, + "+".to_string(), + 0, + 0, + ); + + let path = Path::new_create( + conn, + "chr1", + block_group.id, + vec![edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], + ); + let tree = Path::intervaltree_for(conn, &path); + let blocks1: Vec<_> = tree.query_point(2).map(|x| x.value.clone()).collect(); + assert_eq!(blocks1.len(), 1); + let block1 = &blocks1[0]; + assert_eq!(block1.sequence.hash, sequence1_hash); + assert_eq!(block1.sequence_start, 0); + assert_eq!(block1.sequence_end, 8); + assert_eq!(block1.path_start, 0); + assert_eq!(block1.path_end, 8); + assert_eq!(block1.strand, "+"); + + let blocks2: Vec<_> = tree.query_point(12).map(|x| x.value.clone()).collect(); + assert_eq!(blocks2.len(), 1); + let block2 = &blocks2[0]; + assert_eq!(block2.sequence.hash, sequence2_hash); + assert_eq!(block2.sequence_start, 1); + assert_eq!(block2.sequence_end, 8); + assert_eq!(block2.path_start, 8); + assert_eq!(block2.path_end, 15); + assert_eq!(block2.strand, "+"); + + let blocks4: Vec<_> = tree.query_point(25).map(|x| x.value.clone()).collect(); + assert_eq!(blocks4.len(), 1); + let block4 = &blocks4[0]; + assert_eq!(block4.sequence.hash, sequence4_hash); + assert_eq!(block4.sequence_start, 1); + assert_eq!(block4.sequence_end, 8); + assert_eq!(block4.path_start, 22); + assert_eq!(block4.path_end, 29); + assert_eq!(block4.strand, "+"); + } } diff --git a/src/models/path_edge.rs b/src/models/path_edge.rs new file mode 100644 index 0000000..f7205fe --- /dev/null +++ b/src/models/path_edge.rs @@ -0,0 +1,263 @@ +use crate::models::{new_edge::NewEdge, path::Path}; +use rusqlite::types::Value; +use rusqlite::{params_from_iter, Connection}; +use std::collections::HashMap; + +#[derive(Clone, Debug)] +pub struct PathEdge { + pub id: i32, + pub path_id: i32, + pub index_in_path: i32, + pub edge_id: i32, +} + +impl PathEdge { + pub fn create(conn: &Connection, path_id: i32, index_in_path: i32, edge_id: i32) -> PathEdge { + let query = + "INSERT INTO path_edges (path_id, index_in_path, edge_id) VALUES (?1, ?2, ?3) RETURNING (id)"; + let mut stmt = conn.prepare(query).unwrap(); + let mut rows = stmt + .query_map((path_id, index_in_path, edge_id), |row| { + Ok(PathEdge { + id: row.get(0)?, + path_id, + index_in_path, + edge_id, + }) + }) + .unwrap(); + match rows.next().unwrap() { + Ok(res) => res, + Err(rusqlite::Error::SqliteFailure(err, details)) => { + if err.code == rusqlite::ErrorCode::ConstraintViolation { + println!("{err:?} {details:?}"); + let mut placeholders = vec![path_id]; + let query = "SELECT id from path_edges where path_id = ?1 AND edge_id = ?2;"; + placeholders.push(edge_id); + println!("{query} {placeholders:?}"); + PathEdge { + id: conn + .query_row(query, params_from_iter(&placeholders), |row| row.get(0)) + .unwrap(), + path_id, + index_in_path, + edge_id, + } + } else { + panic!("something bad happened querying the database") + } + } + Err(_) => { + panic!("something bad happened querying the database") + } + } + } + + pub fn query(conn: &Connection, query: &str, placeholders: Vec) -> Vec { + let mut stmt = conn.prepare(query).unwrap(); + let rows = stmt + .query_map(params_from_iter(placeholders), |row| { + Ok(PathEdge { + id: row.get(0)?, + path_id: row.get(1)?, + index_in_path: row.get(2)?, + edge_id: row.get(3)?, + }) + }) + .unwrap(); + let mut objs = vec![]; + for row in rows { + objs.push(row.unwrap()); + } + objs + } + + pub fn edges_for(conn: &Connection, path_id: i32) -> Vec { + let path_edges = PathEdge::query( + conn, + "select * from path_edges where path_id = ?1 order by index_in_path ASC", + vec![Value::from(path_id)], + ); + let edge_ids = path_edges.into_iter().map(|path_edge| path_edge.edge_id); + let edges = NewEdge::bulk_load(conn, edge_ids.clone().collect()); + let edges_by_id = edges + .into_iter() + .map(|edge| (edge.id, edge)) + .collect::>(); + edge_ids + .into_iter() + .map(|edge_id| edges_by_id[&edge_id].clone()) + .collect::>() + } +} + +mod tests { + use rusqlite::Connection; + // Note this useful idiom: importing names from outer (for mod tests) scope. + use super::*; + + use crate::migrations::run_migrations; + use crate::models::{sequence::Sequence, BlockGroup, Collection}; + + fn get_connection() -> Connection { + let mut conn = Connection::open_in_memory() + .unwrap_or_else(|_| panic!("Error opening in memory test db")); + rusqlite::vtab::array::load_module(&conn).unwrap(); + run_migrations(&mut conn); + conn + } + + #[test] + fn test_gets_sequence() { + let conn = &mut get_connection(); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1_hash = Sequence::create(conn, "DNA", "ATCGATCG", true); + let edge1 = NewEdge::create( + conn, + NewEdge::PATH_START_HASH.to_string(), + -123, + "+".to_string(), + sequence1_hash.clone(), + 0, + "+".to_string(), + 0, + 0, + ); + let sequence2_hash = Sequence::create(conn, "DNA", "AAAAAAAA", true); + let edge2 = NewEdge::create( + conn, + sequence1_hash.clone(), + 8, + "+".to_string(), + sequence2_hash.clone(), + 1, + "+".to_string(), + 0, + 0, + ); + let sequence3_hash = Sequence::create(conn, "DNA", "CCCCCCCC", true); + let edge3 = NewEdge::create( + conn, + sequence2_hash.clone(), + 8, + "+".to_string(), + sequence3_hash.clone(), + 1, + "+".to_string(), + 0, + 0, + ); + let sequence4_hash = Sequence::create(conn, "DNA", "GGGGGGGG", true); + let edge4 = NewEdge::create( + conn, + sequence3_hash.clone(), + 8, + "+".to_string(), + sequence4_hash.clone(), + 1, + "+".to_string(), + 0, + 0, + ); + let edge5 = NewEdge::create( + conn, + sequence4_hash.clone(), + 8, + "+".to_string(), + NewEdge::PATH_END_HASH.to_string(), + -1, + "+".to_string(), + 0, + 0, + ); + + let path = Path::new_create( + conn, + "chr1", + block_group.id, + vec![edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], + ); + assert_eq!( + Path::new_sequence(conn, path), + "ATCGATCGAAAAAAACCCCCCCGGGGGGG" + ); + } + + #[test] + fn test_gets_sequence_with_rc() { + let conn = &mut get_connection(); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1_hash = Sequence::create(conn, "DNA", "ATCGATCG", true); + let edge5 = NewEdge::create( + conn, + sequence1_hash.clone(), + 8, + "-".to_string(), + NewEdge::PATH_END_HASH.to_string(), + 0, + "-".to_string(), + 0, + 0, + ); + let sequence2_hash = Sequence::create(conn, "DNA", "AAAAAAAA", true); + let edge4 = NewEdge::create( + conn, + sequence2_hash.clone(), + 7, + "-".to_string(), + sequence1_hash.clone(), + 0, + "-".to_string(), + 0, + 0, + ); + let sequence3_hash = Sequence::create(conn, "DNA", "CCCCCCCC", true); + let edge3 = NewEdge::create( + conn, + sequence3_hash.clone(), + 7, + "-".to_string(), + sequence2_hash.clone(), + 0, + "-".to_string(), + 0, + 0, + ); + let sequence4_hash = Sequence::create(conn, "DNA", "GGGGGGGG", true); + let edge2 = NewEdge::create( + conn, + sequence4_hash.clone(), + 7, + "-".to_string(), + sequence3_hash.clone(), + 0, + "-".to_string(), + 0, + 0, + ); + let edge1 = NewEdge::create( + conn, + NewEdge::PATH_START_HASH.to_string(), + -1, + "-".to_string(), + sequence4_hash.clone(), + 0, + "-".to_string(), + 0, + 0, + ); + + let path = Path::new_create( + conn, + "chr1", + block_group.id, + vec![edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], + ); + assert_eq!( + Path::new_sequence(conn, path), + "CCCCCCCGGGGGGGTTTTTTTCGATCGAT" + ); + } +} diff --git a/src/models/sequence.rs b/src/models/sequence.rs index 1854e54..708e668 100644 --- a/src/models/sequence.rs +++ b/src/models/sequence.rs @@ -1,8 +1,9 @@ use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; use sha2::{Digest, Sha256}; +use std::collections::HashMap; -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct Sequence { pub hash: String, pub sequence_type: String, @@ -45,11 +46,7 @@ impl Sequence { obj_hash } - pub fn get_sequences( - conn: &Connection, - query: &str, - placeholders: Vec, - ) -> Vec { + pub fn sequences(conn: &Connection, query: &str, placeholders: Vec) -> Vec { let mut stmt = conn.prepare_cached(query).unwrap(); let rows = stmt .query_map(params_from_iter(placeholders), |row| { @@ -67,4 +64,26 @@ impl Sequence { } objs } + + pub fn sequences_by_hash(conn: &Connection, hashes: Vec) -> HashMap { + let joined_hashes = &hashes + .into_iter() + .map(|hash| format!("\"{}\"", hash)) + .collect::>() + .join(","); + let sequences = Sequence::sequences( + conn, + &format!("select * from sequence where hash in ({0})", joined_hashes), + vec![], + ); + sequences + .into_iter() + .map(|sequence| (sequence.hash.clone(), sequence)) + .collect::>() + } + + pub fn sequence_from_hash(conn: &Connection, hash: &str) -> Option { + let sequences_by_hash = Sequence::sequences_by_hash(conn, vec![hash.to_string()]); + sequences_by_hash.get(hash).cloned() + } }