From 9bb27f15f1468427868b5309417c20877675d95d Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 23 Oct 2024 17:10:32 -0400 Subject: [PATCH 01/22] Add edge-style updates --- .../edit-yeast-and-export-fasta.ipynb | 281 ++++++++++++++++++ src/updates.rs | 1 + src/updates/fasta.rs | 51 ++++ 3 files changed, 333 insertions(+) create mode 100644 examples/yeast_editing/edit-yeast-and-export-fasta.ipynb create mode 100644 src/updates/fasta.rs diff --git a/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb b/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb new file mode 100644 index 0000000..90f2760 --- /dev/null +++ b/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb @@ -0,0 +1,281 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "f91a0d12-58e8-45d5-9b19-eaa32c0b612b", + "metadata": {}, + "outputs": [], + "source": [ + "# Clean up any lingering old files\n", + "!rm -rf .gen\n", + "!rm *.db" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c2cff742-e105-4eeb-9bb3-1c3ddc28aebb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-10-23 15:30:08-- http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-1-1_20110203.tgz\n", + "Resolving sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)... 52.92.237.59, 52.92.178.219, 52.92.201.163, ...\n", + "Connecting to sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)|52.92.237.59|:80... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 17152288 (16M) [application/x-tar]\n", + "Saving to: ‘S288C_reference_genome_R64-1-1_20110203.tgz’\n", + "\n", + "S288C_reference_gen 100%[===================>] 16.36M 7.77MB/s in 2.1s \n", + "\n", + "2024-10-23 15:30:11 (7.77 MB/s) - ‘S288C_reference_genome_R64-1-1_20110203.tgz’ saved [17152288/17152288]\n", + "\n" + ] + } + ], + "source": [ + "# Download yeast genome files\n", + "!wget http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-1-1_20110203.tgz" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "318ef937-dc28-437e-a847-143bca61bf1a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "x S288C_reference_genome_R64-1-1_20110203/\n", + "x S288C_reference_genome_R64-1-1_20110203/S288C_reference_sequence_R64-1-1_20110203.fsa\n", + "x S288C_reference_genome_R64-1-1_20110203/gene_association_R64-1-1_20110205.sgd\n", + "x S288C_reference_genome_R64-1-1_20110203/saccharomyces_cerevisiae_R64-1-1_20110208.gff\n", + "x S288C_reference_genome_R64-1-1_20110203/other_features_genomic_R64-1-1_20110203.fasta\n", + "x S288C_reference_genome_R64-1-1_20110203/rna_coding_R64-1-1_20110203.fasta\n", + "x S288C_reference_genome_R64-1-1_20110203/NotFeature_R64-1-1_20110203.fasta\n", + "x S288C_reference_genome_R64-1-1_20110203/orf_trans_all_R64-1-1_20110203.fasta\n", + "x S288C_reference_genome_R64-1-1_20110203/orf_coding_all_R64-1-1_20110203.fasta\n" + ] + } + ], + "source": [ + "!tar -xzvf S288C_reference_genome_R64-1-1_20110203.tgz" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9a779f3a-5585-43b4-8f7b-0f78bfc1cd4a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/hofer/bin/gen\n" + ] + } + ], + "source": [ + "!which gen" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "15c93c14-de3d-4bfe-9ccd-38e6bc22e38b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Gen repository initialized.\n" + ] + } + ], + "source": [ + "!gen init" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a890b7fd-7220-4888-b1d2-46da2ceb599b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Default database set to yeast.db\n", + "Default collection set to genome\n" + ] + } + ], + "source": [ + "!gen defaults --database yeast.db --collection genome" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1d23efeb-d239-43fb-a2ee-e00c7f0741c5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created it\n" + ] + } + ], + "source": [ + "!gen import --fasta S288C_reference_genome_R64-1-1_20110203\\/S288C_reference_sequence_R64-1-1_20110203.fsa" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "406421ff-4000-4418-b4f2-7a62c6087172", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rm: cassette-edit.fa: No such file or directory\n" + ] + } + ], + "source": [ + "!rm cassette-edit.fa\n", + "!echo \">ref|NC_001138|\" >> cassette-edit.fa\n", + "!echo \"ATCGATCG\" >> cassette-edit.fa" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2e2f6fd6-552e-43bb-af90-d7f3d566f5fe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m\u001b[31merror:\u001b[0m unexpected argument '\u001b[33m--start\u001b[0m' found\n", + "\n", + "\u001b[1m\u001b[4mUsage:\u001b[0m \u001b[1mgen update\u001b[0m <--name |--fasta |--vcf |--genotype |--sample >\n", + "\n", + "For more information, try '\u001b[1m--help\u001b[0m'.\n" + ] + } + ], + "source": [ + "!gen update --fasta cassette-edit.fa --start 3 --end 5" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f63ce7c0-20a8-4a06-965a-c750f3f2669a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m\u001b[31merror:\u001b[0m unrecognized subcommand '\u001b[33mlist\u001b[0m'\n", + "\n", + "\u001b[1m\u001b[4mUsage:\u001b[0m \u001b[1mgen\u001b[0m [OPTIONS] [COMMAND]\n", + "\n", + "For more information, try '\u001b[1m--help\u001b[0m'.\n" + ] + } + ], + "source": [ + "!gen list contigs" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e5b2fe67-e050-40af-b170-4382e5f18787", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m\u001b[31merror:\u001b[0m unrecognized subcommand '\u001b[33mlist\u001b[0m'\n", + "\n", + "\u001b[1m\u001b[4mUsage:\u001b[0m \u001b[1mgen\u001b[0m [OPTIONS] [COMMAND]\n", + "\n", + "For more information, try '\u001b[1m--help\u001b[0m'.\n" + ] + } + ], + "source": [ + "!gen list paths ref\\|NC_001138\\|" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "2fc670e3-a9a6-4155-b0dc-0d313bb81311", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m\u001b[31merror:\u001b[0m unexpected argument '\u001b[33m--fasta\u001b[0m' found\n", + "\n", + "\u001b[1m\u001b[4mUsage:\u001b[0m \u001b[1mgen export\u001b[0m [OPTIONS] \u001b[1m--gfa\u001b[0m \n", + "\n", + "For more information, try '\u001b[1m--help\u001b[0m'.\n" + ] + } + ], + "source": [ + "!gen export --fasta edited-yeast-genome.fa" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "723c463a-6159-4257-b34d-bd878efea004", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/updates.rs b/src/updates.rs index c4ab81a..62f5913 100644 --- a/src/updates.rs +++ b/src/updates.rs @@ -1,2 +1,3 @@ +pub mod fasta; pub mod library; pub mod vcf; diff --git a/src/updates/fasta.rs b/src/updates/fasta.rs new file mode 100644 index 0000000..3fcb2fe --- /dev/null +++ b/src/updates/fasta.rs @@ -0,0 +1,51 @@ +use noodles::fasta; +use rusqlite::{session, types::Value as SQLValue, Connection}; + +use crate::models::{ + file_types::FileTypes, + metadata, + operations::{FileAddition, Operation}, + path::Path, +}; +use crate::operation_management; + +pub fn update_with_fasta( + conn: &Connection, + operation_conn: &Connection, + name: &str, + path_name: &str, + start_coordinate: i64, + end_coordinate: i64, + fasta_file_path: &str, +) -> std::io::Result<()> { + let mut session = session::Session::new(conn).unwrap(); + operation_management::attach_session(&mut session); + // let change = FileAddition::create(operation_conn, library_file_path, FileTypes::CSV); + let change = FileAddition::create(operation_conn, fasta_file_path, FileTypes::Fasta); + + let db_uuid = metadata::get_db_uuid(conn); + + let operation = Operation::create( + operation_conn, + &db_uuid, + name.to_string(), + "fasta_update", + change.id, + ); + + let mut fasta_reader = fasta::io::reader::Builder.build_from_path(fasta_file_path)?; + + let path = Path::get_paths( + conn, + "select * from path where name = ?1", + vec![SQLValue::from(path_name.to_string())], + )[0] + .clone(); + + println!("Updated with fasta file: {}", fasta_file_path); + let mut output = Vec::new(); + session.changeset_strm(&mut output).unwrap(); + operation_management::write_changeset(conn, &operation, &output); + + Ok(()) +} From 629e12014cb725cc47ab6066ff7f2cf8d879c7b7 Mon Sep 17 00:00:00 2001 From: hofer Date: Fri, 25 Oct 2024 15:38:31 -0400 Subject: [PATCH 02/22] Mostly finish update with fasta, add in operation path model --- migrations/core/01-initial/up.sql | 8 ++ src/models.rs | 1 + src/models/operation_path.rs | 56 +++++++++++++ src/models/path.rs | 55 +++++++++++++ src/operation_management.rs | 36 +++++++++ src/updates/fasta.rs | 127 +++++++++++++++++++++++++----- 6 files changed, 263 insertions(+), 20 deletions(-) create mode 100644 src/models/operation_path.rs diff --git a/migrations/core/01-initial/up.sql b/migrations/core/01-initial/up.sql index 16e2dd4..c93a372 100644 --- a/migrations/core/01-initial/up.sql +++ b/migrations/core/01-initial/up.sql @@ -141,6 +141,14 @@ CREATE TABLE block_group_edges ( ) STRICT; CREATE UNIQUE INDEX block_group_edges_uidx ON block_group_edges(block_group_id, edge_id); +CREATE TABLE operation_paths ( + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, + operation_id INTEGER NOT NULL, + path_id INTEGER NOT NULL, + FOREIGN KEY(path_id) REFERENCES path(id) +) STRICT; +CREATE UNIQUE INDEX operation_paths_uidx ON operation_paths(operation_id, path_id); + INSERT INTO gen_metadata (db_uuid) values (lower( hex(randomblob(4)) || '-' || hex(randomblob(2)) || '-' || '4' || substr(hex( randomblob(2)), 2) || '-' || diff --git a/src/models.rs b/src/models.rs index 84fc4d6..5a7d4bf 100644 --- a/src/models.rs +++ b/src/models.rs @@ -6,6 +6,7 @@ pub mod edge; pub mod file_types; pub mod metadata; pub mod node; +pub mod operation_path; pub mod operations; pub mod path; pub mod path_edge; diff --git a/src/models/operation_path.rs b/src/models/operation_path.rs new file mode 100644 index 0000000..c3a80e1 --- /dev/null +++ b/src/models/operation_path.rs @@ -0,0 +1,56 @@ +use rusqlite::{params_from_iter, types::Value as SQLValue, Connection}; + +#[derive(Clone, Debug)] +pub struct OperationPath { + pub id: i64, + pub operation_id: i64, + pub path_id: i64, +} + +impl OperationPath { + pub fn create(conn: &Connection, operation_id: i64, path_id: i64) -> i64 { + let insert_statement = + "INSERT INTO operation_paths (operation_id, path_id) VALUES (?1, ?2) RETURNING (id);"; + let mut stmt = conn.prepare_cached(insert_statement).unwrap(); + let mut rows = stmt + .query_map( + params_from_iter(vec![SQLValue::from(operation_id), SQLValue::from(path_id)]), + |row| row.get(0), + ) + .unwrap(); + match rows.next().unwrap() { + Ok(res) => res, + Err(rusqlite::Error::SqliteFailure(err, details)) => { + if err.code == rusqlite::ErrorCode::ConstraintViolation { + println!("{err:?} {details:?}"); + let placeholders = vec![operation_id, path_id]; + let query = "SELECT id from operation_paths where id = ?1 and path_id = ?2;"; + conn.query_row(query, params_from_iter(&placeholders), |row| row.get(0)) + .unwrap() + } else { + panic!("something bad happened querying the database") + } + } + Err(_) => { + panic!("something bad happened querying the database") + } + } + } + + // pub fn query(conn: &Connection, query: &str, placeholders: Vec) -> Vec { + // let mut stmt = conn.prepare(query).unwrap(); + // let mut objs = vec![]; + // let rows = stmt + // .query_map(params_from_iter(placeholders), |row| { + // Ok(Node { + // id: row.get(0)?, + // sequence_hash: row.get(1)?, + // }) + // }) + // .unwrap(); + // for row in rows { + // objs.push(row.unwrap()); + // } + // objs + // } +} diff --git a/src/models/path.rs b/src/models/path.rs index 09e7533..abacbdf 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -537,6 +537,61 @@ impl Path { }) .clone() .collect() + + pub fn new_path_for( + &self, + conn: &Connection, + path_start: i64, + path_end: i64, + edge_to_new_node: &Edge, + edge_from_new_node: &Edge, + ) -> Path { + // Creates a new path from the current one by replacing all edges between path_start and + // path_end with the input edges that are to and from a new node + let tree = Path::intervaltree_for(conn, self); + let block_with_start = tree.query_point(path_start).next().unwrap().value.clone(); + let block_with_end = tree.query_point(path_end).next().unwrap().value.clone(); + + let edges = PathEdge::edges_for_path(conn, self.id); + let edges_by_source = edges + .iter() + .map(|edge| ((edge.source_node_id, edge.source_coordinate), edge)) + .collect::>(); + let edges_by_target = edges + .iter() + .map(|edge| ((edge.target_node_id, edge.target_coordinate), edge)) + .collect::>(); + let edge_before_new_node = edges_by_target + .get(&(block_with_start.node_id, block_with_start.sequence_start)) + .unwrap(); + let edge_after_new_node = edges_by_source + .get(&(block_with_end.node_id, block_with_end.sequence_end)) + .unwrap(); + + let mut new_edge_ids = vec![]; + let mut before_new_node = true; + let mut after_new_node = false; + for edge in &edges { + if before_new_node { + new_edge_ids.push(edge.id); + if edge.id == edge_before_new_node.id { + before_new_node = false; + new_edge_ids.push(edge_to_new_node.id); + new_edge_ids.push(edge_from_new_node.id); + } + } else if after_new_node { + new_edge_ids.push(edge.id); + } else if edge.id == edge_after_new_node.id { + after_new_node = true; + new_edge_ids.push(edge.id); + } + } + + let new_name = format!( + "{}-start-{}-end-{}-node-{}", + self.name, path_start, path_end, edge_to_new_node.target_node_id + ); + Path::create(conn, &new_name, self.block_group_id, &new_edge_ids) } } diff --git a/src/operation_management.rs b/src/operation_management.rs index 965b323..7732b07 100644 --- a/src/operation_management.rs +++ b/src/operation_management.rs @@ -17,6 +17,7 @@ use crate::models::block_group_edge::BlockGroupEdge; use crate::models::collection::Collection; use crate::models::edge::{Edge, EdgeData}; use crate::models::file_types::FileTypes; +use crate::models::metadata; use crate::models::node::Node; use crate::models::operations::{ Branch, FileAddition, Operation, OperationState, OperationSummary, @@ -800,6 +801,41 @@ pub fn move_to(conn: &Connection, operation_conn: &Connection, operation: &Opera } } +pub fn start_operation<'a>( + conn: &'a Connection, + operation_conn: &'a Connection, + file_path: &'a str, + file_type: FileTypes, + operation_description: &'a str, + name: &'a str, +) -> (session::Session<'a>, Operation) { + let mut session = session::Session::new(conn).unwrap(); + attach_session(&mut session); + let change = FileAddition::create(operation_conn, file_path, file_type); + let db_uuid = metadata::get_db_uuid(conn); + let operation = Operation::create( + operation_conn, + &db_uuid, + name.to_string(), + operation_description, + change.id, + ); + (session, operation) +} + +pub fn end_operation( + conn: &Connection, + operation_conn: &Connection, + operation: &Operation, + session: &mut session::Session, + summary_str: &str, +) { + OperationSummary::create(operation_conn, operation.id, summary_str); + let mut output = Vec::new(); + session.changeset_strm(&mut output).unwrap(); + write_changeset(conn, operation, &output); +} + pub fn attach_session(session: &mut session::Session) { for table in [ "collections", diff --git a/src/updates/fasta.rs b/src/updates/fasta.rs index 3fcb2fe..20e9218 100644 --- a/src/updates/fasta.rs +++ b/src/updates/fasta.rs @@ -1,13 +1,19 @@ use noodles::fasta; -use rusqlite::{session, types::Value as SQLValue, Connection}; +use rusqlite::{types::Value as SQLValue, Connection}; +use std::collections::HashMap; +use std::{io, str}; use crate::models::{ + block_group::{BlockGroup, PathCache, PathChange}, + edge::Edge, file_types::FileTypes, - metadata, - operations::{FileAddition, Operation}, - path::Path, + node::Node, + operation_path::OperationPath, + path::{Path, PathBlock}, + sequence::Sequence, + strand::Strand, }; -use crate::operation_management; +use crate::{calculate_hash, operation_management}; pub fn update_with_fasta( conn: &Connection, @@ -17,20 +23,14 @@ pub fn update_with_fasta( start_coordinate: i64, end_coordinate: i64, fasta_file_path: &str, -) -> std::io::Result<()> { - let mut session = session::Session::new(conn).unwrap(); - operation_management::attach_session(&mut session); - // let change = FileAddition::create(operation_conn, library_file_path, FileTypes::CSV); - let change = FileAddition::create(operation_conn, fasta_file_path, FileTypes::Fasta); - - let db_uuid = metadata::get_db_uuid(conn); - - let operation = Operation::create( +) -> io::Result<()> { + let (mut session, operation) = operation_management::start_operation( + conn, operation_conn, - &db_uuid, - name.to_string(), + fasta_file_path, + FileTypes::Fasta, "fasta_update", - change.id, + name, ); let mut fasta_reader = fasta::io::reader::Builder.build_from_path(fasta_file_path)?; @@ -42,10 +42,97 @@ pub fn update_with_fasta( )[0] .clone(); + // Assuming just one entry in the fasta file + let record = fasta_reader.records().next().ok_or_else(|| { + io::Error::new(io::ErrorKind::InvalidData, "No records found in fasta file") + })??; + + let sequence = str::from_utf8(record.sequence().as_ref()) + .unwrap() + .to_string(); + let seq = Sequence::new() + .sequence_type("DNA") + .sequence(&sequence) + .save(conn); + let node_id = Node::create( + conn, + &seq.hash, + calculate_hash(&format!( + "{path_id}:{ref_start}-{ref_end}->{sequence_hash}", + path_id = path.id, + ref_start = 0, + ref_end = seq.length, + sequence_hash = seq.hash + )), + ); + + let path_block = PathBlock { + id: -1, + node_id, + block_sequence: sequence, + sequence_start: 0, + sequence_end: seq.length, + path_start: start_coordinate, + path_end: end_coordinate, + strand: Strand::Forward, + }; + + let path_change = PathChange { + block_group_id: path.block_group_id, + path: path.clone(), + start: start_coordinate, + end: end_coordinate, + block: path_block, + chromosome_index: 0, + phased: 0, + }; + + let path_cache = PathCache { + cache: HashMap::new(), + intervaltree_cache: HashMap::new(), + conn, + }; + + BlockGroup::insert_changes(conn, &vec![path_change], &path_cache); + + let edge_to_new_node = Edge::query( + conn, + "select * from edge where target_node_id = ?1", + vec![SQLValue::from(node_id)], + )[0] + .clone(); + let edge_from_new_node = Edge::query( + conn, + "select * from edge where source_node_id = ?1", + vec![SQLValue::from(node_id)], + )[0] + .clone(); + let new_path = path.new_path_for( + conn, + start_coordinate, + end_coordinate, + &edge_to_new_node, + &edge_from_new_node, + ); + + // let operation_paths = OperationPath::paths_for_operation(conn, operation.id); + // for operation_path in operation_paths { + // if operation_path.path_id != path.id { + // OperationPath::create(conn, operation.id, operation_path.path_id); + // } + // } + // OperationPath::create(conn, operation.id, new_path.id); + + let summary_str = format!(" {}: 1 change", new_path.name); + operation_management::end_operation( + conn, + operation_conn, + &operation, + &mut session, + &summary_str, + ); + println!("Updated with fasta file: {}", fasta_file_path); - let mut output = Vec::new(); - session.changeset_strm(&mut output).unwrap(); - operation_management::write_changeset(conn, &operation, &output); Ok(()) } From ca5def493be747f06f35ed694d3ffa08c21acfda Mon Sep 17 00:00:00 2001 From: hofer Date: Fri, 25 Oct 2024 15:48:55 -0400 Subject: [PATCH 03/22] Complete setup for operation paths --- src/imports/fasta.rs | 2 ++ src/models/operation_path.rs | 37 +++++++++++++++++++----------------- src/updates/fasta.rs | 19 +++++++++--------- 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/src/imports/fasta.rs b/src/imports/fasta.rs index 4244563..77d4eb5 100644 --- a/src/imports/fasta.rs +++ b/src/imports/fasta.rs @@ -10,6 +10,7 @@ use crate::models::{ edge::Edge, metadata, node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}, + operation_path::OperationPath, path::Path, sequence::Sequence, strand::Strand, @@ -104,6 +105,7 @@ pub fn import_fasta( ); BlockGroupEdge::bulk_create(conn, block_group.id, &[edge_into.id, edge_out_of.id]); let path = Path::create(conn, &name, block_group.id, &[edge_into.id, edge_out_of.id]); + OperationPath::create(conn, operation.id, path.id); summary.entry(path.name).or_insert(sequence_length); } let mut summary_str = "".to_string(); diff --git a/src/models/operation_path.rs b/src/models/operation_path.rs index c3a80e1..c6fa9fa 100644 --- a/src/models/operation_path.rs +++ b/src/models/operation_path.rs @@ -1,4 +1,5 @@ -use rusqlite::{params_from_iter, types::Value as SQLValue, Connection}; +use crate::models::traits::Query; +use rusqlite::{params_from_iter, types::Value as SQLValue, Connection, Row}; #[derive(Clone, Debug)] pub struct OperationPath { @@ -7,6 +8,17 @@ pub struct OperationPath { pub path_id: i64, } +impl Query for OperationPath { + type Model = OperationPath; + fn process_row(row: &Row) -> Self::Model { + OperationPath { + id: row.get(0).unwrap(), + operation_id: row.get(1).unwrap(), + path_id: row.get(2).unwrap(), + } + } +} + impl OperationPath { pub fn create(conn: &Connection, operation_id: i64, path_id: i64) -> i64 { let insert_statement = @@ -37,20 +49,11 @@ impl OperationPath { } } - // pub fn query(conn: &Connection, query: &str, placeholders: Vec) -> Vec { - // let mut stmt = conn.prepare(query).unwrap(); - // let mut objs = vec![]; - // let rows = stmt - // .query_map(params_from_iter(placeholders), |row| { - // Ok(Node { - // id: row.get(0)?, - // sequence_hash: row.get(1)?, - // }) - // }) - // .unwrap(); - // for row in rows { - // objs.push(row.unwrap()); - // } - // objs - // } + pub fn paths_for_operation(conn: &Connection, operation_id: i64) -> Vec { + let select_statement = format!( + "SELECT * FROM operation_paths WHERE operation_id = {0};", + operation_id + ); + OperationPath::query(conn, &select_statement, vec![]) + } } diff --git a/src/updates/fasta.rs b/src/updates/fasta.rs index 20e9218..a555ff9 100644 --- a/src/updates/fasta.rs +++ b/src/updates/fasta.rs @@ -80,6 +80,7 @@ pub fn update_with_fasta( let path_change = PathChange { block_group_id: path.block_group_id, path: path.clone(), + path_accession: None, start: start_coordinate, end: end_coordinate, block: path_block, @@ -87,13 +88,13 @@ pub fn update_with_fasta( phased: 0, }; - let path_cache = PathCache { + let mut path_cache = PathCache { cache: HashMap::new(), intervaltree_cache: HashMap::new(), conn, }; - BlockGroup::insert_changes(conn, &vec![path_change], &path_cache); + BlockGroup::insert_changes(conn, &vec![path_change], &mut path_cache); let edge_to_new_node = Edge::query( conn, @@ -115,13 +116,13 @@ pub fn update_with_fasta( &edge_from_new_node, ); - // let operation_paths = OperationPath::paths_for_operation(conn, operation.id); - // for operation_path in operation_paths { - // if operation_path.path_id != path.id { - // OperationPath::create(conn, operation.id, operation_path.path_id); - // } - // } - // OperationPath::create(conn, operation.id, new_path.id); + let operation_paths = OperationPath::paths_for_operation(conn, operation.id); + for operation_path in operation_paths { + if operation_path.path_id != path.id { + OperationPath::create(conn, operation.id, operation_path.path_id); + } + } + OperationPath::create(conn, operation.id, new_path.id); let summary_str = format!(" {}: 1 change", new_path.name); operation_management::end_operation( From f73928bde2d39229d81aae753510bd20a26be17a Mon Sep 17 00:00:00 2001 From: hofer Date: Fri, 25 Oct 2024 19:25:37 -0400 Subject: [PATCH 04/22] Add export fasta operation --- src/exports.rs | 1 + src/exports/fasta.rs | 24 +++++++++++++++++++++ src/main.rs | 51 ++++++++++++++++++++++++++++++++++---------- 3 files changed, 65 insertions(+), 11 deletions(-) create mode 100644 src/exports/fasta.rs diff --git a/src/exports.rs b/src/exports.rs index 5bcbabe..da35a12 100644 --- a/src/exports.rs +++ b/src/exports.rs @@ -1 +1,2 @@ +pub mod fasta; pub mod gfa; diff --git a/src/exports/fasta.rs b/src/exports/fasta.rs new file mode 100644 index 0000000..eedaa0e --- /dev/null +++ b/src/exports/fasta.rs @@ -0,0 +1,24 @@ +use noodles::fasta; +use rusqlite::{types::Value as SQLValue, Connection}; +use std::fs::File; +use std::io; +use std::path::PathBuf; + +use crate::models::{block_group::BlockGroup, operation_path::OperationPath, path::Path}; + +pub fn export_fasta(conn: &Connection, operation_id: i64, filename: &PathBuf) { + let operation_paths = OperationPath::paths_for_operation(conn, operation_id); + for operation_path in operation_paths { + let path = Path::get(conn, operation_path.path_id); + let block_group = BlockGroup::get_by_id(conn, path.block_group_id); + + let file = File::create(filename).unwrap(); + let mut writer = fasta::io::Writer::new(file); + + let definition = fasta::record::Definition::new(block_group.name, None); + let sequence = fasta::record::Sequence::from(Path::sequence(conn, path).into_bytes()); + let record = fasta::Record::new(definition, sequence); + + let _ = writer.write_record(&record); + } +} diff --git a/src/main.rs b/src/main.rs index 481be75..864c72e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,6 +3,7 @@ use clap::{Parser, Subcommand}; use gen::config; use gen::config::get_operation_connection; +use gen::exports::fasta::export_fasta; use gen::exports::gfa::export_gfa; use gen::get_connection; use gen::imports::fasta::import_fasta; @@ -10,6 +11,7 @@ use gen::imports::gfa::import_gfa; use gen::models::metadata; use gen::models::operations::{setup_db, Branch, OperationState}; use gen::operation_management; +use gen::updates::fasta::update_with_fasta; use gen::updates::library::update_with_library; use gen::updates::vcf::update_with_vcf; use rusqlite::{types::Value, Connection}; @@ -20,7 +22,7 @@ use std::str; #[derive(Parser)] #[command(version, about, long_about = None)] struct Cli { - /// The path to the database you wish to utilize + // The path to the database you wish to utilize #[arg(short, long)] db: Option, #[command(subcommand)] @@ -36,18 +38,18 @@ fn get_default_collection(conn: &Connection) -> Option { #[derive(Subcommand)] enum Commands { - /// Import a new sequence collection. + // Import a new sequence collection. Import { - /// Fasta file path + // Fasta file path #[arg(short, long)] fasta: Option, - /// GFA file path + // GFA file path #[arg(short, long)] gfa: Option, - /// The name of the collection to store the entry under + // The name of the collection to store the entry under #[arg(short, long)] name: Option, - /// Don't store the sequence in the database, instead store the filename + // Don't store the sequence in the database, instead store the filename #[arg(short, long, action)] shallow: bool, }, @@ -133,15 +135,18 @@ enum Commands { id: i64, }, Export { - /// The name of the collection to export + // The name of the collection to export #[arg(short, long)] name: Option, - /// The name of the GFA file to export to + // The name of the GFA file to export to #[arg(short, long)] - gfa: String, + gfa: Option, /// An optional sample name #[arg(short, long)] sample: Option, + // The name of the fasta file to export to + #[arg(short, long)] + fasta: Option, }, Defaults { /// The default database to use @@ -258,6 +263,18 @@ fn main() { &parts.clone().unwrap(), library_path, ); + } else if let Some(fasta_path) = fasta { + // NOTE: This has to go after library because the library update also uses a fasta + // file + update_with_fasta( + &conn, + &operation_conn, + name, + &path_name.clone().unwrap(), + start.unwrap(), + end.unwrap(), + fasta_path, + ); } else if let Some(vcf_path) = vcf { update_with_vcf( vcf_path, @@ -387,13 +404,25 @@ fn main() { Some(Commands::Reset { id }) => { operation_management::reset(&conn, &operation_conn, &db_uuid, *id); } - Some(Commands::Export { name, gfa, sample }) => { + Some(Commands::Export { + name, + gfa, + sample, + fasta, + }) => { let name = &name.clone().unwrap_or_else(|| { get_default_collection(&operation_conn) .expect("No collection specified and default not setup.") }); conn.execute("BEGIN TRANSACTION", []).unwrap(); - export_gfa(&conn, name, &PathBuf::from(gfa), sample.clone()); + if let Some(gfa_path) = gfa { + export_gfa(&conn, name, &PathBuf::from(gfa_path), sample.clone()); + } else if let Some(fasta_path) = fasta { + let current_op = OperationState::get_operation(&operation_conn, &db_uuid).unwrap(); + export_fasta(&conn, current_op, &PathBuf::from(fasta_path)); + } else { + panic!("No file type specified for export."); + } conn.execute("END TRANSACTION", []).unwrap(); } None => {} From ec5d01391659f14c2cc8af9982971ceacb110b77 Mon Sep 17 00:00:00 2001 From: hofer Date: Mon, 28 Oct 2024 15:47:33 -0400 Subject: [PATCH 05/22] Some fixes --- .../edit-yeast-and-export-fasta.ipynb | 140 +++++------------- src/exports/fasta.rs | 13 +- src/main.rs | 60 ++++---- src/updates/fasta.rs | 18 +-- 4 files changed, 91 insertions(+), 140 deletions(-) diff --git a/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb b/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb index 90f2760..d7d9568 100644 --- a/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb +++ b/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 18, "id": "f91a0d12-58e8-45d5-9b19-eaa32c0b612b", "metadata": {}, "outputs": [], @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "c2cff742-e105-4eeb-9bb3-1c3ddc28aebb", "metadata": {}, "outputs": [ @@ -22,16 +22,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "--2024-10-23 15:30:08-- http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-1-1_20110203.tgz\n", - "Resolving sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)... 52.92.237.59, 52.92.178.219, 52.92.201.163, ...\n", - "Connecting to sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)|52.92.237.59|:80... connected.\n", + "--2024-10-28 15:20:30-- http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-1-1_20110203.tgz\n", + "Resolving sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)... 52.92.239.139, 52.92.227.147, 52.92.164.155, ...\n", + "Connecting to sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)|52.92.239.139|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 17152288 (16M) [application/x-tar]\n", - "Saving to: ‘S288C_reference_genome_R64-1-1_20110203.tgz’\n", + "Saving to: ‘S288C_reference_genome_R64-1-1_20110203.tgz.2’\n", "\n", - "S288C_reference_gen 100%[===================>] 16.36M 7.77MB/s in 2.1s \n", + "S288C_reference_gen 100%[===================>] 16.36M 1.14MB/s in 16s \n", "\n", - "2024-10-23 15:30:11 (7.77 MB/s) - ‘S288C_reference_genome_R64-1-1_20110203.tgz’ saved [17152288/17152288]\n", + "2024-10-28 15:20:47 (1.02 MB/s) - ‘S288C_reference_genome_R64-1-1_20110203.tgz.2’ saved [17152288/17152288]\n", "\n" ] } @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "id": "318ef937-dc28-437e-a847-143bca61bf1a", "metadata": {}, "outputs": [ @@ -69,25 +69,7 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "9a779f3a-5585-43b4-8f7b-0f78bfc1cd4a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/Users/hofer/bin/gen\n" - ] - } - ], - "source": [ - "!which gen" - ] - }, - { - "cell_type": "code", - "execution_count": 3, + "execution_count": 19, "id": "15c93c14-de3d-4bfe-9ccd-38e6bc22e38b", "metadata": {}, "outputs": [ @@ -105,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 20, "id": "a890b7fd-7220-4888-b1d2-46da2ceb599b", "metadata": {}, "outputs": [ @@ -124,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 21, "id": "1d23efeb-d239-43fb-a2ee-e00c7f0741c5", "metadata": {}, "outputs": [ @@ -142,27 +124,19 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 22, "id": "406421ff-4000-4418-b4f2-7a62c6087172", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "rm: cassette-edit.fa: No such file or directory\n" - ] - } - ], + "outputs": [], "source": [ "!rm cassette-edit.fa\n", - "!echo \">ref|NC_001138|\" >> cassette-edit.fa\n", - "!echo \"ATCGATCG\" >> cassette-edit.fa" + "!echo \">ref|NC_001138|\\n\" >> cassette-edit.fa\n", + "!echo \"ATCGATCG\\n\" >> cassette-edit.fa" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 24, "id": "2e2f6fd6-552e-43bb-af90-d7f3d566f5fe", "metadata": {}, "outputs": [ @@ -170,65 +144,37 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[1m\u001b[31merror:\u001b[0m unexpected argument '\u001b[33m--start\u001b[0m' found\n", - "\n", - "\u001b[1m\u001b[4mUsage:\u001b[0m \u001b[1mgen update\u001b[0m <--name |--fasta |--vcf |--genotype |--sample >\n", - "\n", - "For more information, try '\u001b[1m--help\u001b[0m'.\n" - ] - } - ], - "source": [ - "!gen update --fasta cassette-edit.fa --start 3 --end 5" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "f63ce7c0-20a8-4a06-965a-c750f3f2669a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[31merror:\u001b[0m unrecognized subcommand '\u001b[33mlist\u001b[0m'\n", - "\n", - "\u001b[1m\u001b[4mUsage:\u001b[0m \u001b[1mgen\u001b[0m [OPTIONS] [COMMAND]\n", - "\n", - "For more information, try '\u001b[1m--help\u001b[0m'.\n" - ] - } - ], - "source": [ - "!gen list contigs" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "e5b2fe67-e050-40af-b170-4382e5f18787", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[31merror:\u001b[0m unrecognized subcommand '\u001b[33mlist\u001b[0m'\n", - "\n", - "\u001b[1m\u001b[4mUsage:\u001b[0m \u001b[1mgen\u001b[0m [OPTIONS] [COMMAND]\n", - "\n", - "For more information, try '\u001b[1m--help\u001b[0m'.\n" + "here1\n", + "operation parent ID: 1\n", + "here2\n", + "here2\n", + "here2\n", + "here2\n", + "here2\n", + "here2\n", + "here2\n", + "here2\n", + "here2\n", + "here2\n", + "here2\n", + "here2\n", + "here2\n", + "here2\n", + "here2\n", + "here2\n", + "here2\n", + "here3\n", + "Updated with fasta file: cassette-edit.fa\n" ] } ], "source": [ - "!gen list paths ref\\|NC_001138\\|" + "!gen update --fasta cassette-edit.fa --start 3 --end 5 --path-name ref\\|NC_001138\\|" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 25, "id": "2fc670e3-a9a6-4155-b0dc-0d313bb81311", "metadata": {}, "outputs": [ @@ -236,11 +182,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[1m\u001b[31merror:\u001b[0m unexpected argument '\u001b[33m--fasta\u001b[0m' found\n", - "\n", - "\u001b[1m\u001b[4mUsage:\u001b[0m \u001b[1mgen export\u001b[0m [OPTIONS] \u001b[1m--gfa\u001b[0m \n", - "\n", - "For more information, try '\u001b[1m--help\u001b[0m'.\n" + "Exported to file edited-yeast-genome.fa\n" ] } ], diff --git a/src/exports/fasta.rs b/src/exports/fasta.rs index eedaa0e..066a36c 100644 --- a/src/exports/fasta.rs +++ b/src/exports/fasta.rs @@ -8,17 +8,24 @@ use crate::models::{block_group::BlockGroup, operation_path::OperationPath, path pub fn export_fasta(conn: &Connection, operation_id: i64, filename: &PathBuf) { let operation_paths = OperationPath::paths_for_operation(conn, operation_id); + println!("here1"); + + let file = File::create(filename).unwrap(); + let mut writer = fasta::io::Writer::new(file); + for operation_path in operation_paths { + println!("here2"); let path = Path::get(conn, operation_path.path_id); let block_group = BlockGroup::get_by_id(conn, path.block_group_id); - let file = File::create(filename).unwrap(); - let mut writer = fasta::io::Writer::new(file); - let definition = fasta::record::Definition::new(block_group.name, None); let sequence = fasta::record::Sequence::from(Path::sequence(conn, path).into_bytes()); + println!("sequence length: {}", sequence.len()); + println!("definition: {}", definition); let record = fasta::Record::new(definition, sequence); let _ = writer.write_record(&record); } + + println!("Exported to file {}", filename.display()); } diff --git a/src/main.rs b/src/main.rs index 864c72e..77cdae7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -53,21 +53,21 @@ enum Commands { #[arg(short, long, action)] shallow: bool, }, - /// Update a sequence collection with new data + // Update a sequence collection with new data Update { - /// The name of the collection to update + // The name of the collection to update #[arg(short, long)] name: Option, - /// A fasta file to insert + // A fasta file to insert #[arg(short, long)] fasta: Option, - /// A VCF file to incorporate + // A VCF file to incorporate #[arg(short, long)] vcf: Option, - /// If no genotype is provided, enter the genotype to assign variants + // If no genotype is provided, enter the genotype to assign variants #[arg(short, long)] genotype: Option, - /// If no sample is provided, enter the sample to associate variants to + // If no sample is provided, enter the sample to associate variants to #[arg(short, long)] sample: Option, /// Use the given sample as the parent sample for changes. @@ -76,61 +76,61 @@ enum Commands { /// A CSV with combinatorial library information #[arg(short, long)] library: Option, - /// A fasta with the combinatorial library parts + // A fasta with the combinatorial library parts #[arg(long)] parts: Option, - /// The name of the path to add the library to + // The name of the path to add the library to #[arg(short, long)] path_name: Option, - /// The start coordinate for the region to add the library to + // The start coordinate for the region to add the library to #[arg(long)] start: Option, - /// The end coordinate for the region to add the library to + // The end coordinate for the region to add the library to #[arg(short, long)] end: Option, }, - /// Initialize a gen repository + // Initialize a gen repository Init {}, - /// Manage and create branches + // Manage and create branches Branch { - /// Create a branch with the given name + // Create a branch with the given name #[arg(long, action)] create: bool, - /// Delete a given branch + // Delete a given branch #[arg(short, long, action)] delete: bool, - /// Checkout a given branch + // Checkout a given branch #[arg(long, action)] checkout: bool, - /// List all branches + // List all branches #[arg(short, long, action)] list: bool, - /// The branch name + // The branch name #[clap(index = 1)] branch_name: Option, }, - /// Migrate a database to a given operation + // Migrate a database to a given operation Checkout { - /// The branch identifier to migrate to + // The branch identifier to migrate to #[arg(short, long)] branch: Option, - /// The operation id to move to + // The operation id to move to #[clap(index = 1)] id: Option, }, Reset { - /// The operation id to reset to + // The operation id to reset to #[clap(index = 1)] id: i64, }, - /// View operations carried out against a database + // View operations carried out against a database Operations { - /// The branch to list operations for + // The branch to list operations for #[arg(short, long)] branch: Option, }, Apply { - /// The operation id to apply + // The operation id to apply #[clap(index = 1)] id: i64, }, @@ -141,7 +141,7 @@ enum Commands { // The name of the GFA file to export to #[arg(short, long)] gfa: Option, - /// An optional sample name + // An optional sample name #[arg(short, long)] sample: Option, // The name of the fasta file to export to @@ -149,10 +149,10 @@ enum Commands { fasta: Option, }, Defaults { - /// The default database to use + // The default database to use #[arg(short, long)] database: Option, - /// The default collection to use + // The default collection to use #[arg(short, long)] collection: Option, }, @@ -213,6 +213,7 @@ fn main() { shallow, }) => { conn.execute("BEGIN TRANSACTION", []).unwrap(); + operation_conn.execute("BEGIN TRANSACTION", []).unwrap(); let name = &name.clone().unwrap_or_else(|| { get_default_collection(&operation_conn) .expect("No collection specified and default not setup.") @@ -233,6 +234,7 @@ fn main() { ); } conn.execute("END TRANSACTION", []).unwrap(); + operation_conn.execute("END TRANSACTION", []).unwrap(); } Some(Commands::Update { name, @@ -248,6 +250,7 @@ fn main() { coordinate_frame, }) => { conn.execute("BEGIN TRANSACTION", []).unwrap(); + operation_conn.execute("BEGIN TRANSACTION", []).unwrap(); let name = &name.clone().unwrap_or_else(|| { get_default_collection(&operation_conn) .expect("No collection specified and default not setup.") @@ -290,6 +293,7 @@ fn main() { } conn.execute("END TRANSACTION", []).unwrap(); + operation_conn.execute("END TRANSACTION", []).unwrap(); } Some(Commands::Operations { branch }) => { let current_op = OperationState::get_operation(&operation_conn, &db_uuid) @@ -415,6 +419,7 @@ fn main() { .expect("No collection specified and default not setup.") }); conn.execute("BEGIN TRANSACTION", []).unwrap(); + operation_conn.execute("BEGIN TRANSACTION", []).unwrap(); if let Some(gfa_path) = gfa { export_gfa(&conn, name, &PathBuf::from(gfa_path), sample.clone()); } else if let Some(fasta_path) = fasta { @@ -424,6 +429,7 @@ fn main() { panic!("No file type specified for export."); } conn.execute("END TRANSACTION", []).unwrap(); + operation_conn.execute("END TRANSACTION", []).unwrap(); } None => {} // these will never be handled by this method as we search for them earlier. diff --git a/src/updates/fasta.rs b/src/updates/fasta.rs index a555ff9..f9c36cb 100644 --- a/src/updates/fasta.rs +++ b/src/updates/fasta.rs @@ -1,6 +1,5 @@ use noodles::fasta; use rusqlite::{types::Value as SQLValue, Connection}; -use std::collections::HashMap; use std::{io, str}; use crate::models::{ @@ -88,23 +87,18 @@ pub fn update_with_fasta( phased: 0, }; - let mut path_cache = PathCache { - cache: HashMap::new(), - intervaltree_cache: HashMap::new(), - conn, - }; - - BlockGroup::insert_changes(conn, &vec![path_change], &mut path_cache); + let interval_tree = Path::intervaltree_for(conn, &path); + BlockGroup::insert_change(conn, &path_change, &interval_tree); let edge_to_new_node = Edge::query( conn, - "select * from edge where target_node_id = ?1", + "select * from edges where target_node_id = ?1", vec![SQLValue::from(node_id)], )[0] .clone(); let edge_from_new_node = Edge::query( conn, - "select * from edge where source_node_id = ?1", + "select * from edges where source_node_id = ?1", vec![SQLValue::from(node_id)], )[0] .clone(); @@ -116,12 +110,14 @@ pub fn update_with_fasta( &edge_from_new_node, ); - let operation_paths = OperationPath::paths_for_operation(conn, operation.id); + let operation_parent_id = operation.parent_id.unwrap(); + let operation_paths = OperationPath::paths_for_operation(conn, operation_parent_id); for operation_path in operation_paths { if operation_path.path_id != path.id { OperationPath::create(conn, operation.id, operation_path.path_id); } } + OperationPath::create(conn, operation.id, new_path.id); let summary_str = format!(" {}: 1 change", new_path.name); From 43c86327738eab4f67e91e90ea00ca0eda6b3415 Mon Sep 17 00:00:00 2001 From: hofer Date: Mon, 28 Oct 2024 15:51:15 -0400 Subject: [PATCH 06/22] Delete prints --- src/exports/fasta.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/exports/fasta.rs b/src/exports/fasta.rs index 066a36c..95dd6b4 100644 --- a/src/exports/fasta.rs +++ b/src/exports/fasta.rs @@ -8,20 +8,16 @@ use crate::models::{block_group::BlockGroup, operation_path::OperationPath, path pub fn export_fasta(conn: &Connection, operation_id: i64, filename: &PathBuf) { let operation_paths = OperationPath::paths_for_operation(conn, operation_id); - println!("here1"); let file = File::create(filename).unwrap(); let mut writer = fasta::io::Writer::new(file); for operation_path in operation_paths { - println!("here2"); let path = Path::get(conn, operation_path.path_id); let block_group = BlockGroup::get_by_id(conn, path.block_group_id); let definition = fasta::record::Definition::new(block_group.name, None); let sequence = fasta::record::Sequence::from(Path::sequence(conn, path).into_bytes()); - println!("sequence length: {}", sequence.len()); - println!("definition: {}", definition); let record = fasta::Record::new(definition, sequence); let _ = writer.write_record(&record); From f7c94348b30063d398b60e1c917d2191704911f7 Mon Sep 17 00:00:00 2001 From: hofer Date: Mon, 28 Oct 2024 15:53:37 -0400 Subject: [PATCH 07/22] Update notebook --- .../edit-yeast-and-export-fasta.ipynb | 101 +++++++++++------- 1 file changed, 65 insertions(+), 36 deletions(-) diff --git a/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb b/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb index d7d9568..d07dfbd 100644 --- a/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb +++ b/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 18, + "execution_count": 26, "id": "f91a0d12-58e8-45d5-9b19-eaa32c0b612b", "metadata": {}, "outputs": [], @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 27, "id": "c2cff742-e105-4eeb-9bb3-1c3ddc28aebb", "metadata": {}, "outputs": [ @@ -22,16 +22,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "--2024-10-28 15:20:30-- http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-1-1_20110203.tgz\n", - "Resolving sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)... 52.92.239.139, 52.92.227.147, 52.92.164.155, ...\n", - "Connecting to sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)|52.92.239.139|:80... connected.\n", + "--2024-10-28 15:48:13-- http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-1-1_20110203.tgz\n", + "Resolving sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)... 52.92.193.27, 52.92.210.27, 52.92.210.243, ...\n", + "Connecting to sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)|52.92.193.27|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 17152288 (16M) [application/x-tar]\n", - "Saving to: ‘S288C_reference_genome_R64-1-1_20110203.tgz.2’\n", + "Saving to: ‘S288C_reference_genome_R64-1-1_20110203.tgz.3’\n", "\n", - "S288C_reference_gen 100%[===================>] 16.36M 1.14MB/s in 16s \n", + "S288C_reference_gen 100%[===================>] 16.36M 1.07MB/s in 19s \n", "\n", - "2024-10-28 15:20:47 (1.02 MB/s) - ‘S288C_reference_genome_R64-1-1_20110203.tgz.2’ saved [17152288/17152288]\n", + "2024-10-28 15:48:32 (878 KB/s) - ‘S288C_reference_genome_R64-1-1_20110203.tgz.3’ saved [17152288/17152288]\n", "\n" ] } @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 28, "id": "318ef937-dc28-437e-a847-143bca61bf1a", "metadata": {}, "outputs": [ @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 29, "id": "15c93c14-de3d-4bfe-9ccd-38e6bc22e38b", "metadata": {}, "outputs": [ @@ -87,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 30, "id": "a890b7fd-7220-4888-b1d2-46da2ceb599b", "metadata": {}, "outputs": [ @@ -106,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 31, "id": "1d23efeb-d239-43fb-a2ee-e00c7f0741c5", "metadata": {}, "outputs": [ @@ -124,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 32, "id": "406421ff-4000-4418-b4f2-7a62c6087172", "metadata": {}, "outputs": [], @@ -136,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 33, "id": "2e2f6fd6-552e-43bb-af90-d7f3d566f5fe", "metadata": {}, "outputs": [ @@ -144,26 +144,6 @@ "name": "stdout", "output_type": "stream", "text": [ - "here1\n", - "operation parent ID: 1\n", - "here2\n", - "here2\n", - "here2\n", - "here2\n", - "here2\n", - "here2\n", - "here2\n", - "here2\n", - "here2\n", - "here2\n", - "here2\n", - "here2\n", - "here2\n", - "here2\n", - "here2\n", - "here2\n", - "here2\n", - "here3\n", "Updated with fasta file: cassette-edit.fa\n" ] } @@ -174,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 35, "id": "2fc670e3-a9a6-4155-b0dc-0d313bb81311", "metadata": {}, "outputs": [ @@ -192,9 +172,58 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "id": "723c463a-6159-4257-b34d-bd878efea004", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AAAATAAAGGTAGTAAGTAGCTTTTGGTTGAACATCCGGGTAAGAGACAACAGGGCTTGG\n", + "AGGAGACGTACATGAGGGCTATTTAGGGCTATTTAGGGCTATGTAGAAGTGCTGTAGGGC\n", + "TAAAGAACAGGGTTTCATTTTCATTTTTTTTTTT\n", + ">ref|NC_001138| [org=Saccharomyces cerevisiae] [strain=S288C] [moltype=genomic] [chromosome=VI]\n", + "GATCTCGCAAGTGCATTCCTAGACTTAATTCATATCTGCTCCTCAACTGTCGATGATGCC\n", + "TGCTAAACTGCAGCTTGACGTACTGCGGACCCTGCAGTCCAGCGCTCGTCATGGAACGCA\n", + "AACGCTGAAAAACTCCAACTTTCTCGAGCGCTTCCACAAAGACCGTATCGTCTTTTGCCT\n" + ] + } + ], + "source": [ + "!grep -C 3 NC_001138 S288C_reference_genome_R64-1-1_20110203\\/S288C_reference_sequence_R64-1-1_20110203.fsa" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "98c8c254-4304-495d-8a9e-3505d21cafe7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CCCGCGGCGGGCGGACCCCGAAGGAGTGAGGGACCCCTCCCTAATGGGAGGGGGACCGAACCCCTTTTTAAGAAGGAGTC\n", + "CATATATATATATTAATAAAAAAAAGTAATATATATATATATATTGGAATAGTTATATTATTATACAGAAATATGCTTAA\n", + "TTATAATATAATATCCATA\n", + ">ref|NC_001138|\n", + "GATATCGATCGCGCAAGTGCATTCCTAGACTTAATTCATATCTGCTCCTCAACTGTCGATGATGCCTGCTAAACTGCAGC\n", + "TTGACGTACTGCGGACCCTGCAGTCCAGCGCTCGTCATGGAACGCAAACGCTGAAAAACTCCAACTTTCTCGAGCGCTTC\n", + "CACAAAGACCGTATCGTCTTTTGCCTCCCATTCTTCCCGGCACTTTTTCTCGTCCCAGTTCAAAAAGTACTGCAGCACCT\n" + ] + } + ], + "source": [ + "!grep -C 3 NC_001138 edited-yeast-genome.fa\n", + "# Note that ATCGATCG appears starting after the third base pair, edit successful." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b499ed79-5705-400f-971d-4d7584efb4d3", + "metadata": {}, "outputs": [], "source": [] } From 63b13e6b6c1956002b3f46ac52ceeb7a78177080 Mon Sep 17 00:00:00 2001 From: hofer Date: Mon, 28 Oct 2024 16:01:34 -0400 Subject: [PATCH 08/22] Final notebook edits, remove unused imports --- .../edit-yeast-and-export-fasta.ipynb | 54 ++++++++++++------- src/exports/fasta.rs | 3 +- src/updates/fasta.rs | 2 +- 3 files changed, 37 insertions(+), 22 deletions(-) diff --git a/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb b/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb index d07dfbd..1bb8a28 100644 --- a/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb +++ b/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb @@ -1,9 +1,17 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "765587cd-3c78-4973-bfb2-06824e3a6f32", + "metadata": {}, + "source": [ + "In this notebook, we'll download the fasta for a yeast genome and edit it on the command line by replacing a short subsequence of NC_001138 with our own subsequence, ATCGATCG" + ] + }, { "cell_type": "code", - "execution_count": 26, - "id": "f91a0d12-58e8-45d5-9b19-eaa32c0b612b", + "execution_count": 1, + "id": "b350d406-ea1d-41b4-b051-d5e79065ce2f", "metadata": {}, "outputs": [], "source": [ @@ -14,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 2, "id": "c2cff742-e105-4eeb-9bb3-1c3ddc28aebb", "metadata": {}, "outputs": [ @@ -22,16 +30,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "--2024-10-28 15:48:13-- http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-1-1_20110203.tgz\n", - "Resolving sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)... 52.92.193.27, 52.92.210.27, 52.92.210.243, ...\n", - "Connecting to sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)|52.92.193.27|:80... connected.\n", + "--2024-10-28 16:00:21-- http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-1-1_20110203.tgz\n", + "Resolving sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)... 52.92.181.51, 52.92.176.179, 52.92.194.19, ...\n", + "Connecting to sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)|52.92.181.51|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 17152288 (16M) [application/x-tar]\n", - "Saving to: ‘S288C_reference_genome_R64-1-1_20110203.tgz.3’\n", + "Saving to: ‘S288C_reference_genome_R64-1-1_20110203.tgz’\n", "\n", - "S288C_reference_gen 100%[===================>] 16.36M 1.07MB/s in 19s \n", + "S288C_reference_gen 100%[===================>] 16.36M 1.43MB/s in 13s \n", "\n", - "2024-10-28 15:48:32 (878 KB/s) - ‘S288C_reference_genome_R64-1-1_20110203.tgz.3’ saved [17152288/17152288]\n", + "2024-10-28 16:00:35 (1.28 MB/s) - ‘S288C_reference_genome_R64-1-1_20110203.tgz’ saved [17152288/17152288]\n", "\n" ] } @@ -43,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 3, "id": "318ef937-dc28-437e-a847-143bca61bf1a", "metadata": {}, "outputs": [ @@ -69,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 4, "id": "15c93c14-de3d-4bfe-9ccd-38e6bc22e38b", "metadata": {}, "outputs": [ @@ -87,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 5, "id": "a890b7fd-7220-4888-b1d2-46da2ceb599b", "metadata": {}, "outputs": [ @@ -106,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 6, "id": "1d23efeb-d239-43fb-a2ee-e00c7f0741c5", "metadata": {}, "outputs": [ @@ -124,10 +132,18 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 7, "id": "406421ff-4000-4418-b4f2-7a62c6087172", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rm: cassette-edit.fa: No such file or directory\n" + ] + } + ], "source": [ "!rm cassette-edit.fa\n", "!echo \">ref|NC_001138|\\n\" >> cassette-edit.fa\n", @@ -136,7 +152,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 8, "id": "2e2f6fd6-552e-43bb-af90-d7f3d566f5fe", "metadata": {}, "outputs": [ @@ -154,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 9, "id": "2fc670e3-a9a6-4155-b0dc-0d313bb81311", "metadata": {}, "outputs": [ @@ -172,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 10, "id": "723c463a-6159-4257-b34d-bd878efea004", "metadata": {}, "outputs": [ @@ -196,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 11, "id": "98c8c254-4304-495d-8a9e-3505d21cafe7", "metadata": {}, "outputs": [ diff --git a/src/exports/fasta.rs b/src/exports/fasta.rs index 95dd6b4..95f3083 100644 --- a/src/exports/fasta.rs +++ b/src/exports/fasta.rs @@ -1,7 +1,6 @@ use noodles::fasta; -use rusqlite::{types::Value as SQLValue, Connection}; +use rusqlite::Connection; use std::fs::File; -use std::io; use std::path::PathBuf; use crate::models::{block_group::BlockGroup, operation_path::OperationPath, path::Path}; diff --git a/src/updates/fasta.rs b/src/updates/fasta.rs index f9c36cb..44de602 100644 --- a/src/updates/fasta.rs +++ b/src/updates/fasta.rs @@ -3,7 +3,7 @@ use rusqlite::{types::Value as SQLValue, Connection}; use std::{io, str}; use crate::models::{ - block_group::{BlockGroup, PathCache, PathChange}, + block_group::{BlockGroup, PathChange}, edge::Edge, file_types::FileTypes, node::Node, From 32166efb954f452da19f8a29349fc7d0c3ad5caf Mon Sep 17 00:00:00 2001 From: hofer Date: Tue, 29 Oct 2024 12:57:55 -0400 Subject: [PATCH 09/22] More tests, fix warnings --- fixtures/aaaaaaaa.fa | 2 + fixtures/simple.fa | 2 +- fixtures/tttttttt.fa | 2 + src/exports/fasta.rs | 118 +++++++++++ src/exports/gfa.rs | 1 + src/main.rs | 6 +- src/models/block_group.rs | 33 ++-- src/updates/fasta.rs | 399 ++++++++++++++++++++++++++++++++++++-- src/updates/vcf.rs | 1 + 9 files changed, 530 insertions(+), 34 deletions(-) create mode 100644 fixtures/aaaaaaaa.fa create mode 100644 fixtures/tttttttt.fa diff --git a/fixtures/aaaaaaaa.fa b/fixtures/aaaaaaaa.fa new file mode 100644 index 0000000..c619080 --- /dev/null +++ b/fixtures/aaaaaaaa.fa @@ -0,0 +1,2 @@ +>m123 +AAAAAAAA \ No newline at end of file diff --git a/fixtures/simple.fa b/fixtures/simple.fa index c4b62d0..fe9b332 100644 --- a/fixtures/simple.fa +++ b/fixtures/simple.fa @@ -1,2 +1,2 @@ >m123 -ATCGATCGATCGATCGATCGGGAACACACAGAGA \ No newline at end of file +ATCGATCGATCGATCGATCGGGAACACACAGAGA diff --git a/fixtures/tttttttt.fa b/fixtures/tttttttt.fa new file mode 100644 index 0000000..a306315 --- /dev/null +++ b/fixtures/tttttttt.fa @@ -0,0 +1,2 @@ +>m123 +TTTTTTTT diff --git a/src/exports/fasta.rs b/src/exports/fasta.rs index 95f3083..109964a 100644 --- a/src/exports/fasta.rs +++ b/src/exports/fasta.rs @@ -24,3 +24,121 @@ pub fn export_fasta(conn: &Connection, operation_id: i64, filename: &PathBuf) { println!("Exported to file {}", filename.display()); } + +#[cfg(test)] +mod tests { + // Note this useful idiom: importing names from outer (for mod tests) scope. + use super::*; + use crate::imports::fasta::import_fasta; + use crate::models::{ + metadata, + operations::{setup_db, OperationState}, + }; + use crate::test_helpers::{get_connection, get_operation_connection, setup_gen_dir}; + use crate::updates::fasta::update_with_fasta; + use noodles::fasta; + use std::path::PathBuf; + use std::{io, str}; + use tempdir::TempDir; + + #[test] + fn test_import_then_export() { + setup_gen_dir(); + let mut fasta_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_path.push("fixtures/simple.fa"); + let conn = &get_connection(None); + let db_uuid = metadata::get_db_uuid(conn); + let op_conn = &get_operation_connection(None); + setup_db(op_conn, &db_uuid); + + let collection = "test".to_string(); + + import_fasta( + &fasta_path.to_str().unwrap().to_string(), + &collection, + false, + conn, + op_conn, + ); + let current_op_id = OperationState::get_operation(op_conn, &db_uuid).unwrap(); + let tmp_dir = TempDir::new("export_fasta_files").unwrap().into_path(); + let filename = tmp_dir.join("out.fa"); + export_fasta(conn, current_op_id, &filename); + + let mut fasta_reader = fasta::io::reader::Builder + .build_from_path(filename) + .unwrap(); + let record = fasta_reader + .records() + .next() + .ok_or_else(|| { + io::Error::new(io::ErrorKind::InvalidData, "No records found in fasta file") + }) + .unwrap() + .unwrap(); + + let sequence = str::from_utf8(record.sequence().as_ref()) + .unwrap() + .to_string(); + assert_eq!(sequence, "ATCGATCGATCGATCGATCGGGAACACACAGAGA"); + } + + #[test] + fn test_import_fasta_update_with_fasta_export() { + /* + Graph after fasta update: + AT ----> CGA ------> TCGATCGATCGATCGGGAACACACAGAGA + \-> AAAAAAAA --/ + */ + setup_gen_dir(); + let mut fasta_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_path.push("fixtures/simple.fa"); + let mut fasta_update_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_update_path.push("fixtures/aaaaaaaa.fa"); + let conn = &get_connection(None); + let db_uuid = metadata::get_db_uuid(conn); + let op_conn = &get_operation_connection(None); + setup_db(op_conn, &db_uuid); + + let collection = "test".to_string(); + + import_fasta( + &fasta_path.to_str().unwrap().to_string(), + &collection, + false, + conn, + op_conn, + ); + let _ = update_with_fasta( + conn, + op_conn, + &collection, + "m123", + 2, + 5, + fasta_update_path.to_str().unwrap(), + ); + + let current_op_id = OperationState::get_operation(op_conn, &db_uuid).unwrap(); + let tmp_dir = TempDir::new("export_fasta_files").unwrap().into_path(); + let filename = tmp_dir.join("out.fa"); + export_fasta(conn, current_op_id, &filename); + + let mut fasta_reader = fasta::io::reader::Builder + .build_from_path(filename) + .unwrap(); + let record = fasta_reader + .records() + .next() + .ok_or_else(|| { + io::Error::new(io::ErrorKind::InvalidData, "No records found in fasta file") + }) + .unwrap() + .unwrap(); + + let sequence = str::from_utf8(record.sequence().as_ref()) + .unwrap() + .to_string(); + assert_eq!(sequence, "ATAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA"); + } +} diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index 3617e22..7fff442 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -8,6 +8,7 @@ use crate::models::{ path::Path, path_edge::PathEdge, strand::Strand, + traits::*, }; use itertools::Itertools; use petgraph::prelude::DiGraphMap; diff --git a/src/main.rs b/src/main.rs index 77cdae7..621994b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -82,6 +82,9 @@ enum Commands { // The name of the path to add the library to #[arg(short, long)] path_name: Option, + // The name of the contig to update + #[arg(long)] + contig_name: Option, // The start coordinate for the region to add the library to #[arg(long)] start: Option, @@ -245,6 +248,7 @@ fn main() { genotype, sample, path_name, + contig_name, start, end, coordinate_frame, @@ -273,7 +277,7 @@ fn main() { &conn, &operation_conn, name, - &path_name.clone().unwrap(), + &contig_name.clone().unwrap(), start.unwrap(), end.unwrap(), fasta_path, diff --git a/src/models/block_group.rs b/src/models/block_group.rs index 8e62f20..12433a6 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -5,7 +5,7 @@ use intervaltree::IntervalTree; use itertools::Itertools; use petgraph::graphmap::DiGraphMap; use petgraph::Direction; -use rusqlite::{params_from_iter, types::Value as SQLValue, Connection}; +use rusqlite::{params_from_iter, types::Value as SQLValue, Connection, Row}; use serde::{Deserialize, Serialize}; use crate::graph::{ @@ -158,25 +158,6 @@ impl BlockGroup { } } - pub fn query(conn: &Connection, query: &str, placeholders: Vec) -> Vec { - let mut stmt = conn.prepare(query).unwrap(); - let rows = stmt - .query_map(params_from_iter(placeholders), |row| { - Ok(BlockGroup { - id: row.get(0)?, - collection_name: row.get(1)?, - sample_name: row.get(2)?, - name: row.get(3)?, - }) - }) - .unwrap(); - let mut objs = vec![]; - for row in rows { - objs.push(row.unwrap()); - } - objs - } - pub fn get_by_id(conn: &Connection, id: i64) -> BlockGroup { let query = "SELECT * FROM block_groups WHERE id = ?1"; let mut stmt = conn.prepare(query).unwrap(); @@ -821,6 +802,18 @@ impl BlockGroup { } } +impl Query for BlockGroup { + type Model = BlockGroup; + fn process_row(row: &Row) -> Self::Model { + BlockGroup { + id: row.get(0).unwrap(), + collection_name: row.get(1).unwrap(), + sample_name: row.get(2).unwrap(), + name: row.get(3).unwrap(), + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/updates/fasta.rs b/src/updates/fasta.rs index 44de602..38775c0 100644 --- a/src/updates/fasta.rs +++ b/src/updates/fasta.rs @@ -11,14 +11,15 @@ use crate::models::{ path::{Path, PathBlock}, sequence::Sequence, strand::Strand, + traits::*, }; use crate::{calculate_hash, operation_management}; pub fn update_with_fasta( conn: &Connection, operation_conn: &Connection, - name: &str, - path_name: &str, + collection_name: &str, + contig_name: &str, start_coordinate: i64, end_coordinate: i64, fasta_file_path: &str, @@ -29,17 +30,38 @@ pub fn update_with_fasta( fasta_file_path, FileTypes::Fasta, "fasta_update", - name, + collection_name, ); let mut fasta_reader = fasta::io::reader::Builder.build_from_path(fasta_file_path)?; - let path = Path::get_paths( - conn, - "select * from path where name = ?1", - vec![SQLValue::from(path_name.to_string())], - )[0] - .clone(); + let block_groups = BlockGroup::query(conn, "select * from block_group where collection_name = ?1 AND sample_name is null and name = ?2;", vec![SQLValue::from(collection_name.to_string()), SQLValue::from(contig_name.to_string())]); + if block_groups.is_empty() { + panic!("No block group found for contig {}", contig_name); + } + let block_group_id = block_groups[0].id; + + let operation_parent_id = operation.parent_id.unwrap(); + let operation_paths = OperationPath::paths_for_operation(conn, operation_parent_id); + + let path_ids = operation_paths + .iter() + .map(|operation_path| operation_path.path_id.to_string()) + .collect::>(); + let query = format!( + "select * from path where block_group_id = {block_group_id} and id in ({path_ids});", + path_ids = path_ids.join(", ") + ); + let paths = Path::query(conn, &query, vec![]); + + if paths.len() != 1 { + panic!( + "Expected one path for block group {} (contig name: {}", + block_group_id, contig_name + ); + } + + let path = paths[0].clone(); // Assuming just one entry in the fasta file let record = fasta_reader.records().next().ok_or_else(|| { @@ -77,7 +99,7 @@ pub fn update_with_fasta( }; let path_change = PathChange { - block_group_id: path.block_group_id, + block_group_id, path: path.clone(), path_accession: None, start: start_coordinate, @@ -110,8 +132,6 @@ pub fn update_with_fasta( &edge_from_new_node, ); - let operation_parent_id = operation.parent_id.unwrap(); - let operation_paths = OperationPath::paths_for_operation(conn, operation_parent_id); for operation_path in operation_paths { if operation_path.path_id != path.id { OperationPath::create(conn, operation.id, operation_path.path_id); @@ -133,3 +153,358 @@ pub fn update_with_fasta( Ok(()) } + +#[cfg(test)] +mod tests { + // Note this useful idiom: importing names from outer (for mod tests) scope. + use super::*; + use crate::imports::fasta::import_fasta; + use crate::models::{metadata, operations::setup_db}; + use crate::test_helpers::{get_connection, get_operation_connection, setup_gen_dir}; + use std::collections::HashSet; + use std::path::PathBuf; + + #[test] + fn test_update_with_fasta() { + /* + Graph after fasta update: + AT ----> CGA ------> TCGATCGATCGATCGGGAACACACAGAGA + \-> AAAAAAAA --/ + */ + setup_gen_dir(); + let mut fasta_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_path.push("fixtures/simple.fa"); + let mut fasta_update_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_update_path.push("fixtures/aaaaaaaa.fa"); + let conn = &get_connection(None); + let db_uuid = metadata::get_db_uuid(conn); + let op_conn = &get_operation_connection(None); + setup_db(op_conn, &db_uuid); + + let collection = "test".to_string(); + + import_fasta( + &fasta_path.to_str().unwrap().to_string(), + &collection, + false, + conn, + op_conn, + ); + let _ = update_with_fasta( + conn, + op_conn, + &collection, + "m123", + 2, + 5, + fasta_update_path.to_str().unwrap(), + ); + let expected_sequences = vec![ + "ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string(), + "ATAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), + ]; + assert_eq!( + BlockGroup::get_all_sequences(conn, 1), + HashSet::from_iter(expected_sequences), + ); + } + + #[test] + fn test_update_within_update() { + /* + Graph after fasta updates: + AT --------------> CGA ----------------> TCGATCGATCGATCGGGAACACACAGAGA + \-> AA -----> AA -------> AAAA --/ + \--> TTTTTTTT --/ + */ + setup_gen_dir(); + let mut fasta_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_path.push("fixtures/simple.fa"); + let mut fasta_update1_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_update1_path.push("fixtures/aaaaaaaa.fa"); + let mut fasta_update2_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_update2_path.push("fixtures/tttttttt.fa"); + let conn = &get_connection(None); + let db_uuid = metadata::get_db_uuid(conn); + let op_conn = &get_operation_connection(None); + setup_db(op_conn, &db_uuid); + + let collection = "test".to_string(); + + import_fasta( + &fasta_path.to_str().unwrap().to_string(), + &collection, + false, + conn, + op_conn, + ); + let _ = update_with_fasta( + conn, + op_conn, + &collection, + "m123", + 2, + 5, + fasta_update1_path.to_str().unwrap(), + ); + // Second fasta update replacing part of the first update sequence + let _ = update_with_fasta( + conn, + op_conn, + &collection, + "m123", + 4, + 6, + fasta_update2_path.to_str().unwrap(), + ); + let expected_sequences = vec![ + "ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string(), + "ATAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), + "ATAATTTTTTTTAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), + ]; + assert_eq!( + BlockGroup::get_all_sequences(conn, 1), + HashSet::from_iter(expected_sequences), + ); + } + + #[test] + fn test_update_with_two_fastas_partial_leading_overlap() { + /* + Graph after fasta updates: + A --> T --------------> CGA ----------------> TCGATCGATCGATCGGGAACACACAGAGA + \ \-> AAAA -------> AAAA --/ + \--> TTTTTTTT --/ + */ + setup_gen_dir(); + let mut fasta_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_path.push("fixtures/simple.fa"); + let mut fasta_update1_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_update1_path.push("fixtures/aaaaaaaa.fa"); + let mut fasta_update2_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_update2_path.push("fixtures/tttttttt.fa"); + let conn = &get_connection(None); + let db_uuid = metadata::get_db_uuid(conn); + let op_conn = &get_operation_connection(None); + setup_db(op_conn, &db_uuid); + + let collection = "test".to_string(); + + import_fasta( + &fasta_path.to_str().unwrap().to_string(), + &collection, + false, + conn, + op_conn, + ); + let _ = update_with_fasta( + conn, + op_conn, + &collection, + "m123", + 2, + 5, + fasta_update1_path.to_str().unwrap(), + ); + // Second fasta update replacing parts of both the original and first update sequences + let _ = update_with_fasta( + conn, + op_conn, + &collection, + "m123", + 1, + 6, + fasta_update2_path.to_str().unwrap(), + ); + let expected_sequences = vec![ + "ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string(), + "ATAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), + "ATTTTTTTTAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), + ]; + assert_eq!( + BlockGroup::get_all_sequences(conn, 1), + HashSet::from_iter(expected_sequences), + ); + } + + #[test] + fn test_update_with_two_fastas_partial_trailing_overlap() { + /* + Graph after fasta updates: + A --> T --------------> CGA ----------------> TC --> GATCGATCGATCGGGAACACACAGAGA + \ \-----> AAAAAAAA ---------/ / + \-------------> TTTTTTTT ---------------------/ + */ + /* + Graph after fasta updates: + AT --------------> CGA ------------> TC --> GATCGATCGATCGGGAACACACAGAGA + \-> AAAA -------> AAAA ----/ / + \--> TTTTTTTT --------/ + */ + setup_gen_dir(); + let mut fasta_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_path.push("fixtures/simple.fa"); + let mut fasta_update1_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_update1_path.push("fixtures/aaaaaaaa.fa"); + let mut fasta_update2_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_update2_path.push("fixtures/tttttttt.fa"); + let conn = &get_connection(None); + let db_uuid = metadata::get_db_uuid(conn); + let op_conn = &get_operation_connection(None); + setup_db(op_conn, &db_uuid); + + let collection = "test".to_string(); + + import_fasta( + &fasta_path.to_str().unwrap().to_string(), + &collection, + false, + conn, + op_conn, + ); + let _ = update_with_fasta( + conn, + op_conn, + &collection, + "m123", + 2, + 5, + fasta_update1_path.to_str().unwrap(), + ); + // Second fasta update replacing parts of both the original and first update sequences + let _ = update_with_fasta( + conn, + op_conn, + &collection, + "m123", + 1, + 12, + fasta_update2_path.to_str().unwrap(), + ); + let expected_sequences = vec![ + "ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string(), + "ATAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), + "ATTTTTTTTGATCGATCGATCGGGAACACACAGAGA".to_string(), + ]; + assert_eq!( + BlockGroup::get_all_sequences(conn, 1), + HashSet::from_iter(expected_sequences), + ); + } + + #[test] + fn test_update_with_two_fastas_second_over_first() { + /* + Graph after fasta updates: + AT --------------> CGA ------------> TC --> GATCGATCGATCGGGAACACACAGAGA + \-> AAAA -------> AAAA ----/ / + \--> TTTTTTTT --------/ + */ + setup_gen_dir(); + let mut fasta_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_path.push("fixtures/simple.fa"); + let mut fasta_update1_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_update1_path.push("fixtures/aaaaaaaa.fa"); + let mut fasta_update2_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_update2_path.push("fixtures/tttttttt.fa"); + let conn = &get_connection(None); + let db_uuid = metadata::get_db_uuid(conn); + let op_conn = &get_operation_connection(None); + setup_db(op_conn, &db_uuid); + + let collection = "test".to_string(); + + import_fasta( + &fasta_path.to_str().unwrap().to_string(), + &collection, + false, + conn, + op_conn, + ); + let _ = update_with_fasta( + conn, + op_conn, + &collection, + "m123", + 2, + 5, + fasta_update1_path.to_str().unwrap(), + ); + // Second fasta update replacing parts of both the original and first update sequences + let _ = update_with_fasta( + conn, + op_conn, + &collection, + "m123", + 6, + 12, + fasta_update2_path.to_str().unwrap(), + ); + let expected_sequences = vec![ + "ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string(), + "ATAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), + "ATAAAATTTTTTTTGATCGATCGATCGGGAACACACAGAGA".to_string(), + ]; + assert_eq!( + BlockGroup::get_all_sequences(conn, 1), + HashSet::from_iter(expected_sequences), + ); + } + + #[test] + fn test_update_with_same_fasta_twice() { + /* + Graph after fasta updates: + AT --------------> CGA ----------------> TCGATCGATCGATCGGGAACACACAGAGA + \-> AA -----> AA -------> AAAA --/ + \--> AAAAAAAA --/ + */ + setup_gen_dir(); + let mut fasta_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_path.push("fixtures/simple.fa"); + let mut fasta_update_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fasta_update_path.push("fixtures/aaaaaaaa.fa"); + let conn = &get_connection(None); + let db_uuid = metadata::get_db_uuid(conn); + let op_conn = &get_operation_connection(None); + setup_db(op_conn, &db_uuid); + + let collection = "test".to_string(); + + import_fasta( + &fasta_path.to_str().unwrap().to_string(), + &collection, + false, + conn, + op_conn, + ); + let _ = update_with_fasta( + conn, + op_conn, + &collection, + "m123", + 2, + 5, + fasta_update_path.to_str().unwrap(), + ); + // Same fasta second time + let _ = update_with_fasta( + conn, + op_conn, + &collection, + "m123", + 4, + 6, + fasta_update_path.to_str().unwrap(), + ); + let expected_sequences = vec![ + "ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string(), + "ATAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), + "ATAAAAAAAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), + ]; + assert_eq!( + BlockGroup::get_all_sequences(conn, 1), + HashSet::from_iter(expected_sequences), + ); + } +} diff --git a/src/updates/vcf.rs b/src/updates/vcf.rs index bceb19b..701619c 100644 --- a/src/updates/vcf.rs +++ b/src/updates/vcf.rs @@ -12,6 +12,7 @@ use crate::models::{ sample::Sample, sequence::Sequence, strand::Strand, + traits::*, }; use crate::{calculate_hash, operation_management, parse_genotype}; use noodles::vcf; From fc16aa3cbeee367693d7ad05b4f656e63b05f9b3 Mon Sep 17 00:00:00 2001 From: hofer Date: Tue, 29 Oct 2024 13:57:48 -0400 Subject: [PATCH 10/22] Update notebook --- .../edit-yeast-and-export-fasta.ipynb | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb b/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb index 1bb8a28..f569eb0 100644 --- a/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb +++ b/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb @@ -30,16 +30,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "--2024-10-28 16:00:21-- http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-1-1_20110203.tgz\n", - "Resolving sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)... 52.92.181.51, 52.92.176.179, 52.92.194.19, ...\n", - "Connecting to sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)|52.92.181.51|:80... connected.\n", + "--2024-10-29 13:55:03-- http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-1-1_20110203.tgz\n", + "Resolving sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)... 52.218.228.50, 52.92.132.243, 52.92.207.43, ...\n", + "Connecting to sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)|52.218.228.50|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 17152288 (16M) [application/x-tar]\n", "Saving to: ‘S288C_reference_genome_R64-1-1_20110203.tgz’\n", "\n", - "S288C_reference_gen 100%[===================>] 16.36M 1.43MB/s in 13s \n", + "S288C_reference_gen 100%[===================>] 16.36M 1.19MB/s in 16s \n", "\n", - "2024-10-28 16:00:35 (1.28 MB/s) - ‘S288C_reference_genome_R64-1-1_20110203.tgz’ saved [17152288/17152288]\n", + "2024-10-29 13:55:19 (1.04 MB/s) - ‘S288C_reference_genome_R64-1-1_20110203.tgz’ saved [17152288/17152288]\n", "\n" ] } @@ -152,7 +152,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "2e2f6fd6-552e-43bb-af90-d7f3d566f5fe", "metadata": {}, "outputs": [ @@ -165,12 +165,12 @@ } ], "source": [ - "!gen update --fasta cassette-edit.fa --start 3 --end 5 --path-name ref\\|NC_001138\\|" + "!gen update --fasta cassette-edit.fa --start 3 --end 5 --contig-name ref\\|NC_001138\\|" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "2fc670e3-a9a6-4155-b0dc-0d313bb81311", "metadata": {}, "outputs": [ @@ -188,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "723c463a-6159-4257-b34d-bd878efea004", "metadata": {}, "outputs": [ @@ -212,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "98c8c254-4304-495d-8a9e-3505d21cafe7", "metadata": {}, "outputs": [ From d55f7ff488049617e5ca57dca2bd2b91a9e42c8f Mon Sep 17 00:00:00 2001 From: hofer Date: Tue, 29 Oct 2024 14:03:02 -0400 Subject: [PATCH 11/22] Redo clap commenting --- src/main.rs | 72 ++++++++++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/src/main.rs b/src/main.rs index 621994b..e10b7ed 100644 --- a/src/main.rs +++ b/src/main.rs @@ -38,36 +38,36 @@ fn get_default_collection(conn: &Connection) -> Option { #[derive(Subcommand)] enum Commands { - // Import a new sequence collection. + /// Import a new sequence collection. Import { - // Fasta file path + /// Fasta file path #[arg(short, long)] fasta: Option, - // GFA file path + /// GFA file path #[arg(short, long)] gfa: Option, - // The name of the collection to store the entry under + /// The name of the collection to store the entry under #[arg(short, long)] name: Option, - // Don't store the sequence in the database, instead store the filename + /// Don't store the sequence in the database, instead store the filename #[arg(short, long, action)] shallow: bool, }, - // Update a sequence collection with new data + /// Update a sequence collection with new data Update { - // The name of the collection to update + /// The name of the collection to update #[arg(short, long)] name: Option, - // A fasta file to insert + /// A fasta file to insert #[arg(short, long)] fasta: Option, - // A VCF file to incorporate + /// A VCF file to incorporate #[arg(short, long)] vcf: Option, - // If no genotype is provided, enter the genotype to assign variants + /// If no genotype is provided, enter the genotype to assign variants #[arg(short, long)] genotype: Option, - // If no sample is provided, enter the sample to associate variants to + /// If no sample is provided, enter the sample to associate variants to #[arg(short, long)] sample: Option, /// Use the given sample as the parent sample for changes. @@ -76,86 +76,86 @@ enum Commands { /// A CSV with combinatorial library information #[arg(short, long)] library: Option, - // A fasta with the combinatorial library parts + /// A fasta with the combinatorial library parts #[arg(long)] parts: Option, - // The name of the path to add the library to + /// The name of the path to add the library to #[arg(short, long)] path_name: Option, - // The name of the contig to update + /// The name of the contig to update #[arg(long)] contig_name: Option, - // The start coordinate for the region to add the library to + /// The start coordinate for the region to add the library to #[arg(long)] start: Option, - // The end coordinate for the region to add the library to + /// The end coordinate for the region to add the library to #[arg(short, long)] end: Option, }, - // Initialize a gen repository + /// Initialize a gen repository Init {}, - // Manage and create branches + /// Manage and create branches Branch { - // Create a branch with the given name + /// Create a branch with the given name #[arg(long, action)] create: bool, - // Delete a given branch + /// Delete a given branch #[arg(short, long, action)] delete: bool, - // Checkout a given branch + /// Checkout a given branch #[arg(long, action)] checkout: bool, - // List all branches + /// List all branches #[arg(short, long, action)] list: bool, - // The branch name + /// The branch name #[clap(index = 1)] branch_name: Option, }, - // Migrate a database to a given operation + /// Migrate a database to a given operation Checkout { - // The branch identifier to migrate to + /// The branch identifier to migrate to #[arg(short, long)] branch: Option, - // The operation id to move to + /// The operation id to move to #[clap(index = 1)] id: Option, }, Reset { - // The operation id to reset to + /// The operation id to reset to #[clap(index = 1)] id: i64, }, - // View operations carried out against a database + /// View operations carried out against a database Operations { - // The branch to list operations for + /// The branch to list operations for #[arg(short, long)] branch: Option, }, Apply { - // The operation id to apply + /// The operation id to apply #[clap(index = 1)] id: i64, }, Export { - // The name of the collection to export + /// The name of the collection to export #[arg(short, long)] name: Option, - // The name of the GFA file to export to + /// The name of the GFA file to export to #[arg(short, long)] gfa: Option, - // An optional sample name + /// An optional sample name #[arg(short, long)] sample: Option, - // The name of the fasta file to export to + /// The name of the fasta file to export to #[arg(short, long)] fasta: Option, }, Defaults { - // The default database to use + /// The default database to use #[arg(short, long)] database: Option, - // The default collection to use + /// The default collection to use #[arg(short, long)] collection: Option, }, From 1490dcca90613c84f46e05b4e7d4b871301d03fb Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 30 Oct 2024 12:14:41 -0400 Subject: [PATCH 12/22] Get rarray for queries working --- src/models/path.rs | 20 ++++++++++++++++++++ src/updates/fasta.rs | 17 ++++++++--------- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/src/models/path.rs b/src/models/path.rs index abacbdf..41a5229 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -4,6 +4,7 @@ use std::collections::{HashMap, HashSet}; use intervaltree::IntervalTree; use itertools::Itertools; use rusqlite::types::Value; +use rusqlite::Params; use rusqlite::{params_from_iter, Connection}; use serde::{Deserialize, Serialize}; @@ -147,6 +148,25 @@ impl Path { rows.next().unwrap().unwrap() } + pub fn full_query(conn: &Connection, query: &str, params: impl Params) -> Vec { + let mut stmt = conn.prepare(query).unwrap(); + let rows = stmt + .query_map(params, |row| { + let path_id = row.get(0).unwrap(); + Ok(Path { + id: path_id, + block_group_id: row.get(1)?, + name: row.get(2)?, + }) + }) + .unwrap(); + let mut paths = vec![]; + for row in rows { + paths.push(row.unwrap()); + } + paths + } + pub fn get_paths(conn: &Connection, query: &str, placeholders: Vec) -> Vec { let mut stmt = conn.prepare(query).unwrap(); let rows = stmt diff --git a/src/updates/fasta.rs b/src/updates/fasta.rs index 38775c0..7015551 100644 --- a/src/updates/fasta.rs +++ b/src/updates/fasta.rs @@ -1,6 +1,7 @@ use noodles::fasta; +use rusqlite; use rusqlite::{types::Value as SQLValue, Connection}; -use std::{io, str}; +use std::{io, rc::Rc, str}; use crate::models::{ block_group::{BlockGroup, PathChange}, @@ -46,17 +47,15 @@ pub fn update_with_fasta( let path_ids = operation_paths .iter() - .map(|operation_path| operation_path.path_id.to_string()) - .collect::>(); - let query = format!( - "select * from path where block_group_id = {block_group_id} and id in ({path_ids});", - path_ids = path_ids.join(", ") - ); - let paths = Path::query(conn, &query, vec![]); + .map(|operation_path| SQLValue::from(operation_path.path_id)) + .collect::>(); + let query = "select * from path where block_group_id = ?1 and id in rarray(?2);"; + let params = rusqlite::params!(SQLValue::from(block_group_id), Rc::new(path_ids)); + let paths = Path::full_query(conn, query, params); if paths.len() != 1 { panic!( - "Expected one path for block group {} (contig name: {}", + "Expected one path for block group {} (contig name: {})", block_group_id, contig_name ); } From cd1b8a64577a5003503670e95caf00f842fe8c88 Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 30 Oct 2024 13:14:35 -0400 Subject: [PATCH 13/22] Updates after rebase --- src/exports/fasta.rs | 2 +- src/models/path.rs | 118 ++++++++++++++++++++++++++++++++++++++++++- src/updates/fasta.rs | 2 +- 3 files changed, 119 insertions(+), 3 deletions(-) diff --git a/src/exports/fasta.rs b/src/exports/fasta.rs index 109964a..f6e61a5 100644 --- a/src/exports/fasta.rs +++ b/src/exports/fasta.rs @@ -16,7 +16,7 @@ pub fn export_fasta(conn: &Connection, operation_id: i64, filename: &PathBuf) { let block_group = BlockGroup::get_by_id(conn, path.block_group_id); let definition = fasta::record::Definition::new(block_group.name, None); - let sequence = fasta::record::Sequence::from(Path::sequence(conn, path).into_bytes()); + let sequence = fasta::record::Sequence::from(path.sequence(conn).into_bytes()); let record = fasta::Record::new(definition, sequence); let _ = writer.write_record(&record); diff --git a/src/models/path.rs b/src/models/path.rs index 41a5229..4dcc843 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -557,6 +557,7 @@ impl Path { }) .clone() .collect() + } pub fn new_path_for( &self, @@ -568,7 +569,7 @@ impl Path { ) -> Path { // Creates a new path from the current one by replacing all edges between path_start and // path_end with the input edges that are to and from a new node - let tree = Path::intervaltree_for(conn, self); + let tree = self.intervaltree(conn); let block_with_start = tree.query_point(path_start).next().unwrap().value.clone(); let block_with_end = tree.query_point(path_end).next().unwrap().value.clone(); @@ -2482,4 +2483,119 @@ mod tests { assert_eq!(result_annotation.start, 0); assert_eq!(result_annotation.end, 4); } + + #[test] + fn test_new_path_for() { + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -123, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let sequence2 = Sequence::new() + .sequence_type("DNA") + .sequence("AAAAAAAA") + .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str(), None); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + node2_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge3 = Edge::create( + conn, + node2_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path1 = Path::create( + conn, + "chr1", + block_group.id, + &[edge1.id, edge2.id, edge3.id], + ); + assert_eq!(path1.sequence(conn), "ATCGATCGAAAAAAAA"); + + let sequence3 = Sequence::new() + .sequence_type("DNA") + .sequence("CCCCCCCC") + .save(conn); + let node3_id = Node::create(conn, sequence3.hash.as_str(), None); + let edge4 = Edge::create( + conn, + node1_id, + 4, + Strand::Forward, + node3_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge5 = Edge::create( + conn, + node3_id, + 8, + Strand::Forward, + node2_id, + 3, + Strand::Forward, + 0, + 0, + ); + + let path2 = path1.new_path_for(conn, 4, 11, &edge4, &edge5); + assert_eq!(path2.sequence(conn), "ATCGCCCCCCCCAAAAA"); + + let edge6 = Edge::create( + conn, + node1_id, + 4, + Strand::Forward, + node3_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge7 = Edge::create( + conn, + node3_id, + 8, + Strand::Forward, + node1_id, + 7, + Strand::Forward, + 0, + 0, + ); + + let path3 = path1.new_path_for(conn, 4, 7, &edge6, &edge7); + assert_eq!(path3.sequence(conn), "ATCGCCCCCCCCGAAAAAAAA"); + } } diff --git a/src/updates/fasta.rs b/src/updates/fasta.rs index 7015551..f9a7bea 100644 --- a/src/updates/fasta.rs +++ b/src/updates/fasta.rs @@ -108,7 +108,7 @@ pub fn update_with_fasta( phased: 0, }; - let interval_tree = Path::intervaltree_for(conn, &path); + let interval_tree = path.intervaltree(conn); BlockGroup::insert_change(conn, &path_change, &interval_tree); let edge_to_new_node = Edge::query( From e764c37be871cfe3e1b21a14e94ccec37c9de4a4 Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 30 Oct 2024 13:16:37 -0400 Subject: [PATCH 14/22] Rename method --- src/models/path.rs | 8 ++++---- src/updates/fasta.rs | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/models/path.rs b/src/models/path.rs index 4dcc843..23b23e5 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -559,7 +559,7 @@ impl Path { .collect() } - pub fn new_path_for( + pub fn new_path_with( &self, conn: &Connection, path_start: i64, @@ -2485,7 +2485,7 @@ mod tests { } #[test] - fn test_new_path_for() { + fn test_new_path_with() { let conn = &mut get_connection(None); Collection::create(conn, "test collection"); let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); @@ -2569,7 +2569,7 @@ mod tests { 0, ); - let path2 = path1.new_path_for(conn, 4, 11, &edge4, &edge5); + let path2 = path1.new_path_with(conn, 4, 11, &edge4, &edge5); assert_eq!(path2.sequence(conn), "ATCGCCCCCCCCAAAAA"); let edge6 = Edge::create( @@ -2595,7 +2595,7 @@ mod tests { 0, ); - let path3 = path1.new_path_for(conn, 4, 7, &edge6, &edge7); + let path3 = path1.new_path_with(conn, 4, 7, &edge6, &edge7); assert_eq!(path3.sequence(conn), "ATCGCCCCCCCCGAAAAAAAA"); } } diff --git a/src/updates/fasta.rs b/src/updates/fasta.rs index f9a7bea..eaf5508 100644 --- a/src/updates/fasta.rs +++ b/src/updates/fasta.rs @@ -123,7 +123,7 @@ pub fn update_with_fasta( vec![SQLValue::from(node_id)], )[0] .clone(); - let new_path = path.new_path_for( + let new_path = path.new_path_with( conn, start_coordinate, end_coordinate, From b2406be941d28a923ca9774fc01b4fc094f8133e Mon Sep 17 00:00:00 2001 From: hofer Date: Thu, 31 Oct 2024 10:15:52 -0400 Subject: [PATCH 15/22] Fixes after rebase --- migrations/core/01-initial/up.sql | 2 +- src/updates/fasta.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/migrations/core/01-initial/up.sql b/migrations/core/01-initial/up.sql index c93a372..fbb9e96 100644 --- a/migrations/core/01-initial/up.sql +++ b/migrations/core/01-initial/up.sql @@ -145,7 +145,7 @@ CREATE TABLE operation_paths ( id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, operation_id INTEGER NOT NULL, path_id INTEGER NOT NULL, - FOREIGN KEY(path_id) REFERENCES path(id) + FOREIGN KEY(path_id) REFERENCES paths(id) ) STRICT; CREATE UNIQUE INDEX operation_paths_uidx ON operation_paths(operation_id, path_id); diff --git a/src/updates/fasta.rs b/src/updates/fasta.rs index eaf5508..9fe0dfd 100644 --- a/src/updates/fasta.rs +++ b/src/updates/fasta.rs @@ -36,7 +36,7 @@ pub fn update_with_fasta( let mut fasta_reader = fasta::io::reader::Builder.build_from_path(fasta_file_path)?; - let block_groups = BlockGroup::query(conn, "select * from block_group where collection_name = ?1 AND sample_name is null and name = ?2;", vec![SQLValue::from(collection_name.to_string()), SQLValue::from(contig_name.to_string())]); + let block_groups = BlockGroup::query(conn, "select * from block_groups where collection_name = ?1 AND sample_name is null and name = ?2;", vec![SQLValue::from(collection_name.to_string()), SQLValue::from(contig_name.to_string())]); if block_groups.is_empty() { panic!("No block group found for contig {}", contig_name); } @@ -49,7 +49,7 @@ pub fn update_with_fasta( .iter() .map(|operation_path| SQLValue::from(operation_path.path_id)) .collect::>(); - let query = "select * from path where block_group_id = ?1 and id in rarray(?2);"; + let query = "select * from paths where block_group_id = ?1 and id in rarray(?2);"; let params = rusqlite::params!(SQLValue::from(block_group_id), Rc::new(path_ids)); let paths = Path::full_query(conn, query, params); From e8dafbb9afcff049ac5badad86941f63f62b4c24 Mon Sep 17 00:00:00 2001 From: hofer Date: Thu, 31 Oct 2024 11:37:02 -0400 Subject: [PATCH 16/22] Make update fasta header nonsense to make it clearer the contig name parameter is what matters --- .../edit-yeast-and-export-fasta.ipynb | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb b/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb index f569eb0..a726bc4 100644 --- a/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb +++ b/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb @@ -30,16 +30,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "--2024-10-29 13:55:03-- http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-1-1_20110203.tgz\n", - "Resolving sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)... 52.218.228.50, 52.92.132.243, 52.92.207.43, ...\n", - "Connecting to sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)|52.218.228.50|:80... connected.\n", + "--2024-10-31 11:34:49-- http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-1-1_20110203.tgz\n", + "Resolving sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)... 52.92.138.67, 52.92.208.19, 52.92.133.227, ...\n", + "Connecting to sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)|52.92.138.67|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 17152288 (16M) [application/x-tar]\n", "Saving to: ‘S288C_reference_genome_R64-1-1_20110203.tgz’\n", "\n", - "S288C_reference_gen 100%[===================>] 16.36M 1.19MB/s in 16s \n", + "S288C_reference_gen 100%[===================>] 16.36M 739KB/s in 18s \n", "\n", - "2024-10-29 13:55:19 (1.04 MB/s) - ‘S288C_reference_genome_R64-1-1_20110203.tgz’ saved [17152288/17152288]\n", + "2024-10-31 11:35:07 (948 KB/s) - ‘S288C_reference_genome_R64-1-1_20110203.tgz’ saved [17152288/17152288]\n", "\n" ] } @@ -146,13 +146,14 @@ ], "source": [ "!rm cassette-edit.fa\n", - "!echo \">ref|NC_001138|\\n\" >> cassette-edit.fa\n", + "# NOTE: The header doesn't matter, gen uses the --contig-name argument to know which contig to apply the change to\n", + "!echo \">foo\\n\" >> cassette-edit.fa\n", "!echo \"ATCGATCG\\n\" >> cassette-edit.fa" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "2e2f6fd6-552e-43bb-af90-d7f3d566f5fe", "metadata": {}, "outputs": [ @@ -170,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "2fc670e3-a9a6-4155-b0dc-0d313bb81311", "metadata": {}, "outputs": [ @@ -188,7 +189,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "723c463a-6159-4257-b34d-bd878efea004", "metadata": {}, "outputs": [ @@ -212,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "98c8c254-4304-495d-8a9e-3505d21cafe7", "metadata": {}, "outputs": [ From 757b035fb70fe2edaac1c0766530772fada7468e Mon Sep 17 00:00:00 2001 From: hofer Date: Tue, 5 Nov 2024 14:45:02 -0500 Subject: [PATCH 17/22] Fixes after rebase --- src/models/path.rs | 4 ++-- src/test_helpers.rs | 1 + src/updates/fasta.rs | 12 ++++++------ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/models/path.rs b/src/models/path.rs index 23b23e5..5e26791 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -570,8 +570,8 @@ impl Path { // Creates a new path from the current one by replacing all edges between path_start and // path_end with the input edges that are to and from a new node let tree = self.intervaltree(conn); - let block_with_start = tree.query_point(path_start).next().unwrap().value.clone(); - let block_with_end = tree.query_point(path_end).next().unwrap().value.clone(); + let block_with_start = tree.query_point(path_start).next().unwrap().value; + let block_with_end = tree.query_point(path_end).next().unwrap().value; let edges = PathEdge::edges_for_path(conn, self.id); let edges_by_source = edges diff --git a/src/test_helpers.rs b/src/test_helpers.rs index a8792d1..385a5a5 100644 --- a/src/test_helpers.rs +++ b/src/test_helpers.rs @@ -18,6 +18,7 @@ use crate::models::node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}; use crate::models::path::Path; use crate::models::sequence::Sequence; use crate::models::strand::Strand; +use crate::models::traits::*; pub fn get_connection<'a>(db_path: impl Into>) -> Connection { let path: Option<&str> = db_path.into(); diff --git a/src/updates/fasta.rs b/src/updates/fasta.rs index 9fe0dfd..1006033 100644 --- a/src/updates/fasta.rs +++ b/src/updates/fasta.rs @@ -203,7 +203,7 @@ mod tests { "ATAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), ]; assert_eq!( - BlockGroup::get_all_sequences(conn, 1), + BlockGroup::get_all_sequences(conn, 1, false), HashSet::from_iter(expected_sequences), ); } @@ -262,7 +262,7 @@ mod tests { "ATAATTTTTTTTAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), ]; assert_eq!( - BlockGroup::get_all_sequences(conn, 1), + BlockGroup::get_all_sequences(conn, 1, false), HashSet::from_iter(expected_sequences), ); } @@ -321,7 +321,7 @@ mod tests { "ATTTTTTTTAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), ]; assert_eq!( - BlockGroup::get_all_sequences(conn, 1), + BlockGroup::get_all_sequences(conn, 1, false), HashSet::from_iter(expected_sequences), ); } @@ -386,7 +386,7 @@ mod tests { "ATTTTTTTTGATCGATCGATCGGGAACACACAGAGA".to_string(), ]; assert_eq!( - BlockGroup::get_all_sequences(conn, 1), + BlockGroup::get_all_sequences(conn, 1, false), HashSet::from_iter(expected_sequences), ); } @@ -445,7 +445,7 @@ mod tests { "ATAAAATTTTTTTTGATCGATCGATCGGGAACACACAGAGA".to_string(), ]; assert_eq!( - BlockGroup::get_all_sequences(conn, 1), + BlockGroup::get_all_sequences(conn, 1, false), HashSet::from_iter(expected_sequences), ); } @@ -502,7 +502,7 @@ mod tests { "ATAAAAAAAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), ]; assert_eq!( - BlockGroup::get_all_sequences(conn, 1), + BlockGroup::get_all_sequences(conn, 1, false), HashSet::from_iter(expected_sequences), ); } From 154778e94021377bff78bd40234a0b73949d8a17 Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 6 Nov 2024 15:07:22 -0500 Subject: [PATCH 18/22] Rework to use samples as basis for lineage --- migrations/core/01-initial/up.sql | 8 -- src/exports/fasta.rs | 41 +++++--- src/imports/fasta.rs | 7 +- src/main.rs | 22 ++-- src/models.rs | 1 - src/models/block_group.rs | 9 ++ src/models/operation_path.rs | 59 ----------- src/updates/fasta.rs | 160 ++++++++++++++++++++++-------- 8 files changed, 173 insertions(+), 134 deletions(-) delete mode 100644 src/models/operation_path.rs diff --git a/migrations/core/01-initial/up.sql b/migrations/core/01-initial/up.sql index fbb9e96..16e2dd4 100644 --- a/migrations/core/01-initial/up.sql +++ b/migrations/core/01-initial/up.sql @@ -141,14 +141,6 @@ CREATE TABLE block_group_edges ( ) STRICT; CREATE UNIQUE INDEX block_group_edges_uidx ON block_group_edges(block_group_id, edge_id); -CREATE TABLE operation_paths ( - id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, - operation_id INTEGER NOT NULL, - path_id INTEGER NOT NULL, - FOREIGN KEY(path_id) REFERENCES paths(id) -) STRICT; -CREATE UNIQUE INDEX operation_paths_uidx ON operation_paths(operation_id, path_id); - INSERT INTO gen_metadata (db_uuid) values (lower( hex(randomblob(4)) || '-' || hex(randomblob(2)) || '-' || '4' || substr(hex( randomblob(2)), 2) || '-' || diff --git a/src/exports/fasta.rs b/src/exports/fasta.rs index f6e61a5..9a1d1e5 100644 --- a/src/exports/fasta.rs +++ b/src/exports/fasta.rs @@ -1,19 +1,31 @@ use noodles::fasta; -use rusqlite::Connection; +use rusqlite::{types::Value as SQLValue, Connection}; use std::fs::File; use std::path::PathBuf; -use crate::models::{block_group::BlockGroup, operation_path::OperationPath, path::Path}; - -pub fn export_fasta(conn: &Connection, operation_id: i64, filename: &PathBuf) { - let operation_paths = OperationPath::paths_for_operation(conn, operation_id); +use crate::models::block_group::BlockGroup; +use crate::models::traits::*; + +pub fn export_fasta( + conn: &Connection, + collection_name: &str, + sample_name: &str, + filename: &PathBuf, +) { + let block_groups = BlockGroup::query( + conn, + "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", + vec![ + SQLValue::from(collection_name.to_string()), + SQLValue::from(sample_name.to_string()), + ], + ); let file = File::create(filename).unwrap(); let mut writer = fasta::io::Writer::new(file); - for operation_path in operation_paths { - let path = Path::get(conn, operation_path.path_id); - let block_group = BlockGroup::get_by_id(conn, path.block_group_id); + for block_group in block_groups { + let path = BlockGroup::get_current_path(conn, block_group.id); let definition = fasta::record::Definition::new(block_group.name, None); let sequence = fasta::record::Sequence::from(path.sequence(conn).into_bytes()); @@ -30,10 +42,7 @@ mod tests { // Note this useful idiom: importing names from outer (for mod tests) scope. use super::*; use crate::imports::fasta::import_fasta; - use crate::models::{ - metadata, - operations::{setup_db, OperationState}, - }; + use crate::models::{metadata, operations::setup_db}; use crate::test_helpers::{get_connection, get_operation_connection, setup_gen_dir}; use crate::updates::fasta::update_with_fasta; use noodles::fasta; @@ -60,10 +69,9 @@ mod tests { conn, op_conn, ); - let current_op_id = OperationState::get_operation(op_conn, &db_uuid).unwrap(); let tmp_dir = TempDir::new("export_fasta_files").unwrap().into_path(); let filename = tmp_dir.join("out.fa"); - export_fasta(conn, current_op_id, &filename); + export_fasta(conn, &collection, "", &filename); let mut fasta_reader = fasta::io::reader::Builder .build_from_path(filename) @@ -113,16 +121,17 @@ mod tests { conn, op_conn, &collection, + "", + "new sample", "m123", 2, 5, fasta_update_path.to_str().unwrap(), ); - let current_op_id = OperationState::get_operation(op_conn, &db_uuid).unwrap(); let tmp_dir = TempDir::new("export_fasta_files").unwrap().into_path(); let filename = tmp_dir.join("out.fa"); - export_fasta(conn, current_op_id, &filename); + export_fasta(conn, &collection, "new sample", &filename); let mut fasta_reader = fasta::io::reader::Builder .build_from_path(filename) diff --git a/src/imports/fasta.rs b/src/imports/fasta.rs index 77d4eb5..d3220fc 100644 --- a/src/imports/fasta.rs +++ b/src/imports/fasta.rs @@ -10,8 +10,8 @@ use crate::models::{ edge::Edge, metadata, node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}, - operation_path::OperationPath, path::Path, + sample::Sample, sequence::Sequence, strand::Strand, }; @@ -49,6 +49,8 @@ pub fn import_fasta( name: name.to_string(), } }; + let sample_name = ""; + let _sample = Sample::create(conn, sample_name); let mut summary: HashMap = HashMap::new(); for result in reader.records() { @@ -80,7 +82,7 @@ pub fn import_fasta( hash = seq.hash )), ); - let block_group = BlockGroup::create(conn, &collection.name, None, &name); + let block_group = BlockGroup::create(conn, &collection.name, Some(sample_name), &name); let edge_into = Edge::create( conn, PATH_START_NODE_ID, @@ -105,7 +107,6 @@ pub fn import_fasta( ); BlockGroupEdge::bulk_create(conn, block_group.id, &[edge_into.id, edge_out_of.id]); let path = Path::create(conn, &name, block_group.id, &[edge_into.id, edge_out_of.id]); - OperationPath::create(conn, operation.id, path.id); summary.entry(path.name).or_insert(sequence_length); } let mut summary_str = "".to_string(); diff --git a/src/main.rs b/src/main.rs index e10b7ed..9573590 100644 --- a/src/main.rs +++ b/src/main.rs @@ -70,6 +70,9 @@ enum Commands { /// If no sample is provided, enter the sample to associate variants to #[arg(short, long)] sample: Option, + /// New sample name if we are updating with intentional edits + #[arg(long)] + new_sample: Option, /// Use the given sample as the parent sample for changes. #[arg(long, alias = "cf")] coordinate_frame: Option, @@ -82,9 +85,9 @@ enum Commands { /// The name of the path to add the library to #[arg(short, long)] path_name: Option, - /// The name of the contig to update + /// The name of the region to update (eg "chr1") #[arg(long)] - contig_name: Option, + region_name: Option, /// The start coordinate for the region to add the library to #[arg(long)] start: Option, @@ -247,8 +250,9 @@ fn main() { parts, genotype, sample, + new_sample, path_name, - contig_name, + region_name, start, end, coordinate_frame, @@ -277,7 +281,9 @@ fn main() { &conn, &operation_conn, name, - &contig_name.clone().unwrap(), + &sample.clone().unwrap_or("".to_string()), + &new_sample.clone().unwrap(), + ®ion_name.clone().unwrap(), start.unwrap(), end.unwrap(), fasta_path, @@ -427,8 +433,12 @@ fn main() { if let Some(gfa_path) = gfa { export_gfa(&conn, name, &PathBuf::from(gfa_path), sample.clone()); } else if let Some(fasta_path) = fasta { - let current_op = OperationState::get_operation(&operation_conn, &db_uuid).unwrap(); - export_fasta(&conn, current_op, &PathBuf::from(fasta_path)); + export_fasta( + &conn, + name, + &sample.clone().unwrap_or("".to_string()), + &PathBuf::from(fasta_path), + ); } else { panic!("No file type specified for export."); } diff --git a/src/models.rs b/src/models.rs index 5a7d4bf..84fc4d6 100644 --- a/src/models.rs +++ b/src/models.rs @@ -6,7 +6,6 @@ pub mod edge; pub mod file_types; pub mod metadata; pub mod node; -pub mod operation_path; pub mod operations; pub mod path; pub mod path_edge; diff --git a/src/models/block_group.rs b/src/models/block_group.rs index 12433a6..87ea38d 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -800,6 +800,15 @@ impl BlockGroup { .collect(); tree } + + pub fn get_current_path(conn: &Connection, block_group_id: i64) -> Path { + let paths = Path::get_paths( + conn, + "SELECT * FROM paths WHERE block_group_id = ?1 ORDER BY id DESC", + vec![SQLValue::from(block_group_id)], + ); + paths[0].clone() + } } impl Query for BlockGroup { diff --git a/src/models/operation_path.rs b/src/models/operation_path.rs deleted file mode 100644 index c6fa9fa..0000000 --- a/src/models/operation_path.rs +++ /dev/null @@ -1,59 +0,0 @@ -use crate::models::traits::Query; -use rusqlite::{params_from_iter, types::Value as SQLValue, Connection, Row}; - -#[derive(Clone, Debug)] -pub struct OperationPath { - pub id: i64, - pub operation_id: i64, - pub path_id: i64, -} - -impl Query for OperationPath { - type Model = OperationPath; - fn process_row(row: &Row) -> Self::Model { - OperationPath { - id: row.get(0).unwrap(), - operation_id: row.get(1).unwrap(), - path_id: row.get(2).unwrap(), - } - } -} - -impl OperationPath { - pub fn create(conn: &Connection, operation_id: i64, path_id: i64) -> i64 { - let insert_statement = - "INSERT INTO operation_paths (operation_id, path_id) VALUES (?1, ?2) RETURNING (id);"; - let mut stmt = conn.prepare_cached(insert_statement).unwrap(); - let mut rows = stmt - .query_map( - params_from_iter(vec![SQLValue::from(operation_id), SQLValue::from(path_id)]), - |row| row.get(0), - ) - .unwrap(); - match rows.next().unwrap() { - Ok(res) => res, - Err(rusqlite::Error::SqliteFailure(err, details)) => { - if err.code == rusqlite::ErrorCode::ConstraintViolation { - println!("{err:?} {details:?}"); - let placeholders = vec![operation_id, path_id]; - let query = "SELECT id from operation_paths where id = ?1 and path_id = ?2;"; - conn.query_row(query, params_from_iter(&placeholders), |row| row.get(0)) - .unwrap() - } else { - panic!("something bad happened querying the database") - } - } - Err(_) => { - panic!("something bad happened querying the database") - } - } - } - - pub fn paths_for_operation(conn: &Connection, operation_id: i64) -> Vec { - let select_statement = format!( - "SELECT * FROM operation_paths WHERE operation_id = {0};", - operation_id - ); - OperationPath::query(conn, &select_statement, vec![]) - } -} diff --git a/src/updates/fasta.rs b/src/updates/fasta.rs index 1006033..9788c3c 100644 --- a/src/updates/fasta.rs +++ b/src/updates/fasta.rs @@ -1,26 +1,29 @@ use noodles::fasta; use rusqlite; use rusqlite::{types::Value as SQLValue, Connection}; -use std::{io, rc::Rc, str}; +use std::{io, str}; use crate::models::{ block_group::{BlockGroup, PathChange}, edge::Edge, file_types::FileTypes, node::Node, - operation_path::OperationPath, - path::{Path, PathBlock}, + path::PathBlock, + sample::Sample, sequence::Sequence, strand::Strand, traits::*, }; use crate::{calculate_hash, operation_management}; +#[allow(clippy::too_many_arguments)] pub fn update_with_fasta( conn: &Connection, operation_conn: &Connection, collection_name: &str, - contig_name: &str, + parent_sample_name: &str, + new_sample_name: &str, + region_name: &str, start_coordinate: i64, end_coordinate: i64, fasta_file_path: &str, @@ -36,31 +39,37 @@ pub fn update_with_fasta( let mut fasta_reader = fasta::io::reader::Builder.build_from_path(fasta_file_path)?; - let block_groups = BlockGroup::query(conn, "select * from block_groups where collection_name = ?1 AND sample_name is null and name = ?2;", vec![SQLValue::from(collection_name.to_string()), SQLValue::from(contig_name.to_string())]); - if block_groups.is_empty() { - panic!("No block group found for contig {}", contig_name); + let _new_sample = Sample::create(conn, new_sample_name); + let block_groups = BlockGroup::query( + conn, + "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", + vec![ + SQLValue::from(collection_name.to_string()), + SQLValue::from(parent_sample_name.to_string()), + ], + ); + + let mut new_block_group_id = 0; + for block_group in block_groups { + let new_bg_id = BlockGroup::get_or_create_sample_block_group( + conn, + collection_name, + new_sample_name, + &block_group.name, + Some(parent_sample_name), + ) + .unwrap(); + BlockGroup::clone(conn, block_group.id, new_bg_id); + if block_group.name == region_name { + new_block_group_id = new_bg_id; + } } - let block_group_id = block_groups[0].id; - - let operation_parent_id = operation.parent_id.unwrap(); - let operation_paths = OperationPath::paths_for_operation(conn, operation_parent_id); - - let path_ids = operation_paths - .iter() - .map(|operation_path| SQLValue::from(operation_path.path_id)) - .collect::>(); - let query = "select * from paths where block_group_id = ?1 and id in rarray(?2);"; - let params = rusqlite::params!(SQLValue::from(block_group_id), Rc::new(path_ids)); - let paths = Path::full_query(conn, query, params); - - if paths.len() != 1 { - panic!( - "Expected one path for block group {} (contig name: {})", - block_group_id, contig_name - ); + + if new_block_group_id == 0 { + panic!("No block group found with region name: {}", region_name); } - let path = paths[0].clone(); + let path = BlockGroup::get_current_path(conn, new_block_group_id); // Assuming just one entry in the fasta file let record = fasta_reader.records().next().ok_or_else(|| { @@ -98,7 +107,7 @@ pub fn update_with_fasta( }; let path_change = PathChange { - block_group_id, + block_group_id: new_block_group_id, path: path.clone(), path_accession: None, start: start_coordinate, @@ -131,14 +140,6 @@ pub fn update_with_fasta( &edge_from_new_node, ); - for operation_path in operation_paths { - if operation_path.path_id != path.id { - OperationPath::create(conn, operation.id, operation_path.path_id); - } - } - - OperationPath::create(conn, operation.id, new_path.id); - let summary_str = format!(" {}: 1 change", new_path.name); operation_management::end_operation( conn, @@ -193,17 +194,29 @@ mod tests { conn, op_conn, &collection, + "", + "new sample", "m123", 2, 5, fasta_update_path.to_str().unwrap(), ); + let expected_sequences = vec![ "ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string(), "ATAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), ]; + let block_groups = BlockGroup::query( + conn, + "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", + vec![ + SQLValue::from(collection), + SQLValue::from("new sample".to_string()), + ], + ); + assert_eq!(block_groups.len(), 1); assert_eq!( - BlockGroup::get_all_sequences(conn, 1, false), + BlockGroup::get_all_sequences(conn, block_groups[0].id, false), HashSet::from_iter(expected_sequences), ); } @@ -241,6 +254,8 @@ mod tests { conn, op_conn, &collection, + "", + "new sample", "m123", 2, 5, @@ -251,6 +266,8 @@ mod tests { conn, op_conn, &collection, + "", + "new sample", "m123", 4, 6, @@ -261,8 +278,17 @@ mod tests { "ATAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), "ATAATTTTTTTTAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), ]; + let block_groups = BlockGroup::query( + conn, + "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", + vec![ + SQLValue::from(collection), + SQLValue::from("new sample".to_string()), + ], + ); + assert_eq!(block_groups.len(), 1); assert_eq!( - BlockGroup::get_all_sequences(conn, 1, false), + BlockGroup::get_all_sequences(conn, block_groups[0].id, false), HashSet::from_iter(expected_sequences), ); } @@ -300,6 +326,8 @@ mod tests { conn, op_conn, &collection, + "", + "new sample", "m123", 2, 5, @@ -310,6 +338,8 @@ mod tests { conn, op_conn, &collection, + "", + "new sample", "m123", 1, 6, @@ -320,8 +350,17 @@ mod tests { "ATAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), "ATTTTTTTTAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), ]; + let block_groups = BlockGroup::query( + conn, + "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", + vec![ + SQLValue::from(collection), + SQLValue::from("new sample".to_string()), + ], + ); + assert_eq!(block_groups.len(), 1); assert_eq!( - BlockGroup::get_all_sequences(conn, 1, false), + BlockGroup::get_all_sequences(conn, block_groups[0].id, false), HashSet::from_iter(expected_sequences), ); } @@ -365,6 +404,8 @@ mod tests { conn, op_conn, &collection, + "", + "new sample", "m123", 2, 5, @@ -375,6 +416,8 @@ mod tests { conn, op_conn, &collection, + "", + "new sample", "m123", 1, 12, @@ -385,8 +428,17 @@ mod tests { "ATAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), "ATTTTTTTTGATCGATCGATCGGGAACACACAGAGA".to_string(), ]; + let block_groups = BlockGroup::query( + conn, + "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", + vec![ + SQLValue::from(collection), + SQLValue::from("new sample".to_string()), + ], + ); + assert_eq!(block_groups.len(), 1); assert_eq!( - BlockGroup::get_all_sequences(conn, 1, false), + BlockGroup::get_all_sequences(conn, block_groups[0].id, false), HashSet::from_iter(expected_sequences), ); } @@ -424,6 +476,8 @@ mod tests { conn, op_conn, &collection, + "", + "new sample", "m123", 2, 5, @@ -434,6 +488,8 @@ mod tests { conn, op_conn, &collection, + "", + "new sample", "m123", 6, 12, @@ -444,8 +500,17 @@ mod tests { "ATAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), "ATAAAATTTTTTTTGATCGATCGATCGGGAACACACAGAGA".to_string(), ]; + let block_groups = BlockGroup::query( + conn, + "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", + vec![ + SQLValue::from(collection), + SQLValue::from("new sample".to_string()), + ], + ); + assert_eq!(block_groups.len(), 1); assert_eq!( - BlockGroup::get_all_sequences(conn, 1, false), + BlockGroup::get_all_sequences(conn, block_groups[0].id, false), HashSet::from_iter(expected_sequences), ); } @@ -481,6 +546,8 @@ mod tests { conn, op_conn, &collection, + "", + "new sample", "m123", 2, 5, @@ -491,6 +558,8 @@ mod tests { conn, op_conn, &collection, + "", + "new sample", "m123", 4, 6, @@ -501,8 +570,17 @@ mod tests { "ATAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), "ATAAAAAAAAAAAAAATCGATCGATCGATCGGGAACACACAGAGA".to_string(), ]; + let block_groups = BlockGroup::query( + conn, + "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", + vec![ + SQLValue::from(collection), + SQLValue::from("new sample".to_string()), + ], + ); + assert_eq!(block_groups.len(), 1); assert_eq!( - BlockGroup::get_all_sequences(conn, 1, false), + BlockGroup::get_all_sequences(conn, block_groups[0].id, false), HashSet::from_iter(expected_sequences), ); } From 5959b5ad4c72c826e5c9d7fa2bc33fa63c58942f Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 6 Nov 2024 18:27:31 -0500 Subject: [PATCH 19/22] Fix some bugs --- src/main.rs | 2 +- src/models/path.rs | 20 -------------------- src/operation_management.rs | 18 +++++++++--------- src/updates/vcf.rs | 6 +++--- 4 files changed, 13 insertions(+), 33 deletions(-) diff --git a/src/main.rs b/src/main.rs index 9573590..e899760 100644 --- a/src/main.rs +++ b/src/main.rs @@ -22,7 +22,7 @@ use std::str; #[derive(Parser)] #[command(version, about, long_about = None)] struct Cli { - // The path to the database you wish to utilize + /// The path to the database you wish to utilize #[arg(short, long)] db: Option, #[command(subcommand)] diff --git a/src/models/path.rs b/src/models/path.rs index 5e26791..decd87d 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -4,7 +4,6 @@ use std::collections::{HashMap, HashSet}; use intervaltree::IntervalTree; use itertools::Itertools; use rusqlite::types::Value; -use rusqlite::Params; use rusqlite::{params_from_iter, Connection}; use serde::{Deserialize, Serialize}; @@ -148,25 +147,6 @@ impl Path { rows.next().unwrap().unwrap() } - pub fn full_query(conn: &Connection, query: &str, params: impl Params) -> Vec { - let mut stmt = conn.prepare(query).unwrap(); - let rows = stmt - .query_map(params, |row| { - let path_id = row.get(0).unwrap(); - Ok(Path { - id: path_id, - block_group_id: row.get(1)?, - name: row.get(2)?, - }) - }) - .unwrap(); - let mut paths = vec![]; - for row in rows { - paths.push(row.unwrap()); - } - paths - } - pub fn get_paths(conn: &Connection, query: &str, placeholders: Vec) -> Vec { let mut stmt = conn.prepare(query).unwrap(); let rows = stmt diff --git a/src/operation_management.rs b/src/operation_management.rs index 7732b07..da4c861 100644 --- a/src/operation_management.rs +++ b/src/operation_management.rs @@ -1000,7 +1000,7 @@ mod tests { let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len(); assert_eq!(edge_count, 2); assert_eq!(node_count, 3); - assert_eq!(sample_count, 0); + assert_eq!(sample_count, 1); assert_eq!(op_count, 1); update_with_vcf( &vcf_path.to_str().unwrap().to_string(), @@ -1029,7 +1029,7 @@ mod tests { // * 1 node created by the initial fasta import // * 2 nodes created by the VCF update. See above explanation of edge count for more details. assert_eq!(node_count, 5); - assert_eq!(sample_count, 3); + assert_eq!(sample_count, 4); assert_eq!(op_count, 2); // revert back to state 1 where vcf samples and blockpaths do not exist @@ -1047,7 +1047,7 @@ mod tests { let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len(); assert_eq!(edge_count, 2); assert_eq!(node_count, 3); - assert_eq!(sample_count, 0); + assert_eq!(sample_count, 1); assert_eq!(op_count, 2); apply_changeset( @@ -1063,7 +1063,7 @@ mod tests { let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len(); assert_eq!(edge_count, 10); assert_eq!(node_count, 5); - assert_eq!(sample_count, 3); + assert_eq!(sample_count, 4); assert_eq!(op_count, 2); } @@ -1230,7 +1230,7 @@ mod tests { let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len(); assert_eq!(edge_count, 2); assert_eq!(node_count, 3); - assert_eq!(sample_count, 0); + assert_eq!(sample_count, 1); assert_eq!(op_count, 1); let branch_1 = Branch::create(operation_conn, &db_uuid, "branch_1"); @@ -1258,7 +1258,7 @@ mod tests { let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len(); assert_eq!(edge_count, 10); assert_eq!(node_count, 5); - assert_eq!(sample_count, 3); + assert_eq!(sample_count, 4); assert_eq!(op_count, 2); // checkout branch 2 @@ -1282,7 +1282,7 @@ mod tests { let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len(); assert_eq!(edge_count, 2); assert_eq!(node_count, 3); - assert_eq!(sample_count, 0); + assert_eq!(sample_count, 1); assert_eq!(op_count, 2); // apply vcf2 @@ -1301,7 +1301,7 @@ mod tests { let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len(); assert_eq!(edge_count, 6); assert_eq!(node_count, 4); - assert_eq!(sample_count, 1); + assert_eq!(sample_count, 2); assert_eq!(op_count, 3); // migrate to branch 1 again @@ -1323,7 +1323,7 @@ mod tests { let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len(); assert_eq!(edge_count, 10); assert_eq!(node_count, 5); - assert_eq!(sample_count, 3); + assert_eq!(sample_count, 4); assert_eq!(op_count, 3); } diff --git a/src/updates/vcf.rs b/src/updates/vcf.rs index 701619c..9db776f 100644 --- a/src/updates/vcf.rs +++ b/src/updates/vcf.rs @@ -244,7 +244,7 @@ pub fn update_with_vcf<'a>( collection_name, &fixed_sample, seq_name.clone(), - coordinate_frame, + Some(""), ); let sample_bg_id = sample_bg_id.expect("can't find sample bg....check this out more"); @@ -301,7 +301,7 @@ pub fn update_with_vcf<'a>( collection_name, &sample_names[sample_index], seq_name.clone(), - coordinate_frame, + Some(""), ); let sample_bg_id = @@ -977,7 +977,7 @@ mod tests { ); assert_eq!( - BlockGroup::get_all_sequences(conn, get_sample_bg(conn, None).id, true), + BlockGroup::get_all_sequences(conn, get_sample_bg(conn, "").id, true), HashSet::from_iter(vec!["ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string()]) ); assert_eq!( From 42ba7d2c5365422b2049da8c6fb5fdf785ee328e Mon Sep 17 00:00:00 2001 From: hofer Date: Thu, 7 Nov 2024 16:29:10 -0500 Subject: [PATCH 20/22] Fix tests --- src/exports/fasta.rs | 35 +++++++++------ src/imports/fasta.rs | 5 +-- src/main.rs | 4 +- src/operation_management.rs | 18 ++++---- src/updates/fasta.rs | 85 ++++++++++++++++++++----------------- src/updates/vcf.rs | 6 +-- 6 files changed, 84 insertions(+), 69 deletions(-) diff --git a/src/exports/fasta.rs b/src/exports/fasta.rs index 9a1d1e5..159dd17 100644 --- a/src/exports/fasta.rs +++ b/src/exports/fasta.rs @@ -9,17 +9,26 @@ use crate::models::traits::*; pub fn export_fasta( conn: &Connection, collection_name: &str, - sample_name: &str, + sample_name: Option<&str>, filename: &PathBuf, ) { - let block_groups = BlockGroup::query( - conn, - "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", - vec![ - SQLValue::from(collection_name.to_string()), - SQLValue::from(sample_name.to_string()), - ], - ); + let block_groups; + if let Some(sample_name) = sample_name { + block_groups = BlockGroup::query( + conn, + "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", + vec![ + SQLValue::from(collection_name.to_string()), + SQLValue::from(sample_name.to_string()), + ], + ); + } else { + block_groups = BlockGroup::query( + conn, + "select * from block_groups where collection_name = ?1 AND sample_name IS NULL;", + vec![SQLValue::from(collection_name.to_string())], + ); + } let file = File::create(filename).unwrap(); let mut writer = fasta::io::Writer::new(file); @@ -71,7 +80,7 @@ mod tests { ); let tmp_dir = TempDir::new("export_fasta_files").unwrap().into_path(); let filename = tmp_dir.join("out.fa"); - export_fasta(conn, &collection, "", &filename); + export_fasta(conn, &collection, None, &filename); let mut fasta_reader = fasta::io::reader::Builder .build_from_path(filename) @@ -121,8 +130,8 @@ mod tests { conn, op_conn, &collection, - "", - "new sample", + None, + "child sample", "m123", 2, 5, @@ -131,7 +140,7 @@ mod tests { let tmp_dir = TempDir::new("export_fasta_files").unwrap().into_path(); let filename = tmp_dir.join("out.fa"); - export_fasta(conn, &collection, "new sample", &filename); + export_fasta(conn, &collection, Some("child sample"), &filename); let mut fasta_reader = fasta::io::reader::Builder .build_from_path(filename) diff --git a/src/imports/fasta.rs b/src/imports/fasta.rs index d3220fc..4244563 100644 --- a/src/imports/fasta.rs +++ b/src/imports/fasta.rs @@ -11,7 +11,6 @@ use crate::models::{ metadata, node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}, path::Path, - sample::Sample, sequence::Sequence, strand::Strand, }; @@ -49,8 +48,6 @@ pub fn import_fasta( name: name.to_string(), } }; - let sample_name = ""; - let _sample = Sample::create(conn, sample_name); let mut summary: HashMap = HashMap::new(); for result in reader.records() { @@ -82,7 +79,7 @@ pub fn import_fasta( hash = seq.hash )), ); - let block_group = BlockGroup::create(conn, &collection.name, Some(sample_name), &name); + let block_group = BlockGroup::create(conn, &collection.name, None, &name); let edge_into = Edge::create( conn, PATH_START_NODE_ID, diff --git a/src/main.rs b/src/main.rs index e899760..00537cb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -281,7 +281,7 @@ fn main() { &conn, &operation_conn, name, - &sample.clone().unwrap_or("".to_string()), + sample.clone().as_deref(), &new_sample.clone().unwrap(), ®ion_name.clone().unwrap(), start.unwrap(), @@ -436,7 +436,7 @@ fn main() { export_fasta( &conn, name, - &sample.clone().unwrap_or("".to_string()), + sample.clone().as_deref(), &PathBuf::from(fasta_path), ); } else { diff --git a/src/operation_management.rs b/src/operation_management.rs index da4c861..7732b07 100644 --- a/src/operation_management.rs +++ b/src/operation_management.rs @@ -1000,7 +1000,7 @@ mod tests { let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len(); assert_eq!(edge_count, 2); assert_eq!(node_count, 3); - assert_eq!(sample_count, 1); + assert_eq!(sample_count, 0); assert_eq!(op_count, 1); update_with_vcf( &vcf_path.to_str().unwrap().to_string(), @@ -1029,7 +1029,7 @@ mod tests { // * 1 node created by the initial fasta import // * 2 nodes created by the VCF update. See above explanation of edge count for more details. assert_eq!(node_count, 5); - assert_eq!(sample_count, 4); + assert_eq!(sample_count, 3); assert_eq!(op_count, 2); // revert back to state 1 where vcf samples and blockpaths do not exist @@ -1047,7 +1047,7 @@ mod tests { let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len(); assert_eq!(edge_count, 2); assert_eq!(node_count, 3); - assert_eq!(sample_count, 1); + assert_eq!(sample_count, 0); assert_eq!(op_count, 2); apply_changeset( @@ -1063,7 +1063,7 @@ mod tests { let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len(); assert_eq!(edge_count, 10); assert_eq!(node_count, 5); - assert_eq!(sample_count, 4); + assert_eq!(sample_count, 3); assert_eq!(op_count, 2); } @@ -1230,7 +1230,7 @@ mod tests { let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len(); assert_eq!(edge_count, 2); assert_eq!(node_count, 3); - assert_eq!(sample_count, 1); + assert_eq!(sample_count, 0); assert_eq!(op_count, 1); let branch_1 = Branch::create(operation_conn, &db_uuid, "branch_1"); @@ -1258,7 +1258,7 @@ mod tests { let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len(); assert_eq!(edge_count, 10); assert_eq!(node_count, 5); - assert_eq!(sample_count, 4); + assert_eq!(sample_count, 3); assert_eq!(op_count, 2); // checkout branch 2 @@ -1282,7 +1282,7 @@ mod tests { let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len(); assert_eq!(edge_count, 2); assert_eq!(node_count, 3); - assert_eq!(sample_count, 1); + assert_eq!(sample_count, 0); assert_eq!(op_count, 2); // apply vcf2 @@ -1301,7 +1301,7 @@ mod tests { let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len(); assert_eq!(edge_count, 6); assert_eq!(node_count, 4); - assert_eq!(sample_count, 2); + assert_eq!(sample_count, 1); assert_eq!(op_count, 3); // migrate to branch 1 again @@ -1323,7 +1323,7 @@ mod tests { let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len(); assert_eq!(edge_count, 10); assert_eq!(node_count, 5); - assert_eq!(sample_count, 4); + assert_eq!(sample_count, 3); assert_eq!(op_count, 3); } diff --git a/src/updates/fasta.rs b/src/updates/fasta.rs index 9788c3c..4dc81c0 100644 --- a/src/updates/fasta.rs +++ b/src/updates/fasta.rs @@ -21,7 +21,7 @@ pub fn update_with_fasta( conn: &Connection, operation_conn: &Connection, collection_name: &str, - parent_sample_name: &str, + parent_sample_name: Option<&str>, new_sample_name: &str, region_name: &str, start_coordinate: i64, @@ -40,14 +40,23 @@ pub fn update_with_fasta( let mut fasta_reader = fasta::io::reader::Builder.build_from_path(fasta_file_path)?; let _new_sample = Sample::create(conn, new_sample_name); - let block_groups = BlockGroup::query( - conn, - "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", - vec![ - SQLValue::from(collection_name.to_string()), - SQLValue::from(parent_sample_name.to_string()), - ], - ); + let block_groups; + if let Some(parent_name) = parent_sample_name { + block_groups = BlockGroup::query( + conn, + "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", + vec![ + SQLValue::from(collection_name.to_string()), + SQLValue::from(parent_name.to_string()), + ], + ); + } else { + block_groups = BlockGroup::query( + conn, + "select * from block_groups where collection_name = ?1 AND sample_name IS NULL;", + vec![SQLValue::from(collection_name.to_string())], + ); + } let mut new_block_group_id = 0; for block_group in block_groups { @@ -56,7 +65,7 @@ pub fn update_with_fasta( collection_name, new_sample_name, &block_group.name, - Some(parent_sample_name), + parent_sample_name, ) .unwrap(); BlockGroup::clone(conn, block_group.id, new_bg_id); @@ -194,8 +203,8 @@ mod tests { conn, op_conn, &collection, - "", - "new sample", + None, + "child sample", "m123", 2, 5, @@ -211,7 +220,7 @@ mod tests { "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", vec![ SQLValue::from(collection), - SQLValue::from("new sample".to_string()), + SQLValue::from("child sample".to_string()), ], ); assert_eq!(block_groups.len(), 1); @@ -254,8 +263,8 @@ mod tests { conn, op_conn, &collection, - "", - "new sample", + None, + "child sample", "m123", 2, 5, @@ -266,8 +275,8 @@ mod tests { conn, op_conn, &collection, - "", - "new sample", + Some("child sample"), + "grandchild sample", "m123", 4, 6, @@ -283,7 +292,7 @@ mod tests { "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", vec![ SQLValue::from(collection), - SQLValue::from("new sample".to_string()), + SQLValue::from("grandchild sample".to_string()), ], ); assert_eq!(block_groups.len(), 1); @@ -326,8 +335,8 @@ mod tests { conn, op_conn, &collection, - "", - "new sample", + None, + "child sample", "m123", 2, 5, @@ -338,8 +347,8 @@ mod tests { conn, op_conn, &collection, - "", - "new sample", + Some("child sample"), + "grandchild sample", "m123", 1, 6, @@ -355,7 +364,7 @@ mod tests { "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", vec![ SQLValue::from(collection), - SQLValue::from("new sample".to_string()), + SQLValue::from("grandchild sample".to_string()), ], ); assert_eq!(block_groups.len(), 1); @@ -404,8 +413,8 @@ mod tests { conn, op_conn, &collection, - "", - "new sample", + None, + "child sample", "m123", 2, 5, @@ -416,8 +425,8 @@ mod tests { conn, op_conn, &collection, - "", - "new sample", + Some("child sample"), + "grandchild sample", "m123", 1, 12, @@ -433,7 +442,7 @@ mod tests { "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", vec![ SQLValue::from(collection), - SQLValue::from("new sample".to_string()), + SQLValue::from("grandchild sample".to_string()), ], ); assert_eq!(block_groups.len(), 1); @@ -476,8 +485,8 @@ mod tests { conn, op_conn, &collection, - "", - "new sample", + None, + "child sample", "m123", 2, 5, @@ -488,8 +497,8 @@ mod tests { conn, op_conn, &collection, - "", - "new sample", + Some("child sample"), + "grandchild sample", "m123", 6, 12, @@ -505,7 +514,7 @@ mod tests { "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", vec![ SQLValue::from(collection), - SQLValue::from("new sample".to_string()), + SQLValue::from("grandchild sample".to_string()), ], ); assert_eq!(block_groups.len(), 1); @@ -546,8 +555,8 @@ mod tests { conn, op_conn, &collection, - "", - "new sample", + None, + "child sample", "m123", 2, 5, @@ -558,8 +567,8 @@ mod tests { conn, op_conn, &collection, - "", - "new sample", + Some("child sample"), + "grandchild sample", "m123", 4, 6, @@ -575,7 +584,7 @@ mod tests { "select * from block_groups where collection_name = ?1 AND sample_name = ?2;", vec![ SQLValue::from(collection), - SQLValue::from("new sample".to_string()), + SQLValue::from("grandchild sample".to_string()), ], ); assert_eq!(block_groups.len(), 1); diff --git a/src/updates/vcf.rs b/src/updates/vcf.rs index 9db776f..701619c 100644 --- a/src/updates/vcf.rs +++ b/src/updates/vcf.rs @@ -244,7 +244,7 @@ pub fn update_with_vcf<'a>( collection_name, &fixed_sample, seq_name.clone(), - Some(""), + coordinate_frame, ); let sample_bg_id = sample_bg_id.expect("can't find sample bg....check this out more"); @@ -301,7 +301,7 @@ pub fn update_with_vcf<'a>( collection_name, &sample_names[sample_index], seq_name.clone(), - Some(""), + coordinate_frame, ); let sample_bg_id = @@ -977,7 +977,7 @@ mod tests { ); assert_eq!( - BlockGroup::get_all_sequences(conn, get_sample_bg(conn, "").id, true), + BlockGroup::get_all_sequences(conn, get_sample_bg(conn, None).id, true), HashSet::from_iter(vec!["ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string()]) ); assert_eq!( From b2d736a1f98ebde44728a41c1ddaa27ca420497c Mon Sep 17 00:00:00 2001 From: hofer Date: Thu, 7 Nov 2024 16:44:12 -0500 Subject: [PATCH 21/22] Address comments --- src/models/block_group.rs | 2 +- src/updates/fasta.rs | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/models/block_group.rs b/src/models/block_group.rs index 87ea38d..4aac2f2 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -180,7 +180,7 @@ impl BlockGroup { pub fn clone(conn: &Connection, source_block_group_id: i64, target_block_group_id: i64) { let existing_paths = Path::get_paths( conn, - "SELECT * from paths where block_group_id = ?1", + "SELECT * from paths where block_group_id = ?1 ORDER BY id ASC;", vec![SQLValue::from(source_block_group_id)], ); diff --git a/src/updates/fasta.rs b/src/updates/fasta.rs index 4dc81c0..704158d 100644 --- a/src/updates/fasta.rs +++ b/src/updates/fasta.rs @@ -68,7 +68,6 @@ pub fn update_with_fasta( parent_sample_name, ) .unwrap(); - BlockGroup::clone(conn, block_group.id, new_bg_id); if block_group.name == region_name { new_block_group_id = new_bg_id; } From 11c50f012e291665821743229673b38f62cc9136 Mon Sep 17 00:00:00 2001 From: hofer Date: Thu, 7 Nov 2024 17:02:09 -0500 Subject: [PATCH 22/22] Update notebook --- .../edit-yeast-and-export-fasta.ipynb | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb b/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb index a726bc4..7a7c432 100644 --- a/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb +++ b/examples/yeast_editing/edit-yeast-and-export-fasta.ipynb @@ -30,16 +30,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "--2024-10-31 11:34:49-- http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-1-1_20110203.tgz\n", - "Resolving sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)... 52.92.138.67, 52.92.208.19, 52.92.133.227, ...\n", - "Connecting to sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)|52.92.138.67|:80... connected.\n", + "--2024-11-07 16:57:50-- http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-1-1_20110203.tgz\n", + "Resolving sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)... 52.92.229.203, 52.92.136.147, 52.218.218.50, ...\n", + "Connecting to sgd-archive.yeastgenome.org (sgd-archive.yeastgenome.org)|52.92.229.203|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 17152288 (16M) [application/x-tar]\n", "Saving to: ‘S288C_reference_genome_R64-1-1_20110203.tgz’\n", "\n", - "S288C_reference_gen 100%[===================>] 16.36M 739KB/s in 18s \n", + "S288C_reference_gen 100%[===================>] 16.36M 1.12MB/s in 18s \n", "\n", - "2024-10-31 11:35:07 (948 KB/s) - ‘S288C_reference_genome_R64-1-1_20110203.tgz’ saved [17152288/17152288]\n", + "2024-11-07 16:58:08 (945 KB/s) - ‘S288C_reference_genome_R64-1-1_20110203.tgz’ saved [17152288/17152288]\n", "\n" ] } @@ -166,12 +166,12 @@ } ], "source": [ - "!gen update --fasta cassette-edit.fa --start 3 --end 5 --contig-name ref\\|NC_001138\\|" + "!gen update --fasta cassette-edit.fa --new-sample edited-sample --start 3 --end 5 --region-name ref\\|NC_001138\\|" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "id": "2fc670e3-a9a6-4155-b0dc-0d313bb81311", "metadata": {}, "outputs": [ @@ -184,12 +184,12 @@ } ], "source": [ - "!gen export --fasta edited-yeast-genome.fa" + "!gen export --fasta edited-yeast-genome.fa --sample edited-sample" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "id": "723c463a-6159-4257-b34d-bd878efea004", "metadata": {}, "outputs": [ @@ -213,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "id": "98c8c254-4304-495d-8a9e-3505d21cafe7", "metadata": {}, "outputs": [ @@ -221,9 +221,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "CCCGCGGCGGGCGGACCCCGAAGGAGTGAGGGACCCCTCCCTAATGGGAGGGGGACCGAACCCCTTTTTAAGAAGGAGTC\n", - "CATATATATATATTAATAAAAAAAAGTAATATATATATATATATTGGAATAGTTATATTATTATACAGAAATATGCTTAA\n", - "TTATAATATAATATCCATA\n", + "TTTAGTGTTTGTTGCACGGCAGTAGCGAGAGACAAGTGGGAAAGAGTAGGATAAAAAGACAATCTATAAAAAGTAAACAT\n", + "AAAATAAAGGTAGTAAGTAGCTTTTGGTTGAACATCCGGGTAAGAGACAACAGGGCTTGGAGGAGACGTACATGAGGGCT\n", + "ATTTAGGGCTATTTAGGGCTATGTAGAAGTGCTGTAGGGCTAAAGAACAGGGTTTCATTTTCATTTTTTTTTTT\n", ">ref|NC_001138|\n", "GATATCGATCGCGCAAGTGCATTCCTAGACTTAATTCATATCTGCTCCTCAACTGTCGATGATGCCTGCTAAACTGCAGC\n", "TTGACGTACTGCGGACCCTGCAGTCCAGCGCTCGTCATGGAACGCAAACGCTGAAAAACTCCAACTTTCTCGAGCGCTTC\n",