From 57671cee2a7abde00fb93604cccff577680c2fbf Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Sat, 26 Oct 2024 11:17:28 +0300 Subject: [PATCH 01/23] Add scripsts to run case study report Signed-off-by: Awa Synthia --- inst/report/report_template.Rmd | 485 +++++++ inst/report/scripts/MolEvolData_class.R | 751 ++++++++++ inst/report/scripts/generate_report.R | 668 +++++++++ inst/report/scripts/run_molevolvr_pipeline.R | 1289 ++++++++++++++++++ inst/report/scripts/viz_utils.R | 742 ++++++++++ 5 files changed, 3935 insertions(+) create mode 100644 inst/report/report_template.Rmd create mode 100644 inst/report/scripts/MolEvolData_class.R create mode 100644 inst/report/scripts/generate_report.R create mode 100644 inst/report/scripts/run_molevolvr_pipeline.R create mode 100644 inst/report/scripts/viz_utils.R diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd new file mode 100644 index 00000000..c42aa443 --- /dev/null +++ b/inst/report/report_template.Rmd @@ -0,0 +1,485 @@ +--- +title: MolEvolvR +output: + html_document: + self_contained: yes +params: + rs_interproscan_visualization: default + proximity_network: default + sunburst: default + data: default + queryDataTable: default + fastaDataText: default + heatmap: default + query_data: default + query_domarch_cols: default + query_iprDatabases: default + query_iprVisType: default + mainTable: default + DALinTable: default + DALinPlot: default + DANetwork: default + DA_Prot: default + domarch_cols: default + DA_Col: default + DACutoff: default + da_interproscan_visualization: default + phylo_sunburst_levels: default + phylo_sunburst: default + lineage_table: default + ## Tree + tree_msa_tool: default ##input + ## MSA + rep_accnums: default + msa_rep_num: default ##input + app_data: default + PhyloSelect: default ##input + acc_to_name: default + rval_phylo: default + query_pin: default + msa_reduce_by: default +--- + +--- +subtitle: "`r glue::glue('Retrieval Code: {params$query_pin}')`" +--- + + + + + +```{r mine,results="asis",engine="js", echo=FALSE} +// stumble through the tabset(s) +setTimeout(function(){ + document.querySelector('a#FullResultButton1').addEventListener('click', doIt); + function doIt(){ + document.querySelector('a[href*="#domain-architecture-2"]').click(); //Second(-2) Domain Architecture Section (reuse at different heading levels) + } +}, 100); // slow your roll + +setTimeout(function(){ + document.querySelector('a#FullResultButton2').addEventListener('click', doIt); + function doIt(){ + document.querySelector('a[href*="#phylogeny-1"]').click(); //Second Phylogeny(-1) Section (reuse at different heading levels) + } +}, 100); + +setTimeout(function(){ + document.querySelector('a#FullResultButton3').addEventListener('click', doIt); + function doIt(){ + document.querySelector('a[href*="#homolog-data"]').click(); //Homolog Data Section + } +}, 100); +``` + +```{r setup, include = FALSE} +## Globally Suppress warnings and messaging +knitr::opts_chunk$set(warning = FALSE, message = FALSE) +## Create Sunburst Report Function +cpcols <- c( + "#AFEEEE", "#DDA0DD", "#EE2C2C", "#CDBE70", "#B0B099", + "#8B2323", "#EE7600", "#EEC900", "chartreuse3", "#0000FF", + "#FFD900", "#32CD32", "maroon4", "cornflowerblue", "darkslateblue", + "#AB82FF", "#CD6889", "#FFA07A", "#FFFF00", "#228B22", + "#FFFFE0", "#FFEC8B", "peru", "#668B8B", "honeydew", + "#A020F0", "grey", "#8B4513", "#191970", "#00FF7F", + "lemonchiffon", "#66CDAA", "#5F9EA0", "#A2CD5A", "#556B2F", + "#EEAEEE", "thistle4", "#473C8B", "#FFB6C1", "#8B1C62", + "#FFE4B5", "black", "#FF7F50", "#FFB90F", "#FF69B4", "#836FFF", + "#757575", "#CD3333", "#EE7600", "#CDAD00", "#556B2F", "#7AC5CD" + ) +lineage_sunburst_report <- function(prot, lineage_column = "Lineage", + type = "sunburst", + levels = 2, colors = NULL, legendOrder = NULL, showLegend = TRUE, maxLevels = 5) { + lin_col <- sym(lineage_column) + + # ensure they don't exceed maxLevels, although this should + if (!is.null(maxLevels) && levels > maxLevels) { + levels <- maxLevels + } + + levels_vec <- c() + for (i in 1:levels) + { + levels_vec <- append(levels_vec, paste0("level", i)) + } + + # Take lineage column and break into the first to levels + prot <- prot %>% + select({{ lin_col }}) %>% + arrange(desc({{ lin_col }})) %>% + drop_na({{ lin_col }}) + protLevels <- prot %>% separate({{ lin_col }}, into = levels_vec, sep = ">") + # Count the occurrance of each group of levels + protLevels <- protLevels %>% + group_by_at(levels_vec) %>% + summarise(size = n()) + protLevels <- protLevels %>% arrange() + tree <- d3_nest(protLevels, value_cols = "size") + + # Plot sunburst + if (type == "sunburst") { + result <- sunburst(tree, legend = list(w = 225, h = 15, r = 5, s = 5), colors = cpcols, legendOrder = legendOrder) + } else if (type == "sund2b") { + result <- sund2b(tree) + } + + if (showLegend) { + return( + htmlwidgets::onRender( + result, + "function(el, x) { + jQuery('.sunburst-togglelegend', el) + .attr('checked', 'true') + .attr('data-html2canvas-ignore', 'true'); + jQuery('.sunburst-legend', el).css('visibility', ''); + + // create a button to download the sunburst + // (relies on html2canvas being included in the page) + + // FIXME: consider pulling this all out into a js library + // so that we can apply it to other components. + + const downloadBtn = jQuery('') + .css({'position': 'absolute', 'right': '5px', 'top': '5px'}) + .attr('data-html2canvas-ignore', 'true') + .appendTo(el); + + saveAs = (blob, fileName) => { + const link = document.createElement('a'); + link.download = fileName + link.href = URL.createObjectURL(blob); + link.click(); + URL.revokeObjectURL(link.href); + } + + downloadBtn.click(() => { + html2canvas(el, { scale: 4, logging: false }).then(canvas => { + canvas.toBlob(function(blob) { + saveAs(blob, 'sunburst.png'); + }); + }); + }); + }" + ) + ) + } + + return(result) +} +``` + +# {.tabset} + +## Results Summary + +An overview of the protein analysis. To view the full results, explore the additional tabs. + +### Domain Architecture + +Visualizations and summaries for protein domains. + +View full results + + +#### Interproscan Visualization + +```{r, echo=FALSE} +params$rs_interproscan_visualization +``` + +#### Proximity Network + +```{r, echo=FALSE} +params$proximity_network +``` + +### Phylogeny + +Visualizations for protein evolution. + +View full results + +#### Sunburst + +```{r, echo=FALSE} +library(dplyr) +library(tidyr) +lineage_sunburst_report(params$sunburst, "Lineage", levels = 2) +``` + +### Data + +Summary table of proteins including domain architectures, phylogeny, and homologs, when applicable. + +View full results + +```{r, echo=FALSE} +params$data +``` + + +## Query Data {.tabset} + +Input data, additional metadata, and preliminary analyses of query protein(s). + +### Data Table + +The data table provides a summary of the sequences submitted, or \"queried\", for analysis. The preview shown can be extended by using \"Add/remove column(s)\" to see info about other taxonomic classes as well as domain architecuture codes from databases other than the default (Pfam). + +```{r, echo=FALSE} +params$queryDataTable +``` + +### FASTA + +Uploaded amino acid FASTA sequence(s). + +`r glue::glue(params$fastaDataText)` + +### Query Heatmap + +A heatmap of submitted sequences and their respective taxonomic lineages. + +```{r, echo=FALSE} +params$heatmap +``` + +### Domain Architecture + +Visualizations and analyses of all query and homologous protein domains, structural or functional subunits, and their architectures. + +```{r, echo=FALSE} +if(is.null(params$query_iprDatabases)){ + choices <- params$query_domarch_cols + if ("Pfam" %in% choices & "Phobius" %in% choices) { + analysis_type <- c("Pfam", "Phobius") + } else { + default <- choices[1] + } + } else { + analysis_type <- params$query_iprDatabases + } + +if(is.null(params$query_iprVisType)){ + analysis_group <- 'Analysis' + } else { + analysis_group <- params$query_iprVisType + } + +if(length(params$query_domarch_cols) >= 1 ) { + DomArchPlot <- ipr2viz_web( + infile_ipr = params$query_data@ipr_path, + accessions = params$query_data@df$QueryName, + analysis = analysis_type, group_by = analysis_group, name = "Name" + ) + DomArchPlot +} else { + "No domains found in the input sequences." + } +``` + +## Homolog Data + +Full set of homologs of query sequences, including their lineage and domain architecture info. + +```{r, echo=FALSE} +params$mainTable +``` + +## Domain Architecture {.tabset} + +Summary and visualizations of protein motifs/subunits (domains) and their configurations within the query protein(s) (domain architectures). + +### Table + +```{r, echo=FALSE} +params$DALinTable +``` + +### Heatmap + +```{r, echo=FALSE} +params$DALinPlot +``` + +### Network + +```{r, echo=FALSE} +params$DANetwork +``` + +```{r, echo=FALSE} +if (length(params$domarch_cols) == 0) { + choices <- c("None") + vals <- "None" + selected <- "None" + } else { + vals <- params$domarch_cols + choices <- substring(vals, first = 9) + selected <- if (length(choices) >= 2) choices[1:2] else choices[1] + } + +if (is.null(params$DA_col)) { + da_col <- vals[[1]] + } else { + da_col <- params$DA_Col + } + +plot_data <- params$DA_Prot +plot_data[[da_col]] <- str_replace_all(plot_data[[da_col]], " ", "_") +wordcloud_element(plot_data, colname = da_col, cutoff = params$DACutoff, UsingRowsCutoff = F) +``` + +### Interproscan Visualization + +```{r, echo=FALSE} +params$da_interproscan_visualization +``` + +### UpSet Plot + +```{r, echo=FALSE} +upset_plot_data <- params$DA_Prot +upset_plot_data[[da_col]] <- str_replace_all(upset_plot_data[[da_col]], " ", "_") +final_plot <- upset.plot(upset_plot_data, colname = da_col, cutoff = params$DACutoff) +domains <- upset_plot_data %>% + dplyr::pull(da_col) +n_unique_domains <- domains %>% + unique() %>% + length() + +if (n_unique_domains > 1) { + final_plot + } else { + stringr::str_glue("UpSet plot requires more than 1 unique domain (only {n_unique_domains} present). Try selecting more proteins.") + } +``` + +## Phylogeny {.tabset} + +Visualizations of phyletic patterns, sequence similarity, and evolution of related proteins. + +### Sunburst + +```{r, echo=FALSE} +lineage_sunburst_report(params$phylo_sunburst, lineage_column = "Lineage_long_na", type = "sunburst", levels = params$phylo_sunburst_levels) +``` + +### Tree + +```{r, include=FALSE} +if (is.null(params$tree_msa_tool)) { + tree_msa_tool <- 'ClustalO' ## initialize with default value + } else { + tree_msa_tool <- params$tree_msa_tool + } + +if (is.null(params$msa_rep_num)) { + msa_rep_num <- 3 ## initialize with default value + } else { + msa_rep_num <- params$msa_rep_num + } +``` + +```{r, include=FALSE} +if(length(params$rep_accnums) >= 3 ) { + rep <- params$rep_accnums[1:msa_rep_num] + if (!params$rval_phylo) { + seqs <- readAAStringSet(params$app_data@fasta_path) + names(seqs) <- sub(" .*", "", names(seqs)) + query_accession <- params$app_data@df %>% filter(params$app_data@df$QueryName == params$PhyloSelect) + query_accession <- unique(query_accession$Query) + query <- seqs[query_accession] + names(query) <- params$PhyloSelect + query <- AAStringSet(query) + } + # Generate Fasta File + rep_fasta_path <- tempfile() + + acc2fa(rep, outpath = rep_fasta_path, "sequential") + rename_fasta(rep_fasta_path, rep_fasta_path, + replacement_function = map_acc2name, + acc2name = params$acc_to_name + ) + if (!params$rval_phylo) { + writeXStringSet(query, rep_fasta_path, append = TRUE) + } + + rep_msa_path <- tempfile() + alignFasta(rep_fasta_path, tree_msa_tool, rep_msa_path) +} +``` + +```{r, echo=FALSE} +if(length(params$rep_accnums) >= 3 ) { + seq_tree(fasta_filepath = rep_msa_path) + } else {"Not enough representative sequences: try changing the 'Reduce By' field."} +``` + +### MSA + +```{r, include=FALSE} +if(length(params$rep_accnums) >= 3 ) { + if (!params$rval_phylo) { + seqs <- readAAStringSet(params$app_data@fasta_path) + names(seqs) <- sub(" .*", "", names(seqs)) + query_accession <- params$app_data@df %>% filter(params$app_data@df$QueryName == params$PhyloSelect) + query_accession <- unique(query_accession$Query) + query <- seqs[query_accession] + names(query) <- params$PhyloSelect + query <- AAStringSet(query) + } + + # Generate Fasta File + rep_fasta_path <- tempfile() + acc2fa(rep, outpath = rep_fasta_path, "sequential") + rename_fasta(rep_fasta_path, rep_fasta_path, + replacement_function = map_acc2name, + acc2name = params$acc_to_name + ) + if (!params$rval_phylo) { + writeXStringSet(query, rep_fasta_path, append = TRUE) + } + + # Call MSA2PDF + msa_pdf_path <- tempfile() + msa_prefix <- "/data/research/jravilab/molevolvr_app/www/msa_figs/" + post_fix <- paste("msa", params$query_pin, params$PhyloSelect, params$msa_reduce_by, ".pdf", sep = "_") + + msa_pdf_path <- paste0(msa_prefix, post_fix) + + msa_pdf(fasta_path = rep_fasta_path, msa_pdf_path) + } +``` + +```{r msa_pdf, echo = FALSE, out.width = "95%", out.height = "800px"} +if(length(params$rep_accnums) >= 3 ) { + knitr::include_graphics(msa_pdf_path) + } else {"Not enough representative sequences: try changing the 'Reduce By' field."} +``` diff --git a/inst/report/scripts/MolEvolData_class.R b/inst/report/scripts/MolEvolData_class.R new file mode 100644 index 00000000..c50e2925 --- /dev/null +++ b/inst/report/scripts/MolEvolData_class.R @@ -0,0 +1,751 @@ +# Author(s): Samuel Chen +# Last modified: 2020 + +setClass("MolEvolData", + slots = list( + df = "data.frame", + fasta_path = "character", + msa_path = "character", + ipr_path = "character", + queries = "character", + cln_path = "character", + domainSeqs = "character", + domain_ipr_path = "character" + ) +) +setClass("queryData", + slots = list( + df = "data.frame", + fasta_path = "character", + msa_path = "character", + ipr_path = "character", + queries = "character", + cln_path = "character" + ) +) +setClass("blastUpload", + slots = list( + df = "character", + seqs = "character" + ) +) +setClass("iprUpload", + slots = list( + df = "character", + seqs = "character" + ) +) +setClass("seqUpload", + slots = list( + seqs = "character" + ) +) + +# Group by lineage + DA then take top 20 +top_acc <- function(cln_file, DA_col = "DomArch.Pfam", + lin_col = "Lineage", n = 20) { + lin_sym <- sym(lin_col) + DA_sym <- sym(DA_col) + + cln <- fread(cln_file, sep = "\t", fill = T) + + grouped <- cln %>% + group_by({{ lin_sym }}, {{ DA_sym }}) %>% + summarise(count = n()) %>% + arrange(-count) %>% + filter(!is.na({{ lin_sym }}) & !is.na({{ DA_sym }})) + + top_acc <- character(n) + for (r in 1:min(nrow(grouped), n)) + { + l <- (grouped %>% pull({{ lin_sym }}))[r] + DA <- (grouped %>% pull({{ DA_sym }}))[r] + + filt <- cln %>% filter({{ lin_sym }} == l & {{ DA_sym }} == DA) + + top <- filt[which(filt$PcPositive == max(filt$PcPositive))[1], ] + + top_acc[r] <- top$AccNum + } + top_acc <- top_acc[which(top_acc != "")] + return(top_acc) +} + + +combine_files_nopmap <- function(inpath, pattern, outpath, + delim = "\t", skip = 0, + col_names) { + source_files <- dir(path = inpath, pattern = pattern, recursive = T) + + source_files_path <- paste0(inpath, source_files) + + dt_list <- map(source_files_path, function(x) { + if (!grepl("query", x)) { + fread(x, sep = delim, skip = skip, fill = T) # , col.names = col_names) + } + }) + + combined <- rbindlist(dt_list, fill = T) + + fwrite(combined, outpath, sep = "\t") + + return(combined) +} + + +full_analysis_colnames <- c("AccNum", "QueryName", "STitle", "Species.x", + "TaxID.x", "Lineage", "PcPositive", "PcIdentity", + "AlnLength", "SAccNum", "SAllSeqID", "Mismatch", + "GapOpen", "QStart", "QEnd", "QLength", "SStart", + "SEnd", "SLength", "EValue", + "BitScore", "PcPosOrig", + "Name", "ClusterID", "DomArch.Pfam", + "DomArch.SMART", + "DomArch.CDD", "DomArch.TIGRFAM", + "DomArch.Phobius", + "DomArch.Gene3D", "DomArch.TMHMM", + "DomArch.SignalP_EUK", + "DomArch.SignalP_GRAM_NEGATIVE", + "DomArch.SignalP_GRAM_POSITIVE", + "Description", "Length", "TaxID.y", + "Species.y", "SourceDB", "Completeness") + + +process_wrapper_dir <- function(path, pinName, type = "full") { + if (type == "full") { + if (file.exists(paste0(path, "/cln_combined.tsv")) && file.exists(paste0(path, "/ipr_combined.tsv"))) { + query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv")) + query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv") + query_msa_path <- paste0(path, "/query_data/query_seqs.msa") + query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa") + query_data$Query <- query_data$AccNum + queries <- unique(query_data$Query) + cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv") + query_data <- query_data %>% + mutate(QueryName = Name) %>% + select(QueryName, Species, contains("Lineage"), contains("DomArch"), Query, Name) %>% + distinct(QueryName, .keep_all = TRUE) + query_wrapper_data <- new("queryData", + df = query_data, fasta_path = query_sequence_path, + msa_path = query_msa_path, ipr_path = query_ipr_path, queries = queries, cln_path = cln_path + ) + com_blast_data <- read_tsv(path, "/blast_combined.tsv") %>% arrange(desc(PcPositive)) + ipr_blast_path <- paste0(path, "/ipr_combined.tsv") + queries <- unique(com_blast_data$Query) + com_blast_data$Lineage <- as.factor(com_blast_data$Lineage) + com_blast_data <- com_blast_data %>% select(QueryName, everything()) + wrapper_data <- new("MolEvolData", + df = com_blast_data, queries = queries, ipr_path = ipr_blast_path, cln_path = com_blast_path, + fasta_path = query_sequence_path, msa_path = query_msa_path, domainSeqs = "", domain_ipr_path = "" + ) + + return(list(wrapper_data, query_wrapper_data)) + } else { + + # Get the query data + query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv")) + query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv") + query_msa_path <- paste0(path, "/query_data/query_seqs.msa") + query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa") + query_data$Query <- query_data$AccNum + # alignFasta(query_sequence_path, tool = "ClustalO", outpath = query_msa_path) + queries <- unique(query_data$Query) + cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv") + query_data <- query_data %>% + mutate(QueryName = Name) %>% + select(QueryName, Species, contains("Lineage"), contains("DomArch"), Query, Name) %>% + distinct(QueryName, .keep_all = TRUE) + query_wrapper_data <- new("queryData", + df = query_data, fasta_path = query_sequence_path, + msa_path = query_msa_path, ipr_path = query_ipr_path, queries = queries, cln_path = cln_path + ) + + query_data <- query_data %>% select(Query, QueryName) + + com_blast_path <- paste0(path, "/blast_combined.tsv") + ipr_blast_path <- paste0(path, "/ipr_combined.tsv") + com_blast_data <- combine_files_nopmap(paste0(path, "/"), + pattern = "*.full_analysis.tsv", skip = 0, + col_names = c(), outpath = com_blast_path, delim = "\t" + ) + + ipr_blast_data <- combine_files_nopmap(paste0(path, "/"), + pattern = "*.iprscan_cln.tsv", skip = 0, + col_names = ipr_colnames, outpath = ipr_blast_path, delim = "\t" + ) + com_blast_data <- merge(com_blast_data, query_data, by = "Query") + com_blast_data <- com_blast_data %>% arrange(desc(PcPositive)) + fwrite(com_blast_data, com_blast_path, sep = "\t") + + queries <- unique(com_blast_data$Query) + + com_blast_data$Lineage <- as.factor(com_blast_data$Lineage) + com_blast_data <- com_blast_data %>% select(QueryName, everything()) + + wrapper_data <- new("MolEvolData", + df = com_blast_data, queries = queries, ipr_path = ipr_blast_path, cln_path = com_blast_path, + fasta_path = query_sequence_path, msa_path = query_msa_path, domainSeqs = "", domain_ipr_path = "" + ) + + return(list(wrapper_data, query_wrapper_data)) + } + } else if (type == "dblast") { + query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv")) + query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv") + query_msa_path <- paste0(path, "/query_data/query_seqs.msa") + query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa") + alignFasta(query_sequence_path, tool = "ClustalO", outpath = query_msa_path) + cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv") + query_data <- query_data %>% mutate(Query = AccNum) + query_data <- query_data %>% + mutate(QueryName = Name) %>% + select(QueryName, Species, contains("Lineage"), contains("DomArch"), + Query, Name, AccNum) %>% + distinct(QueryName, .keep_all = TRUE) + queries <- unique(query_data$Query) + query_wrapper_data <- new("queryData", + df = query_data, + fasta_path = query_sequence_path, + msa_path = query_msa_path, + ipr_path = query_ipr_path, + queries = queries, cln_path = cln_path + ) + query_data <- query_data %>% select(Query, QueryName) + com_blast_path <- paste0(path, "/blast_combined.tsv") + com_blast_data <- combine_files_nopmap(paste0(path, "/"), + pattern = "*.blast.cln.tsv", + skip = 0, + col_names = c(), + outpath = com_blast_path, + delim = "\t" + ) + com_blast_data <- merge(com_blast_data, query_data, by = "Query") + queries <- unique(com_blast_data$Query) + com_blast_data$Lineage <- as.factor(com_blast_data$Lineage) + com_blast_data <- com_blast_data %>% + select(QueryName, everything()) %>% + arrange(desc(PcPositive)) + wrapper_data <- new("MolEvolData", + df = com_blast_data, queries = queries, + ipr_path = "", cln_path = com_blast_path, + fasta_path = query_sequence_path, + msa_path = query_msa_path, domainSeqs = "", + domain_ipr_path = "" + ) + return(list(wrapper_data, query_wrapper_data)) + } else if (type == "phylo") { + query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv")) + query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv") + query_msa_path <- paste0(path, "/query_data/query_seqs.msa") + cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv") + query_data <- query_data %>% mutate(Query = AccNum) + query_data <- query_data %>% + mutate(QueryName = Name) %>% + select(QueryName, Species, contains("Lineage"), contains("DomArch"), + Query, Name, AccNum) %>% + distinct(QueryName, .keep_all = TRUE) + queries <- unique(query_data$Query) + query_wrapper_data <- new("queryData", + df = query_data, + fasta_path = query_sequence_path, + msa_path = query_msa_path, + ipr_path = query_ipr_path, + queries = queries, cln_path = cln_path + ) + wrapper_data <- new("MolEvolData", + df = query_data, + fasta_path = query_sequence_path, + msa_path = query_msa_path, + ipr_path = query_ipr_path, + queries = queries, cln_path = cln_path, + domainSeqs = "", domain_ipr_path = "" + ) + return(list(wrapper_data, query_wrapper_data)) + } else if (type == "da") { + query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv")) + query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv") + query_msa_path <- paste0(path, "/query_data/query_seqs.msa") + query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa") + alignFasta(query_sequence_path, tool = "ClustalO", outpath = query_msa_path) + cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv") + query_data <- query_data %>% + mutate(QueryName = Name) %>% + select(QueryName, Species, contains("Lineage"), contains("DomArch"), + Query, Name) %>% + distinct(QueryName, .keep_all = TRUE) + queries <- unique(query_data$Query) + query_wrapper_data <- new("queryData", + df = query_data, + fasta_path = query_sequence_path, + msa_path = query_msa_path, + ipr_path = query_ipr_path, + queries = queries, cln_path = cln_path + ) + com_blast_path <- paste0(path, "/blast_combined.tsv") + com_blast_data <- combine_files_nopmap(paste0(path, "/"), + pattern = "*.full_analysis.tsv", + skip = 0, + col_names = c(), + outpath = com_blast_path, + delim = "\t" + ) + com_blast_data <- merge(com_blast_data, query_data, by = "Query") + com_blast_data <- com_blast_data %>% + select(QueryName, everything()) %>% + arrange(desc(PcPositive)) + wrapper_data <- new("MolEvolData", + df = com_blast_data, queries = queries, + ipr_path = "", cln_path = com_blast_path, + fasta_path = query_sequence_path, + msa_path = query_msa_path, + domainSeqs = "", domain_ipr_path = "" + ) + return(list(wrapper_data, query_wrapper_data)) + } else { + stop("Unrecognized type. Please use 'full', 'dblast', 'phylo', or 'da'.") + } +} + +drop_empty <- function(df) { + for (c in colnames(df)) { + if (all(is.na(df[, ..c])) || all(df[, ..c] == "")) { + df <- df %>% select(-contains(c)) + } + } + return(df) +} + + +clean_fetched <- function(df) { + df <- as.data.frame(df) + # df = df %>% cleanup_species() + + # Cleanup domarchs + cols <- colnames(df) + for (c in cols) { + if (grepl("^DomArch", c)) { + # Repeats + old <- c + new <- paste0(c, ".repeats") + df <- df %>% cleanup_domarch( + old = old, new = new, + domains_rename = NULL, # domains_rename, + domains_keep = NULL, # filter applied to only ClustName for now. + domains_ignore = NULL, # !! should check and remove soon! + repeat2s = FALSE, + remove_tails = F, # new! check below if it works! + remove_empty = F + ) + + new <- c + df <- df %>% cleanup_domarch( + old = old, new = new, + domains_rename = NULL, # domains_rename, + domains_keep = NULL, # filter applied to only ClustName for now. + domains_ignore = NULL, # !! should check and remove soon! + repeat2s = TRUE, + remove_tails = F, # new! check below if it works! + remove_empty = F + ) + } + } + + + return(df) +} + +# Description +# validation functions for accession numbers/headers for FASTA (validate_accnum_fasta) +# & accession number input (validate_accnum) submission types. + +library(Biostrings) +library(httr) +library(httr2) +library(rentrez) +library(shiny) + +.get_accnum_from_fasta <- function(biostrings_aa_string_set, verbose = FALSE) { + # parsing/cleaning accession numbers using the same methods as + # `upstream_scripts/00_submit_full.R`'s `get_sequences()` function + accnums <- c() + for (header in names(biostrings_aa_string_set)) { + # case 1 UnitProtKB formatted FASTA + # header: https://www.uniprot.org/help/fasta-headers + if (grepl("\\|", header)) { + accnum <- unlist(strsplit(header, "\\|"))[2] + # case 2 NCBI formatted FASTA + # header: https://www.ncbi.nlm.nih.gov/genbank/fastaformat + } else if (grepl(" ", header)) { + accnum <- unlist(strsplit(header, " "))[1] + # case 3 neither delimiter present; use the whole header + } else { + accnum <- header + } + # print debug info + if (verbose == TRUE) { + cat("header:", header, "\n") + cat("accnum_parsed:", accnum, "\n") + } + accnums <- append(accnums, accnum) + } + print(accnums) + return(accnums) +} + +validate_accnum_fasta <- function(text) { + # INPUT: text from input object that houses FASTA data + # validate the headers/accnums for FASTA submission + # Return: + # T: when no duplicate accnums after parsing + # F: when duplicate accnums are present after parsing + + # TRY write tmp file for biostrings parsing + path <- tryCatch( + expr = { + path <- tempfile() + write(text, path) + path + }, + error = function(e) { + NULL + } + ) + if (is.null(path)) { + cat("failed to write temp fasta during accnum validation\n", file = stderr()) + return(FALSE) # validation fail + } + + # TRY read seq + fasta <- tryCatch( + expr = { + fasta <- Biostrings::readAAStringSet(path) + fasta + }, + error = function(e) { + NULL + }, + finally = { + unlink(path) + } + ) + if (is.null(fasta)) { + cat("failed to read sequence during accnum validation\n", file = stderr()) + return(FALSE) # validation fail + } + + # parse accnums from fasta + accnums <- .get_accnum_from_fasta(fasta) + tb_accnum_counts <- tibble("frequencies" = table(accnums)) + # check for duplicates + if (any(tb_accnum_counts$frequencies > 1)) { + cat("duplicate headers found during fasta validation\n", file = stderr()) + return(FALSE) # validation fail + } else { + return(TRUE) + } +} + + +#' Test whether a single accession returns a valid protein from Entrez +is_accnum_valid_entrez <- function(accnum, verbose = FALSE) { + # empty accnum wil not raise an error from efetch, so test for this first + if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)} + + # try performing a POST of the accnum, followed by a GET request for a protein + # sequence + # rentrez::entrez_fetch() returns an error when there's no protein, + # so we're using try statements to handle this and return FALSE upon error + result <- tryCatch( + expr = { + result <- rentrez::entrez_fetch(db = "protein", id = accnum, rettype = "fasta") + if (verbose) {print(result)} + TRUE + }, + error = function(e) {if (verbose) {print(e)}; FALSE} + ) + return(result) +} + +#' Test whether a single accession returns a valid protein from EBI +is_accnum_valid_ebi <- function(accnum, verbose = FALSE) { + # validation: ensure there's some text to parse + if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)} + # construct a httr2 request to POST an accession number, then GET the fasta + url_base <- "https://www.ebi.ac.uk/proteins/api/proteins?accession=" + url_protein <- paste0(url_base, accnum) + req <- httr2::request(url_protein) |> + httr2::req_headers("Accept" = "text/x-fasta", "Content-type" = "application/x-www-form-urlencoded") + + # wrap in try since a failed HTTP request will raise an R error + try(httr2::req_perform(req)) + # get the HTTP response code + resp <- httr2::last_response() + # assign result based on HTTP response code ('200' is a success) + result <- ifelse(resp$status == 200, TRUE, FALSE) + if (verbose) { + msg <- stringr::str_glue( + "EBI protein query results:\n", + "\taccnum: {accnum}\t validation result: {result}\n" + ) |> print() + } + return(result) +} + +#' Perform a series of API reqs using NCBI entrez to validate accession numbers +perform_entrez_reqs <- function(accnums, verbose = FALSE, track_progress = FALSE) { + # API guidelines docs + # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html + # entrez recommends no more than 3 POSTs per second + # simple method, sleep for 1 second after every 3rd POST + i <- 0 + results <- vapply( + X = accnums, + FUN = function(accnum) { + result <- is_accnum_valid_entrez(accnum, verbose = TRUE) + i <<- i + 1L + if (i >= 3L) {Sys.sleep(1); print('sleeping for entrez API reqs'); i <<- 0L} + if (track_progress) {incProgress(1)} + result + }, + FUN.VALUE = logical(1) + ) + return(results) +} + +perform_ebi_reqs <- function(accnums, verbose = FALSE, track_progress = FALSE) { + # API guidelines docs + # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html + # EBI allows 200 POSTs per second + # simple method, sleep for 1 second after every 200th POST + i <- 0 + results <- vapply( + X = accnums, + FUN = function(accnum) { + result <- is_accnum_valid_ebi(accnum, verbose = TRUE) + i <<- i + 1L + if (i >= 200L) {Sys.sleep(1); i <<- 0L} + if (track_progress) incProgress(1) + result + }, + FUN.VALUE = logical(1) + ) + return(results) +} + +#' Validate accession numbers from MolEvolvR user input +validate_accnum <- function(text, verbose = FALSE, track_progress = FALSE, n_steps = integer()) { + # API guidelines docs + # entrez https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Usage_Guidelines_and_Requiremen + # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html + + # get accnums from three possible delimiters + # 1. comma separated values with any amount of space between them + # 2. one or more space delimiter + # 3. "\n" (newline) delimiter + accnums <- unlist(strsplit(text, "\\s*,\\s*|\\s+|\\n|,")) + + warning() + + if (track_progress) {incProgress(1 / n_steps, + message = "Testing for duplicate accession numbers . . .")} + # fail fast for duplicates + if (any(duplicated(accnums))) { + warning("duplicate accesion numbers found") + if (track_progress) {setProgress(n_steps, + message = "Duplicate accession numbers found")} + return(FALSE) + } + if (track_progress) {incProgress(1 / n_steps, + message = "Please wait (searching NCBI's protein database) . . .")} + # entrez API + entrez_results <- perform_entrez_reqs(accnums, verbose = verbose) + if (verbose) { + stringr::str_glue("rentrez + results:\n\t{paste0(entrez_results, collapse=",")}\n") |> + print() + } + + # EBI API + # if all of entrez resulted in success, then skip EBI. Else, try EBI + if (!all(entrez_results)) { + ebi_results <- perform_ebi_reqs(accnums, verbose = verbose) + if (verbose) { + stringr::str_glue("ebi results:\n\t{paste0(ebi_results, + collapse=",")}\n") |> + print() + } + if (track_progress) {incProgress(1 / n_steps, message = "Please wait + (searching EBI's protein database) . . .")} + # OR test each accnum result across both dbs + final_results <- ebi_results | entrez_results + } else { + if (track_progress) {setProgress(n_steps, message = "All accession + numbers validated")} + final_results <- entrez_results + } + failed_accnums <- accnums[which(!final_results)] + if (length(failed_accnums) >= 1) { + msg <- stringr::str_glue( + "The following accnums failed validation:\n", + "\t{paste0(failed_accnums, collapse=",")}\n" + ) + warning(msg) + if (track_progress) {setProgress(n_steps, message = "Accession number + validation failed")} + } + setProgress(n_steps, message = "Accession number validation finished") + return( + list( + "validation_result" = all(final_results), + "failed_accnums" = failed_accnums + ) + ) +} + +#=============================================================================== +# Description +# validation function for evalue input +#=============================================================================== +validate_evalue <- function(input_value) { + is_valid_evalue <- is.numeric(input_value) && input_value != 0 + return(is_valid_evalue) +} + +#=============================================================================== +# Author(s): JK +# Last modified: 2023_06 +#=============================================================================== + +library("Biostrings") +library("tidyverse") +#------------------------------------------------------------------------------- +.guess_seq_type <- function(single_fasta, dna_guess_cutoff = 0.9, other_guess_cutoff = 0.5) { + tb <- as_tibble(alphabetFrequency(single_fasta)) + n_other <- if ("other" %in% colnames(tb)) sum(unlist(tb["other"])) else 0 + + aa_cols <- setdiff(Biostrings::AA_ALPHABET, c("*", ".")) + tb_dna <- tb %>% select(all_of(Biostrings::DNA_ALPHABET)) + tb_aa <- tb %>% select(all_of(aa_cols)) + + total <- nchar(single_fasta) + est_dna_prop <- sum(unlist(tb_dna)) / total + est_aa_prop <- sum(unlist(tb_aa)) / total + other_prop <- n_other / total + cat( + names(single_fasta), "\n", + "estimated DNA alphabet proportion:", est_dna_prop, "\n", + "estimated AA alphabet proportion:", est_aa_prop, "\n", + "'other' alphabet proportion:", other_prop, "\n" + ) + + if (est_dna_prop >= dna_guess_cutoff) { + guess <- "DNA" + } else if (other_prop >= other_guess_cutoff) { + guess <- NA + } else { + guess <- "AA" + } + cat("Guess:", guess, "\n") + return(guess) +} + +.validate_seq_body <- function(text) { + # convert string to single letter character vector + individual_chars <- unlist(strsplit(text, "")) + # return the characters from input that are NOT in the AA_ALPHABET + invalid_chars <- setdiff(individual_chars, Biostrings::AA_ALPHABET) + n_invalid_chars <- length(invalid_chars) + + # 0 length character vector means the sequence contains + # characters that are all present in the AA_ALPHABET + # >0 length character vector means there are some characters + # which are NOT found in the AA_ALPHABET + if (n_invalid_chars == 0) { + return(TRUE) + } else { + warning("A sequence contains characters not found in the AA_ALPHABET") + return(FALSE) + } +} + +#------------------------------------------------------------------------------- +validate_fasta <- function(fasta_path, .type = "AA") { + # handle case of fasta being unreadable/unrecognized format + fasta <- tryCatch( + expr = Biostrings::readAAStringSet(fasta_path), + error = function(e) { + warning(paste0("error: failed to read sequence from: ", fasta_path)) + return(NULL) + } + ) + # failure to read yields quick return + if (is.null(fasta)) { + return(FALSE) # validation fail + } + + # require at least some characters for the sequence + if (!(sum(nchar(unlist(fasta))) >= 1)) { + warning(paste0("fasta does not have any sequence characters")) + return(FALSE) # validation fail + } + print(fasta) + + ### validate sequence BODY + # iteratively return boolean value for valid seq body (FALSE = invalid) + seq_body_validations <- c() + for (i in seq_along(1:length(fasta))) { + seq_body <- as.character(fasta[i]) + seq_body_validations <- c(.validate_seq_body(seq_body), seq_body_validations) + } + # require all sequences to have chars only in AA_ALPHABET + if (!(all(seq_body_validations))) { + warning("At least one sequence has non-IUPAC AA characters") + return(FALSE) # validation fail + } + + ### validate sequence TYPE + # iteratively guess the type (DNA, AA, or other of each seq in the FASTA file) + seq_types <- c() + for (idx in seq_along(1:length(fasta))) { + seq_types <- c(.guess_seq_type(fasta[idx]), seq_types) + } + + switch(.type, + "DNA" = { + anti_type <- "AA" + }, + "AA" = { + anti_type <- "DNA" + } + ) + + # if all seqs are of desired .type, only then TRUE + if (sum(is.na(seq_types)) != 0) { + return(FALSE) # validation fail + } else if (anti_type %in% seq_types) { + return(FALSE) # validation fail + } else { + return(TRUE) # validation success + } +} + +# Author(s): Samuel Chen, Lo Sosinski, Joe Burke +# Last modified: 2021 + +alpha_numeric <- c(0:9, letters, LETTERS) + +rand_string <- function(length, characters = alpha_numeric, post_fix = "", ignorelist = c()) { + str <- NULL + + while (is.null(str) || (str %in% ignorelist)) { + chars <- sample(characters, size = length, replace = TRUE) + str <- paste0(paste(chars, collapse = ""), post_fix) + } + + return(str) +} +strsplit_vect <- function(strings, pattern = "", pos) { + split_strings <- map(strsplit(strings, "_"), function(x) x[1]) %>% unlist() + + return(split_strings) +} diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R new file mode 100644 index 00000000..c8b4301f --- /dev/null +++ b/inst/report/scripts/generate_report.R @@ -0,0 +1,668 @@ +# Author(s): Awa Synthia +# Last modified: 2024 + +# get fasta of pathogen and/or drug +get_card_data <- function(pathogen = NULL, drug = NULL) { + destination_dir <- "CARD_data" + # Check if CARD data exists + if (!dir.exists(destination_dir)) { + dir.create(destination_dir) + } + if (!file.exists("CARD_data/aro_index.tsv")) { + # Step 1: Download CARD Data + download.file("https://card.mcmaster.ca/latest/data", "card_data.tar.bz2") + #unzip("card_data.tar.bz2", exdir = "CARD_data") + system(paste("tar -xjf card_data.tar.bz2 -C", destination_dir)) + } + + # Step 2: Open ARO_index.tsv + aro_data <- read.delim("CARD_data/aro_index.tsv", sep = "\t", header = TRUE) + + # Step 3: Map CARD Short Name + antibiotics <- read.delim("CARD_data/shortname_antibiotics.tsv", + sep = "\t", header = TRUE) + pathogens <- read.delim("CARD_data/shortname_pathogens.tsv", + sep = "\t", header = TRUE) + + aro_data <- aro_data %>% + mutate( + pathogen = str_extract(CARD.Short.Name, "^[^_]+"), + gene = str_extract(CARD.Short.Name, "(?<=_)[^_]+"), + drug = str_extract(CARD.Short.Name, "(?<=_)[^_]+$") + ) %>% + left_join(pathogens, by = c("pathogen" = "Abbreviation")) %>% + left_join(antibiotics, by = c("drug" = "AAC.Abbreviation")) + + # Sort names + aro_data <- aro_data %>% + arrange(Pathogen, Molecule) %>% + group_by(Pathogen, Molecule) + + # Filter data based on user input + filtered_data <- aro_data + + if (!is.null(pathogen)) { + filtered_data <- filtered_data %>% filter(Pathogen == !!pathogen) + } + + if (!is.null(drug)) { + filtered_data <- filtered_data %>% filter(Molecule == !!drug) + } + + # Check if filtered data is empty + if (nrow(filtered_data) == 0) { + stop("No data found for the specified pathogen and drug.") + } + + # Extract protein accessions + accessions <- filtered_data$Protein.Accession + + # Function to fetch FASTA sequence from NCBI + get_fasta <- function(accession) { + entrez_fetch(db = "protein", id = accession, rettype = "fasta") + } + + # Download sequences + fasta_sequences <- lapply(accessions, get_fasta) + + # Write sequences to a file + writeLines(unlist(fasta_sequences), "filtered_proteins.fasta") + + return("FASTA sequences downloaded to 'filtered_proteins.fasta'.") +} + +# Run analysis +run_analysis <- function( + dupload_type = "Fasta", + evalue = 0.00001, + accnum_fasta_input = "", + file_paths = list(accnum = NULL, fasta = tempfile(), msa = tempfile(), + blast = NULL, iprscan = NULL), + blast_db = "refseq_protein", + blast_hits = 100, + blast_eval = 0.0001, + acc_homology_analysis = TRUE, + acc_da_analysis = TRUE, + acc_phylogeny_analysis = FALSE, + report_template_path = "/report/report_template.Rmd", + output_file = file.path(tempdir(), "report.html"), + DASelect = "All", + mainSelect = NULL, + PhyloSelect = NULL, + q_heatmap_select = "All", + DACutoff = 95, + GCCutoff = 0.5, + query_select = NULL, + query_iprDatabases = NULL, + query_iprVisType = NULL, tree_msa_tool = "ClustalO", + levels = 2, + DA_Col = "DomArch.Pfam", + msa_rep_num = NULL, + msa_reduce_by = "Species", + rval_phylo = FALSE, + ... + +) { + + ##### Initialize Variables and Classes ##### + blast_upload_data <- new("blastUpload", df = "", seqs = "") + ipr_upload_data <- new("iprUpload", df = "", seqs = "") + sequence_upload_data <- new("seqUpload", seqs = "") + data <- new("MolEvolData", msa_path = tempfile(), + fasta_path = file_paths$fasta) + app_data <- new("MolEvolData", msa_path = tempfile(), + fasta_path = file_paths$fasta) + query_data <- new("queryData") + + if (length(dupload_type) != 1) { + stop("dupload_type must be a single value.") + } + + # Reset default analysis function + resetSettings <- function() { + acc_homology_analysis <<- FALSE + acc_da_analysis <<- FALSE + acc_phylogeny_analysis <<- FALSE + domain_split <<- FALSE + } + + fasta_set <- c("Fasta", "AccNum", "MSA") + + # Update settings based on upload type + updateUploadType <- function(dupload_type) { + switch(dupload_type, + "Fasta" = { + resetSettings() + acc_homology_analysis <<- TRUE + acc_da_analysis <<- TRUE + }, + "AccNum" = { + resetSettings() + acc_homology_analysis <<- TRUE + acc_da_analysis <<- TRUE + }, + "MSA" = { + resetSettings() + acc_homology_analysis <<- TRUE + acc_da_analysis <<- TRUE + }, + "BLAST Output" = { + resetSettings() + acc_phylogeny_analysis <<- TRUE + }, + "InterProScan Output" = { + resetSettings() + acc_da_analysis <<- TRUE + } + ) + } + + updateUploadType(dupload_type) + + ####### File Upload Functions ######## + + # Update function to return modified object + uploadAccNumFile <- function(file_path, seq_obj) { + accnum_data <- read_file(file_path) + seq_obj@seqs <- accnum_data + return(seq_obj) + } + + uploadFastaFile <- function(file_path, seq_obj) { + fasta_data <- read_file(file_path) + seq_obj@seqs <- fasta_data + return(seq_obj) + } + + uploadMSAFile <- function(file_path, seq_obj) { + msa_data <- read_file(file_path) + seq_obj@seqs <- msa_data + return(seq_obj) + } + + uploadBlastFile <- function(file_path, blast_obj) { + blast_obj@df <- file_path + return(blast_obj) + } + + uploadIPRScanFile <- function(file_path, ipr_obj) { + ipr_obj@df <- file_path + return(ipr_obj) + } + + # Now modify the calling section to reassign the modified objects + if (!is.null(file_paths$accnum)) { + sequence_upload_data <- uploadAccNumFile(file_paths$accnum, + sequence_upload_data) + } + if (!is.null(file_paths$fasta)) { + sequence_upload_data <- uploadFastaFile(file_paths$fasta, + sequence_upload_data) + } + if (!is.null(file_paths$msa)) { + sequence_upload_data <- uploadMSAFile(file_paths$msa, + sequence_upload_data) + } + if (!is.null(file_paths$blast)) { + blast_upload_data <- uploadBlastFile(file_paths$blast, + blast_upload_data) + } + if (!is.null(file_paths$iprscan)) { + ipr_upload_data <- uploadIPRScanFile(file_paths$iprscan, + ipr_upload_data) + } + + # Validation of inputs + fasta_data <- read_file(file_paths$fasta) + validate_and_process_inputs <- function(evalue, fasta_data) { + if (!validate_evalue(evalue)) { + return("Error: A numeric E-value is required. Please set a valid + value (e.g., 0.0001).") + } + + + if (!validate_accnum_fasta(fasta_data)) { + return("Error: Input for AccNum/Fasta cannot be empty or invalid.") + } + + sequence_vector <- unlist(strsplit(fasta_data, "\n")) + return(list(message = "Inputs are valid!", sequences = sequence_vector)) + } + + validation_result <- validate_and_process_inputs(evalue, fasta_data) + if (is.character(validation_result)) { + return(validation_result) + } + + # Phylogenetic Analysis Validation + phylo <- acc_phylogeny_analysis + if (phylo) { + # Validate phylogenetic analysis based on upload type + is_valid_phylo <- switch( + dupload_type, + "Fasta" = { + str_count(string = sequence_upload_data@seqs, ">") > 1 + }, + "AccNum" = { + accnums <- sequence_upload_data@seqs |> + strsplit("\\s*,\\s*|\\s+|\\n|,") |> + unlist() + length(accnums) > 1 + }, + "MSA" = { + str_count(string = sequence_upload_data@seqs, ">") > 1 + }, + FALSE # Default to FALSE if no valid type is found + ) + + if (!is_valid_phylo) { + return("Error: At least two sequences/identifiers are required for + phylogenetic analysis.") + } + } + # Fasta-like submissions + if (dupload_type %in% fasta_set) { + # Can have any combination of select options + type <- "" + script <- "" + postfix <- "" + + if (acc_homology_analysis && !phylo && acc_da_analysis) { + type <- "full" + postfix <- "full" + } + # Phylogenetic analysis, do full script but skip blast + else if (phylo && acc_da_analysis) { + type <- "phylo" + postfix <- "phylo" + } + else if (acc_da_analysis) { + type <- "da" + postfix <- "da" + } + else if (acc_homology_analysis) { + # Only run BLAST + type <- "dblast" + postfix <- "dblast" + } + else { + # Something went wrong, throw error + stop("Please select one of the above analyses (full, phylo, da, + dblast) to run.") + + return() + } + } + + # After uploading the sequence data, you would check the uploaded data + if (sequence_upload_data@seqs == "") { + stop("Error: Please upload a protein sequence") + } + OUT_PATH <- getwd() + unavailable_pins <- list.files(OUT_PATH) + unavailable_pins <- strsplit_vect(unavailable_pins, pattern = "_") + pinName <- rand_string(length = 6, post_fix = "", ignorelist = unavailable_pins) + pin_id <- strtrim(pinName, 6) + + dir <- paste0(OUT_PATH, pin_id, "_", postfix) + path <- paste0(dir, "/", pin_id, ".fa") + # Adding validation logic based on upload type + system(paste0("mkdir ", dir), wait = TRUE) + switch(dupload_type, + "Fasta" = { + # Validate sequence limit + if (str_count(sequence_upload_data@seqs, ">") > 200) { + stop("Error: Only submissions with less than 200 proteins + are accepted at this time. For analyses with more than 200 p + roteins please contact janani.ravi@cuanschutz.edu.") + } + + # Validate accession numbers for FASTA submission + if (!(validate_accnum_fasta(sequence_upload_data@seqs))) { + stop("Error: Please adjust the FASTA headers. Ensure a header + line for each sequence, no duplicate header names, and + no duplicate protein accession numbers.") + } + + # TRY load fasta for fasta validation + is_valid_aa_fasta <- tryCatch( + expr = { + tmp_file <- tempfile() + writeLines(sequence_upload_data@seqs, tmp_file) + validate_fasta(tmp_file) + }, + error = function(e) { + warning("Error: Failed to run input FASTA verification") + return(FALSE) # Return FALSE if an error occurs + }, + finally = { + unlink(tmp_file) + } + ) + + # Validate AA fasta + if (!(is_valid_aa_fasta)) { + stop("Error: The FASTA input could not be recognized as valid + Amino Acid (AA) sequence(s). MolEvolvR only accepts FASTA + formatted AA/protein sequences as input (not DNA/RNA). + For a short reference on FASTA format, review this + https://blast.ncbi.nlm.nih.gov/doc/blast-topics/short NCBI guide.") + } + + write(sequence_upload_data@seqs, path) + }, + "AccNum" = { + # Convert Acc to FASTA + accnum_vect <- unlist(strsplit(sequence_upload_data@seqs, "\\s*,\\s*|\\s+|\\n|,")) + if (length(accnum_vect) > 200) { + stop("Error: Only submissions with less than 200 proteins are + accepted at this time. For analyses with more than 200 + proteins please contact janani.ravi@cuanschutz.edu.") + } + + # if an error is raised, validation fails + validation_results <- purrr::map_lgl( + accnum_vect, + function(accnum) { + tryCatch( + expr = { + tmp <- tempfile( + pattern = paste0("molevolvr_acccnum_validation-", accnum, "-", "XXXXX"), + fileext = ".fa" + ) + acc2fa(accnum, tmp) + readAAStringSet(tmp) + TRUE + }, + error = function(e) { + FALSE + }, + finally = { + suppressWarnings(sink()) + unlink(tmp) + } + ) + } + ) + + # If any accession numbers fail, halt submission + if (!all(validation_results)) { + stop("Error: MolEvolvR could not locate sequences for the + following accession numbers: + [ {paste0(accnum_vect[which(!validation_results)], collapse = ', ')} ] + Please try submitting FASTA sequences instead.") + } else { + # Write a multifasta + acc2fa(accnum_vect, outpath = path) + } + }, + "MSA" = { + # Validate sequence limit + if (str_count(sequence_upload_data@seqs, ">") > 200) { + stop("Error: Only submissions with less than 200 proteins + are accepted at this time. For analyses with more than 200 + proteins please contact janani.ravi@cuanschutz.edu.") + } + + msa_temp <- str_replace_all(sequence_upload_data@seqs, "-", "") + write(msa_temp, path) + } + ) + + phylo <- if_else(phylo, "TRUE", "FALSE") + + # Clean FASTA headers + fasta <- Biostrings::readAAStringSet(filepath = path) + headers_original <- names(fasta) + headers_accnum <- names(fasta) |> purrr::map_chr(function(x) extractAccNum(x)) + fasta <- cleanFAHeaders(fasta) + headers_cleaned <- names(fasta) + + # Write a table to map original accnums to their cleaned version + readr::write_tsv( + x = tibble::tibble( + "header_original" = headers_original, + "header_accnum" = headers_accnum, + "header_clean" = headers_cleaned + ), + file = file.path(dir, "query-fasta_header-map.tsv") + ) + + # Remove '*' characters which are incompatible with interproscan + fasta <- gsub(pattern = "\\*", replacement = "X", + x = fasta) |> Biostrings::AAStringSet() + Biostrings::writeXStringSet(x = fasta, filepath = path) + + if (domain_split) { + submit_split_by_domain( + dir = dir, + sequences = path, + DB = blast_db, + NHITS = blast_hits, + EVAL = blast_eval, + phylo = phylo, + type = type, + job_code = pin_id, + ) + } else { + submit_full( + dir = dir, + sequences = path, + DB = blast_db, + NHITS = blast_hits, + EVAL = blast_eval, + phylo = phylo, + type = type, + job_code = pin_id, + ) + } + + # Additional handling for different upload types + if (dupload_type == "BLAST Output") { + dir <- paste0(OUT_PATH, pin_id, "_blast") + if (blast_upload_data@df == "") { + stop("Error: Please upload a BLAST file.") + } + if (blast_upload_data@seqs == "" && !blast_ncbi_check) { + stop("Error: Please provide a file containing sequences or check + the box to use fetch sequences for NCBI accession numbers.") + } + + if (tools::file_ext(blast_upload_data@df) == "tsv" || blast_upload_data@df == EX_BLASTOUTPUT) { + data <- read_tsv(blast_upload_data@df, col_names = web_blastp_hit_colnames) + } else { + data <- read_csv(blast_upload_data@df, col_names = web_blastp_hit_colnames) + } + + if (nrow(data) > 5005) { + stop("Error: BLAST output submissions are limited to 5000 proteins + at the moment. For analyses with more than 5000 proteins please + contact janani.ravi@cuanschutz.edu.") + } + + system(paste0("mkdir ", dir), wait = TRUE) + blast_path <- paste0(dir, "/", pin_id, ".wblast.tsv") + write_tsv(data, blast_path, col_names = FALSE) + + queries <- data$Query %>% unique() + writeLines(queries, paste0(dir, "/", "accs.txt")) + + if (blast_ncbi_check) { + submit_blast( + dir = dir, + blast = paste0(dir, "/", pin_id, ".wblast.tsv"), + seqs = paste0(dir, "/", "seqs.fa"), + ncbi = TRUE, + job_code = pin_id, + submitter_email = notify_email, + advanced_options = isolate(rvals_advanced_options |> reactiveValuesToList() |> unlist()) + ) + } else { + seqs <- read_file(blast_upload_data@seqs) + writeLines(seqs, paste0(dir, "/", "seqs.fa")) + submit_blast( + dir = dir, + blast = paste0(dir, "/", pin_id, ".wblast.tsv"), + seqs = paste0(dir, "/", "seqs.fa"), + ncbi = FALSE, + job_code = pin_id, + submitter_email = notify_email, + advanced_options = isolate(rvals_advanced_options |> reactiveValuesToList() |> unlist()) + ) + } + + } else if (dupload_type == "InterProScan Output") { + dir <- paste0(OUT_PATH, pin_id, "_ipr") + path <- paste0(dir, "/", pin_id, "_ipr.tsv") + if (ipr_upload_data()@df == "") { + stop("Error: Please upload an interproscan file.") + } + if (ipr_upload_data()@seqs == "" && !ipr_ncbi_check) { + stop("Error: Please provide a file containing sequences or check the + box to use fetch sequences for NCBI accession numbers.") + } + + ipr <- read_tsv(ipr_upload_data()@df, col_names = FALSE) + if (nrow(ipr) > 200) { + stop("Error: Only submissions with less than 200 proteins are + accepted at this time. For analyses with more than 200 proteins + please contact janani.ravi@cuanschutz.edu.") + } + + system(paste0("mkdir ", dir), wait = TRUE) + writeLines("AccNum\tSeqMD5Digest\tSLength\tAnalysis\tDB.ID\tSignDesc\tStartLoc\tStopLoc\tScore\tStatus\tRunDate\tIPRAcc\tIPRDesc\tGOTerms\tExtra\n", path) + write_tsv(ipr, path, col_names = FALSE, append = TRUE) + + ncbi <- if_else(ipr_ncbi_check, TRUE, FALSE) + blast <- if_else(acc_homology_analysis, TRUE, FALSE) + + if (ncbi) { + submit_ipr( + dir = dir, + ipr = path, + blast = acc_homology_analysis, + seqs = paste0(dir, "/", "seqs.fa"), + ncbi = TRUE, + DB = blast_db, + NHITS = blast_hits, + EVAL = blast_eval, + ) + } else { + seqs <- read_file(ipr_upload_data()@seqs) + writeLines(seqs, paste0(dir, "/", "seqs.fa")) + submit_ipr( + dir = dir, + ncbi = FALSE, + ipr = path, + blast = acc_homology_analysis, + seqs = paste0(dir, "/", "seqs.fa"), + DB = blast_db, + NHITS = blast_hits, + EVAL = blast_eval, + + ) + } + + } + + # Process results for results + fetched <- process_wrapper_dir(dir, pinName = pinName, type = type) + + # Assign fetched data to objects if the fetched list has the expected length + if (length(fetched) == 2) { + data <<- fetched[[1]] # Assign the first element to 'data' + app_data <<- fetched[[1]] # Assign the same to 'app_data' + query_data <<- fetched[[2]] # Assign the second element to 'query_data' + + r_nrow_initial(nrow(fetched[[1]]@df)) # Initialize row count + } + + domarch_cols_value <- get_domarch_cols(app_data, DASelect) + + query_domarch_cols_value <- get_domarch_columns(query_data) + + mainTable_value <- generate_data_table(data) + + queryDataTable_value <- generate_query_data_table(query_data, query_select) + + fastaDataText_value <- get_fasta_data(query_data@fasta_path) + + domainDataText_value <- get_domain_data() + + msaDataText_value <- get_msa_data(query_data@msa_path) + + rs_IprGenes_value <- generate_ipr_genes_visualization(data, app_data, input_rs_iprDatabases, input_rs_iprVisType) + + rval_rs_network_layout_value <- generate_rs_network_layout(data, app_data, cutoff = 100, layout = "nice") + + rs_data_table_value <- generate_data_table(data) + + da_IprGenes_value <- generate_da_ipr_genes_plot(app_data, da_iprDatabases, da_iprVisType, DASelect) + + query_heatmap_value <- generate_query_heatmap(query_data_df, heatmap_select = "All", heatmap_color = "blue") + + DA_Prot_value <- get_DA_Prot(app_data, validate_da, DASelect) + + DALinPlot_value <- generate_DA_heatmap_plot(DA_col, DACutoff, DA_Prot, DA_lin_color, ipr_path) + + DALinTable_value <- generate_DA_lin_table(DA_col, ipr_path, DAlin_count_table_DT) + + DANetwork_value <- generate_domain_network(DA_col, DACutoff, DA_Prot, networkLayout, ipr_path) + + phylogeny_prot_value <- filter_phylogeny_proteins(app_data, phylo_select) + + acc_to_name_value <- acc_to_name(app_data) + + ####### Report Generation ######## + + tryCatch({ + tempReport <- file.path(tempdir(), "report.Rmd") + file.copy(report_template_path, tempReport, overwrite = TRUE) + + # List of graphics to include in report + params <- list( + rs_interproscan_visualization = rs_IprGenes_value, + proximity_network = rval_rs_network_layout_value, + sunburst = data@df, + data = rs_data_table_value, + queryDataTable = queryDataTable_value, + fastaDataText = fastaDataText_value, + heatmap = query_heatmap_value, + query_data = query_data, + query_domarch_cols = query_domarch_cols_value, + query_iprDatabases = query_iprDatabases, + query_iprVisType = query_iprVisType, + mainTable = maintable_value, + DALinTable = DALinTable_value, + DALinPlot = DALinPlot_value, + DANetwork = DANetwork_value, + DA_Prot = DA_Prot_value, + domarch_cols = domarch_cols_value, + DA_Col = DA_Col, + DACutoff = DACutoff, + da_interproscan_visualization = da_IprGenes_value, + phylo_sunburst_levels = levels, + phylo_sunburst = phylogeny_prot_value, + tree_msa_tool = tree_msa_tool, + rep_accnums = rep_accnums(), + msa_rep_num = 10, + app_data = app_data, + PhyloSelect = PhyloSelect, + acc_to_name = acc_to_name_value, + rval_phylo = rval_phylo, + msa_reduce_by = msa_reduce_by + ) + + # Render RMarkdown report + rmarkdown::render(tempReport, + output_file = output_file, + params = params, + envir = new.env(parent = globalenv())) + }, error = function(e) { + return(paste("Error in report generation:", e$message)) + }) + + return("Initialization, upload, and + report generation completed successfully.") +} + diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R new file mode 100644 index 00000000..0ddc2761 --- /dev/null +++ b/inst/report/scripts/run_molevolvr_pipeline.R @@ -0,0 +1,1289 @@ +# Author(s): Awa Synthia +# Last modified: 2024 + +# Load necessary libraries +library(httr) +library(data.table) +library(readr) +library(rentrez) + +get_sequences <- function(sequences, + acc_file_path = "accs.txt", + dir_path = "~", + separate = TRUE) { + seqs <- readAAStringSet(sequences) + cln_names <- c() + for (accnum in names(seqs)) { + if (grepl("\\|", accnum)) { + accnum_cln <- strsplit(accnum, "\\|")[[1]][2] + accnum_cln <- strsplit(accnum_cln, " ")[[1]] + } else { + accnum_cln <- strsplit(accnum, " ")[[1]][1] + } + cln_names <- append(cln_names, accnum_cln) + write(accnum_cln, file = acc_file_path, append = TRUE) + if (separate) { + write(paste0(dir_path, "/", accnum_cln, ".faa"), + file = "input.txt", append = TRUE) + write(paste0(">", accnum_cln), + file = paste0(accnum_cln, ".faa"), append = TRUE) + write(toString(seqs[accnum]), + file = paste0(accnum_cln, ".faa"), append = TRUE) + } + } + names(seqs) <- cln_names + writeXStringSet(seqs, sequences, format = "fasta") + return(length(seqs)) +} +submit_full <- function( + dir = "/data/scratch", + DB = Sys.getenv("BLAST_DB", unset = "refseq"), + NHITS = Sys.getenv("BLAST_HITS", unset = 100), + EVAL = Sys.getenv("BLAST_EVALUE", unset = 0.00001), + sequences = "~/test.fa", + phylo = "FALSE", + by_domain = "FALSE", + domain_starting = "~/domain_seqs.fa", + type = "full", + job_code=NULL, + submitter_email=NULL, + advanced_options=NULL, + get_slurm_mails=FALSE +) { + # Set working directory + setwd(dir) + + advanced_options_names <- names(advanced_options[advanced_options == TRUE]) + + # Write job submission params to file + job_args <- list( + submission_type = type, + database = ifelse(phylo == FALSE, DB, NA), + nhits = ifelse(phylo == FALSE, NHITS, NA), + evalue = ifelse(phylo == FALSE, EVAL, NA), + submitter_email = submitter_email, + advanced_options = advanced_options_names, + job_code = job_code + ) + yml <- yaml::as.yaml(job_args) + write(yml, "job_args.yml") + + # Create a log file + write("START_DT\tSTOP_DT\tquery\tdblast\tacc2info\tdblast_cleanup\tacc2fa + \tblast_clust\tclust2table\tiprscan\tipr2lineage\tipr2da\tduration", + "logfile.tsv") + + # Process sequences (local handling) + if (phylo == "FALSE") { + # Split the sequences if needed, store them locally + num_seqs <- get_sequences(sequences, dir_path = dir, separate = TRUE) + + fasta <- Biostrings::readAAStringSet(sequences) + headers_original <- names(fasta) + headers_accnum <- names(fasta) |> purrr::map_chr(function(x) extractAccNum(x)) + + # Execute BLAST locally (instead of submitting jobs to the cluster) + for (i in 1:num_seqs) { + # Assume each sequence is saved separately + input_file <- paste0(dir, "/", headers_accnum[i], ".faa") + # output_file <- paste0(dir, "/blast_output_", i, ".txt") + + # Construct the local BLAST command (make sure 'blastn' is available locally) + run_molevolvr_pipeline(input_file, DB, NHITS, EVAL, is_query = F, type, i) + + #cmd <- sprintf( + # "deltablast -query %s -db %s -out %s -num_alignments %d -evalue %f -remote", + # input_file, DB, output_file, NHITS, EVAL + #) + + # Execute BLAST locally + #system(cmd) + + cat(sprintf("BLAST for sequence %d completed.\n", i), file=stderr()) + } + } else { + # Handle phylogenetic analysis if needed + cat("Phylogenetic analysis is not supported in the local version yet.\n") + } + + # Simulate query run locally + run_molevolvr_pipeline(sequences, DB, NHITS, EVAL, is_query = TRUE, type) + # cmd_query <- sprintf( + # "deltablast -query %s -db %s -out %s_query.txt -num_alignments %d -evalue %f -remote", + # sequences, DB, paste0(dir, "/query_output"), NHITS, EVAL + # ) + # system(cmd_query) + + cat("Query analysis completed.\n") + + # Status update + num_runs <- num_seqs + 1 + write(paste0("0/", num_runs, " analyses completed"), "status.txt") + + cat("All analyses completed locally.\n") +} + +# Define the main pipeline function +run_molevolvr_pipeline <- function(input_paths, db, nhits, eval, + is_query, type, i) { + + # Start time + start <- Sys.time() + OUTPATH <- getwd() # Set output directory to the current working directory + + # If IS_QUERY is True, handle query data + if (is_query == TRUE) { + + FILE <- input_paths + PREFIX <- "query_data" + OUTDIR <- file.path(OUTPATH, PREFIX) + dir.create(OUTDIR, showWarnings = FALSE) # Create output directory + + all_accnums_file <- file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")) + file.copy(FILE, all_accnums_file) + + # Parse accession numbers + accnums_file <- file.path(OUTPATH, "query-fasta_header-map.tsv") + parsed_accnums_file <- file.path(OUTDIR, "parsed_accnums.txt") + + if (file.exists(accnums_file)) { + parsed_accnums <- read.table(accnums_file, + header = FALSE, sep = "\t", + stringsAsFactors = FALSE) + writeLines(parsed_accnums$V2[-1], parsed_accnums_file) # Skip header + } + + # Copy starting_accs.txt if exists, or fallback to accs.txt + if (file.exists("starting_accs.txt")) { + file.copy("starting_accs.txt", + file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt"))) + } else { + file.copy("accs.txt", + file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt"))) + file.copy(parsed_accnums_file, + file.path(OUTDIR, paste0(PREFIX, ".parsed_accnums.txt"))) + } + + # setwd(OUTDIR) + + # Run acc2info + run_acc2info(parsed_accnums_file, PREFIX, OUTDIR) + + replace_accession_numbers(file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), + file.path(OUTPATH, "query-fasta_header-map.tsv"), + file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv"))) + + file.copy(file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), + file.path(OUTDIR, paste0(PREFIX, ".blast.cln.tsv"))) + + } else { + # Handle homolog data + FILE <- readLines(input_paths)[1] # Assume single file for local usage + F_value <- basename(FILE) + PREFIX <- sub("\\.faa$", "", basename(FILE)) + PREFIX <- gsub(">", "", PREFIX) # Extract prefix from file name + OUTDIR <- file.path(OUTPATH, paste0(PREFIX, "_", type)) + dir.create(OUTDIR, showWarnings = FALSE) # Create output directory + # setwd(OUTDIR) + + # Run DELTABLAST + run_deltablast(input_paths, PREFIX, OUTDIR, db, nhits, eval) + + # Run ACC2FA + convert_accnum_to_fasta(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")), + PREFIX, OUTDIR) + + # Run ACC2INFO + run_acc2info(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt")), + PREFIX, OUTDIR) + + # Clean up BLAST results + cleanup_blast(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")), + file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), + PREFIX, F) + + } + + # Sys.sleep(30) + + # Run BLASTCLUST + run_blastclust(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")), + PREFIX, OUTDIR ) + + # Convert clusters to table + clust2tbl(file.path(OUTDIR, paste0(PREFIX, ".bclust.L60S80.tsv")), + file.path(OUTDIR, paste0(PREFIX, ".blast.cln.tsv"))) + + # Run INTERPROSCAN + run_interproscan(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")), + PREFIX, OUTDIR) + new_header <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis", "DB.ID", + "SignDesc", "StartLoc", "StopLoc", "Score", + "Status", "RunDate", "IPRAcc", "IPRDesc") + + temp_data <- read_tsv(file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")), + col_names = FALSE) + + colnames(temp_data) <- new_header + + write.table(temp_data, file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")), + sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) + # Run IPR2LIN + ipr2lin(file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")), + file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), PREFIX) + + + # Optionally run IPR2DA if IS_QUERY is true + if (is_query == "T") { + ## perform ipr2da on iprscan results + da <- ipr2da(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")), + PREFIX, "NA") + + ## if blast results are provided, call append_ipr + if (is.null(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv"))) | + is.na(PREFIX)) { + print("No blast results provided, moving on.") + } else { + append_ipr(ipr_da = da, blast = "NA", prefix = PREFIX) + } + + file.copy(file.path(OUTDIR, paste0(PREFIX, ".ipr_domarch.tsv")), + file.path(OUTDIR, PREFIX, paste0(PREFIX, ".full_analysis.tsv"))) + } else { + # perform ipr2da on iprscan results + da <- ipr2da(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")), + PREFIX) + + ## if blast results are provided, call append_ipr + if (is.null(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv"))) | + is.na(PREFIX)) { + print("No blast results provided, moving on.") + } else { + append_ipr(ipr_da = da, + blast = file.path(OUTDIR, paste0(PREFIX, ".cln.clust.tsv")), + prefix = PREFIX) + } + } + + # Copy the input fasta file to the output directory + # file.copy(input_paths, OUTDIR) + + # Total run time + dur <- difftime(Sys.time(), start, units = "secs") + cat("\nTotal run time:", dur, "seconds\n") + + # Log the run times + logfile_path <- file.path(OUTPATH, "logfile.tsv") + log_data <- data.frame( + START_DT = format(start, "%d/%m/%Y-%H:%M:%S"), + STOP_DT = format(Sys.time(), "%d/%m/%Y-%H:%M:%S"), + PREFIX = PREFIX, + dur = dur + ) + + write.table(log_data, file = logfile_path, sep = "\t", + row.names = FALSE, col.names = FALSE, + append = TRUE, quote = FALSE) +} + +# Define the acc2info function +acc2info <- function(infile, prefix, outdir) { + # Ensure output directory exists + if (!dir.exists(outdir)) { + dir.create(outdir, recursive = TRUE) + } + + outfile <- file.path(outdir, paste0(prefix, ".acc2info.tsv")) + + # Print column names to the output file + # Print column names to outfile + + # Read the input file of accession numbers + acc_nums <- readLines(infile) + + # Random sleep to avoid overloading the server + # Sys.sleep(sample(1:10, 1)) + + # EPOST to get WebEnv and QueryKey + epost_response <- entrez_post(db = "protein", id = acc_nums) + webenv <- epost_response$WebEnv + query_key <- epost_response$QueryKey + # EFETCH to get the document summaries + docsums <- entrez_summary(db = "protein", web_history = epost_response) + + # Check if any atomic values exist in docsums + any_atomic <- FALSE + # Check if any value in the current docsum is atomic + for (docsum in docsums) { + # Check if any value in the current docsum is atomic + if (is.atomic(docsum)) { + any_atomic <- TRUE + break # Exit loop early if an atomic value is found + } + } + + + if (any_atomic) { + parsed_data <- data.frame( + AccNum = docsums$oslt$value, + AccNum.noV = docsums$caption, + FullAccNum = docsums$extra, + Description = docsums$title, + Length = docsums$slen, + TaxID = docsums$taxid, + Species = docsums$organism, + SourceDB = ifelse(is.null(docsums$sourcedb), NA, + docsums$sourcedb), + Completeness = ifelse(is.null(docsums$completeness), NA, + docsums$completeness), + stringsAsFactors = FALSE # Avoid factors in data frame + ) + } else { + # If no atomic values exist, use lapply to parse each docsum + parsed_data <- do.call(rbind, lapply(docsums, function(docsum) { + tryCatch({ + data.frame( + AccNum = docsum$oslt$value, + AccNum.noV = docsum$caption, + FullAccNum = docsum$extra, + Description = docsum$title, + Length = docsum$slen, + TaxID = docsum$taxid, + Species = docsum$organism, + SourceDB = ifelse(is.null(docsum$sourcedb), NA, + docsum$sourcedb), + Completeness = ifelse(is.null(docsum$completeness), NA, + docsum$completeness), + stringsAsFactors = FALSE # Avoid factors in data frame + ) + }, error = function(e) { + # Return a data frame filled with NA values in case of an error + return(data.frame( + AccNum = NA, + AccNum.noV = NA, + FullAccNum = NA, + Description = NA, + Length = NA, + TaxID = NA, + Species = NA, + SourceDB = NA, + Completeness = NA, + stringsAsFactors = FALSE + )) + }) + })) + } + + # Parse the fetched data + # Check if we got any data + if (nrow(parsed_data) == 0) { + cat("No data found for accession numbers in NCBI. Trying UniProt...\n") + + # Split input file into smaller chunks + split_files <- split(acc_nums, ceiling(seq_along(acc_nums) / 100)) + + for (i in seq_along(split_files)) { + accnum <- paste(split_files[[i]], collapse = ",") # Join with comma + url <- paste0("https://www.ebi.ac.uk/proteins/api/proteins?accession=", + accnum) + + response <- GET(url, accept("application/xml")) + + if (status_code(response) == 200) { + xml_content <- content(response, as = "text") + + # Parse the XML response + parsed_uniprot <- xml2::read_xml(xml_content) + + # Extract required elements (customize based on XML structure) + entries <- xml2::xml_find_all(parsed_uniprot, "//entry") + + for (entry in entries) { + accession <- xml2::xml_text(xml2::xml_find_first(entry, + ".//accession")) + full_name <- xml2::xml_text(xml2::xml_find_first(entry, + ".//fullName")) + length_seq <- xml2::xml_attr(xml2::xml_find_first(entry, + ".//sequence"), + "length") + db_reference <- xml2::xml_attr(xml2::xml_find_first(entry, + ".//dbReference"), + "id") + dataset <- xml2::xml_attr(entry, "dataset") + name <- xml2::xml_text(xml2::xml_find_first(entry, ".//name")) + + # Create a row of data + new_row <- data.frame(AccNum = accession, + AccNum.noV = gsub("\\|", "", accession), + FullAccNum = accession, + Description = full_name, + Length = as.integer(length_seq), + TaxID = db_reference, + Species = name, + SourceDB = dataset, + Completeness = "NA", # Adjust as needed + stringsAsFactors = FALSE) + + # Append to the outfile + write.table(new_row, file = outfile, sep = "\t", col.names = TRUE, + row.names = FALSE, quote = FALSE, append = TRUE) + } + } + } + } else { + # Append NCBI data to outfile + write.table(parsed_data, file = outfile, sep = "\t", col.names = TRUE, + row.names = FALSE, quote = FALSE, append = TRUE) + } + + cat("Data saved to:", outfile, "\n") +} + +acc2info_phylo <- function(infile, outdir) { + # Ensure output directory exists + if (!dir.exists(outdir)) { + dir.create(outdir, recursive = TRUE) + } + + outfile <- file.path(outdir, "acc2info.tsv") + + # Create a data frame to store results + results <- data.frame( + Caption = character(), + Extra = character(), + Title = character(), + Length = character(), + TaxID = character(), + Organism = character(), + SourceDB = character(), + Completeness = character(), + stringsAsFactors = FALSE + ) + + # Process accession numbers in batches to avoid overloading + for (acc in acc_nums) { + # Fetch data using epost and efetch + response <- GET(sprintf("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&format=xml&id=%s", + acc)) + + # Check if the request was successful + if (response$status_code == 200) { + # Parse the XML response + parsed_xml <- read_xml(content(response, "text")) + + # Extract relevant information + doc_summary <- xml_find_all(parsed_xml, ".//DocumentSummary") + + for (summary in doc_summary) { + results <- rbind(results, data.frame( + Caption = xml_text(xml_find_first(summary, ".//Caption")), + Extra = xml_text(xml_find_first(summary, ".//Extra")), + Title = xml_text(xml_find_first(summary, ".//Title")), + Length = xml_text(xml_find_first(summary, ".//Slen")), + TaxID = xml_text(xml_find_first(summary, ".//TaxId")), + Organism = xml_text(xml_find_first(summary, ".//Organism")), + SourceDB = xml_text(xml_find_first(summary, ".//SourceDb")), + Completeness = xml_text(xml_find_first(summary, ".//Completeness")), + stringsAsFactors = FALSE + )) + } + } else { + warning(sprintf("Failed to fetch data for accession %s: %s", + acc, response$status_code)) + } + } + + # Write results to the output file + write.table(results, file = outfile, sep = "\t", row.names = FALSE, + quote = FALSE) +} + +# Main function to run based on the prefix +run_acc2info <- function(infile, prefix, outdir) { + if (prefix == "NA") { + acc2info_phylo(infile, outdir) + } else { + acc2info(infile, prefix, outdir) + } +} + +substitute_accnum_for_acc2info <- function(df_acc2info, df_header_map) { + df_result <- df_header_map |> + # set column name in header map to match accnum col in acc2info + dplyr::rename(AccNum = header_accnum) |> + # join onto header map + dplyr::left_join(df_acc2info, by = "AccNum") |> + # deselect accnum + dplyr::select(-AccNum) |> + # set the accnum col to the cleaned form + dplyr::rename(AccNum = header_clean) |> + # rm excess columns from header map file + dplyr::select(-header_original) + return(df_result) +} + + +replace_accession_numbers <- function(path_acc2info, + path_query_header_map, path_out) { + + # Read the input files + df_acc2info <- read_tsv(path_acc2info) + df_query_header_map <- read_tsv(path_query_header_map) + + # Substitute accession numbers + df_acc2info_substituted <- substitute_accnum_for_acc2info(df_acc2info, + df_query_header_map) + + # Print the substituted dataframe + print("### df_acc2info_substituted") + print(df_acc2info_substituted) + + # Write the substituted dataframe to the output file + write_tsv(df_acc2info_substituted, file = path_out, col_names = TRUE) +} + + +run_deltablast <- function(infile, prefix, outdir, + db = "refseq_protein", + nhits = 5000, evalue = 1e-5, + threads = 10) { + + # Prepare output file path + # outfile <- file.path(outdir, paste0(prefix, ".dblast.tsv")) + + outfile <- paste0(outdir, "/" , prefix, ".dblast.tsv") + + # Print I/O messages + cat("\nNow processing:", infile, "\n") + cat("Running against:", db, "\n") + cat("E-value: ≤", evalue, "\n") + cat("Top", nhits, "hits/alignments\n") + cat("Output filepath:", outfile, "\n") + + # Designating database based on user input + dblast_db <- switch(db, + nr = "nr", + refseq_protein = "refseq_protein", + stop("Invalid database specified. + Choose either 'nr' or 'refseq'.")) + + # Core command for running DELTABLAST + command <- sprintf( + "deltablast -query %s -db %s -out %s -num_alignments %d -evalue %s -outfmt '6 qacc sacc sseqid sallseqid stitle sscinames staxids pident length mismatch gapopen qstart qend qlen sstart send slen evalue bitscore ppos' -remote", + infile, dblast_db, outfile, nhits, evalue + ) + # Run the DELTABLAST command + system(command, intern = TRUE) + + cat("DELTABLAST completed.\n") +} + + +# This script converts AccNum to Fasta using NCBI's EDirect or EBI's API + +convert_accnum_to_fasta <- function(infile, prefix, outdir) { + + # Create the output file path + outfile <- file.path(outdir, paste0(prefix, ".all_accnums.fa")) + + # Print status + cat("\n####################\n") + cat("BEGIN EDIRECT SEARCH\n") + cat("####################\n") + cat("Processing input file:", infile, "\n") + + # Create temporary files based on the number of columns in input file + temp_acc_file <- file.path(outdir, paste0(prefix, ".all_accnums.txt")) + cols <- length(strsplit(readLines(infile, n = 1), "\t")[[1]]) + + if (cols > 1) { + # Extract unique homolog accession numbers from the 2nd column + cat("Creating temp file with 2nd column unique accessions...\n") + system(sprintf("awk -F '\\t' '{ print $2 }' %s | sort -u > %s", infile, temp_acc_file)) + } else { + # Extract unique homolog accession numbers from the 1st column + cat("Creating temp file with 1st column unique accessions...\n") + system(sprintf("awk -F '\\t' '{ print $1 }' %s | sort -u > %s", infile, temp_acc_file)) + } + + # Split accessions into chunks of 1000 + system(sprintf("split -l 1000 -e %s %s/acc", temp_acc_file, outdir)) + + # Fetch FASTA sequences + cat("\nObtaining FASTA files\n") + acc_files <- list.files(path = outdir, pattern = "^acc", full.names = TRUE) + + for (x in acc_files) { + # Accession numbers in a comma-separated format + accnum <- paste(readLines(x), collapse = ",") + fasta_sequence <- entrez_fetch(db = "protein", id = accnum, + rettype = "fasta") + # Write the fetched sequence to the output file + writeLines(fasta_sequence, con = outfile) + } + + # Check if any sequences were retrieved + num_seqs <- as.numeric(system(sprintf("grep '>' %s | wc -l", outfile), + intern = TRUE)) + + # If no sequences found, try fetching from EBI API + if (num_seqs < 1) { + cat("\nNo sequences retrieved. Trying EBI API...\n") + for (x in acc_files) { + accnum <- paste(readLines(x), collapse = ",") + system(sprintf( + "curl -X GET --header 'Accept:text/x-fasta' 'https://www.ebi.ac.uk/proteins/api/proteins?accession=%s' >> %s", + accnum, outfile + )) + + } + } + + # Remove temporary files + cat("\nRemoving temporary files\n") + system(sprintf("rm %s/acc*", outdir)) + + # Print end message + cat("#####################\n") + cat("END OF EDIRECT SEARCH\n") + cat("#####################\n") +} + + +cleanup_blast <- function(infile_blast, acc2info, prefix, wblast = F) { + + outdir <- dirname(infile_blast) + # Load and clean acc2info file + acc2info_out <- fread(input = acc2info, sep = "\t", + header = TRUE, fill = TRUE) %>% + mutate(FullAccNum = gsub("\\|", "", FullAccNum)) %>% + mutate(FullAccNum = gsub(".*[a-z]", "", FullAccNum)) + + # If using web-blast output + if (wblast == "T") { + blast_out <- fread(input = infile_blast, sep = "\t", header = FALSE, + col.names = web_blastp_hit_colnames, fill = TRUE) + cleanedup_blast <- blast_out %>% + mutate(AccNum = gsub("\\|", "", AccNum)) %>% + mutate(AccNum = gsub(".*[a-z]", "", AccNum)) %>% + mutate(PcIdentity = round(as.double(PcIdentity), 2)) + + # Merge cleaned blast output with acc2info + cleanedup_blast <- merge(cleanedup_blast, acc2info_out, + by.x = "AccNum", + by.y = "FullAccNum", + all.x = TRUE) + names(cleanedup_blast)[names(cleanedup_blast) == "Species.y"] <- "Species" + + # Additional calculations for PcPositive + cleanedup_blast <- cleanedup_blast %>% + mutate(PcPosOrig = as.numeric(PcPosOrig)) %>% + mutate(AlnLength = as.numeric(AlnLength)) %>% + mutate(PcPositive = PcPosOrig) + + } else if (wblast == FALSE) { + # If using classic-blast output + blast_out <- read_tsv(file = infile_blast, col_names = cl_blast_colnames) + cleanedup_blast <- blast_out %>% + mutate(AccNum = gsub("\\|", "", AccNum)) %>% + mutate(AccNum = gsub(".*[a-z]", "", AccNum)) %>% + mutate(Species = gsub(";.*$", "", Species)) %>% + mutate(PcIdentity = round(PcIdentity, 2)) %>% + mutate(PcPositive = round((PcPosOrig * AlnLength / QLength), digits = 2)) + + # Merge cleaned blast output with acc2info + cleanedup_blast <- merge(cleanedup_blast, acc2info_out, + by.x = "AccNum", + by.y = "FullAccNum", + all.x = TRUE) %>% + select(-Species.x, -TaxID.x) + names(cleanedup_blast)[names(cleanedup_blast) == "Species.y"] <- "Species" + } + + # TaxID to lineage mapping + cleanedup_blast$TaxID <- as.integer(cleanedup_blast$TaxID) + lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt", + header = TRUE, fill = TRUE, + colClasses = lineage_map_cols) + + # Merge with lineage map and clean up columns + mergedLins <- merge(cleanedup_blast, lineage_map, by = "TaxID", + all.x = TRUE) %>% + mutate(Species = Species.y, Spp.blast = Species.x) %>% + select(any_of(cl_blast_postcln_cols)) + + # Add names and prepare the output + blast_names <- addName(mergedLins) + + # Create output file name + file_name <- file.path(outdir, paste0(prefix, ".blast.cln.tsv")) + + # Write cleaned data to file + write_tsv(blast_names, file_name, col_names = TRUE) +} + + +# Function to run BLASTCLUST on given input +run_blastclust <- function(infile, suffix, outdir) { + + # Prepare output file path + outfile <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv")) + + + # Print process messages + cat("\n#####################################\n") + cat("## Now running BLASTCLUST on file(s):", infile, "\n") + cat("#####################################\n") + + # Run BLASTCLUST + # blastclust_cmd <- paste("blastclust -i", infile, "-o", outfile, "-p T -L .6 -b T -S 80 -a 8") + cdhit_command <- sprintf( + "cd-hit -i %s -o %s -c 0.8 -aS 0.6 -T 8", + infile, outfile + ) + cat("\nPerforming BLASTCLUST analysis on", infile, "\n") + # system(blastclust_cmd) + + system(cdhit_command) + +} + +# Function to format blastclust output +clust2tbl <- function(clust, blast) { + clust_out <- read_tsv(file = clust, col_names = F) + blast_out <- read_tsv(file = blast, col_names = T) + ## Count the number of accession numbers in a cluster + # Counting number of spaces between acc. no. +1 + clust_out$NumAccs <- map(.x = clust_out$X1, function(x) { + (str_count(string = x, pattern = " ") + 1) + }) + ## Create empty vectors to store information + empty_vec <- c("ClusterID") + empty_vec2 <- c("RowNum") + ## Adding empty vectors to dataframe + clust_out[, empty_vec] <- NA + clust_out[, empty_vec2] <- NA + ## Counting number of rows to add to the RowNum column -- used for creating cluster name + rows <- as.numeric(rownames(clust_out)) %>% + str_pad(width = 4, pad = 0) + ## Add row number info to dataframe + clust_out[, "RowNum"] <- rows + # Name columns + colnam <- list("AccNum", "NoAccs", "ClusterID", "NumRows") + ## Create cluster name from row number (num of clusters) and num of accessions + clust_out <- clust_out %>% + `colnames<-`(colnam) %>% + mutate(ClusterID = paste0(NumRows, ".", NoAccs)) + + # Store data frame column as vector + myvar <- c("AccNum", "ClusterID") + ## Add only the 2 colummns wanted to a new varible + clusters <- clust_out[myvar] + # Initialize empty data frame + new_clust <- data.frame(ClusterID = character(0), + AccNum = character(0), + stringsAsFactors = F) + + ## Assigning each sseqid to a ClusterID + # Iterate over dataframe + for (i in 1:nrow(clusters)) { + # Extract the cluster name for this row + cname <- clusters$ClusterID[i] + # Split accession numbers by a space + # vals is a vecto of all accession numbers in a row + vals <- clusters$AccNum[i] %>% + strsplit(split = " ") %>% + unlist() + + # Iterate over each element in vals + for (v in vals) { + # add each accession num w/ corresponding cluster ID to new df + new_clust[nrow(new_clust) + 1, ] <- c(cname, v) + } + } + # blast_out$AccNum <- gsub("^>", "", blast_out$AccNum) + blast_out$AccNum <- paste0(">", blast_out$AccNum) + blast_clustnames <- merge(blast_out, new_clust, by = "AccNum") + + for (i in 1:nrow(blast_clustnames)) { + blast_clustnames$ClusterID[i] <- str_c(blast_clustnames$ClusterID[i]) + } + + first_prot <- as.data.frame(word(clust_out$AccNum), word(clust_out$ClusterID)) + + ## write the new file as a TSV file + newarg <- gsub(".bclust.L[0-9][0-9]S[0-9][0-9].tsv", "", clust) + # accnum + clusterID + write_tsv(new_clust, file = paste0(newarg, ".clustIDs"), append = F) + # first protein from every cluster + write_tsv(first_prot, file = paste0(newarg, ".clust_reps"), + col_names = F, append = F) + # cleaned up blast file + clusterIDs + write_tsv(blast_clustnames, file = paste0(newarg, ".cln.clust.tsv"), + col_names = T, append = F) +} + +# Function to run InterProScan +run_interproscan <- function(query_file, prefix, outdir) { + + # Start InterProScan run + cat("\n######################\n") + cat("BEGIN INTERPROSCAN RUN\n") + cat("Input file:", query_file, "\n") + cat("######################\n") + + # Output file path + outfile <- file.path(outdir, paste0(prefix, ".iprscan")) + + # Process the input query file with InterProScan + cat("Now processing", query_file, "\n") + + # Run InterProScan command + # Construct the command + command <- paste( + "/home/simple/iprdir/interproscan-5.70-102.0/interproscan.sh -i", + shQuote(query_file), + "-b", shQuote(outfile), + "-f TSV --cpu", Sys.getenv("INTERPROSCAN_CPUS", "4"), + "--appl Pfam,MobiDBlite,Phobius,Coils,SignalP_GRAM_POSITIVE,", + "SignalP_GRAM_NEGATIVE,Hamap,Gene3D,SignalP_EUK" + ) + + # Run the command + system(command) + + cat("##################\n") + cat("END OF IPRSCAN RUN\n") + cat("##################\n") +} + +ipr2lin <- function(ipr, acc2info, prefix) { + # read in iprscan results + # duplicate rows in iprscan file + ipr_in <- read_tsv(ipr, col_names = TRUE) %>% + mutate(DB.ID = gsub("G3DSA:", "", DB.ID)) + + acc2info_out <- fread(input = acc2info, sep = "\t", header = T, fill = T) %>% + mutate(FullAccNum = gsub("\\|", "", FullAccNum)) %>% + mutate(FullAccNum = gsub(".*[a-z]", "", FullAccNum)) + + # merge ipr file with acc2info file + ipr_in <- ipr_in %>% + # remove version number and any other suffices + mutate(AccNum.noV = gsub("\\.[0-9].*", "", AccNum)) + + ipr_tax <- left_join(ipr_in, acc2info_out, by = "AccNum") + + # read in lineage map + lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt", + header = T, fill = T) + + # merge ipr+info w/ lineage + # both tables have a species column, but only + # the lineage_map (y) species column is kept + ipr_tax <- ipr_tax %>% + mutate(TaxID = as.numeric(TaxID)) + + ipr_lin <- left_join(ipr_tax, lineage_map, by = "TaxID") |> + mutate(Species = Species.y) %>% + select(-Species.x, -Species.y) + + # add lookup table to iprscan file + lookup_tbl <- fread(input = "~/awasyn/new_trial/cln_lookup_tbl.tsv", + sep = "\t", header = T, fill = T) %>% + distinct() + if ("AccNum.x" %in% names(ipr_lin)) { + ipr_lin <- ipr_lin %>% + # deselect the AccNum.y from the lineage table (y) and set + # the AccNum.x (x) from the ipr/acc2info tables to simply 'AccNum' + mutate(AccNum = AccNum.x) %>% + select(-AccNum.x, -AccNum.y) + } + # run add_name f(x) on ipr+lineage dataframe + ipr_lin <- ipr_lin %>% + addName() %>% + mutate(Name = gsub("^_", "", Name)) + + # add domarch info to iprscan + lineage df, only keep what's in x + ipr_cln <- left_join(ipr_lin, lookup_tbl, by = "DB.ID") + + # populate empty description/short name columns + for (i in 1:nrow(ipr_cln)) { + if ((is.na(ipr_cln$ShortName[i]) || ipr_cln$ShortName[i] == "") && + (is.na(ipr_cln$SignDesc[i]) || ipr_cln$SignDesc[i] == "-")) { + + ipr_cln$SignDesc[i] <- ipr_cln$IPRDesc[i] + if (length(ipr_cln$LookupTblDesc[i]) != 0) { + ipr_cln$ShortName[i] <- ipr_cln$LookupTblDesc[i] + } + } + if (is.na(ipr_cln$ShortName[i]) || ipr_cln$ShortName[i] == "") { + ipr_cln$ShortName[i] <- ipr_cln$SignDesc[i] + } + if (is.na(ipr_cln$SignDesc[i]) || ipr_cln$SignDesc[i] == "-") { + ipr_cln$SignDesc[i] <- ipr_cln$ShortName[i] + } + } + # rename unclear/duplicated columns + names(ipr_cln)[names(ipr_cln) == "Description.x"] <- "ProteinName" + names(ipr_cln)[names(ipr_cln) == "Description.y"] <- "LookupTblDesc" + # deselect the AccNum.noV from the lineage table (y) and set + # the AccNum.noV.x (x) from the ipr/acc2info tables to simply 'AccNum.noV' + ipr_cln <- ipr_cln |> dplyr::select(-AccNum.noV.y) |> + dplyr::mutate(AccNum.noV = AccNum.noV.x) |> + dplyr::select(-AccNum.noV.x) + # create label column to use in ipr2viz + ipr_cln <- ipr_cln %>% + mutate(Label = strtrim(ShortName, 30)) %>% + mutate(Label = gsub(", .*", "", Label)) %>% + mutate(Label = gsub("C-terminal region of a signal", + "C-term signal peptide", Label)) %>% + mutate(Label = gsub("N-terminal region of a signal", + "N-term signal peptide", Label)) %>% + mutate(Label = gsub("Twin arginine translocation \\(T", + "Tat signal profile", Label)) %>% + mutate(Label = gsub("GLUTATHIONE HYDROLASE PROENZYM", + "GLUTATHIONE HYDROLASE PROENZYME", Label)) %>% + mutate(Label = gsub("N-terminal nucleophile aminohy", + "Ntn hydrolases", Label)) %>% + mutate(Label = gsub("Region of a membrane-bound pro", + "cytoplasmic reg of mem-bound prot", + Label)) %>% + mutate(ShortName = gsub("Region of a membrane-bound protein predicted to be + outside the membrane, in the cytoplasm.", + "cytoplasmic reg of mem-bound prot", ShortName)) + outdir <- dirname(ipr) + # write results to file + write_tsv(ipr_cln, file.path(paste0(outdir, "/",paste0(prefix,".iprscan_cln.tsv")))) +} + +ipr2da <- function(infile_ipr, prefix, + analysis = c( + "Pfam", "SMART", "Phobius", + "Gene3D", "TMHMM", "SignalP_GRAM_POSITIVE", + "SUPERFAMILY", "MobiDBLite", "TIGRFAM", "PANTHER", "Coils" + )) { + # read in cleaned up iprscan results + ipr_in <- read_tsv(infile_ipr, col_names = T, col_types = ipr_cln_cols) + + # split dataframe into unique proteins + x <- split(x = ipr_in, f = ipr_in$AccNum) + + # plan(strategy = "multicore", .skip = T) + + # within each data.table + domarch <- map(x, function(y) { + # domarch <- future_map(x, function(y) { + acc_row <- data.frame(AccNum = y$AccNum[1], stringsAsFactors = F) + DAs <- data.frame(matrix(nrow = 1, ncol = length(analysis))) + DA <- y %>% + group_by(Analysis) %>% + arrange(StartLoc) + i <- 1 + for (a in analysis) { + a_da <- DA %>% filter(Analysis == a) + if (a == "SignalP_EUK" || a == "SignalP_GRAM_NEGATIVE" || + a == "SignalP_GRAM_POSITIVE") { + var_shortname <- "DB.ID" + } else { + var_shortname <- "ShortName" + } + var_shortname_sym <- sym(var_shortname) + a_da <- a_da %>% + ungroup() %>% + select({{ var_shortname_sym }}) %>% + filter(!is.na({{ var_shortname_sym }})) %>% + filter(!is.null({{ var_shortname_sym }})) %>% + pull(var_shortname) %>% + paste(collapse = "+") + DAs[1, i] <- a_da + i <- (i + 1) + } + + colnames(DAs) <- paste("DomArch", analysis, sep = ".") + return(cbind(acc_row, DAs)) + }) + + # select relevant rows from ipr input to add to domarch + ipr_select <- ipr_in %>% + select(Name, AccNum, Species, TaxID, Lineage, Lineage_long_na, + Lineage_long, Lineage_med, Lineage_short, ProteinName, + SourceDB, Completeness, AccNum.noV) %>% + distinct() + + # combine domarchs to one data frame, merge w/ acc2info + domarch2 <- do.call(rbind.data.frame, domarch) + + domarch_lins <- domarch2 %>% + merge(ipr_select, by = "AccNum", all.x = T) + + # save domarch_lins file + write_tsv(domarch_lins, + file = paste0(prefix, ".ipr_domarch.tsv"), + append = F, na = "NA" + ) + + # return domarch2 dataframe to append to blast results if given + return(domarch2) +} + +## function to add results from ipr2da to blast results +append_ipr <- function(ipr_da, blast, prefix) { + # ! an 'AccNum' or 'AccNum.noV' column is required in blast table for joining ! + blast_out <- read_tsv(blast, col_names = T) + if ("AccNum.noV" %in% colnames(blast_out)) { + ipr_da <- read_tsv(paste0(prefix, ".ipr_domarch.tsv"), col_names = T) + blast_ipr <- merge(blast_out, ipr_da, by = "AccNum.noV", all.x = T) + } else { + blast_ipr <- merge(blast_out, ipr_da, by = "AccNum", all.x = T) + } + + write_tsv(blast_ipr, file = paste0(prefix, ".full_analysis.tsv"), na = "NA") +} + +# Web BLAST output +web_blast_colnames <- c("Query", "AccNum", + "PcIdentity", "AlnLength", "Mismatch", "GapOpen", + "QStart", "QEnd", "SStart", "SEnd", + "EValue", "BitScore", "PcPosOrig", + "QSFrames") # specific to "blastx" + + +# BLAST Command line +cl_blast_colnames <- c("Query", "SAccNum", "AccNum", + "SAllSeqID", "STitle", "Species", "TaxID", + "PcIdentity", "AlnLength", "Mismatch", "GapOpen", + "QStart", "QEnd", "QLength", + "SStart", "SEnd", "SLength", + "EValue", "BitScore", "PcPosOrig", + "PcPositive", "ClusterID") # post-cleanup + +# IPRSCAN (web+command-line) +ipr_colnames <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis", + "DB.ID", "SignDesc", "StartLoc", "StopLoc", "Score", + "Status", "RunDate", "IPRAcc", "IPRDesc") + +# RPSBLAST +rps_colnames <- c("AccNum", "DBID", "DBSeqID", + "PcIdentity", "PcPosOrig", # Ppos missing + "AlnLength", "Mismatch", + # Q here is Subject; S here is the matching domain; rename! + "SStart", "SEnd", "DStart", "DEnd", + "EValue", "BitScore", "TaxID") # TaxID missing (NA); remove? + +# IPG +ipg_colnames <- c("IPG.ID", "Source", "NucAccNum", + "NucStart", "NucStop", "Strand", + "AccNum", "ProtDesc", + "Species", "SppStrain", "AssemblyID") +# Final ColNames +combo_colnames <- c("Query", "AccNum", "Species", "TaxID", "Lineage", + "PcPositive", "ClusterID", + # "Leaf", # MISSING (useful for all dataviz) + # "AssemblyID", "GeneName", "ProtDesc", # MISSING NOW!?! + "DomArch.Pfam", "DomArch.COG", "DomArch.Gene3D", + "DomArch.TMHMM", "DomArch.Phobius", "DomArch.SignalP") + + +## ############ ## +## COLUMN names ## +## ############ ## +## BLAST +############ +## Web-BLAST +############ +## Downloaded as HIT-TABLE csv +# BLASTP and related protein BLASTs +web_blastp_hit_colnames <- c( + "Query", "AccNum", + "PcIdentity", "AlnLength", "Mismatch", "GapOpen", + "QStart", "QEnd", "SStart", "SEnd", + "EValue", "BitScore", "PcPosOrig" +) +# BLASTX +web_blastx_colnames <- c( + "Query", "AccNum", + "PcIdentity", "AlnLength", "Mismatch", "GapOpen", + "QStart", "QEnd", "SStart", "SEnd", + "EValue", "BitScore", "PcPosOrig", + "QSFrames" +) # specific to "blastx" + +## Downloaded as Descriptions csv +# BLASTP and related protein BLASTs +web_blastp_desc_colnames <- c( + "Description", "Species", "CommonName", "TaxID", + "BitScore", "TotalScore", + "PcQCover", "EValue", "PcIdentity", + "SLen", "AccNum" +) + +##################### +## Command line BLAST +##################### + +# pre-cleanup +cl_blast_colnames <- c( + "Query", "SAccNum", "AccNum", + "SAllSeqID", "STitle", "Species", "TaxID", + "PcIdentity", "AlnLength", "Mismatch", "GapOpen", + "QStart", "QEnd", "QLength", + "SStart", "SEnd", "SLength", + "EValue", "BitScore", "PcPosOrig" +) + +# post-cleanup +cl_blast_postcln_cols <- c( + "Query", "AccNum", + "STitle", "Species", "TaxID", "Lineage", "Lineage_long", + "Lineage_long_na", "Lineage_med", "Lineage_short", + "PcPositive", "PcIdentity", "AlnLength", + "SAccNum", "SAllSeqID", + "Mismatch", "GapOpen", + "QStart", "QEnd", "QLength", + "SStart", "SEnd", "SLength", + "EValue", "BitScore", "PcPosOrig", "QueryName" +) + +########## +## IPRSCAN +########## +# ipr_colnames_orig <- c("AccNum", "Seq_MD5_digest", "SeqLen", "Analysis", +# "DB_ID", "SignDesc", "StartLoc", "StopLoc", "Score", +# "Status", "RunDate", "IPRAcc", "IPRDesc") + +ipr_colnames <- c( + "AccNum", "SeqMD5Digest", "SLength", "Analysis", + "DB.ID", "SignDesc", "StartLoc", "StopLoc", "Score", + "Status", "RunDate", "IPRAcc", "IPRDesc" +) + +# post cleanup +########################## +## NEED TO BE REORDERED ## +########################## +ipr_cln_colnames <- c( + "DB.ID", "TaxID", "AccNum.noV", "AccNum", + "SeqMD5Digest", "SLength", "Analysis", "SignDesc", + "StartLoc", "StopLoc", "Score", "Status", "RunDate", + "IPRAcc", "IPRDesc", "FullAccNum", "ProteinName", + "Length", "SourceDB", "Completeness", "Lineage", + "Species", "Name", "ShortName", "LookupTblDesc", + "ID", "Label" +) + +########### +## RPSBLAST +########### +rps_colnames <- c( + "AccNum", "DB.ID", "DBSeqID", + "PcIdentity.Dom", "PcPosOrig.Dom", # "PcPos.Dom", # Ppos missing + "AlnLength", "Mismatch", + "SStart", "SEnd", "DStart", "DEnd", + "EValue", "BitScore", "TaxID" +) # TaxID missing (NA); remove? + +####################### +## IPG and Lineage maps +####################### +ipg_colnames <- c( + "IPG.ID", "Source", "NucAccNum", + "NucStart", "NucStop", "Strand", + "AccNum", "Description", + "Species", "Spp.Strain", "AssemblyID" +) + +################## +## Assembly files +## Genbank, Refseq +################## +assembly_colnames <- c( + "AssemblyID", + "bioproject", "biosample", "wgs_master", # not used + "RefseqCategory", "TaxID", "Spp.TaxID", + "Species", "Spp.Strain", + "isolate", "version_status", # not used + "assembly_level", "release_type", # not used + "GenomeStatus", + "seq_rel_date", "asm_name", "submitter", # not used + "AssemblyID.GBRS", + "paired_asm_comp", "ftp_path", # not used + "excluded_from_refseq", "relation_to_type_material" +) # not used +assembly_sub_colnames <- c( + "TaxID", "Spp.TaxID", "Species", "Spp.Strain", + "RefseqCategory", "GenomeStatus", + "AssemblyID", "AssemblyID.GBRS" +) + +################# +## Lookup tables +## in common_data +################# +lineage_lookup_colnames <- c("TaxID", "Species", "Lineage_long", + "Lineage_long_na", "Lineage_med", + "Lineage_short", "Lineage") +domarch_lookup_colnames <- c("DB.ID", "ShortName", "Description", "ID") +# !! SC and LS will fix other piecemeal files based on these + +###################### +## FINAL UPLOADED DATA +###################### +## Combined data frame that is loaded on to the webapp +combo_colnames <- c( + "Query", "UID", "AccNum", "Species", "TaxID", "Lineage", + "PcPositive", "ClusterID", "QueryName", + # "AssemblyID", "GeneName", "Description", # MISSING NOW!?! + "DomArch.Pfam", "DomArch.COG", "DomArch.Gene3D", + "DomArch.TMHMM", "DomArch.Phobius", "DomArch.SignalP", + "DomArch.SMART", "DomArch.TIGR" +) + + +################ +## read tsv colnames +################ +lookup_table_cols <- cols( + DB.ID = col_character(), + ShortName = col_character(), + Description = col_character(), + ID = col_character() +) + +iprscan_cols <- cols( + .default = col_character(), + TaxID = col_double(), + SLength = col_double(), + SignDesc = col_character(), + StartLoc = col_double(), + StopLoc = col_double(), + Score = col_double(), + Status = col_logical(), + IPRAcc = col_character(), + IPRDesc = col_character(), + Length = col_double(), + ShortName = col_character(), + LookupTblDesc = col_character(), + ID = col_character(), + Label = col_character() +) + +ipr_cln_cols <- cols( + .default = col_character(), + TaxID = col_double(), + SLength = col_double(), + StartLoc = col_double(), + StopLoc = col_double(), + Score = col_double(), + Status = col_logical(), + IPRAcc = col_logical(), + IPRDesc = col_logical(), + Length = col_double(), + ID = col_logical() +) + +lineage_map_cols <- c( + "double", + "character", + "character", "character", "character", "character", "character" +) diff --git a/inst/report/scripts/viz_utils.R b/inst/report/scripts/viz_utils.R new file mode 100644 index 00000000..d721855b --- /dev/null +++ b/inst/report/scripts/viz_utils.R @@ -0,0 +1,742 @@ +# Author(s): Awa Synthia +# Last modified: 2024 + +# Load necessary packages +library(dplyr) +library(stringr) +library(visNetwork) +library(DT) +library(plotly) + +# Function to generate the InterProScan Visualization +generate_ipr_genes_visualization <- function(data, app_data, + input_rs_iprDatabases, + input_rs_iprVisType) { + + # Check if analysis is loaded + if (nrow(data@df) == 0 || app_data@ipr_path == "") { + stop("Analysis data is not loaded properly or ipr_path is missing.") + } + + # If there is no cln_path or PcPositive column is NULL + if (length(data@cln_path) == 0 || is.null(data@df$PcPositive)) { + + # Set column name for accessing based on the data + n <- if ("name" %in% colnames(data)) { + "name" + } else { + "AccNum" + } + n <- "Name" # Hardcoded to "Name" based on original code + + # Call the `ipr2viz_web` function + ipr_plot <- ipr2viz_web( + infile_ipr = data@ipr_path, + accessions = data@df$Name, + analysis = input_rs_iprDatabases, + group_by = input_rs_iprVisType, + name = n + ) + + } else { + + # Call the `ipr2viz` function with additional arguments + ipr_plot <- ipr2viz( + infile_ipr = data@ipr_path, + infile_full = data@df, + accessions = unique(data@df$Name), + analysis = input_rs_iprDatabases, + group_by = input_rs_iprVisType, + topn = 20, # This value is hardcoded in the original code + query = "All" + ) + } + + # Return the plot object for further use + return(ipr_plot) +} + +# Function to generate the domain network layout visualization +generate_rs_network_layout <- function(data, app_data, + cutoff = 100, + layout = "nice") { + + # Check if analysis is loaded and app data has a valid ipr_path + if (nrow(data@df) == 0 || app_data@ipr_path == "") { + stop("Analysis not loaded or ipr_path missing.") + } + + # Extract column names and find the first matching "DomArch" column + cols <- colnames(data@df) + if (is.null(cols)) { + stop("Dataframe columns are missing.") + } + + col <- cols[grepl("DomArch.*", cols)][1] + if (is.null(col)) { + stop("No domain architecture column found.") + } + + # Clean up domain architecture columns in the data + df_data <- data@df %>% + mutate(across(tidyselect::starts_with("DomArch"), clean_string)) + + # Generate the network using the domain_network function + res_network <- domain_network( + df_data, + column = col, + domains_of_interest = ".*", + cutoff = cutoff, + layout = layout + ) + + # Validate that the result is not an error + if (res_network == "error") { + stop("Not enough nodes to construct a network.") + } + + return(res_network) +} + +# Function to generate the data table +generate_data_table <- function(data) { + if (nrow(data@df) == 0) { + stop("No data available. + Please ensure you have uploaded your data correctly.") + } + + # Define the columns to be shown + viewing_cols <- c( + "AccNum", "QueryName", "Name", "Lineage", "Species", + "Length", "PcPositive", "DomArch.Pfam" + ) + + # Extract the data + d <- data@df + + # Identify columns to hide (those not in viewing_cols) + hide_cols <- which(!colnames(d) %in% viewing_cols) - 1 + + # Add hyperlinks to the "AccNum" column + d$AccNum <- paste0("", d$AccNum, "") + + # Create the DataTable + dt <- DT::datatable(d, + rownames = FALSE, + filter = "top", # Enable filtering + extensions = c("Buttons"), # Enable buttons (e.g., CSV export) + escape = FALSE, # Allow HTML rendering (for the hyperlinks) + options = list( + dom = "frlBtip", # Layout for the table (filters, buttons, etc.) + pageLength = 10, # Number of rows per page + paging = TRUE, # Enable pagination + # Enable regex search + search = list(caseInsensitive = TRUE, regex = TRUE), + language = list( + searchPlaceholder = "Regex or Text...", + filterPlaceholder = "Test" + ), + buttons = list( + list( + extend = "colvis", + text = "Add/remove column(s)" + ), + list( + extend = "csv", + text = "Download", + filename = "molevolvr-data", + exportOptions = list( + # Export all data, not just visible page + modifier = list(page = "all") + ) + ) + ), + scrollX = FALSE, # Disable horizontal scrolling + fixedHeader = FALSE, # Disable fixed header + columnDefs = list( + # Hide columns not in 'viewing_cols' + list(visible = FALSE, targets = hide_cols) + ) + ) + ) + + return(dt) +} + +# Function to generate query data table +generate_query_data_table <- function(query_data, query_select = NULL) { + + # Check if analysis is loaded and data is available + if (nrow(query_data@df) == 0) { + stop("No data available. Please ensure you have uploaded your data + correctly.") + } + + # Define the columns to be shown + viewing_cols <- c("QueryName", "Species", "Lineage", "DomArch.Pfam") + + # If no specific queries are selected, display all data + if (is.null(query_select)) { + d <- query_data@df + } else { + # Filter the data for selected queries + d <- query_data@df %>% filter(QueryName %in% query_select) + d <- droplevels(d) + } + + # Identify columns to hide (those not in viewing_cols) + hide_cols <- which(!colnames(d) %in% viewing_cols) - 1 + + # Add hyperlinks to the "Query" column (replace 'Query' with the actual + # column name if different) + d$Query <- paste0("", d$Query, "") + + # Create and return the DataTable + dt <- DT::datatable(d, + rownames = FALSE, + filter = "top", # Enable filtering + extensions = c("Buttons"), # Enable buttons (e.g., CSV export) + callback = DT::JS(c( + '$(\'div.has-feedback input[type="search"]\').attr( "placeholder", "Search..." );' + )), + escape = FALSE, # Allow HTML rendering (for the hyperlinks) + options = list( + dom = "frlBtip", # Layout for the table (filters, buttons, etc.) + pageLength = 25, # Number of rows per page + paging = TRUE, # Enable pagination + # Enable regex search + search = list(caseInsensitive = TRUE, regex = TRUE), + language = list( + searchPlaceholder = "Regex or Text..." + ), + buttons = list( + list( + extend = "colvis", + text = "Add/remove column(s)" + ), + list( + extend = "csv", + text = "Download", + filename = "molevolvr-query", + exportOptions = list( + # Export all data, not just visible page + modifier = list(page = "all") + ) + ) + ), + scrollX = FALSE, # Disable horizontal scrolling + fixedHeader = FALSE, # Disable fixed header + columnDefs = list( + # Hide columns not in 'viewing_cols' + list(visible = FALSE, targets = hide_cols) + ) + ) + ) + + return(dt) +} + +# Function to read and return the FASTA file contents +read_fasta_data <- function(fasta_path) { + + # Check if analysis is loaded and the file path is not empty + if (fasta_path == "" || !file.exists(fasta_path)) { + stop("FASTA file path is invalid or the file does not exist.") + } + + # Read the content of the FASTA file + fasta_content <- read_file(fasta_path) + + return(fasta_content) +} + +get_fasta_data <- function(fasta_path) { + if (is.null(fasta_path) || fasta_path == "") { + stop("Error: FASTA path is not provided.") + } + return(read_file(fasta_path)) # Read and return the FASTA data +} + +# Function to get domain sequences (assumes `data` is a predefined object) +get_domain_data <- function() { + return(data@domainSeqs) # Return domain sequences +} + +# Function to get MSA data from a given path +get_msa_data <- function(msa_path) { + if (is.null(msa_path) || msa_path == "") { + stop("Error: MSA path is not provided.") + } + return(read_file(msa_path)) +} + +# Function to generate a heatmap +generate_query_heatmap <- function(query_data_df, + heatmap_select = "All", + heatmap_color = "blue") { + + # Check if analysis is loaded and query data exists + if (nrow(query_data_df) == 0) { + stop("No query data available.") + } + + # Filter queries based on user selection + if (heatmap_select == "All") { + queries <- unique(query_data_df$QueryName) + prot <- query_data_df %>% + filter(Lineage != "") %>% + tidyr::drop_na(Lineage) + } else { + queries <- heatmap_select + prot <- query_data_df %>% + filter(grepl(heatmap_select, QueryName, ignore.case = TRUE)) %>% + filter(Lineage != "") %>% + tidyr::drop_na(Lineage) + } + + # Validate that the Lineage column has values + if (all(unique(prot$Lineage) == "")) { + stop("Lineage column not found for selected proteins. + See the FAQ for possible reasons/solutions.") + } + + # Assuming `lineage.Query.plot` is a custom function for plotting + lineage.Query.plot(prot, queries = queries, colname = "QueryName", + cutoff = 100, color = heatmap_color) +} + +# Function to retrieve domain architecture columns +get_domarch_columns <- function(query_data_df) { + # Check if query data exists + if (nrow(query_data_df) == 0) { + stop("No query data available.") + } + + # Get column names + cols <- colnames(query_data_df) + + # Filter domain architecture columns, excluding repeats + domarch_cols <- cols[grepl("^DomArch", cols) & !grepl("repeats$", cols)] + + # Identify columns that are completely NA + na_cols <- names(query_data_df)[apply(query_data_df, 2, + function(x) all(is.na(x)))] + + # Remove NA columns from domain architecture columns + domarch_cols <- setdiff(domarch_cols, na_cols) + + # Include SignalP_GRAM_POSITIVE if present + if ("SignalP_GRAM_POSITIVE" %in% domarch_cols) { + domarch_cols <- setdiff(domarch_cols, "SignalP_GRAM_POSITIVE") # Remove first + domarch_cols <- append(domarch_cols, "SignalP_GRAM_POSITIVE") # Append it last + } + + # Remove prefix from column names + domarch_cols <- substring(domarch_cols, first = 9) + + return(domarch_cols) +} + +# Function to generate main data table +generate_main_table <- function(data_df, main_select = NULL) { + # Validate input data + if (nrow(data_df) == 0) { + stop("No data available. Please ensure you have uploaded your data + correctly. See Help documentation or contact JRaviLab + (janani.ravi[AT]cuanschutz[DOT]edu).") + } + + # Define columns to view + viewing_cols <- c("AccNum", "QueryName", "Name", "Lineage", + "Species", "Length", "PcPositive", + "DomArch.Pfam") + + # Filter data based on selection + if (!is.null(main_select)) { + d <- data_df %>% + filter(QueryName %in% main_select) %>% + droplevels() + } else { + d <- data_df + } + + # Identify columns to hide + hide_cols <- which(!colnames(d) %in% viewing_cols) + hide_cols <- hide_cols - 1 + + # Create hyperlinks for AccNum + d$AccNum <- paste0("", d$AccNum, "") + + # Generate DataTable + datatable_output <- DT::datatable(d, + rownames = FALSE, + filter = "top", + callback = DT::JS(c( + '$(\'div.has-feedback input[type="search"]\').attr( "placeholder", "Search..." );' + )), + extensions = c("Buttons"), + escape = FALSE, + options = list( + dom = "frlBtip", + pageLength = 25, + paging = TRUE, + search = list(caseInsensitive = TRUE, regex = TRUE), + language = list(searchPlaceholder = "Regex or Text...", + filterPlaceholder = "Test"), + buttons = list( + list(extend = "colvis", text = "Add/remove column(s)"), + list( + extend = "csv", + text = "Download", + filename = "molevolvr-homolog", + exportOptions = list(modifier = list(page = "all")) + ) + ), + scrollX = FALSE, + fixedHeader = FALSE, + columnDefs = list(list(visible = FALSE, targets = hide_cols)) + ) + ) + + return(datatable_output) +} + +# Function to generate Domain Architecture Linear Table +generate_DA_lin_table <- function(DA_col, ipr_path, DAlin_count_table_DT) { + # Check if ipr_path is valid + if (ipr_path == "") { + stop("InterPro path is empty.") + } + + # Check if DA_col is provided + if (is.null(DA_col)) { + stop("DA_col input is required.") + } + + # Generate the DAlin count table + DAlin_table <- DAlin_count_table_DT() + + return(DAlin_table) +} + +# Function to generate the Domain Architecture Lineage Plot +generate_DA_heatmap_plot <- function(DA_col, DACutoff, + DA_Prot, DA_lin_color, + analysis_loaded, ipr_path) { + # Check if ipr_path is valid + if (ipr_path == "") { + stop("InterPro path is empty.") + } + + # Filter the protein data for plotting + prot <- DA_Prot() %>% + filter(Lineage != "") %>% + drop_na(Lineage) + + # Create the plot + plot <- lineage.DA.plot( + prot, + colname = DA_col, + cutoff = DACutoff, + RowsCutoff = FALSE, + color = DA_lin_color + ) + + # Convert ggplot to plotly object + plot <- plotly::ggplotly(plot) + plot <- plot %>% plotly::layout(xaxis = list(side = "top")) + + return(plot) +} + +# Function to generate the Domain Architecture Network +generate_domain_network <- function(DA_col, DACutoff, DA_Prot, + networkLayout, ipr_path) { + # Check if ipr_path is valid + if (ipr_path == "") { + stop("InterPro path is empty.") + } + + # Prepare the selected protein data + dn_data <- DA_Prot() + col <- sym(DA_col) # Convert DA_col to a symbol for use with dplyr + dn_data[[col]] <- str_replace_all(dn_data[[col]], " ", "_") + + # Filter data based on the specified column + dn_data <- dn_data %>% + drop_na(!!col) %>% + filter(!!col != "") + + # Generate the domain network + res <- domain_network( + prot = dn_data, + column = col, + domains_of_interest = ".*", + cutoff = DACutoff, + layout = networkLayout + ) + + # Validate the result + if (res == "error") { + stop("Not enough nodes to construct a network. + Try increasing 'Total Count Cutoff'.") + } + + # Add export functionality + visNetwork::addExport(res) + res <- res %>% visExport( + type = "png", + name = "domain_architecture-network", + float = "left", + label = "Download plot", + style = " + background-color: #ffffff; + color: #333333; + border-color: #cccccc; + padding: 6px 12px; + font-size: 14px; + line-height: 1.42857143; + border-radius: 4px; + cursor: pointer; + display: inline-block; + margin-bottom: 0; + font-weight: normal; + text-align: center; + vertical-align: middle; + touch-action: manipulation; + user-select: none; + background-image: none; + text-decoration: none; + " + ) + + return(res) +} + +# Function to retrieve and clean Domain Architecture data +get_DA_Prot <- function(app_data, validate_da, DASelect) { + # Check if the ipr_path is valid + if (app_data@ipr_path == "") { + stop("InterPro path is empty.") + } + + # Validate domain architecture + validate_da() + + # Retrieve app data and clean domain architecture columns + df_app_data <- app_data@df + + # Domain architecture column cleanup + df_app_data <- df_app_data %>% + mutate(across(starts_with("DomArch"), clean_string)) + + # Filter based on user selection + if (DASelect == "All") { + return(df_app_data) + } else { + return(df_app_data %>% filter(grepl(DASelect, QueryName, + ignore.case = TRUE))) + } +} + +# Function to retrieve domain architecture columns +get_domarch_cols <- function(app_data, DASelect) { + # Check if app data DataFrame is not empty + if (nrow(app_data@df) <= 0) { + stop("No data available in app data.") + } + + # Check if ipr_path is valid + if (app_data@ipr_path == "") { + stop("InterPro path is empty.") + } + + # Get column names + cols <- colnames(app_data@df) + + # Filter domain architecture columns + domarch_cols <- cols[grepl("^DomArch", cols) & !grepl("repeats$", cols)] + + # Retrieve the data frame + domarch_data <- app_data@df + + # Filter data frame based on user selection + if (DASelect != "All") { + domarch_data <- domarch_data %>% filter(QueryName == DASelect) + } + + # Identify columns that are completely NA or empty + na_cols <- apply(domarch_data, 2, function(x) { + all(is.na(x)) || all(x == "") + }) + na_cols <- names(domarch_data)[na_cols] + + # Remove NA columns from the domain architecture columns + domarch_cols <- setdiff(domarch_cols, na_cols) + + # Ensure "SignalP_GRAM_POSITIVE" is included if it exists + if ("SignalP_GRAM_POSITIVE" %in% domarch_cols) { + domarch_cols <- append(domarch_cols[!domarch_cols %in% + "SignalP_GRAM_POSITIVE"], + "SignalP_GRAM_POSITIVE") + } + + return(domarch_cols) +} + +# Function to generate the IPR genes plot +generate_da_ipr_genes_plot <- function(app_data, da_iprDatabases, + da_iprVisType, DASelect) { + + if (app_data@ipr_path == "") { + stop("IPR path is not set.") + } + + # Validate the data frame + df <- app_data@df + if (nrow(df) == 0) { + stop("No data available. Please ensure you have uploaded + your data correctly.") + } + + if (is.null(da_iprDatabases) || length(da_iprDatabases) == 0) { + stop("Please select an analysis.") + } + + # Determine the name to use + if (length(app_data@cln_path) == 0 || is.null(df$PcPositive)) { + name_column <- ifelse("name" %in% colnames(df), "name", "AccNum") + name_column <- "Name" + + # Generate the plot using the web version + plot <- ipr2viz_web( + infile_ipr = app_data@ipr_path, + accessions = df$Name, + analysis = da_iprDatabases, + group_by = da_iprVisType, + name = name_column + ) + } else { + # Generate the plot using the local version + plot <- ipr2viz( + infile_ipr = app_data@ipr_path, + infile_full = df, + accessions = unique(df$Name), + analysis = da_iprDatabases, + group_by = da_iprVisType, + topn = 20, + query = DASelect + ) + } + + return(plot) +} + +# Function to filter proteins for phylogeny +filter_phylogeny_proteins <- function(app_data, phylo_select) { + # Validate the input app_data + if (!analysis_loaded()) { + stop("Analysis not loaded.") + } + + # Get the data frame from app_data + df <- app_data@df + + # Check if the data frame is empty + if (nrow(df) == 0) { + stop("No data available in app_data. + Please ensure it has been loaded correctly.") + } + + # Filter the data based on user selection + if (phylo_select == "All") { + filtered_df <- df + } else { + filtered_df <- df %>% + filter(grepl(phylo_select, QueryName, ignore.case = TRUE)) %>% + filter(Lineage != "") + } + + return(filtered_df) +} + +# Function to retrieve representative accession numbers +get_representative_accession_numbers <- function(app_data, phylo_select, + msa_reduce_by, msa_rep_num, + rval_phylo) { + + if (rval_phylo()) { + return(app_data@df$AccNum) + } else { + switch(msa_reduce_by, + "Species" = rep_acc_species(), + "Lineage" = rep_acc_lineage(), + "DomArch" = { + if (app_data@ipr_path == "") { + stop("IPR path is empty. Please ensure it is set correctly.") + } + seqs <- find_top_acc(infile_full = app_data@df, + n = msa_rep_num, + query = phylo_select) + if (is.null(seqs)) { + stop("No sequences found.") + } + return(seqs) + }, + "Full" = { + tmp <- app_data@df %>% filter(QueryName == phylo_select) + return(tmp$AccNum) + }, + stop("Invalid selection for msa_reduce_by.") + ) + } +} + +generate_domain_architecture_plot <- function(ipr_path, query_names, + analysis_type, group_by) { + # Check if the input path is provided + if (is.null(ipr_path) || ipr_path == "") { + stop("Error: Input path for IPR data is not provided.") + } + + # Validate that at least one domain is found + if (length(query_domarch_cols()) < 1) { + stop("Error: No domains found in the input sequences.") + } + + n <- "Name" + plot <- ipr2viz_web( + infile_ipr = ipr_path, + accessions = query_names, + analysis = analysis_type, + group_by = group_by, + name = n + ) + + return(plot) # Return the generated plot +} + +# Function to convert accessions to names +acc_to_name <- function(app_data) { + # Check if "AccNum" is a column in the data + if (!("AccNum" %in% colnames(app_data@df))) { + stop("Column 'AccNum' not found in data.") + } + + # Check if "Name" column exists and create the output data frame + if ("Name" %in% colnames(app_data@df)) { + # Select "AccNum" and "Name" + df <- select(app_data@df, "AccNum", "Name") + } else { + # Create a data frame with "AccNum" and assign "AccNum" to "Name" + df <- app_data@df %>% + select("AccNum") %>% + mutate(Name = AccNum) + } + + return(df) +} + From 36d0e4f7da73f2f1e71db0d58744555ad85ecc5d Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Sat, 26 Oct 2024 12:17:19 +0300 Subject: [PATCH 02/23] change path to local files Signed-off-by: Awa Synthia --- inst/report/scripts/run_molevolvr_pipeline.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R index 0ddc2761..b35cd9f2 100644 --- a/inst/report/scripts/run_molevolvr_pipeline.R +++ b/inst/report/scripts/run_molevolvr_pipeline.R @@ -702,7 +702,7 @@ cleanup_blast <- function(infile_blast, acc2info, prefix, wblast = F) { # TaxID to lineage mapping cleanedup_blast$TaxID <- as.integer(cleanedup_blast$TaxID) - lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt", + lineage_map <- fread("~/data/lineage_lookup.txt", header = TRUE, fill = TRUE, colClasses = lineage_map_cols) @@ -841,7 +841,7 @@ run_interproscan <- function(query_file, prefix, outdir) { # Run InterProScan command # Construct the command command <- paste( - "/home/simple/iprdir/interproscan-5.70-102.0/interproscan.sh -i", + "~/interproscan-5.70-102.0/interproscan.sh -i", shQuote(query_file), "-b", shQuote(outfile), "-f TSV --cpu", Sys.getenv("INTERPROSCAN_CPUS", "4"), @@ -875,7 +875,7 @@ ipr2lin <- function(ipr, acc2info, prefix) { ipr_tax <- left_join(ipr_in, acc2info_out, by = "AccNum") # read in lineage map - lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt", + lineage_map <- fread("~/data/lineage_lookup.txt", header = T, fill = T) # merge ipr+info w/ lineage @@ -889,7 +889,7 @@ ipr2lin <- function(ipr, acc2info, prefix) { select(-Species.x, -Species.y) # add lookup table to iprscan file - lookup_tbl <- fread(input = "~/awasyn/new_trial/cln_lookup_tbl.tsv", + lookup_tbl <- fread(input = "~/data/cln_lookup_tbl.tsv", sep = "\t", header = T, fill = T) %>% distinct() if ("AccNum.x" %in% names(ipr_lin)) { From f404cad1657f9ecd0c919801f450c87a1f3abf2a Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Sat, 26 Oct 2024 12:20:01 +0300 Subject: [PATCH 03/23] chane dupload_type to upload_type:) Signed-off-by: Awa Synthia --- inst/report/scripts/generate_report.R | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R index c8b4301f..59c02b76 100644 --- a/inst/report/scripts/generate_report.R +++ b/inst/report/scripts/generate_report.R @@ -73,7 +73,7 @@ get_card_data <- function(pathogen = NULL, drug = NULL) { # Run analysis run_analysis <- function( - dupload_type = "Fasta", + upload_type = "Fasta", evalue = 0.00001, accnum_fasta_input = "", file_paths = list(accnum = NULL, fasta = tempfile(), msa = tempfile(), @@ -114,8 +114,8 @@ run_analysis <- function( fasta_path = file_paths$fasta) query_data <- new("queryData") - if (length(dupload_type) != 1) { - stop("dupload_type must be a single value.") + if (length(upload_type) != 1) { + stop("upload_type must be a single value.") } # Reset default analysis function @@ -129,8 +129,8 @@ run_analysis <- function( fasta_set <- c("Fasta", "AccNum", "MSA") # Update settings based on upload type - updateUploadType <- function(dupload_type) { - switch(dupload_type, + updateUploadType <- function(upload_type) { + switch(upload_type, "Fasta" = { resetSettings() acc_homology_analysis <<- TRUE @@ -157,7 +157,7 @@ run_analysis <- function( ) } - updateUploadType(dupload_type) + updateUploadType(upload_type) ####### File Upload Functions ######## @@ -239,7 +239,7 @@ run_analysis <- function( if (phylo) { # Validate phylogenetic analysis based on upload type is_valid_phylo <- switch( - dupload_type, + upload_type, "Fasta" = { str_count(string = sequence_upload_data@seqs, ">") > 1 }, @@ -261,7 +261,7 @@ run_analysis <- function( } } # Fasta-like submissions - if (dupload_type %in% fasta_set) { + if (upload_type %in% fasta_set) { # Can have any combination of select options type <- "" script <- "" @@ -308,7 +308,7 @@ run_analysis <- function( path <- paste0(dir, "/", pin_id, ".fa") # Adding validation logic based on upload type system(paste0("mkdir ", dir), wait = TRUE) - switch(dupload_type, + switch(upload_type, "Fasta" = { # Validate sequence limit if (str_count(sequence_upload_data@seqs, ">") > 200) { @@ -458,7 +458,7 @@ run_analysis <- function( } # Additional handling for different upload types - if (dupload_type == "BLAST Output") { + if (upload_type == "BLAST Output") { dir <- paste0(OUT_PATH, pin_id, "_blast") if (blast_upload_data@df == "") { stop("Error: Please upload a BLAST file.") @@ -511,7 +511,7 @@ run_analysis <- function( ) } - } else if (dupload_type == "InterProScan Output") { + } else if (upload_type == "InterProScan Output") { dir <- paste0(OUT_PATH, pin_id, "_ipr") path <- paste0(dir, "/", pin_id, "_ipr.tsv") if (ipr_upload_data()@df == "") { From f9d2c654043ff6c308a30e0133a11ae522631fbc Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Fri, 1 Nov 2024 09:06:28 +0200 Subject: [PATCH 04/23] update pipeline Signed-off-by: Awa Synthia --- inst/report/scripts/generate_report.R | 67 ++++-- inst/report/scripts/run_molevolvr_pipeline.R | 28 +-- inst/report/scripts/viz_utils.R | 206 ++++++++++++++++--- 3 files changed, 244 insertions(+), 57 deletions(-) diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R index 59c02b76..7dd89030 100644 --- a/inst/report/scripts/generate_report.R +++ b/inst/report/scripts/generate_report.R @@ -2,6 +2,7 @@ # Last modified: 2024 # get fasta of pathogen and/or drug +#' @export get_card_data <- function(pathogen = NULL, drug = NULL) { destination_dir <- "CARD_data" # Check if CARD data exists @@ -72,6 +73,7 @@ get_card_data <- function(pathogen = NULL, drug = NULL) { } # Run analysis +#' @export run_analysis <- function( upload_type = "Fasta", evalue = 0.00001, @@ -94,12 +96,15 @@ run_analysis <- function( GCCutoff = 0.5, query_select = NULL, query_iprDatabases = NULL, - query_iprVisType = NULL, tree_msa_tool = "ClustalO", + query_iprVisType = "Analysis", tree_msa_tool = "ClustalO", levels = 2, DA_Col = "DomArch.Pfam", msa_rep_num = NULL, msa_reduce_by = "Species", rval_phylo = FALSE, + DA_lin_color = c("default", "viridis", "inferno", "magma", "plasma", "cividis"), + networkLayout = c("nice", "grid", "circle", "random"), + phylo_select = "All", ... ) { @@ -570,11 +575,11 @@ run_analysis <- function( # Assign fetched data to objects if the fetched list has the expected length if (length(fetched) == 2) { - data <<- fetched[[1]] # Assign the first element to 'data' - app_data <<- fetched[[1]] # Assign the same to 'app_data' - query_data <<- fetched[[2]] # Assign the second element to 'query_data' + data <- fetched[[1]] # Assign the first element to 'data' + app_data <- fetched[[1]] # Assign the same to 'app_data' + query_data <- fetched[[2]] # Assign the second element to 'query_data' - r_nrow_initial(nrow(fetched[[1]]@df)) # Initialize row count + r_nrow_initial <- nrow(fetched[[1]]@df) # Initialize row count } domarch_cols_value <- get_domarch_cols(app_data, DASelect) @@ -587,27 +592,55 @@ run_analysis <- function( fastaDataText_value <- get_fasta_data(query_data@fasta_path) - domainDataText_value <- get_domain_data() + domainDataText_value <- get_domain_data(data) msaDataText_value <- get_msa_data(query_data@msa_path) - rs_IprGenes_value <- generate_ipr_genes_visualization(data, app_data, input_rs_iprDatabases, input_rs_iprVisType) + rs_IprGenes_value <- generate_ipr_genes_visualization(data, + app_data, + input_rs_iprDatabases, + input_rs_iprVisType) - rval_rs_network_layout_value <- generate_rs_network_layout(data, app_data, cutoff = 100, layout = "nice") + rs_network_layout_value <- generate_rs_network_layout(data, + app_data, + cutoff = 100, + layout = "nice") rs_data_table_value <- generate_data_table(data) - da_IprGenes_value <- generate_da_ipr_genes_plot(app_data, da_iprDatabases, da_iprVisType, DASelect) + da_IprGenes_value <- generate_da_ipr_genes_plot(app_data, + da_iprDatabases, + da_iprVisType, + DASelect) - query_heatmap_value <- generate_query_heatmap(query_data_df, heatmap_select = "All", heatmap_color = "blue") + query_heatmap_value <- generate_query_heatmap(query_data_df, + heatmap_select = "All", + heatmap_color = "blue") - DA_Prot_value <- get_DA_Prot(app_data, validate_da, DASelect) + DA_Prot_value <- get_DA_Prot(app_data, DASelect) - DALinPlot_value <- generate_DA_heatmap_plot(DA_col, DACutoff, DA_Prot, DA_lin_color, ipr_path) + DALinPlot_value <- generate_DA_heatmap_plot(DA_col = "DomArch.Pfam", + DACutoff, + DA_Prot_value, + DA_lin_color = "viridis", + app_data@ipr_path) - DALinTable_value <- generate_DA_lin_table(DA_col, ipr_path, DAlin_count_table_DT) + DALin_TotalCounts_value <- DA_TotalCounts(DA_Prot_value, + DACutoff = 95, + DA_col = "DomArch.Pfam", + app_data) - DANetwork_value <- generate_domain_network(DA_col, DACutoff, DA_Prot, networkLayout, ipr_path) + DALinTable_value <- generate_DA_lin_table(DA_col = "DomArch.Pfam", + app_data@ipr_path, + DALin_TotalCounts_value) + + DANetwork_value <- generate_domain_network(DA_col = "DomArch.Pfam", + DACutoff, + DA_Prot_value, + networkLayout = "nice", + app_data@ipr_path) + + rep_accnums_value <- phylogeny_prot_value <- filter_phylogeny_proteins(app_data, phylo_select) @@ -622,7 +655,7 @@ run_analysis <- function( # List of graphics to include in report params <- list( rs_interproscan_visualization = rs_IprGenes_value, - proximity_network = rval_rs_network_layout_value, + proximity_network = rs_network_layout_value, sunburst = data@df, data = rs_data_table_value, queryDataTable = queryDataTable_value, @@ -632,7 +665,7 @@ run_analysis <- function( query_domarch_cols = query_domarch_cols_value, query_iprDatabases = query_iprDatabases, query_iprVisType = query_iprVisType, - mainTable = maintable_value, + mainTable = mainTable_value, DALinTable = DALinTable_value, DALinPlot = DALinPlot_value, DANetwork = DANetwork_value, @@ -644,7 +677,7 @@ run_analysis <- function( phylo_sunburst_levels = levels, phylo_sunburst = phylogeny_prot_value, tree_msa_tool = tree_msa_tool, - rep_accnums = rep_accnums(), + rep_accnums = rep_accnums_value, msa_rep_num = 10, app_data = app_data, PhyloSelect = PhyloSelect, diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R index b35cd9f2..d2d0b303 100644 --- a/inst/report/scripts/run_molevolvr_pipeline.R +++ b/inst/report/scripts/run_molevolvr_pipeline.R @@ -124,6 +124,7 @@ submit_full <- function( } # Define the main pipeline function +#' @export run_molevolvr_pipeline <- function(input_paths, db, nhits, eval, is_query, type, i) { @@ -182,7 +183,7 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval, F_value <- basename(FILE) PREFIX <- sub("\\.faa$", "", basename(FILE)) PREFIX <- gsub(">", "", PREFIX) # Extract prefix from file name - OUTDIR <- file.path(OUTPATH, paste0(PREFIX, "_", type)) + OUTDIR <- file.path(OUTPATH, paste0(PREFIX)) dir.create(OUTDIR, showWarnings = FALSE) # Create output directory # setwd(OUTDIR) @@ -234,7 +235,7 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval, # Optionally run IPR2DA if IS_QUERY is true - if (is_query == "T") { + if (is_query == TRUE) { ## perform ipr2da on iprscan results da <- ipr2da(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")), PREFIX, "NA") @@ -244,11 +245,10 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval, is.na(PREFIX)) { print("No blast results provided, moving on.") } else { - append_ipr(ipr_da = da, blast = "NA", prefix = PREFIX) + browser() + file.copy(file.path(OUTDIR, paste0(PREFIX, ".ipr_domarch.tsv")), + file.path(OUTDIR, paste0(PREFIX, ".full_analysis.tsv"))) } - - file.copy(file.path(OUTDIR, paste0(PREFIX, ".ipr_domarch.tsv")), - file.path(OUTDIR, PREFIX, paste0(PREFIX, ".full_analysis.tsv"))) } else { # perform ipr2da on iprscan results da <- ipr2da(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")), @@ -660,7 +660,7 @@ cleanup_blast <- function(infile_blast, acc2info, prefix, wblast = F) { mutate(FullAccNum = gsub(".*[a-z]", "", FullAccNum)) # If using web-blast output - if (wblast == "T") { + if (wblast == TRUE) { blast_out <- fread(input = infile_blast, sep = "\t", header = FALSE, col.names = web_blastp_hit_colnames, fill = TRUE) cleanedup_blast <- blast_out %>% @@ -702,7 +702,7 @@ cleanup_blast <- function(infile_blast, acc2info, prefix, wblast = F) { # TaxID to lineage mapping cleanedup_blast$TaxID <- as.integer(cleanedup_blast$TaxID) - lineage_map <- fread("~/data/lineage_lookup.txt", + lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt", header = TRUE, fill = TRUE, colClasses = lineage_map_cols) @@ -841,7 +841,7 @@ run_interproscan <- function(query_file, prefix, outdir) { # Run InterProScan command # Construct the command command <- paste( - "~/interproscan-5.70-102.0/interproscan.sh -i", + "~/iprdir/interproscan-5.70-102.0/interproscan.sh -i", shQuote(query_file), "-b", shQuote(outfile), "-f TSV --cpu", Sys.getenv("INTERPROSCAN_CPUS", "4"), @@ -875,7 +875,7 @@ ipr2lin <- function(ipr, acc2info, prefix) { ipr_tax <- left_join(ipr_in, acc2info_out, by = "AccNum") # read in lineage map - lineage_map <- fread("~/data/lineage_lookup.txt", + lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt", header = T, fill = T) # merge ipr+info w/ lineage @@ -889,7 +889,7 @@ ipr2lin <- function(ipr, acc2info, prefix) { select(-Species.x, -Species.y) # add lookup table to iprscan file - lookup_tbl <- fread(input = "~/data/cln_lookup_tbl.tsv", + lookup_tbl <- fread(input = "~/awasyn/new_trial/cln_lookup_tbl.tsv", sep = "\t", header = T, fill = T) %>% distinct() if ("AccNum.x" %in% names(ipr_lin)) { @@ -1019,7 +1019,7 @@ ipr2da <- function(infile_ipr, prefix, # save domarch_lins file write_tsv(domarch_lins, - file = paste0(prefix, ".ipr_domarch.tsv"), + file = paste0(prefix, "/", prefix, ".ipr_domarch.tsv"), append = F, na = "NA" ) @@ -1032,13 +1032,13 @@ append_ipr <- function(ipr_da, blast, prefix) { # ! an 'AccNum' or 'AccNum.noV' column is required in blast table for joining ! blast_out <- read_tsv(blast, col_names = T) if ("AccNum.noV" %in% colnames(blast_out)) { - ipr_da <- read_tsv(paste0(prefix, ".ipr_domarch.tsv"), col_names = T) + ipr_da <- read_tsv(paste0(prefix, "/", prefix, ".ipr_domarch.tsv"), col_names = T) blast_ipr <- merge(blast_out, ipr_da, by = "AccNum.noV", all.x = T) } else { blast_ipr <- merge(blast_out, ipr_da, by = "AccNum", all.x = T) } - write_tsv(blast_ipr, file = paste0(prefix, ".full_analysis.tsv"), na = "NA") + write_tsv(blast_ipr, file = paste0(prefix, "/", prefix, ".full_analysis.tsv"), na = "NA") } # Web BLAST output diff --git a/inst/report/scripts/viz_utils.R b/inst/report/scripts/viz_utils.R index d721855b..4d9fdfe6 100644 --- a/inst/report/scripts/viz_utils.R +++ b/inst/report/scripts/viz_utils.R @@ -10,8 +10,8 @@ library(plotly) # Function to generate the InterProScan Visualization generate_ipr_genes_visualization <- function(data, app_data, - input_rs_iprDatabases, - input_rs_iprVisType) { + input_rs_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"), + input_rs_iprVisType = "Analysis") { # Check if analysis is loaded if (nrow(data@df) == 0 || app_data@ipr_path == "") { @@ -30,7 +30,7 @@ generate_ipr_genes_visualization <- function(data, app_data, n <- "Name" # Hardcoded to "Name" based on original code # Call the `ipr2viz_web` function - ipr_plot <- ipr2viz_web( + ipr_plot <- plotIPR2VizWeb( infile_ipr = data@ipr_path, accessions = data@df$Name, analysis = input_rs_iprDatabases, @@ -41,7 +41,7 @@ generate_ipr_genes_visualization <- function(data, app_data, } else { # Call the `ipr2viz` function with additional arguments - ipr_plot <- ipr2viz( + ipr_plot <- plotIPR2Viz( infile_ipr = data@ipr_path, infile_full = data@df, accessions = unique(data@df$Name), @@ -79,10 +79,10 @@ generate_rs_network_layout <- function(data, app_data, # Clean up domain architecture columns in the data df_data <- data@df %>% - mutate(across(tidyselect::starts_with("DomArch"), clean_string)) + mutate(across(tidyselect::starts_with("DomArch"), cleanString)) # Generate the network using the domain_network function - res_network <- domain_network( + res_network <- createDomainNetwork( df_data, column = col, domains_of_interest = ".*", @@ -260,7 +260,7 @@ get_fasta_data <- function(fasta_path) { } # Function to get domain sequences (assumes `data` is a predefined object) -get_domain_data <- function() { +get_domain_data <- function(data) { return(data@domainSeqs) # Return domain sequences } @@ -302,8 +302,7 @@ generate_query_heatmap <- function(query_data_df, See the FAQ for possible reasons/solutions.") } - # Assuming `lineage.Query.plot` is a custom function for plotting - lineage.Query.plot(prot, queries = queries, colname = "QueryName", + plotLineageQuery(prot, queries = queries, colname = "QueryName", cutoff = 100, color = heatmap_color) } @@ -404,8 +403,89 @@ generate_main_table <- function(data_df, main_select = NULL) { return(datatable_output) } +total_counts <- function(prot, column = "DomArch", lineage_col = "Lineage", + cutoff = 90, RowsCutoff = FALSE, digits = 2 + # type = "GC" +) { + column <- sym(column) + + prot <- select(prot, {{ column }}, {{ lineage_col }}) %>% + filter(!is.na({{ column }}) & !is.na({{ lineage_col }})) %>% + filter({{ column }} != "") + + prot <- summarizeByLineage(prot, column, by = lineage_col, query = "all") + col_count <- prot %>% + group_by({{ column }}) %>% + summarise(totalcount = sum(count)) + + total <- left_join(prot, col_count, by = as_string(column)) + + sum_count <- sum(total$count) + total <- total %>% + mutate("IndividualCountPercent" = totalcount / sum_count * 100) %>% + arrange(-totalcount, -count) + + cumm_percent <- total %>% + select({{ column }}, totalcount) %>% + distinct() %>% + mutate("CumulativePercent" = 0) + total_counter <- 0 + for (x in length(cumm_percent$totalcount):1) { + total_counter <- total_counter + cumm_percent$totalcount[x] + cumm_percent$CumulativePercent[x] <- total_counter / sum_count * 100 + } + + cumm_percent <- cumm_percent %>% select(CumulativePercent, {{ column }}) + + total <- total %>% left_join(cumm_percent, by = as_string(column)) + + # Round the percentage columns + total$CumulativePercent <- total$CumulativePercent %>% round(digits = digits) + total$IndividualCountPercent <- total$IndividualCountPercent %>% round(digits = digits) + + if (RowsCutoff) { + # If total counts is being used for plotting based on number of rows, + # don't include other observations that fall below the cummulative percent cutoff + # , but that have the same 'totalcount' number as the cutoff observation + total <- total %>% filter(CumulativePercent >= 100 - cutoff - .0001) + return(total) + } + + # Include observations that fall below the cummulative percent cutoff, + # but that have the same 'totalcount' as the cutoff observation + t <- total %>% filter(CumulativePercent >= 100 - cutoff) + if (length(t) == 0) { + cutoff_count <- 0 + } else { + cutoff_count <- t$totalcount[nrow(t)] + } + + total <- total %>% + filter(totalcount >= cutoff_count) %>% + ungroup() + + return(total) +} + +DA_TotalCounts <- function(DA_Prot, DACutoff, DA_col, app_data) { + # Check if ipr_path is not empty + if (app_data@ipr_path == "") { + stop("ipr_path is missing.") + } + + # Calculate total counts with the specified cutoff and column + prot_tc <- total_counts(DA_Prot, cutoff = DACutoff, column = DA_col) + + # Replace all instances of ">" in the Lineage column with "_" + prot_tc$Lineage <- map(prot_tc$Lineage, ~ str_replace_all(.x, ">", "_")) %>% + unlist() + + # Return the processed data + prot_tc +} + # Function to generate Domain Architecture Linear Table -generate_DA_lin_table <- function(DA_col, ipr_path, DAlin_count_table_DT) { +generate_DA_lin_table <- function(DA_col, ipr_path, DA_TotalCounts_value) { # Check if ipr_path is valid if (ipr_path == "") { stop("InterPro path is empty.") @@ -416,8 +496,39 @@ generate_DA_lin_table <- function(DA_col, ipr_path, DAlin_count_table_DT) { stop("DA_col input is required.") } + da_col <- sym(DA_col) + + # Perform the data transformation + DA_TotalCounts_value %>% + group_by({{ da_col }}, totalcount, CumulativePercent) %>% + summarize(LineageCount = n()) %>% + select({{ da_col }}, LineageCount, totalcount, CumulativePercent) %>% + arrange(-totalcount) + # Generate the DAlin count table - DAlin_table <- DAlin_count_table_DT() + DAlin_table <- DT::datatable( + DA_TotalCounts_value, + selection = "single", + extensions = c("Buttons"), + options = list( + pageLength = 25, + dom = "frlBtip", + buttons = list( + list( + extend = "csv", + text = "Download", + filename = "MolEvolvR_domarch", + exportOptions = list( + modifier = list(page = "all") + ) + ) + ), + scrollX = FALSE, + paging = TRUE, + fixedHeader = FALSE, + fixedColumns = list(leftColumns = 0, rightColumns = 0) + ) + ) return(DAlin_table) } @@ -425,19 +536,19 @@ generate_DA_lin_table <- function(DA_col, ipr_path, DAlin_count_table_DT) { # Function to generate the Domain Architecture Lineage Plot generate_DA_heatmap_plot <- function(DA_col, DACutoff, DA_Prot, DA_lin_color, - analysis_loaded, ipr_path) { + ipr_path) { # Check if ipr_path is valid if (ipr_path == "") { stop("InterPro path is empty.") } # Filter the protein data for plotting - prot <- DA_Prot() %>% + prot <- DA_Prot %>% filter(Lineage != "") %>% drop_na(Lineage) # Create the plot - plot <- lineage.DA.plot( + plot <- plotLineageDA( prot, colname = DA_col, cutoff = DACutoff, @@ -461,7 +572,7 @@ generate_domain_network <- function(DA_col, DACutoff, DA_Prot, } # Prepare the selected protein data - dn_data <- DA_Prot() + dn_data <- DA_Prot col <- sym(DA_col) # Convert DA_col to a symbol for use with dplyr dn_data[[col]] <- str_replace_all(dn_data[[col]], " ", "_") @@ -471,7 +582,7 @@ generate_domain_network <- function(DA_col, DACutoff, DA_Prot, filter(!!col != "") # Generate the domain network - res <- domain_network( + res <- createDomainNetwork( prot = dn_data, column = col, domains_of_interest = ".*", @@ -517,21 +628,34 @@ generate_domain_network <- function(DA_col, DACutoff, DA_Prot, } # Function to retrieve and clean Domain Architecture data -get_DA_Prot <- function(app_data, validate_da, DASelect) { +get_DA_Prot <- function(app_data, DASelect) { # Check if the ipr_path is valid if (app_data@ipr_path == "") { stop("InterPro path is empty.") } # Validate domain architecture - validate_da() + # Check if ipr_path is not empty + if (app_data@ipr_path == "") { + stop("ipr_path is missing.") + } + + # Check if the data frame has rows + if (nrow(app_data@df) <= 0) { + stop("No data available. Please ensure you have uploaded your data correctly. See Help documentation or contact JRaviLab (janani.ravi[AT]cuanschutz[DOT]edu).") + } + + # Check if there are domain architecture columns + # if (length(domarch_cols) == 0) { + # stop("Please ensure uploaded data has domain architecture columns.") + # } # Retrieve app data and clean domain architecture columns df_app_data <- app_data@df # Domain architecture column cleanup df_app_data <- df_app_data %>% - mutate(across(starts_with("DomArch"), clean_string)) + mutate(across(starts_with("DomArch"), cleanString)) # Filter based on user selection if (DASelect == "All") { @@ -612,7 +736,7 @@ generate_da_ipr_genes_plot <- function(app_data, da_iprDatabases, name_column <- "Name" # Generate the plot using the web version - plot <- ipr2viz_web( + plot <- plotIPR2VizWeb( infile_ipr = app_data@ipr_path, accessions = df$Name, analysis = da_iprDatabases, @@ -621,7 +745,7 @@ generate_da_ipr_genes_plot <- function(app_data, da_iprDatabases, ) } else { # Generate the plot using the local version - plot <- ipr2viz( + plot <- plotIPR2Viz( infile_ipr = app_data@ipr_path, infile_full = df, accessions = unique(df$Name), @@ -637,11 +761,6 @@ generate_da_ipr_genes_plot <- function(app_data, da_iprDatabases, # Function to filter proteins for phylogeny filter_phylogeny_proteins <- function(app_data, phylo_select) { - # Validate the input app_data - if (!analysis_loaded()) { - stop("Analysis not loaded.") - } - # Get the data frame from app_data df <- app_data@df @@ -740,3 +859,38 @@ acc_to_name <- function(app_data) { return(df) } +rep_accnums <- function(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data) { + # If `phylo` is true, return all `AccNum` values from `app_data` + if (phylo) { + return(app_data@df$AccNum) + } else { + # Switch based on `msa_reduce_by` value + tmp <- filter(app_data@df, QueryName == PhyloSelect) + rep_acc_species <- RepresentativeAccNums(tmp, reduced = "Species", column = "AccNum") + + # Get representative accession numbers by "Lineage" + rep_acc_lineage <- RepresentativeAccNums(tmp, reduced = "Lineage", column = "AccNum") + switch(msa_reduce_by, + "Species" = rep_acc_species, + "Lineage" = rep_acc_lineage, + "DomArch" = { + # Check if `ipr_path` is not empty + if (app_data@ipr_path == "") { + stop("ipr_path is missing.") + } + # Find top sequences by accession number based on criteria + seqs <- getTopAccByLinDomArch(infile_full = app_data@df, n = msa_rep_num, query = PhyloSelect) + if (is.null(seqs)) { + stop("No sequences found.") + } + return(seqs) + }, + "Full" = { + # Filter data for matching `QueryName` and return `AccNum` + tmp <- app_data@df %>% filter(QueryName == PhyloSelect) + return(tmp$AccNum) + } + ) + } +} + From 57f8cc53ade1591d7922eae23da81447950a2c47 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Mon, 18 Nov 2024 11:02:38 +0200 Subject: [PATCH 05/23] fix bugs Signed-off-by: Awa Synthia --- R/ipr2viz.R | 8 +- R/networks_domarch.R | 20 +-- R/pre-msa-tree.R | 70 ++++---- R/summarize.R | 176 +++++++++---------- inst/report/report_template.Rmd | 28 +-- inst/report/scripts/generate_report.R | 24 +-- inst/report/scripts/run_molevolvr_pipeline.R | 11 +- inst/report/scripts/viz_utils.R | 46 +++-- 8 files changed, 212 insertions(+), 171 deletions(-) diff --git a/R/ipr2viz.R b/R/ipr2viz.R index e582ab09..a4769ed6 100644 --- a/R/ipr2viz.R +++ b/R/ipr2viz.R @@ -94,14 +94,14 @@ getTopAccByLinDomArch <- function(infile_full, cln_domarch <- cln %>% select(domarch_cols) col_counts <- colSums(is.na(cln_domarch)) DA_sym <- sym(names(which.min(col_counts))) - showNotification(paste0("Selecting representatives by unique ", DA_sym, " and lineage combinations")) + # showNotification(paste0("Selecting representatives by unique ", DA_sym, " and lineage combinations")) ## Group by Lineage, DomArch and reverse sort by group counts grouped <- cln %>% group_by({{ DA_sym }}, {{ lin_sym }}) %>% arrange(desc(PcPositive)) %>% summarise(count = n(), AccNum = dplyr::first(AccNum)) %>% arrange(-count) %>% - filter({{ lin_sym }} != "" && {{ DA_sym }} != "") + filter({{ lin_sym }} != "" & {{ DA_sym }} != "") top_acc <- grouped$AccNum[1:n] top_acc <- na.omit(top_acc) return(top_acc) @@ -180,7 +180,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), ipr_out <- read_tsv(infile_ipr, col_names = T, col_types = MolEvolvR::iprscan_cols) ipr_out <- ipr_out %>% filter(.data$Name %in% accessions) analysis_cols <- paste0("DomArch.", analysis) - infile_full <- infile_full %>% select(.data$analysis_cols, .data$Lineage_short, .data$QueryName, .data$PcPositive, .data$AccNum) + infile_full <- infile_full %>% select(analysis_cols, .data$Lineage_short, .data$QueryName, .data$PcPositive, .data$AccNum) ## To filter by Analysis analysis <- paste(analysis, collapse = "|") ## @SAM: This can't be set in stone since the analysis may change! @@ -212,7 +212,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), analysis_labeler <- analyses %>% pivot_wider(names_from = .data$Analysis, values_from = .data$Analysis) - lookup_tbl_path <- "/data/research/jravilab/common_data/cln_lookup_tbl.tsv" + lookup_tbl_path <- "~/awasyn/new_trial/cln_lookup_tbl.tsv" lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols) lookup_tbl <- lookup_tbl %>% select(-.data$ShortName) # Already has ShortName -- Just needs SignDesc diff --git a/R/networks_domarch.R b/R/networks_domarch.R index ae6fe8be..e4da7aca 100755 --- a/R/networks_domarch.R +++ b/R/networks_domarch.R @@ -24,16 +24,16 @@ #' A network of domains is returned based on shared domain architectures. #' #' @param prot A data frame that contains the column 'DomArch'. -#' @param column Name of column containing Domain architecture from which nodes +#' @param column Name of column containing Domain architecture from which nodes #' and edges are generated. #' @param domains_of_interest Character vector specifying domains of interest. -#' @param cutoff Integer. Only use domains that occur at or above the cutoff for +#' @param cutoff Integer. Only use domains that occur at or above the cutoff for #' total counts if cutoff_type is "Total Count". -#' Only use domains that appear in cutoff or greater lineages if cutoff_type is +#' Only use domains that appear in cutoff or greater lineages if cutoff_type is #' Lineage. #' @param layout Character. Layout type to be used for the network. Options are: #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"} -#' @param query_color Character. Color to represent the queried domain in the +#' @param query_color Character. Color to represent the queried domain in the #' network. #' #' @importFrom dplyr across add_row all_of distinct filter mutate pull select @@ -211,7 +211,7 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c visOptions(highlightNearest = TRUE) }, error = function(e) { - showNotification(toString(e)) + # showNotification(toString(e)) vis_g <- "error" }, finally = { @@ -231,18 +231,18 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c #' #' #' @param prot A data frame that contains the column 'DomArch'. -#' @param column Name of column containing Domain architecture from which nodes +#' @param column Name of column containing Domain architecture from which nodes #' and edges are generated. #' @param domains_of_interest Character vector specifying the domains of interest. -#' @param cutoff Integer. Only use domains that occur at or above the cutoff for +#' @param cutoff Integer. Only use domains that occur at or above the cutoff for #' total counts if cutoff_type is "Total Count". -#' Only use domains that appear in cutoff or greater lineages if cutoff_type is +#' Only use domains that appear in cutoff or greater lineages if cutoff_type is #' Lineage. #' @param layout Character. Layout type to be used for the network. Options are: #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"} -#' @param query_color Color that the nodes of the domains in the +#' @param query_color Color that the nodes of the domains in the #' domains_of_interest vector are colored -#' @param partner_color Color that the nodes that are not part of the +#' @param partner_color Color that the nodes that are not part of the #' domains_of_interest vector are colored #' @param border_color Color for the borders of the nodes. #' @param IsDirected Is the network directed? Set to false to eliminate arrows diff --git a/R/pre-msa-tree.R b/R/pre-msa-tree.R index 75cc375d..e15e7996 100644 --- a/R/pre-msa-tree.R +++ b/R/pre-msa-tree.R @@ -46,7 +46,7 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE") #' @param y Delimitter. Default is space (" "). #' #' @importFrom rlang abort -#' +#' #' @return A character vector in title case. #' @export #' @@ -112,21 +112,21 @@ addLeaves2Alignment <- function(aln_file = "", lin_file = "data/rawdata_tsv/all_semiclean.txt", # !! finally change to all_clean.txt!! # lin_file="data/rawdata_tsv/PspA.txt", reduced = FALSE) { - + #Check if the alignment file is provided and exists if (nchar(aln_file) == 0) { abort("Error: Alignment file path must be provided.") } - + if (!file.exists(aln_file)) { abort(paste("Error: The alignment file '", aln_file, "' does not exist.")) } - + # Check if the lineage file exists if (!file.exists(lin_file)) { abort(paste("Error: The lineage file '", lin_file, "' does not exist.")) } - + # Check that the 'reduced' parameter is logical if (!is.logical(reduced) || length(reduced) != 1) { abort("Error: 'reduced' must be a single logical value (TRUE or FALSE).") @@ -249,15 +249,15 @@ addName <- function(data, if (!is.data.frame(data)) { abort("Error: The input 'data' must be a data frame") } - + # Check that the specified columns exist in the data required_cols <- c(accnum_col, spec_col, lin_col) missing_cols <- setdiff(required_cols, names(data)) if (length(missing_cols) > 0) { - abort(paste("Error: The following columns are missing from the data:", + abort(paste("Error: The following columns are missing from the data:", paste(missing_cols, collapse = ", "))) } - + cols <- c(accnum_col, "Kingdom", "Phylum", "Genus", "Spp") split_data <- data %>% separate( @@ -347,16 +347,16 @@ convertAlignment2FA <- function(aln_file = "", if (nchar(aln_file) == 0) { abort("Error: Alignment file path must be provided.") } - + if (!file.exists(aln_file)) { abort(paste("Error: The alignment file '", aln_file, "' does not exist.")) } - + # Check if the lineage file exists if (!file.exists(lin_file)) { abort(paste("Error: The lineage file '", lin_file, "' does not exist.")) } - + # Check that the 'reduced' parameter is logical if (!is.logical(reduced) || length(reduced) != 1) { abort("Error: 'reduced' must be a single logical value (TRUE or FALSE).") @@ -424,14 +424,14 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { if (!is.data.frame(acc2name)) { abort("Error: acc2name must be a data frame.") } - + # Check if the specified columns exist in the data frame if (!(acc_col %in% colnames(acc2name))) { - abort("Error: The specified acc_col '", acc_col, "' does not exist in + abort("Error: The specified acc_col '", acc_col, "' does not exist in acc2name.") } if (!(name_col %in% colnames(acc2name))) { - abort("Error: The specified name_col '", name_col, "' does not exist in + abort("Error: The specified name_col '", name_col, "' does not exist in acc2name.") } @@ -475,7 +475,7 @@ rename_fasta <- function(fa_path, outpath, abort("Error: The input FASTA file does not exist at the specified path: ", fa_path) } - + # Check if the output path is writable outdir <- dirname(outpath) if (!dir.exists(outdir)) { @@ -541,20 +541,20 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"), reduced = F) { # Check if the alignment path exists if (!dir.exists(aln_path)) { - abort("Error: The alignment directory does not exist at the specified + abort("Error: The alignment directory does not exist at the specified path: ", aln_path) } - + # Check if the output path exists; if not, attempt to create it if (!dir.exists(fa_outpath)) { dir.create(fa_outpath, recursive = TRUE) - message("Note: The output directory did not exist and has been created: ", + message("Note: The output directory did not exist and has been created: ", fa_outpath) } - + # Check if the linear file exists if (!file.exists(lin_file)) { - abort("Error: The linear file does not exist at the specified path: ", + abort("Error: The linear file does not exist at the specified path: ", lin_file) } # library(here) @@ -626,7 +626,7 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { if (!is.character(accessions) || length(accessions) == 0) { abort("Error: 'accessions' must be a non-empty character vector.") } - + if (!dir.exists(dirname(outpath))) { abort("Error: The output directory does not exist: ", dirname(outpath)) } @@ -676,7 +676,7 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { id = accessions_partitioned[[x]], db = "protein", rettype = "fasta", - api_key = Sys.getenv("ENTREZ_API_KEY") + #api_key = Sys.getenv("ENTREZ_API_KEY") ) ) }) @@ -732,21 +732,21 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { createRepresentativeAccNum <- function(prot_data, reduced = "Lineage", accnum_col = "AccNum") { - + # Validate input if (!is.data.frame(prot_data)) { abort("Error: 'prot_data' must be a data frame.") } - + # Check if the reduced column exists in prot_data if (!(reduced %in% colnames(prot_data))) { - abort("Error: The specified reduced column '", reduced, "' does not + abort("Error: The specified reduced column '", reduced, "' does not exist in the data frame.") } - + # Check if the accnum_col exists in prot_data if (!(accnum_col %in% colnames(prot_data))) { - abort("Error: The specified accession number column '", accnum_col, "' + abort("Error: The specified accession number column '", accnum_col, "' does not exist in the data frame.") } # Get Unique reduced column and then bind the AccNums back to get one AccNum per reduced column @@ -808,10 +808,10 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { if (!file.exists(fasta_file)) { abort("Error: The FASTA file does not exist: ", fasta_file) } - - if (file_ext(fasta_file) != "fasta" && file_ext(fasta_file) != "fa") { - abort("Error: The specified file is not a valid FASTA file: ", fasta_file) - } + + # if (file_ext(fasta_file) != "fasta" && file_ext(fasta_file) != "fa") { + # abort("Error: The specified file is not a valid FASTA file: ", fasta_file) + # } fasta <- readAAStringSet(fasta_file) aligned <- switch(tool, @@ -857,23 +857,23 @@ writeMSA_AA2FA <- function(alignment, outpath) { if (!inherits(alignment, "AAMultipleAlignment")) { abort("Error: The alignment must be of type 'AAMultipleAlignment'.") } - + # Check the output path is a character string if (!is.character(outpath) || nchar(outpath) == 0) { abort("Error: Invalid output path specified.") } - + # Check if the output directory exists outdir <- dirname(outpath) if (!dir.exists(outdir)) { abort("Error: The output directory does not exist: ", outdir) } - l <- length(rownames(alignment)) + l <- length(names(unmasked(alignment))) fasta <- "" for (i in 1:l) { - fasta <- paste0(fasta, paste(">", rownames(alignment)[i]), "\n") + fasta <- paste0(fasta, paste(">", names(unmasked(alignment)[i])), "\n") seq <- toString(unmasked(alignment)[[i]]) fasta <- paste0(fasta, seq, "\n") } diff --git a/R/summarize.R b/R/summarize.R index e76a86da..350c533d 100644 --- a/R/summarize.R +++ b/R/summarize.R @@ -41,18 +41,18 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo # Any row containing a domain in doms_remove will be removed # ^word$|(?<=\+)word$|(?<=\+)word(?=\+)|word(?=\+) - + # Check if prot is a data frame if (!is.data.frame(prot)) { abort("Error: 'prot' must be a data frame.") } - + # Check if the specified column exists in the data frame if (!column %in% names(prot)) { - abort(paste("Error: The specified column '", column, "' does not exist + abort(paste("Error: The specified column '", column, "' does not exist in the data frame.", sep = "")) } - + # If doms_keep or doms_remove are not provided, inform the user if (length(doms_keep) == 0 && length(doms_remove) == 0) { warning("Warning: No domains specified to keep or remove. Returning the @@ -109,25 +109,25 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo #' countByColumn #' @description #' Function to obtain element counts (DA, GC) -#' -#' @param prot A data frame containing the dataset to analyze, typically with +#' +#' @param prot A data frame containing the dataset to analyze, typically with #' multiple columns including the one specified by the `column` parameter. -#' @param column A character string specifying the name of the column to analyze. +#' @param column A character string specifying the name of the column to analyze. #' The default is "DomArch". -#' @param min.freq An integer specifying the minimum frequency an element must +#' @param min.freq An integer specifying the minimum frequency an element must #' have to be included in the output. Default is 1. #' #' @importFrom dplyr arrange as_tibble filter select #' #' @return A tibble with two columns: #' \describe{ -#' \item{`column`}{The unique elements from the specified column +#' \item{`column`}{The unique elements from the specified column #' (e.g., "DomArch").} -#' \item{`freq`}{The frequency of each element, i.e., the number of times +#' \item{`freq`}{The frequency of each element, i.e., the number of times #' each element appears in the specified column.} #' } -#' The tibble is filtered to only include elements that have a frequency -#' greater than or equal to `min.freq` and does not include elements with `NA` +#' The tibble is filtered to only include elements that have a frequency +#' greater than or equal to `min.freq` and does not include elements with `NA` #' values or those starting with a hyphen ("-"). #' @export #' @@ -136,20 +136,20 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo #' countByColumn(prot = my_data, column = "DomArch", min.freq = 10) #' } countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) { - + # Check if 'prot' is a data frame if (!is.data.frame(prot)) { abort("Error: 'prot' must be a data frame.") } - + # Check if the specified column exists in the data frame if (!column %in% names(prot)) { abort(paste("Error: The specified column '", column, "' does not exist in the data frame.", sep = "")) } - + # Check if min.freq is a positive integer - if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 || + if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 || floor(min.freq) != min.freq) { abort("Error: 'min.freq' must be a positive integer.") } @@ -171,14 +171,14 @@ countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) { #' Break string ELEMENTS into WORDS for domain architecture (DA) and genomic #' context (GC) #' -#' @param prot A dataframe containing the dataset to analyze. The specified +#' @param prot A dataframe containing the dataset to analyze. The specified #' `column` contains the string elements to be processed. -#' @param column A character string specifying the name of the column to analyze. +#' @param column A character string specifying the name of the column to analyze. #' Default is "DomArch". -#' @param conversion_type A character string specifying the type of conversion. +#' @param conversion_type A character string specifying the type of conversion. #' Two options are available: #' \describe{ -#' \item{`da2doms`}{Convert domain architectures into individual domains by +#' \item{`da2doms`}{Convert domain architectures into individual domains by #' replacing `+` symbols with spaces.} #' \item{`gc2da`}{Convert genomic context into domain architectures by #' replacing directional symbols (`<-`, `->`, and `|`) with spaces.} @@ -187,13 +187,13 @@ countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) { #' @importFrom dplyr pull #' @importFrom stringr str_replace_all #' -#' @return A single string where elements are delimited by spaces. The function -#' performs necessary substitutions based on the `conversion_type` and cleans up +#' @return A single string where elements are delimited by spaces. The function +#' performs necessary substitutions based on the `conversion_type` and cleans up #' extraneous characters like newlines, tabs, and multiple spaces. #' #' @examples #' \dontrun{ -#' tibble::tibble(DomArch = c("aaa+bbb", +#' tibble::tibble(DomArch = c("aaa+bbb", #' "a+b", "b+c", "b-c")) |> elements2Words() #' } #' @@ -202,20 +202,20 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms" if (!is.data.frame(prot)) { abort("Error: 'prot' must be a data frame.") } - + # Check if the specified column exists in the data frame if (!column %in% names(prot)) { - abort(paste("Error: The specified column '", column, "' does not exist in + abort(paste("Error: The specified column '", column, "' does not exist in the data frame.", sep = "")) } - + # Check for valid conversion_type values valid_types <- c("da2doms", "doms2da") if (!conversion_type %in% valid_types) { - abort(paste("Error: Invalid 'conversion_type'. Must be one of:", + abort(paste("Error: Invalid 'conversion_type'. Must be one of:", paste(valid_types, collapse = ", "))) } - + z1 <- prot %>% dplyr::pull(column) %>% str_replace_all("\\,", " ") %>% @@ -252,20 +252,20 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms" #' @description #' Get word counts (wc) [DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)] #' -#' @param string A character string containing the elements (words) to count. -#' This would typically be a space-delimited string representing domain +#' @param string A character string containing the elements (words) to count. +#' This would typically be a space-delimited string representing domain #' architectures or genomic contexts. #' #' @importFrom dplyr as_tibble filter arrange #' @importFrom stringr str_replace_all #' -#' @return A tibble (tbl_df) with two columns: +#' @return A tibble (tbl_df) with two columns: #' \describe{ -#' \item{`words`}{A column containing the individual words +#' \item{`words`}{A column containing the individual words #' (domains or domain architectures).} #' \item{`freq`}{A column containing the frequency counts for each word.} #' } -#' +#' #' #' @examples #' \dontrun{ @@ -279,7 +279,7 @@ words2WordCounts <- function(string) { if (!is.character(string) || length(string) != 1) { abort("Error: 'string' must be a single character vector.") } - + df_word_count <- string %>% # reduce spaces with length 2 or greater to a single space str_replace_all("\\s{2,}", " ") %>% @@ -311,14 +311,14 @@ words2WordCounts <- function(string) { #' filterByFrequency #' @description #' Function to filter based on frequencies -#' -#' @param x A tibble (tbl_df) containing at least two columns: one for +#' +#' @param x A tibble (tbl_df) containing at least two columns: one for #' elements (e.g., `words`) and one for their frequency (e.g., `freq`). -#' @param min.freq A numeric value specifying the minimum frequency threshold. -#' Only elements with frequencies greater than or equal to this value will be +#' @param min.freq A numeric value specifying the minimum frequency threshold. +#' Only elements with frequencies greater than or equal to this value will be #' retained. #' -#' @return A tibble with the same structure as `x`, but filtered to include +#' @return A tibble with the same structure as `x`, but filtered to include #' only rows where the frequency is greater than or equal to `min.freq`. #' #' @export @@ -328,18 +328,18 @@ words2WordCounts <- function(string) { #' filterByFrequency() #' } filterByFrequency <- function(x, min.freq) { - + # Check if 'x' is a data frame if (!is.data.frame(x)) { abort("Error: 'x' must be a data frame.") } - + # Check if 'min.freq' is a positive integer - if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 || + if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 || floor(min.freq) != min.freq) { abort("Error: 'min.freq' must be a positive integer.") } - + # Check if the 'freq' column exists in the data frame if (!"freq" %in% names(x)) { abort("Error: The data frame must contain a 'freq' column.") @@ -355,24 +355,24 @@ filterByFrequency <- function(x, min.freq) { #' @name MolEvolvR_summary #' @description #' A collection of summary functions for the MolEvolvR package. -#' +#' NULL #' summarizeByLineage #' #' @param prot A dataframe or tibble containing the data. -#' @param column A string representing the column to be summarized +#' @param column A string representing the column to be summarized #' (e.g., `DomArch`). Default is "DomArch". -#' @param by A string representing the grouping column (e.g., `Lineage`). +#' @param by A string representing the grouping column (e.g., `Lineage`). #' Default is "Lineage". -#' @param query A string specifying the query pattern for filtering the target +#' @param query A string specifying the query pattern for filtering the target #' column. Use "all" to skip filtering and include all rows. #' #' @importFrom dplyr arrange filter group_by summarise #' @importFrom rlang sym #' -#' @return A tibble summarizing the counts of occurrences of elements in -#' the `column`, grouped by the `by` column. The result includes the number +#' @return A tibble summarizing the counts of occurrences of elements in +#' the `column`, grouped by the `by` column. The result includes the number #' of occurrences (`count`) and is arranged in descending order of count. #' @rdname MolEvolvR_summary #' @export @@ -390,19 +390,19 @@ summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage" if (!is.data.frame(prot)) { abort("Error: 'prot' must be a data frame.") } - + # Check if the specified column exists in the data frame if (!column %in% names(prot)) { - abort(paste("Error: The specified column '", column, "' does not exist in + abort(paste("Error: The specified column '", column, "' does not exist in the data frame.", sep = "")) } - + # Check if the 'by' column exists in the data frame if (!by %in% names(prot)) { - abort(paste("Error: The specified 'by' column '", by, "' does not exist + abort(paste("Error: The specified 'by' column '", by, "' does not exist n the data frame.", sep = "")) } - + column <- sym(column) by <- sym(by) if (query == "all") { @@ -427,15 +427,15 @@ summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage" #' Function to summarize and retrieve counts by Domains & Domains+Lineage #' #' -#' @param x A dataframe or tibble containing the data. It must have columns +#' @param x A dataframe or tibble containing the data. It must have columns #' named `DomArch` and `Lineage`. #' #' @importFrom dplyr arrange count desc filter group_by summarise #' -#' @return A tibble summarizing the counts of unique domain architectures -#' (`DomArch`) per lineage (`Lineage`). The resulting table contains three -#' columns: `DomArch`, `Lineage`, and `count`, which indicates the frequency -#' of each domain architecture for each lineage. The results are arranged in +#' @return A tibble summarizing the counts of unique domain architectures +#' (`DomArch`) per lineage (`Lineage`). The resulting table contains three +#' columns: `DomArch`, `Lineage`, and `count`, which indicates the frequency +#' of each domain architecture for each lineage. The results are arranged in #' descending order of `count`. #' @rdname MolEvolvR_summary #' @@ -450,13 +450,13 @@ summarizeDomArch_ByLineage <- function(x) { if (!is.data.frame(x)) { abort("Error: 'x' must be a data frame.") } - + # Check if required columns exist in the data frame required_columns <- c("DomArch", "Lineage") missing_columns <- setdiff(required_columns, names(x)) - + if (length(missing_columns) > 0) { - abort(paste("Error: The following required columns are + abort(paste("Error: The following required columns are missing:", paste(missing_columns, collapse = ", "))) } x %>% @@ -472,16 +472,16 @@ summarizeDomArch_ByLineage <- function(x) { #' @description #' Function to retrieve counts of how many lineages a DomArch appears in #' -#' @param x A dataframe or tibble containing the data. It must have a column -#' named `DomArch` and a count column, such as `count`, which represents the +#' @param x A dataframe or tibble containing the data. It must have a column +#' named `DomArch` and a count column, such as `count`, which represents the #' occurrences of each architecture in various lineages. #' #' @importFrom dplyr arrange group_by filter summarise #' -#' @return A tibble summarizing each unique `DomArch`, along with the following +#' @return A tibble summarizing each unique `DomArch`, along with the following #' columns: #' - `totalcount`: The total occurrences of each `DomArch` across all lineages. -#' - `totallin`: The total number of unique lineages in which each `DomArch` +#' - `totallin`: The total number of unique lineages in which each `DomArch` #' appears. #' The results are arranged in descending order of `totallin` and `totalcount`. #' @rdname MolEvolvR_summary @@ -506,17 +506,17 @@ summarizeDomArch <- function(x) { #' summarizeGenContext_ByDomArchLineage #' -#' @param x A dataframe or tibble containing the data. It must have columns +#' @param x A dataframe or tibble containing the data. It must have columns #' named `GenContext`, `DomArch`, and `Lineage`. #' #' @importFrom dplyr arrange desc filter group_by n summarise #' -#' @return A tibble summarizing each unique combination of `GenContext`, +#' @return A tibble summarizing each unique combination of `GenContext`, #' `DomArch`, and `Lineage`, along with the following columns: #' - `GenContext`: The genomic context for each entry. #' - `DomArch`: The domain architecture for each entry. #' - `Lineage`: The lineage associated with each entry. -#' - `count`: The total number of occurrences for each combination of +#' - `count`: The total number of occurrences for each combination of #' `GenContext`, `DomArch`, and `Lineage`. #' #' The results are arranged in descending order of `count`. @@ -573,12 +573,12 @@ summarizeGenContext_ByLineage <- function(x) { #' summarizeGenContext #' -#' @param x A dataframe or tibble containing the data. It must have columns +#' @param x A dataframe or tibble containing the data. It must have columns #' named `GenContext`, `DomArch`, and `Lineage`. #' #' @importFrom dplyr arrange desc filter group_by n n_distinct summarise #' -#' @return A tibble summarizing each unique combination of `GenContext` and +#' @return A tibble summarizing each unique combination of `GenContext` and #' `Lineage`, along with the following columns: #' - `GenContext`: The genomic context for each entry. #' - `Lineage`: The lineage associated with each entry. @@ -623,13 +623,13 @@ summarizeGenContext <- function(x) { #' @param prot A data frame that must contain columns: #' \itemize{\item Either 'GenContext' or 'DomArch.norep' \item count} #' @param column Character. The column to summarize, default is "DomArch". -#' @param lineage_col Character. The name of the lineage column, default is +#' @param lineage_col Character. The name of the lineage column, default is #' "Lineage". -#' @param cutoff Numeric. Cutoff for total count. Counts below this cutoff value +#' @param cutoff Numeric. Cutoff for total count. Counts below this cutoff value #' will not be shown. Default is 0. -#' @param RowsCutoff Logical. If TRUE, filters based on cumulative percentage +#' @param RowsCutoff Logical. If TRUE, filters based on cumulative percentage #' cutoff. Default is FALSE. -#' @param digits Numeric. Number of decimal places for percentage columns. +#' @param digits Numeric. Number of decimal places for percentage columns. #' Default is 2. #' #' @@ -638,9 +638,9 @@ summarizeGenContext <- function(x) { #' #' @return A data frame with the following columns: #' - `{{ column }}`: Unique values from the specified column. -#' - `totalcount`: The total count of occurrences for each unique value in +#' - `totalcount`: The total count of occurrences for each unique value in #' the specified column. -#' - `IndividualCountPercent`: The percentage of each `totalcount` relative to +#' - `IndividualCountPercent`: The percentage of each `totalcount` relative to #' the overall count. #' - `CumulativePercent`: The cumulative percentage of total counts. #' @rdname MolEvolvR_summary @@ -661,28 +661,28 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col if (!is.data.frame(prot)) { abort("Error: 'prot' must be a data frame.") } - + # Check if the specified columns exist in the data frame required_columns <- c(column, lineage_col) missing_columns <- setdiff(required_columns, names(prot)) - + if (length(missing_columns) > 0) { - abort(paste("Error: The following required columns are missing:", + abort(paste("Error: The following required columns are missing:", paste(missing_columns, collapse = ", "))) } - + # Check that cutoff is a numeric value between 0 and 100 if (!is.numeric(cutoff) || length(cutoff) != 1 || cutoff < 0 || cutoff > 100) { abort("Error: 'cutoff' must be a numeric value between 0 and 100.") } - + # Check that digits is a non-negative integer - if (!is.numeric(digits) || length(digits) != 1 || digits < 0 || + if (!is.numeric(digits) || length(digits) != 1 || digits < 0 || floor(digits) != digits) { abort("Error: 'digits' must be a non-negative integer.") } - - column <- sym(column) + + # column <- sym(column) prot <- select(prot, {{ column }}, {{ lineage_col }}) %>% filter(!is.na({{ column }}) & !is.na({{ lineage_col }})) %>% @@ -690,10 +690,10 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col prot <- summarizeByLineage(prot, column, by = lineage_col, query = "all") col_count <- prot %>% - group_by({{ column }}) %>% + group_by(!!sym(column)) %>% summarise(totalcount = sum(count)) - total <- left_join(prot, col_count, by = as_string(column)) + total <- left_join(prot, col_count, by = column) sum_count <- sum(total$count) total <- total %>% @@ -845,7 +845,7 @@ findParalogs <- function(prot) { if (!is.data.frame(prot)) { abort("Error: 'prot' must be a data frame.") } - + # Remove eukaryotes prot <- prot %>% filter(!grepl("^eukaryota", Lineage)) paralogTable <- prot %>% diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd index c42aa443..46dcf57e 100644 --- a/inst/report/report_template.Rmd +++ b/inst/report/report_template.Rmd @@ -353,7 +353,7 @@ if (is.null(params$DA_col)) { plot_data <- params$DA_Prot plot_data[[da_col]] <- str_replace_all(plot_data[[da_col]], " ", "_") -wordcloud_element(plot_data, colname = da_col, cutoff = params$DACutoff, UsingRowsCutoff = F) +createWordCloudElement(plot_data, colname = da_col, cutoff = params$DACutoff, UsingRowsCutoff = F) ``` ### Interproscan Visualization @@ -367,7 +367,7 @@ params$da_interproscan_visualization ```{r, echo=FALSE} upset_plot_data <- params$DA_Prot upset_plot_data[[da_col]] <- str_replace_all(upset_plot_data[[da_col]], " ", "_") -final_plot <- upset.plot(upset_plot_data, colname = da_col, cutoff = params$DACutoff) +final_plot <- plotUpSet(upset_plot_data, colname = da_col, cutoff = params$DACutoff) domains <- upset_plot_data %>% dplyr::pull(da_col) n_unique_domains <- domains %>% @@ -413,30 +413,33 @@ if(length(params$rep_accnums) >= 3 ) { if (!params$rval_phylo) { seqs <- readAAStringSet(params$app_data@fasta_path) names(seqs) <- sub(" .*", "", names(seqs)) - query_accession <- params$app_data@df %>% filter(params$app_data@df$QueryName == params$PhyloSelect) + query_accession <- params$app_data@df %>% filter(!duplicated(QueryName)) query_accession <- unique(query_accession$Query) query <- seqs[query_accession] - names(query) <- params$PhyloSelect + names(query) <- unique(params$app_data@df$QueryName) query <- AAStringSet(query) } # Generate Fasta File rep_fasta_path <- tempfile() - acc2fa(rep, outpath = rep_fasta_path, "sequential") + acc2FA(rep, outpath = rep_fasta_path, "sequential") rename_fasta(rep_fasta_path, rep_fasta_path, - replacement_function = map_acc2name, + replacement_function = mapAcc2Name, acc2name = params$acc_to_name ) if (!params$rval_phylo) { writeXStringSet(query, rep_fasta_path, append = TRUE) } - rep_msa_path <- tempfile() alignFasta(rep_fasta_path, tree_msa_tool, rep_msa_path) } ``` ```{r, echo=FALSE} +library(msa) +library(ape) +library(tidytree) +library(ggtree) if(length(params$rep_accnums) >= 3 ) { seq_tree(fasta_filepath = rep_msa_path) } else {"Not enough representative sequences: try changing the 'Reduce By' field."} @@ -449,7 +452,7 @@ if(length(params$rep_accnums) >= 3 ) { if (!params$rval_phylo) { seqs <- readAAStringSet(params$app_data@fasta_path) names(seqs) <- sub(" .*", "", names(seqs)) - query_accession <- params$app_data@df %>% filter(params$app_data@df$QueryName == params$PhyloSelect) + query_accession <- params$app_data@df %>% filter(!duplicated(QueryName)) query_accession <- unique(query_accession$Query) query <- seqs[query_accession] names(query) <- params$PhyloSelect @@ -458,9 +461,9 @@ if(length(params$rep_accnums) >= 3 ) { # Generate Fasta File rep_fasta_path <- tempfile() - acc2fa(rep, outpath = rep_fasta_path, "sequential") + acc2FA(rep, outpath = rep_fasta_path, "sequential") rename_fasta(rep_fasta_path, rep_fasta_path, - replacement_function = map_acc2name, + replacement_function = mapAcc2Name, acc2name = params$acc_to_name ) if (!params$rval_phylo) { @@ -469,16 +472,17 @@ if(length(params$rep_accnums) >= 3 ) { # Call MSA2PDF msa_pdf_path <- tempfile() - msa_prefix <- "/data/research/jravilab/molevolvr_app/www/msa_figs/" + msa_prefix <- "~/awasyn/MolEvolvRsHzFyk_full/msa_figs/" post_fix <- paste("msa", params$query_pin, params$PhyloSelect, params$msa_reduce_by, ".pdf", sep = "_") msa_pdf_path <- paste0(msa_prefix, post_fix) - msa_pdf(fasta_path = rep_fasta_path, msa_pdf_path) + createMSA_PDF(fasta_path = rep_fasta_path, msa_pdf_path) } ``` ```{r msa_pdf, echo = FALSE, out.width = "95%", out.height = "800px"} +# apt-get install texlive if(length(params$rep_accnums) >= 3 ) { knitr::include_graphics(msa_pdf_path) } else {"Not enough representative sequences: try changing the 'Reduce By' field."} diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R index 7dd89030..3ea331d5 100644 --- a/inst/report/scripts/generate_report.R +++ b/inst/report/scripts/generate_report.R @@ -90,16 +90,20 @@ run_analysis <- function( output_file = file.path(tempdir(), "report.html"), DASelect = "All", mainSelect = NULL, - PhyloSelect = NULL, + PhyloSelect = "All", q_heatmap_select = "All", DACutoff = 95, GCCutoff = 0.5, query_select = NULL, - query_iprDatabases = NULL, + query_iprDatabases = c( + "Pfam", "SMART", "Phobius", + "Gene3D", "TMHMM", "SignalP_GRAM_POSITIVE", + "SUPERFAMILY", "MobiDBLite", "TIGRFAM", "PANTHER", "Coils" + ), query_iprVisType = "Analysis", tree_msa_tool = "ClustalO", levels = 2, DA_Col = "DomArch.Pfam", - msa_rep_num = NULL, + msa_rep_num = 10, msa_reduce_by = "Species", rval_phylo = FALSE, DA_lin_color = c("default", "viridis", "inferno", "magma", "plasma", "cividis"), @@ -375,7 +379,7 @@ run_analysis <- function( pattern = paste0("molevolvr_acccnum_validation-", accnum, "-", "XXXXX"), fileext = ".fa" ) - acc2fa(accnum, tmp) + acc2FA(accnum, tmp) readAAStringSet(tmp) TRUE }, @@ -398,7 +402,7 @@ run_analysis <- function( Please try submitting FASTA sequences instead.") } else { # Write a multifasta - acc2fa(accnum_vect, outpath = path) + acc2FA(accnum_vect, outpath = path) } }, "MSA" = { @@ -584,7 +588,7 @@ run_analysis <- function( domarch_cols_value <- get_domarch_cols(app_data, DASelect) - query_domarch_cols_value <- get_domarch_columns(query_data) + query_domarch_cols_value <- get_domarch_columns(query_data@df) mainTable_value <- generate_data_table(data) @@ -613,7 +617,7 @@ run_analysis <- function( da_iprVisType, DASelect) - query_heatmap_value <- generate_query_heatmap(query_data_df, + query_heatmap_value <- generate_query_heatmap(query_data@df, heatmap_select = "All", heatmap_color = "blue") @@ -626,7 +630,7 @@ run_analysis <- function( app_data@ipr_path) DALin_TotalCounts_value <- DA_TotalCounts(DA_Prot_value, - DACutoff = 95, + DACutoff, DA_col = "DomArch.Pfam", app_data) @@ -640,7 +644,7 @@ run_analysis <- function( networkLayout = "nice", app_data@ipr_path) - rep_accnums_value <- + rep_accnums_value <- rep_accnums(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data) phylogeny_prot_value <- filter_phylogeny_proteins(app_data, phylo_select) @@ -678,7 +682,7 @@ run_analysis <- function( phylo_sunburst = phylogeny_prot_value, tree_msa_tool = tree_msa_tool, rep_accnums = rep_accnums_value, - msa_rep_num = 10, + msa_rep_num = 3, app_data = app_data, PhyloSelect = PhyloSelect, acc_to_name = acc_to_name_value, diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R index d2d0b303..2fc1fdbe 100644 --- a/inst/report/scripts/run_molevolvr_pipeline.R +++ b/inst/report/scripts/run_molevolvr_pipeline.R @@ -245,7 +245,6 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval, is.na(PREFIX)) { print("No blast results provided, moving on.") } else { - browser() file.copy(file.path(OUTDIR, paste0(PREFIX, ".ipr_domarch.tsv")), file.path(OUTDIR, paste0(PREFIX, ".full_analysis.tsv"))) } @@ -728,6 +727,7 @@ run_blastclust <- function(infile, suffix, outdir) { # Prepare output file path outfile <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv")) + input_file <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv.clstr")) # Print process messages @@ -741,10 +741,17 @@ run_blastclust <- function(infile, suffix, outdir) { "cd-hit -i %s -o %s -c 0.8 -aS 0.6 -T 8", infile, outfile ) + clean_command <- sprintf( + "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s", + input_file, + outfile + ) + cat("\nPerforming BLASTCLUST analysis on", infile, "\n") # system(blastclust_cmd) system(cdhit_command) + system(clean_command) } @@ -802,7 +809,7 @@ clust2tbl <- function(clust, blast) { } } # blast_out$AccNum <- gsub("^>", "", blast_out$AccNum) - blast_out$AccNum <- paste0(">", blast_out$AccNum) + # blast_out$AccNum <- paste0(">", blast_out$AccNum) blast_clustnames <- merge(blast_out, new_clust, by = "AccNum") for (i in 1:nrow(blast_clustnames)) { diff --git a/inst/report/scripts/viz_utils.R b/inst/report/scripts/viz_utils.R index 4d9fdfe6..41304360 100644 --- a/inst/report/scripts/viz_utils.R +++ b/inst/report/scripts/viz_utils.R @@ -91,8 +91,8 @@ generate_rs_network_layout <- function(data, app_data, ) # Validate that the result is not an error - if (res_network == "error") { - stop("Not enough nodes to construct a network.") + if (any(res_network == "error")) { + stop("Not enough nodes to construct a network.") } return(res_network) @@ -407,7 +407,7 @@ total_counts <- function(prot, column = "DomArch", lineage_col = "Lineage", cutoff = 90, RowsCutoff = FALSE, digits = 2 # type = "GC" ) { - column <- sym(column) + # column <- sym(column) prot <- select(prot, {{ column }}, {{ lineage_col }}) %>% filter(!is.na({{ column }}) & !is.na({{ lineage_col }})) %>% @@ -415,10 +415,10 @@ total_counts <- function(prot, column = "DomArch", lineage_col = "Lineage", prot <- summarizeByLineage(prot, column, by = lineage_col, query = "all") col_count <- prot %>% - group_by({{ column }}) %>% + group_by(!!sym(column)) %>% summarise(totalcount = sum(count)) - total <- left_join(prot, col_count, by = as_string(column)) + total <- left_join(prot, col_count, by = column) sum_count <- sum(total$count) total <- total %>% @@ -584,14 +584,14 @@ generate_domain_network <- function(DA_col, DACutoff, DA_Prot, # Generate the domain network res <- createDomainNetwork( prot = dn_data, - column = col, + column = DA_col, domains_of_interest = ".*", cutoff = DACutoff, layout = networkLayout ) # Validate the result - if (res == "error") { + if (any(res == "error")) { stop("Not enough nodes to construct a network. Try increasing 'Total Count Cutoff'.") } @@ -865,11 +865,11 @@ rep_accnums <- function(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data return(app_data@df$AccNum) } else { # Switch based on `msa_reduce_by` value - tmp <- filter(app_data@df, QueryName == PhyloSelect) - rep_acc_species <- RepresentativeAccNums(tmp, reduced = "Species", column = "AccNum") + tmp <- filter(app_data@df) + rep_acc_species <- createRepresentativeAccNum(tmp, reduced = "Species", accnum_col = "AccNum") # Get representative accession numbers by "Lineage" - rep_acc_lineage <- RepresentativeAccNums(tmp, reduced = "Lineage", column = "AccNum") + rep_acc_lineage <- createRepresentativeAccNum(tmp, reduced = "Lineage", accnum_col = "AccNum") switch(msa_reduce_by, "Species" = rep_acc_species, "Lineage" = rep_acc_lineage, @@ -894,3 +894,29 @@ rep_accnums <- function(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data } } +seq_tree <- function(fasta_filepath){ + my_seqs <- readAAStringSet(fasta_filepath) #, format="fasta", seek.first.rec=T) + my_seqs_msa <- msa(my_seqs) + my_seqs_msa_aln <- msaConvert(my_seqs_msa, type="seqinr::alignment") + + #below was commented out, does it need to change as one of the parameters? the bottom keeps + d <- dist.alignment(my_seqs_msa_aln, "identity") + #as.matrix(d)[2:5, "HBA1_Homo_sapiens", drop=FALSE] + + ## Phylogenetic tree + ## using package ape + ## build neighbor-joining tree + seqTree <- nj(d) + #plot(seqTree, main="Phylogenetic Tree of MSA") + groupInfo <- split(seqTree$tip.label, + gsub("_\\w+", "", seqTree$tip.label)) + seqTree <- groupOTU(seqTree, groupInfo) + # ggtree(seqTree, aes(color=group), + # layout='circular') + + # geom_tiplab(size=1, aes(angle=angle)) + #https://yulab-smu.top/treedata-book/chapter4.html + #offs <- 0 + tree <- ggtree(seqTree, branch.length = "dN_vs_dS") + theme_tree2(axis.line.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank()) + msaplot(tree, fasta=fasta_filepath, offset=0.5, bg_line = TRUE) + geom_tiplab(align=TRUE, linesize=0.5, size=3) + +} From 22e644507665de5e514bed2ba29db5f07e08c2bd Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Wed, 20 Nov 2024 02:59:59 +0200 Subject: [PATCH 06/23] camelcase case study functions Signed-off-by: Awa Synthia --- inst/report/report_template.Rmd | 20 ++--- inst/report/scripts/MolEvolData_class.R | 52 ++++++------ inst/report/scripts/generate_report.R | 62 +++++++-------- inst/report/scripts/run_molevolvr_pipeline.R | 83 ++++++++++---------- inst/report/scripts/viz_utils.R | 52 ++++++------ 5 files changed, 133 insertions(+), 136 deletions(-) diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd index 46dcf57e..dddf357c 100644 --- a/inst/report/report_template.Rmd +++ b/inst/report/report_template.Rmd @@ -30,11 +30,11 @@ params: ## Tree tree_msa_tool: default ##input ## MSA - rep_accnums: default + repAccNums: default msa_rep_num: default ##input app_data: default PhyloSelect: default ##input - acc_to_name: default + acc2Name: default rval_phylo: default query_pin: default msa_reduce_by: default @@ -408,8 +408,8 @@ if (is.null(params$msa_rep_num)) { ``` ```{r, include=FALSE} -if(length(params$rep_accnums) >= 3 ) { - rep <- params$rep_accnums[1:msa_rep_num] +if(length(params$repAccNums) >= 3 ) { + rep <- params$repAccNums[1:msa_rep_num] if (!params$rval_phylo) { seqs <- readAAStringSet(params$app_data@fasta_path) names(seqs) <- sub(" .*", "", names(seqs)) @@ -425,7 +425,7 @@ if(length(params$rep_accnums) >= 3 ) { acc2FA(rep, outpath = rep_fasta_path, "sequential") rename_fasta(rep_fasta_path, rep_fasta_path, replacement_function = mapAcc2Name, - acc2name = params$acc_to_name + acc2name = params$acc2Name ) if (!params$rval_phylo) { writeXStringSet(query, rep_fasta_path, append = TRUE) @@ -440,15 +440,15 @@ library(msa) library(ape) library(tidytree) library(ggtree) -if(length(params$rep_accnums) >= 3 ) { - seq_tree(fasta_filepath = rep_msa_path) +if(length(params$repAccNums) >= 3 ) { + seqTree(fasta_filepath = rep_msa_path) } else {"Not enough representative sequences: try changing the 'Reduce By' field."} ``` ### MSA ```{r, include=FALSE} -if(length(params$rep_accnums) >= 3 ) { +if(length(params$repAccNums) >= 3 ) { if (!params$rval_phylo) { seqs <- readAAStringSet(params$app_data@fasta_path) names(seqs) <- sub(" .*", "", names(seqs)) @@ -464,7 +464,7 @@ if(length(params$rep_accnums) >= 3 ) { acc2FA(rep, outpath = rep_fasta_path, "sequential") rename_fasta(rep_fasta_path, rep_fasta_path, replacement_function = mapAcc2Name, - acc2name = params$acc_to_name + acc2name = params$acc2Name ) if (!params$rval_phylo) { writeXStringSet(query, rep_fasta_path, append = TRUE) @@ -483,7 +483,7 @@ if(length(params$rep_accnums) >= 3 ) { ```{r msa_pdf, echo = FALSE, out.width = "95%", out.height = "800px"} # apt-get install texlive -if(length(params$rep_accnums) >= 3 ) { +if(length(params$repAccNums) >= 3 ) { knitr::include_graphics(msa_pdf_path) } else {"Not enough representative sequences: try changing the 'Reduce By' field."} ``` diff --git a/inst/report/scripts/MolEvolData_class.R b/inst/report/scripts/MolEvolData_class.R index c50e2925..791f1d6a 100644 --- a/inst/report/scripts/MolEvolData_class.R +++ b/inst/report/scripts/MolEvolData_class.R @@ -72,7 +72,7 @@ top_acc <- function(cln_file, DA_col = "DomArch.Pfam", } -combine_files_nopmap <- function(inpath, pattern, outpath, +combineFilesNopmap <- function(inpath, pattern, outpath, delim = "\t", skip = 0, col_names) { source_files <- dir(path = inpath, pattern = pattern, recursive = T) @@ -111,7 +111,7 @@ full_analysis_colnames <- c("AccNum", "QueryName", "STitle", "Species.x", "Species.y", "SourceDB", "Completeness") -process_wrapper_dir <- function(path, pinName, type = "full") { +processWrapperDir <- function(path, pinName, type = "full") { if (type == "full") { if (file.exists(paste0(path, "/cln_combined.tsv")) && file.exists(paste0(path, "/ipr_combined.tsv"))) { query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv")) @@ -164,12 +164,12 @@ process_wrapper_dir <- function(path, pinName, type = "full") { com_blast_path <- paste0(path, "/blast_combined.tsv") ipr_blast_path <- paste0(path, "/ipr_combined.tsv") - com_blast_data <- combine_files_nopmap(paste0(path, "/"), + com_blast_data <- combineFilesNopmap(paste0(path, "/"), pattern = "*.full_analysis.tsv", skip = 0, col_names = c(), outpath = com_blast_path, delim = "\t" ) - ipr_blast_data <- combine_files_nopmap(paste0(path, "/"), + ipr_blast_data <- combineFilesNopmap(paste0(path, "/"), pattern = "*.iprscan_cln.tsv", skip = 0, col_names = ipr_colnames, outpath = ipr_blast_path, delim = "\t" ) @@ -212,7 +212,7 @@ process_wrapper_dir <- function(path, pinName, type = "full") { ) query_data <- query_data %>% select(Query, QueryName) com_blast_path <- paste0(path, "/blast_combined.tsv") - com_blast_data <- combine_files_nopmap(paste0(path, "/"), + com_blast_data <- combineFilesNopmap(paste0(path, "/"), pattern = "*.blast.cln.tsv", skip = 0, col_names = c(), @@ -282,7 +282,7 @@ process_wrapper_dir <- function(path, pinName, type = "full") { queries = queries, cln_path = cln_path ) com_blast_path <- paste0(path, "/blast_combined.tsv") - com_blast_data <- combine_files_nopmap(paste0(path, "/"), + com_blast_data <- combineFilesNopmap(paste0(path, "/"), pattern = "*.full_analysis.tsv", skip = 0, col_names = c(), @@ -355,8 +355,8 @@ clean_fetched <- function(df) { } # Description -# validation functions for accession numbers/headers for FASTA (validate_accnum_fasta) -# & accession number input (validate_accnum) submission types. +# validation functions for accession numbers/headers for FASTA (validateAccNumFasta) +# & accession number input (validateAccNum) submission types. library(Biostrings) library(httr) @@ -364,7 +364,7 @@ library(httr2) library(rentrez) library(shiny) -.get_accnum_from_fasta <- function(biostrings_aa_string_set, verbose = FALSE) { +getAccNumFromFasta <- function(biostrings_aa_string_set, verbose = FALSE) { # parsing/cleaning accession numbers using the same methods as # `upstream_scripts/00_submit_full.R`'s `get_sequences()` function accnums <- c() @@ -392,7 +392,7 @@ library(shiny) return(accnums) } -validate_accnum_fasta <- function(text) { +validateAccNumFasta <- function(text) { # INPUT: text from input object that houses FASTA data # validate the headers/accnums for FASTA submission # Return: @@ -434,7 +434,7 @@ validate_accnum_fasta <- function(text) { } # parse accnums from fasta - accnums <- .get_accnum_from_fasta(fasta) + accnums <- getAccNumFromFasta(fasta) tb_accnum_counts <- tibble("frequencies" = table(accnums)) # check for duplicates if (any(tb_accnum_counts$frequencies > 1)) { @@ -447,7 +447,7 @@ validate_accnum_fasta <- function(text) { #' Test whether a single accession returns a valid protein from Entrez -is_accnum_valid_entrez <- function(accnum, verbose = FALSE) { +isAccNumValidEntrez <- function(accnum, verbose = FALSE) { # empty accnum wil not raise an error from efetch, so test for this first if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)} @@ -467,7 +467,7 @@ is_accnum_valid_entrez <- function(accnum, verbose = FALSE) { } #' Test whether a single accession returns a valid protein from EBI -is_accnum_valid_ebi <- function(accnum, verbose = FALSE) { +isAccNumValidEbi <- function(accnum, verbose = FALSE) { # validation: ensure there's some text to parse if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)} # construct a httr2 request to POST an accession number, then GET the fasta @@ -492,7 +492,7 @@ is_accnum_valid_ebi <- function(accnum, verbose = FALSE) { } #' Perform a series of API reqs using NCBI entrez to validate accession numbers -perform_entrez_reqs <- function(accnums, verbose = FALSE, track_progress = FALSE) { +performEntrezReqs <- function(accnums, verbose = FALSE, track_progress = FALSE) { # API guidelines docs # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html # entrez recommends no more than 3 POSTs per second @@ -501,7 +501,7 @@ perform_entrez_reqs <- function(accnums, verbose = FALSE, track_progress = FALSE results <- vapply( X = accnums, FUN = function(accnum) { - result <- is_accnum_valid_entrez(accnum, verbose = TRUE) + result <- isAccNumValidEntrez(accnum, verbose = TRUE) i <<- i + 1L if (i >= 3L) {Sys.sleep(1); print('sleeping for entrez API reqs'); i <<- 0L} if (track_progress) {incProgress(1)} @@ -512,7 +512,7 @@ perform_entrez_reqs <- function(accnums, verbose = FALSE, track_progress = FALSE return(results) } -perform_ebi_reqs <- function(accnums, verbose = FALSE, track_progress = FALSE) { +performEbiReqs <- function(accnums, verbose = FALSE, track_progress = FALSE) { # API guidelines docs # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html # EBI allows 200 POSTs per second @@ -521,7 +521,7 @@ perform_ebi_reqs <- function(accnums, verbose = FALSE, track_progress = FALSE) { results <- vapply( X = accnums, FUN = function(accnum) { - result <- is_accnum_valid_ebi(accnum, verbose = TRUE) + result <- isAccNumValidEbi(accnum, verbose = TRUE) i <<- i + 1L if (i >= 200L) {Sys.sleep(1); i <<- 0L} if (track_progress) incProgress(1) @@ -533,7 +533,7 @@ perform_ebi_reqs <- function(accnums, verbose = FALSE, track_progress = FALSE) { } #' Validate accession numbers from MolEvolvR user input -validate_accnum <- function(text, verbose = FALSE, track_progress = FALSE, n_steps = integer()) { +validateAccNum <- function(text, verbose = FALSE, track_progress = FALSE, n_steps = integer()) { # API guidelines docs # entrez https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Usage_Guidelines_and_Requiremen # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html @@ -558,7 +558,7 @@ validate_accnum <- function(text, verbose = FALSE, track_progress = FALSE, n_ste if (track_progress) {incProgress(1 / n_steps, message = "Please wait (searching NCBI's protein database) . . .")} # entrez API - entrez_results <- perform_entrez_reqs(accnums, verbose = verbose) + entrez_results <- performEntrezReqs(accnums, verbose = verbose) if (verbose) { stringr::str_glue("rentrez results:\n\t{paste0(entrez_results, collapse=",")}\n") |> @@ -568,7 +568,7 @@ validate_accnum <- function(text, verbose = FALSE, track_progress = FALSE, n_ste # EBI API # if all of entrez resulted in success, then skip EBI. Else, try EBI if (!all(entrez_results)) { - ebi_results <- perform_ebi_reqs(accnums, verbose = verbose) + ebi_results <- performEbiReqs(accnums, verbose = verbose) if (verbose) { stringr::str_glue("ebi results:\n\t{paste0(ebi_results, collapse=",")}\n") |> @@ -606,7 +606,7 @@ validate_accnum <- function(text, verbose = FALSE, track_progress = FALSE, n_ste # Description # validation function for evalue input #=============================================================================== -validate_evalue <- function(input_value) { +validateEvalue <- function(input_value) { is_valid_evalue <- is.numeric(input_value) && input_value != 0 return(is_valid_evalue) } @@ -619,7 +619,7 @@ validate_evalue <- function(input_value) { library("Biostrings") library("tidyverse") #------------------------------------------------------------------------------- -.guess_seq_type <- function(single_fasta, dna_guess_cutoff = 0.9, other_guess_cutoff = 0.5) { +guessSeqType <- function(single_fasta, dna_guess_cutoff = 0.9, other_guess_cutoff = 0.5) { tb <- as_tibble(alphabetFrequency(single_fasta)) n_other <- if ("other" %in% colnames(tb)) sum(unlist(tb["other"])) else 0 @@ -649,7 +649,7 @@ library("tidyverse") return(guess) } -.validate_seq_body <- function(text) { +validateSeqBody <- function(text) { # convert string to single letter character vector individual_chars <- unlist(strsplit(text, "")) # return the characters from input that are NOT in the AA_ALPHABET @@ -669,7 +669,7 @@ library("tidyverse") } #------------------------------------------------------------------------------- -validate_fasta <- function(fasta_path, .type = "AA") { +validateFasta <- function(fasta_path, .type = "AA") { # handle case of fasta being unreadable/unrecognized format fasta <- tryCatch( expr = Biostrings::readAAStringSet(fasta_path), @@ -695,7 +695,7 @@ validate_fasta <- function(fasta_path, .type = "AA") { seq_body_validations <- c() for (i in seq_along(1:length(fasta))) { seq_body <- as.character(fasta[i]) - seq_body_validations <- c(.validate_seq_body(seq_body), seq_body_validations) + seq_body_validations <- c(validateSeqBody(seq_body), seq_body_validations) } # require all sequences to have chars only in AA_ALPHABET if (!(all(seq_body_validations))) { @@ -707,7 +707,7 @@ validate_fasta <- function(fasta_path, .type = "AA") { # iteratively guess the type (DNA, AA, or other of each seq in the FASTA file) seq_types <- c() for (idx in seq_along(1:length(fasta))) { - seq_types <- c(.guess_seq_type(fasta[idx]), seq_types) + seq_types <- c(guessSeqType(fasta[idx]), seq_types) } switch(.type, diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R index 3ea331d5..4ad5b076 100644 --- a/inst/report/scripts/generate_report.R +++ b/inst/report/scripts/generate_report.R @@ -2,8 +2,7 @@ # Last modified: 2024 # get fasta of pathogen and/or drug -#' @export -get_card_data <- function(pathogen = NULL, drug = NULL) { +getCardData <- function(pathogen = NULL, drug = NULL) { destination_dir <- "CARD_data" # Check if CARD data exists if (!dir.exists(destination_dir)) { @@ -73,8 +72,7 @@ get_card_data <- function(pathogen = NULL, drug = NULL) { } # Run analysis -#' @export -run_analysis <- function( +runAnalysis <- function( upload_type = "Fasta", evalue = 0.00001, accnum_fasta_input = "", @@ -224,13 +222,13 @@ run_analysis <- function( # Validation of inputs fasta_data <- read_file(file_paths$fasta) validate_and_process_inputs <- function(evalue, fasta_data) { - if (!validate_evalue(evalue)) { + if (!validateEvalue(evalue)) { return("Error: A numeric E-value is required. Please set a valid value (e.g., 0.0001).") } - if (!validate_accnum_fasta(fasta_data)) { + if (!validateAccNumFasta(fasta_data)) { return("Error: Input for AccNum/Fasta cannot be empty or invalid.") } @@ -327,7 +325,7 @@ run_analysis <- function( } # Validate accession numbers for FASTA submission - if (!(validate_accnum_fasta(sequence_upload_data@seqs))) { + if (!(validateAccNumFasta(sequence_upload_data@seqs))) { stop("Error: Please adjust the FASTA headers. Ensure a header line for each sequence, no duplicate header names, and no duplicate protein accession numbers.") @@ -338,7 +336,7 @@ run_analysis <- function( expr = { tmp_file <- tempfile() writeLines(sequence_upload_data@seqs, tmp_file) - validate_fasta(tmp_file) + validateFasta(tmp_file) }, error = function(e) { warning("Error: Failed to run input FASTA verification") @@ -454,7 +452,7 @@ run_analysis <- function( job_code = pin_id, ) } else { - submit_full( + runFull( dir = dir, sequences = path, DB = blast_db, @@ -575,7 +573,7 @@ run_analysis <- function( } # Process results for results - fetched <- process_wrapper_dir(dir, pinName = pinName, type = type) + fetched <- processWrapperDir(dir, pinName = pinName, type = type) # Assign fetched data to objects if the fetched list has the expected length if (length(fetched) == 2) { @@ -586,69 +584,69 @@ run_analysis <- function( r_nrow_initial <- nrow(fetched[[1]]@df) # Initialize row count } - domarch_cols_value <- get_domarch_cols(app_data, DASelect) + domarch_cols_value <- getDomArchCols(app_data, DASelect) - query_domarch_cols_value <- get_domarch_columns(query_data@df) + query_domarch_cols_value <- getDomArchCols(query_data@df) - mainTable_value <- generate_data_table(data) + mainTable_value <- getDataTable(data) - queryDataTable_value <- generate_query_data_table(query_data, query_select) + queryDataTable_value <- getQueryDataTable(query_data, query_select) - fastaDataText_value <- get_fasta_data(query_data@fasta_path) + fastaDataText_value <- getFastaData(query_data@fasta_path) - domainDataText_value <- get_domain_data(data) + domainDataText_value <- getDomData(data) - msaDataText_value <- get_msa_data(query_data@msa_path) + msaDataText_value <- getMSAData(query_data@msa_path) - rs_IprGenes_value <- generate_ipr_genes_visualization(data, + rs_IprGenes_value <- getIPRGenesVisualization(data, app_data, input_rs_iprDatabases, input_rs_iprVisType) - rs_network_layout_value <- generate_rs_network_layout(data, + rs_network_layout_value <- getRSNetworkLayout(data, app_data, cutoff = 100, layout = "nice") - rs_data_table_value <- generate_data_table(data) + rs_data_table_value <- getDataTable(data) - da_IprGenes_value <- generate_da_ipr_genes_plot(app_data, + da_IprGenes_value <- getDomArchIPRGenesPlot(app_data, da_iprDatabases, da_iprVisType, DASelect) - query_heatmap_value <- generate_query_heatmap(query_data@df, + query_heatmap_value <- getQueryHeatmap(query_data@df, heatmap_select = "All", heatmap_color = "blue") - DA_Prot_value <- get_DA_Prot(app_data, DASelect) + DA_Prot_value <- getDomArchProt(app_data, DASelect) - DALinPlot_value <- generate_DA_heatmap_plot(DA_col = "DomArch.Pfam", + DALinPlot_value <- getDomArchHeatmapPlot(DA_col = "DomArch.Pfam", DACutoff, DA_Prot_value, DA_lin_color = "viridis", app_data@ipr_path) - DALin_TotalCounts_value <- DA_TotalCounts(DA_Prot_value, + DALin_TotalCounts_value <- getDomArchTotalCounts(DA_Prot_value, DACutoff, DA_col = "DomArch.Pfam", app_data) - DALinTable_value <- generate_DA_lin_table(DA_col = "DomArch.Pfam", + DALinTable_value <- getDomArchLinearTable(DA_col = "DomArch.Pfam", app_data@ipr_path, DALin_TotalCounts_value) - DANetwork_value <- generate_domain_network(DA_col = "DomArch.Pfam", + DANetwork_value <- getDomNetwork(DA_col = "DomArch.Pfam", DACutoff, DA_Prot_value, networkLayout = "nice", app_data@ipr_path) - rep_accnums_value <- rep_accnums(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data) + rep_accnums_value <- repAccNums(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data) - phylogeny_prot_value <- filter_phylogeny_proteins(app_data, phylo_select) + phylogeny_prot_value <- filterPhylogenyProteins(app_data, phylo_select) - acc_to_name_value <- acc_to_name(app_data) + acc_to_name_value <- acc2Name(app_data) ####### Report Generation ######## @@ -681,11 +679,11 @@ run_analysis <- function( phylo_sunburst_levels = levels, phylo_sunburst = phylogeny_prot_value, tree_msa_tool = tree_msa_tool, - rep_accnums = rep_accnums_value, + repAccNums = rep_accnums_value, msa_rep_num = 3, app_data = app_data, PhyloSelect = PhyloSelect, - acc_to_name = acc_to_name_value, + acc2Name = acc_to_name_value, rval_phylo = rval_phylo, msa_reduce_by = msa_reduce_by ) diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R index 2fc1fdbe..ea52a3b5 100644 --- a/inst/report/scripts/run_molevolvr_pipeline.R +++ b/inst/report/scripts/run_molevolvr_pipeline.R @@ -7,7 +7,7 @@ library(data.table) library(readr) library(rentrez) -get_sequences <- function(sequences, +getSeqs <- function(sequences, acc_file_path = "accs.txt", dir_path = "~", separate = TRUE) { @@ -35,7 +35,7 @@ get_sequences <- function(sequences, writeXStringSet(seqs, sequences, format = "fasta") return(length(seqs)) } -submit_full <- function( +runFull <- function( dir = "/data/scratch", DB = Sys.getenv("BLAST_DB", unset = "refseq"), NHITS = Sys.getenv("BLAST_HITS", unset = 100), @@ -70,13 +70,13 @@ submit_full <- function( # Create a log file write("START_DT\tSTOP_DT\tquery\tdblast\tacc2info\tdblast_cleanup\tacc2fa - \tblast_clust\tclust2table\tiprscan\tipr2lineage\tipr2da\tduration", + \tblast_clust\tclust2table\tiprscan\tipr2lineage\tipr2DomArch\tduration", "logfile.tsv") # Process sequences (local handling) if (phylo == "FALSE") { # Split the sequences if needed, store them locally - num_seqs <- get_sequences(sequences, dir_path = dir, separate = TRUE) + num_seqs <- getSeqs(sequences, dir_path = dir, separate = TRUE) fasta <- Biostrings::readAAStringSet(sequences) headers_original <- names(fasta) @@ -89,7 +89,7 @@ submit_full <- function( # output_file <- paste0(dir, "/blast_output_", i, ".txt") # Construct the local BLAST command (make sure 'blastn' is available locally) - run_molevolvr_pipeline(input_file, DB, NHITS, EVAL, is_query = F, type, i) + runMolevolvrPipeline(input_file, DB, NHITS, EVAL, is_query = F, type, i) #cmd <- sprintf( # "deltablast -query %s -db %s -out %s -num_alignments %d -evalue %f -remote", @@ -107,7 +107,7 @@ submit_full <- function( } # Simulate query run locally - run_molevolvr_pipeline(sequences, DB, NHITS, EVAL, is_query = TRUE, type) + runMolevolvrPipeline(sequences, DB, NHITS, EVAL, is_query = TRUE, type) # cmd_query <- sprintf( # "deltablast -query %s -db %s -out %s_query.txt -num_alignments %d -evalue %f -remote", # sequences, DB, paste0(dir, "/query_output"), NHITS, EVAL @@ -124,8 +124,7 @@ submit_full <- function( } # Define the main pipeline function -#' @export -run_molevolvr_pipeline <- function(input_paths, db, nhits, eval, +runMolevolvrPipeline <- function(input_paths, db, nhits, eval, is_query, type, i) { # Start time @@ -168,9 +167,9 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval, # setwd(OUTDIR) # Run acc2info - run_acc2info(parsed_accnums_file, PREFIX, OUTDIR) + runAcc2Info(parsed_accnums_file, PREFIX, OUTDIR) - replace_accession_numbers(file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), + replaceAccNums(file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), file.path(OUTPATH, "query-fasta_header-map.tsv"), file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv"))) @@ -188,18 +187,18 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval, # setwd(OUTDIR) # Run DELTABLAST - run_deltablast(input_paths, PREFIX, OUTDIR, db, nhits, eval) + runDeltablast(input_paths, PREFIX, OUTDIR, db, nhits, eval) # Run ACC2FA - convert_accnum_to_fasta(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")), + convertAccNum2Fasta(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")), PREFIX, OUTDIR) # Run ACC2INFO - run_acc2info(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt")), + runAcc2Info(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt")), PREFIX, OUTDIR) # Clean up BLAST results - cleanup_blast(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")), + cleanupBlast(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")), file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), PREFIX, F) @@ -208,15 +207,15 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval, # Sys.sleep(30) # Run BLASTCLUST - run_blastclust(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")), + runBlastclust(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")), PREFIX, OUTDIR ) # Convert clusters to table - clust2tbl(file.path(OUTDIR, paste0(PREFIX, ".bclust.L60S80.tsv")), + clust2Table(file.path(OUTDIR, paste0(PREFIX, ".bclust.L60S80.tsv")), file.path(OUTDIR, paste0(PREFIX, ".blast.cln.tsv"))) # Run INTERPROSCAN - run_interproscan(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")), + runIPRScan(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")), PREFIX, OUTDIR) new_header <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis", "DB.ID", "SignDesc", "StartLoc", "StopLoc", "Score", @@ -230,17 +229,17 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval, write.table(temp_data, file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")), sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) # Run IPR2LIN - ipr2lin(file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")), + ipr2Linear(file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")), file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), PREFIX) - # Optionally run IPR2DA if IS_QUERY is true + # Optionally run ipr2DomArch if IS_QUERY is true if (is_query == TRUE) { - ## perform ipr2da on iprscan results - da <- ipr2da(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")), + ## perform ipr2DomArch on iprscan results + da <- ipr2DomArch(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")), PREFIX, "NA") - ## if blast results are provided, call append_ipr + ## if blast results are provided, call appendIPR if (is.null(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv"))) | is.na(PREFIX)) { print("No blast results provided, moving on.") @@ -249,16 +248,16 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval, file.path(OUTDIR, paste0(PREFIX, ".full_analysis.tsv"))) } } else { - # perform ipr2da on iprscan results - da <- ipr2da(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")), + # perform ipr2DomArch on iprscan results + da <- ipr2DomArch(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")), PREFIX) - ## if blast results are provided, call append_ipr + ## if blast results are provided, call appendIPR if (is.null(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv"))) | is.na(PREFIX)) { print("No blast results provided, moving on.") } else { - append_ipr(ipr_da = da, + appendIPR(ipr_da = da, blast = file.path(OUTDIR, paste0(PREFIX, ".cln.clust.tsv")), prefix = PREFIX) } @@ -438,7 +437,7 @@ acc2info <- function(infile, prefix, outdir) { cat("Data saved to:", outfile, "\n") } -acc2info_phylo <- function(infile, outdir) { +acc2InfoPhylo <- function(infile, outdir) { # Ensure output directory exists if (!dir.exists(outdir)) { dir.create(outdir, recursive = TRUE) @@ -498,15 +497,15 @@ acc2info_phylo <- function(infile, outdir) { } # Main function to run based on the prefix -run_acc2info <- function(infile, prefix, outdir) { +runAcc2Info <- function(infile, prefix, outdir) { if (prefix == "NA") { - acc2info_phylo(infile, outdir) + acc2InfoPhylo(infile, outdir) } else { acc2info(infile, prefix, outdir) } } -substitute_accnum_for_acc2info <- function(df_acc2info, df_header_map) { +subsAccnum4cc2Info <- function(df_acc2info, df_header_map) { df_result <- df_header_map |> # set column name in header map to match accnum col in acc2info dplyr::rename(AccNum = header_accnum) |> @@ -522,7 +521,7 @@ substitute_accnum_for_acc2info <- function(df_acc2info, df_header_map) { } -replace_accession_numbers <- function(path_acc2info, +replaceAccNums <- function(path_acc2info, path_query_header_map, path_out) { # Read the input files @@ -530,7 +529,7 @@ replace_accession_numbers <- function(path_acc2info, df_query_header_map <- read_tsv(path_query_header_map) # Substitute accession numbers - df_acc2info_substituted <- substitute_accnum_for_acc2info(df_acc2info, + df_acc2info_substituted <- subsAccnum4cc2Info(df_acc2info, df_query_header_map) # Print the substituted dataframe @@ -542,7 +541,7 @@ replace_accession_numbers <- function(path_acc2info, } -run_deltablast <- function(infile, prefix, outdir, +runDeltablast <- function(infile, prefix, outdir, db = "refseq_protein", nhits = 5000, evalue = 1e-5, threads = 10) { @@ -580,7 +579,7 @@ run_deltablast <- function(infile, prefix, outdir, # This script converts AccNum to Fasta using NCBI's EDirect or EBI's API -convert_accnum_to_fasta <- function(infile, prefix, outdir) { +convertAccNum2Fasta <- function(infile, prefix, outdir) { # Create the output file path outfile <- file.path(outdir, paste0(prefix, ".all_accnums.fa")) @@ -649,7 +648,7 @@ convert_accnum_to_fasta <- function(infile, prefix, outdir) { } -cleanup_blast <- function(infile_blast, acc2info, prefix, wblast = F) { +cleanupBlast <- function(infile_blast, acc2info, prefix, wblast = F) { outdir <- dirname(infile_blast) # Load and clean acc2info file @@ -723,7 +722,7 @@ cleanup_blast <- function(infile_blast, acc2info, prefix, wblast = F) { # Function to run BLASTCLUST on given input -run_blastclust <- function(infile, suffix, outdir) { +runBlastclust <- function(infile, suffix, outdir) { # Prepare output file path outfile <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv")) @@ -756,7 +755,7 @@ run_blastclust <- function(infile, suffix, outdir) { } # Function to format blastclust output -clust2tbl <- function(clust, blast) { +clust2Table <- function(clust, blast) { clust_out <- read_tsv(file = clust, col_names = F) blast_out <- read_tsv(file = blast, col_names = T) ## Count the number of accession numbers in a cluster @@ -831,7 +830,7 @@ clust2tbl <- function(clust, blast) { } # Function to run InterProScan -run_interproscan <- function(query_file, prefix, outdir) { +runIPRScan <- function(query_file, prefix, outdir) { # Start InterProScan run cat("\n######################\n") @@ -864,7 +863,7 @@ run_interproscan <- function(query_file, prefix, outdir) { cat("##################\n") } -ipr2lin <- function(ipr, acc2info, prefix) { +ipr2Linear <- function(ipr, acc2info, prefix) { # read in iprscan results # duplicate rows in iprscan file ipr_in <- read_tsv(ipr, col_names = TRUE) %>% @@ -964,7 +963,7 @@ ipr2lin <- function(ipr, acc2info, prefix) { write_tsv(ipr_cln, file.path(paste0(outdir, "/",paste0(prefix,".iprscan_cln.tsv")))) } -ipr2da <- function(infile_ipr, prefix, +ipr2DomArch <- function(infile_ipr, prefix, analysis = c( "Pfam", "SMART", "Phobius", "Gene3D", "TMHMM", "SignalP_GRAM_POSITIVE", @@ -1034,8 +1033,8 @@ ipr2da <- function(infile_ipr, prefix, return(domarch2) } -## function to add results from ipr2da to blast results -append_ipr <- function(ipr_da, blast, prefix) { +## function to add results from ipr2DomArch to blast results +appendIPR <- function(ipr_da, blast, prefix) { # ! an 'AccNum' or 'AccNum.noV' column is required in blast table for joining ! blast_out <- read_tsv(blast, col_names = T) if ("AccNum.noV" %in% colnames(blast_out)) { diff --git a/inst/report/scripts/viz_utils.R b/inst/report/scripts/viz_utils.R index 41304360..573e0739 100644 --- a/inst/report/scripts/viz_utils.R +++ b/inst/report/scripts/viz_utils.R @@ -9,7 +9,7 @@ library(DT) library(plotly) # Function to generate the InterProScan Visualization -generate_ipr_genes_visualization <- function(data, app_data, +getIPRGenesVisualization <- function(data, app_data, input_rs_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"), input_rs_iprVisType = "Analysis") { @@ -57,7 +57,7 @@ generate_ipr_genes_visualization <- function(data, app_data, } # Function to generate the domain network layout visualization -generate_rs_network_layout <- function(data, app_data, +getRSNetworkLayout <- function(data, app_data, cutoff = 100, layout = "nice") { @@ -99,7 +99,7 @@ generate_rs_network_layout <- function(data, app_data, } # Function to generate the data table -generate_data_table <- function(data) { +getDataTable <- function(data) { if (nrow(data@df) == 0) { stop("No data available. Please ensure you have uploaded your data correctly.") @@ -165,7 +165,7 @@ generate_data_table <- function(data) { } # Function to generate query data table -generate_query_data_table <- function(query_data, query_select = NULL) { +getQueryDataTable <- function(query_data, query_select = NULL) { # Check if analysis is loaded and data is available if (nrow(query_data@df) == 0) { @@ -239,7 +239,7 @@ generate_query_data_table <- function(query_data, query_select = NULL) { } # Function to read and return the FASTA file contents -read_fasta_data <- function(fasta_path) { +readFastaData <- function(fasta_path) { # Check if analysis is loaded and the file path is not empty if (fasta_path == "" || !file.exists(fasta_path)) { @@ -252,7 +252,7 @@ read_fasta_data <- function(fasta_path) { return(fasta_content) } -get_fasta_data <- function(fasta_path) { +getFastaData <- function(fasta_path) { if (is.null(fasta_path) || fasta_path == "") { stop("Error: FASTA path is not provided.") } @@ -260,12 +260,12 @@ get_fasta_data <- function(fasta_path) { } # Function to get domain sequences (assumes `data` is a predefined object) -get_domain_data <- function(data) { +getDomData <- function(data) { return(data@domainSeqs) # Return domain sequences } # Function to get MSA data from a given path -get_msa_data <- function(msa_path) { +getMSAData <- function(msa_path) { if (is.null(msa_path) || msa_path == "") { stop("Error: MSA path is not provided.") } @@ -273,7 +273,7 @@ get_msa_data <- function(msa_path) { } # Function to generate a heatmap -generate_query_heatmap <- function(query_data_df, +getQueryHeatmap <- function(query_data_df, heatmap_select = "All", heatmap_color = "blue") { @@ -307,7 +307,7 @@ generate_query_heatmap <- function(query_data_df, } # Function to retrieve domain architecture columns -get_domarch_columns <- function(query_data_df) { +getDomArchCols <- function(query_data_df) { # Check if query data exists if (nrow(query_data_df) == 0) { stop("No query data available.") @@ -339,7 +339,7 @@ get_domarch_columns <- function(query_data_df) { } # Function to generate main data table -generate_main_table <- function(data_df, main_select = NULL) { +getMainTable <- function(data_df, main_select = NULL) { # Validate input data if (nrow(data_df) == 0) { stop("No data available. Please ensure you have uploaded your data @@ -403,7 +403,7 @@ generate_main_table <- function(data_df, main_select = NULL) { return(datatable_output) } -total_counts <- function(prot, column = "DomArch", lineage_col = "Lineage", +totalCounts <- function(prot, column = "DomArch", lineage_col = "Lineage", cutoff = 90, RowsCutoff = FALSE, digits = 2 # type = "GC" ) { @@ -467,14 +467,14 @@ total_counts <- function(prot, column = "DomArch", lineage_col = "Lineage", return(total) } -DA_TotalCounts <- function(DA_Prot, DACutoff, DA_col, app_data) { +getDomArchTotalCounts <- function(DA_Prot, DACutoff, DA_col, app_data) { # Check if ipr_path is not empty if (app_data@ipr_path == "") { stop("ipr_path is missing.") } # Calculate total counts with the specified cutoff and column - prot_tc <- total_counts(DA_Prot, cutoff = DACutoff, column = DA_col) + prot_tc <- totalCounts(DA_Prot, cutoff = DACutoff, column = DA_col) # Replace all instances of ">" in the Lineage column with "_" prot_tc$Lineage <- map(prot_tc$Lineage, ~ str_replace_all(.x, ">", "_")) %>% @@ -485,7 +485,7 @@ DA_TotalCounts <- function(DA_Prot, DACutoff, DA_col, app_data) { } # Function to generate Domain Architecture Linear Table -generate_DA_lin_table <- function(DA_col, ipr_path, DA_TotalCounts_value) { +getDomArchLinearTable <- function(DA_col, ipr_path, DA_TotalCounts_value) { # Check if ipr_path is valid if (ipr_path == "") { stop("InterPro path is empty.") @@ -534,7 +534,7 @@ generate_DA_lin_table <- function(DA_col, ipr_path, DA_TotalCounts_value) { } # Function to generate the Domain Architecture Lineage Plot -generate_DA_heatmap_plot <- function(DA_col, DACutoff, +getDomArchHeatmapPlot <- function(DA_col, DACutoff, DA_Prot, DA_lin_color, ipr_path) { # Check if ipr_path is valid @@ -564,7 +564,7 @@ generate_DA_heatmap_plot <- function(DA_col, DACutoff, } # Function to generate the Domain Architecture Network -generate_domain_network <- function(DA_col, DACutoff, DA_Prot, +getDomNetwork <- function(DA_col, DACutoff, DA_Prot, networkLayout, ipr_path) { # Check if ipr_path is valid if (ipr_path == "") { @@ -628,7 +628,7 @@ generate_domain_network <- function(DA_col, DACutoff, DA_Prot, } # Function to retrieve and clean Domain Architecture data -get_DA_Prot <- function(app_data, DASelect) { +getDomArchProt <- function(app_data, DASelect) { # Check if the ipr_path is valid if (app_data@ipr_path == "") { stop("InterPro path is empty.") @@ -667,7 +667,7 @@ get_DA_Prot <- function(app_data, DASelect) { } # Function to retrieve domain architecture columns -get_domarch_cols <- function(app_data, DASelect) { +getDomArchCols <- function(app_data, DASelect) { # Check if app data DataFrame is not empty if (nrow(app_data@df) <= 0) { stop("No data available in app data.") @@ -712,7 +712,7 @@ get_domarch_cols <- function(app_data, DASelect) { } # Function to generate the IPR genes plot -generate_da_ipr_genes_plot <- function(app_data, da_iprDatabases, +getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases, da_iprVisType, DASelect) { if (app_data@ipr_path == "") { @@ -760,7 +760,7 @@ generate_da_ipr_genes_plot <- function(app_data, da_iprDatabases, } # Function to filter proteins for phylogeny -filter_phylogeny_proteins <- function(app_data, phylo_select) { +filterPhylogenyProteins <- function(app_data, phylo_select) { # Get the data frame from app_data df <- app_data@df @@ -783,7 +783,7 @@ filter_phylogeny_proteins <- function(app_data, phylo_select) { } # Function to retrieve representative accession numbers -get_representative_accession_numbers <- function(app_data, phylo_select, +getRepAccNum <- function(app_data, phylo_select, msa_reduce_by, msa_rep_num, rval_phylo) { @@ -814,7 +814,7 @@ get_representative_accession_numbers <- function(app_data, phylo_select, } } -generate_domain_architecture_plot <- function(ipr_path, query_names, +getDomArchPlot <- function(ipr_path, query_names, analysis_type, group_by) { # Check if the input path is provided if (is.null(ipr_path) || ipr_path == "") { @@ -839,7 +839,7 @@ generate_domain_architecture_plot <- function(ipr_path, query_names, } # Function to convert accessions to names -acc_to_name <- function(app_data) { +acc2Name <- function(app_data) { # Check if "AccNum" is a column in the data if (!("AccNum" %in% colnames(app_data@df))) { stop("Column 'AccNum' not found in data.") @@ -859,7 +859,7 @@ acc_to_name <- function(app_data) { return(df) } -rep_accnums <- function(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data) { +repAccNums <- function(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data) { # If `phylo` is true, return all `AccNum` values from `app_data` if (phylo) { return(app_data@df$AccNum) @@ -894,7 +894,7 @@ rep_accnums <- function(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data } } -seq_tree <- function(fasta_filepath){ +seqTree <- function(fasta_filepath){ my_seqs <- readAAStringSet(fasta_filepath) #, format="fasta", seek.first.rec=T) my_seqs_msa <- msa(my_seqs) my_seqs_msa_aln <- msaConvert(my_seqs_msa, type="seqinr::alignment") From 5f360b147aaf43bf221c423532b4d175af725eab Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Wed, 20 Nov 2024 03:05:34 +0200 Subject: [PATCH 07/23] trailing spaces in sum Signed-off-by: Awa Synthia --- R/summarize.R | 138 ++++++++++++++++++++++++++------------------------ 1 file changed, 73 insertions(+), 65 deletions(-) diff --git a/R/summarize.R b/R/summarize.R index 350c533d..9ec823de 100644 --- a/R/summarize.R +++ b/R/summarize.R @@ -36,7 +36,7 @@ #' filterByDomains() #' } filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remove = c(), - ignore.case = FALSE) { + ignore.case = FALSE) { # Only rows with a domain in doms_keep will be kept # Any row containing a domain in doms_remove will be removed @@ -213,7 +213,7 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms" valid_types <- c("da2doms", "doms2da") if (!conversion_type %in% valid_types) { abort(paste("Error: Invalid 'conversion_type'. Must be one of:", - paste(valid_types, collapse = ", "))) + paste(valid_types, collapse = ", "))) } z1 <- prot %>% @@ -221,16 +221,16 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms" str_replace_all("\\,", " ") %>% str_replace_all("\"", " ") switch(conversion_type, - da2doms = { - z2 <- z1 %>% - str_replace_all("\\+", " ") - }, - gc2da = { - z2 <- z1 %>% - str_replace_all("\\<-", " ") %>% - str_replace_all("-\\>", " ") %>% - str_replace_all("\\|", " ") - } + da2doms = { + z2 <- z1 %>% + str_replace_all("\\+", " ") + }, + gc2da = { + z2 <- z1 %>% + str_replace_all("\\<-", " ") %>% + str_replace_all("-\\>", " ") %>% + str_replace_all("\\|", " ") + } ) # str_replace_all("^c\\($", " ") %>% # remove "c(" # str_replace_all("\\)$", " ") %>% # remove ")" @@ -385,7 +385,7 @@ NULL #' } #' summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage", - query) { + query) { # Check if 'prot' is a data frame if (!is.data.frame(prot)) { abort("Error: 'prot' must be a data frame.") @@ -431,6 +431,7 @@ summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage" #' named `DomArch` and `Lineage`. #' #' @importFrom dplyr arrange count desc filter group_by summarise +#' @importFrom rlang .data #' #' @return A tibble summarizing the counts of unique domain architectures #' (`DomArch`) per lineage (`Lineage`). The resulting table contains three @@ -443,7 +444,7 @@ summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage" #' #' @examples #' \dontrun{ -#' summarizeDomArch_ByLineage() +#' summarizeDomArch_ByLineage(data1) #' } summarizeDomArch_ByLineage <- function(x) { # Check if 'x' is a data frame @@ -460,8 +461,8 @@ summarizeDomArch_ByLineage <- function(x) { missing:", paste(missing_columns, collapse = ", "))) } x %>% - filter(!grepl("^-$", DomArch)) %>% - group_by(DomArch, Lineage) %>% + filter(!grepl("^-$", .data$DomArch)) %>% + group_by(.data$DomArch, .data$Lineage) %>% summarise(count = n()) %>% # , bin=as.numeric(as.logical(n())) arrange(desc(count)) } @@ -476,7 +477,8 @@ summarizeDomArch_ByLineage <- function(x) { #' named `DomArch` and a count column, such as `count`, which represents the #' occurrences of each architecture in various lineages. #' -#' @importFrom dplyr arrange group_by filter summarise +#' @importFrom dplyr arrange group_by filter summarise desc +#' @importFrom rlang .data #' #' @return A tibble summarizing each unique `DomArch`, along with the following #' columns: @@ -489,7 +491,7 @@ summarizeDomArch_ByLineage <- function(x) { #' #' @examples #' \dontrun{ -#' summarizeDomArch() +#' summarizeDomArch(data1) #' } summarizeDomArch <- function(x) { # Check if 'x' is a data frame @@ -497,11 +499,14 @@ summarizeDomArch <- function(x) { abort("Error: 'x' must be a data frame.") } x %>% - group_by(DomArch) %>% - summarise(totalcount = sum(count), totallin = n()) %>% # totallin=n_distinct(Lineage), - arrange(desc(totallin), desc(totalcount)) %>% - filter(!grepl(" \\{n\\}", DomArch)) %>% - filter(!grepl("^-$", DomArch)) + group_by(.data$DomArch) %>% + summarise( + totalcount = sum(.data$count), + totallin = n() + ) %>% + arrange(desc(.data$totallin), desc(.data$totalcount)) %>% + filter(!grepl(" \\{n\\}", .data$DomArch)) %>% + filter(!grepl("^-$", .data$DomArch)) } #' summarizeGenContext_ByDomArchLineage @@ -510,6 +515,7 @@ summarizeDomArch <- function(x) { #' named `GenContext`, `DomArch`, and `Lineage`. #' #' @importFrom dplyr arrange desc filter group_by n summarise +#' @importFrom rlang .data #' #' @return A tibble summarizing each unique combination of `GenContext`, #' `DomArch`, and `Lineage`, along with the following columns: @@ -525,7 +531,7 @@ summarizeDomArch <- function(x) { #' #' @examples #' \dontrun{ -#' summarizeGenContext_ByDomArchLineage +#' summarizeGenContext_ByDomArchLineage(your_data) #' } summarizeGenContext_ByDomArchLineage <- function(x) { # Check if 'x' is a data frame @@ -533,28 +539,31 @@ summarizeGenContext_ByDomArchLineage <- function(x) { abort("Error: 'x' must be a data frame.") } x %>% - filter(!grepl("^-$", GenContext)) %>% - filter(!grepl("^-$", DomArch)) %>% - filter(!grepl("^-$", Lineage)) %>% - filter(!grepl("^NA$", DomArch)) %>% - group_by(GenContext, DomArch, Lineage) %>% - summarise(count = n()) %>% # , bin=as.numeric(as.logical(n())) - arrange(desc(count)) + filter(!grepl("^-$", .data$GenContext)) %>% + filter(!grepl("^-$", .data$DomArch)) %>% + filter(!grepl("^-$", .data$Lineage)) %>% + filter(!grepl("^NA$", .data$DomArch)) %>% + group_by(.data$GenContext, .data$DomArch, .data$Lineage) %>% + summarise(count = n()) %>% + arrange(desc(.data$count)) } #' summarizeGenContext_ByLineage #' -#' @param x A dataframe or tibble containing the data. +#' @param x A dataframe or tibble containing the data. It must have columns +#' named `GenContext`, `DomArch`, and `Lineage`. #' #' @importFrom dplyr arrange desc filter group_by n summarise +#' @importFrom rlang .data #' -#' @return Describe return, in detail +#' @return A tibble summarizing each unique combination of `GenContext` and `Lineage`, +#' along with the count of occurrences. The results are arranged in descending order of count. #' @rdname MolEvolvR_summary #' @export #' #' @examples #' \dontrun{ -#' summarizeGenContext_ByLineage() +#' summarizeGenContext_ByLineage(your_data) #' } summarizeGenContext_ByLineage <- function(x) { # Check if 'x' is a data frame @@ -562,36 +571,35 @@ summarizeGenContext_ByLineage <- function(x) { abort("Error: 'x' must be a data frame.") } x %>% - filter(!grepl("^-$", GenContext)) %>% - filter(!grepl("^-$", DomArch)) %>% - filter(!grepl("^-$", Lineage)) %>% - filter(!grepl("^NA$", DomArch)) %>% - group_by(GenContext, Lineage) %>% # DomArch.norep, - summarise(count = n()) %>% # , bin=as.numeric(as.logical(n())) - arrange(desc(count)) + filter(!grepl("^-$", .data$GenContext)) %>% + filter(!grepl("^-$", .data$DomArch)) %>% + filter(!grepl("^-$", .data$Lineage)) %>% + filter(!grepl("^NA$", .data$DomArch)) %>% + group_by(.data$GenContext, .data$Lineage) %>% + summarise(count = n()) %>% + arrange(desc(.data$count)) } #' summarizeGenContext #' #' @param x A dataframe or tibble containing the data. It must have columns -#' named `GenContext`, `DomArch`, and `Lineage`. +#' named `GenContext`, `DomArch`, `Lineage`, and `count`. #' -#' @importFrom dplyr arrange desc filter group_by n n_distinct summarise +#' @importFrom dplyr arrange desc filter group_by n_distinct summarise +#' @importFrom rlang .data #' -#' @return A tibble summarizing each unique combination of `GenContext` and -#' `Lineage`, along with the following columns: -#' - `GenContext`: The genomic context for each entry. -#' - `Lineage`: The lineage associated with each entry. -#' - `count`: The total number of occurrences for each combination of -#' `GenContext` and `Lineage`. +#' @return A tibble summarizing each unique `GenContext`, along with the following columns: +#' - `totalcount`: The total count for each `GenContext`. +#' - `totalDA`: The number of distinct `DomArch` for each `GenContext`. +#' - `totallin`: The number of distinct `Lineage` for each `GenContext`. #' -#' The results are arranged in descending order of `count`. +#' The results are arranged in descending order of `totalcount`, `totalDA`, and `totallin`. #' @rdname MolEvolvR_summary #' @export #' #' @examples #' \dontrun{ -#' summarizeGenContext() +#' summarizeGenContext(data1) #' } summarizeGenContext <- function(x) { # Check if 'x' is a data frame @@ -599,15 +607,15 @@ summarizeGenContext <- function(x) { abort("Error: 'x' must be a data frame.") } x %>% - group_by(GenContext) %>% + group_by(.data$GenContext) %>% summarise( - totalcount = sum(count), - totalDA = n_distinct(DomArch), - totallin = n_distinct(Lineage) - ) %>% # totallin=n_distinct(Lineage), - arrange(desc(totalcount), desc(totalDA), desc(totallin)) %>% - filter(!grepl(" \\{n\\}", GenContext)) %>% - filter(!grepl("^-$", GenContext)) + totalcount = sum(.data$count), + totalDA = n_distinct(.data$DomArch), + totallin = n_distinct(.data$Lineage) + ) %>% + arrange(desc(.data$totalcount), desc(.data$totalDA), desc(.data$totallin)) %>% + filter(!grepl(" \\{n\\}", .data$GenContext)) %>% + filter(!grepl("^-$", .data$GenContext)) } @@ -654,8 +662,8 @@ summarizeGenContext <- function(x) { #' totalGenContextOrDomArchCounts(pspa - gc_lin_counts, 0, "GC") #' } totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col = "Lineage", - cutoff = 90, RowsCutoff = FALSE, digits = 2 - # type = "GC" + cutoff = 90, RowsCutoff = FALSE, digits = 2 + # type = "GC" ) { # Check if 'prot' is a data frame if (!is.data.frame(prot)) { @@ -668,7 +676,7 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col if (length(missing_columns) > 0) { abort(paste("Error: The following required columns are missing:", - paste(missing_columns, collapse = ", "))) + paste(missing_columns, collapse = ", "))) } # Check that cutoff is a numeric value between 0 and 100 @@ -682,7 +690,7 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col abort("Error: 'digits' must be a non-negative integer.") } - # column <- sym(column) + column <- sym(column) prot <- select(prot, {{ column }}, {{ lineage_col }}) %>% filter(!is.na({{ column }}) & !is.na({{ lineage_col }})) %>% @@ -690,10 +698,10 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col prot <- summarizeByLineage(prot, column, by = lineage_col, query = "all") col_count <- prot %>% - group_by(!!sym(column)) %>% + group_by({{ column }}) %>% summarise(totalcount = sum(count)) - total <- left_join(prot, col_count, by = column) + total <- left_join(prot, col_count, by = as_string(column)) sum_count <- sum(total$count) total <- total %>% From e946d09052349390f28ca150e4c7696edb060ad5 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Wed, 20 Nov 2024 03:11:00 +0200 Subject: [PATCH 08/23] trailing spaces in sum Signed-off-by: Awa Synthia --- R/summarize.R | 208 +++++++++++++++++++++++++------------------------- 1 file changed, 104 insertions(+), 104 deletions(-) diff --git a/R/summarize.R b/R/summarize.R index 9ec823de..c9e965bd 100644 --- a/R/summarize.R +++ b/R/summarize.R @@ -36,23 +36,23 @@ #' filterByDomains() #' } filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remove = c(), - ignore.case = FALSE) { + ignore.case = FALSE) { # Only rows with a domain in doms_keep will be kept # Any row containing a domain in doms_remove will be removed # ^word$|(?<=\+)word$|(?<=\+)word(?=\+)|word(?=\+) - + # Check if prot is a data frame if (!is.data.frame(prot)) { abort("Error: 'prot' must be a data frame.") } - + # Check if the specified column exists in the data frame if (!column %in% names(prot)) { - abort(paste("Error: The specified column '", column, "' does not exist + abort(paste("Error: The specified column '", column, "' does not exist in the data frame.", sep = "")) } - + # If doms_keep or doms_remove are not provided, inform the user if (length(doms_keep) == 0 && length(doms_remove) == 0) { warning("Warning: No domains specified to keep or remove. Returning the @@ -109,25 +109,25 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo #' countByColumn #' @description #' Function to obtain element counts (DA, GC) -#' -#' @param prot A data frame containing the dataset to analyze, typically with +#' +#' @param prot A data frame containing the dataset to analyze, typically with #' multiple columns including the one specified by the `column` parameter. -#' @param column A character string specifying the name of the column to analyze. +#' @param column A character string specifying the name of the column to analyze. #' The default is "DomArch". -#' @param min.freq An integer specifying the minimum frequency an element must +#' @param min.freq An integer specifying the minimum frequency an element must #' have to be included in the output. Default is 1. #' #' @importFrom dplyr arrange as_tibble filter select #' #' @return A tibble with two columns: #' \describe{ -#' \item{`column`}{The unique elements from the specified column +#' \item{`column`}{The unique elements from the specified column #' (e.g., "DomArch").} -#' \item{`freq`}{The frequency of each element, i.e., the number of times +#' \item{`freq`}{The frequency of each element, i.e., the number of times #' each element appears in the specified column.} #' } -#' The tibble is filtered to only include elements that have a frequency -#' greater than or equal to `min.freq` and does not include elements with `NA` +#' The tibble is filtered to only include elements that have a frequency +#' greater than or equal to `min.freq` and does not include elements with `NA` #' values or those starting with a hyphen ("-"). #' @export #' @@ -136,20 +136,20 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo #' countByColumn(prot = my_data, column = "DomArch", min.freq = 10) #' } countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) { - + # Check if 'prot' is a data frame if (!is.data.frame(prot)) { abort("Error: 'prot' must be a data frame.") } - + # Check if the specified column exists in the data frame if (!column %in% names(prot)) { abort(paste("Error: The specified column '", column, "' does not exist in the data frame.", sep = "")) } - + # Check if min.freq is a positive integer - if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 || + if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 || floor(min.freq) != min.freq) { abort("Error: 'min.freq' must be a positive integer.") } @@ -171,14 +171,14 @@ countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) { #' Break string ELEMENTS into WORDS for domain architecture (DA) and genomic #' context (GC) #' -#' @param prot A dataframe containing the dataset to analyze. The specified +#' @param prot A dataframe containing the dataset to analyze. The specified #' `column` contains the string elements to be processed. -#' @param column A character string specifying the name of the column to analyze. +#' @param column A character string specifying the name of the column to analyze. #' Default is "DomArch". -#' @param conversion_type A character string specifying the type of conversion. +#' @param conversion_type A character string specifying the type of conversion. #' Two options are available: #' \describe{ -#' \item{`da2doms`}{Convert domain architectures into individual domains by +#' \item{`da2doms`}{Convert domain architectures into individual domains by #' replacing `+` symbols with spaces.} #' \item{`gc2da`}{Convert genomic context into domain architectures by #' replacing directional symbols (`<-`, `->`, and `|`) with spaces.} @@ -187,13 +187,13 @@ countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) { #' @importFrom dplyr pull #' @importFrom stringr str_replace_all #' -#' @return A single string where elements are delimited by spaces. The function -#' performs necessary substitutions based on the `conversion_type` and cleans up +#' @return A single string where elements are delimited by spaces. The function +#' performs necessary substitutions based on the `conversion_type` and cleans up #' extraneous characters like newlines, tabs, and multiple spaces. #' #' @examples #' \dontrun{ -#' tibble::tibble(DomArch = c("aaa+bbb", +#' tibble::tibble(DomArch = c("aaa+bbb", #' "a+b", "b+c", "b-c")) |> elements2Words() #' } #' @@ -202,35 +202,35 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms" if (!is.data.frame(prot)) { abort("Error: 'prot' must be a data frame.") } - + # Check if the specified column exists in the data frame if (!column %in% names(prot)) { - abort(paste("Error: The specified column '", column, "' does not exist in + abort(paste("Error: The specified column '", column, "' does not exist in the data frame.", sep = "")) } - + # Check for valid conversion_type values valid_types <- c("da2doms", "doms2da") if (!conversion_type %in% valid_types) { - abort(paste("Error: Invalid 'conversion_type'. Must be one of:", - paste(valid_types, collapse = ", "))) + abort(paste("Error: Invalid 'conversion_type'. Must be one of:", + paste(valid_types, collapse = ", "))) } - + z1 <- prot %>% dplyr::pull(column) %>% str_replace_all("\\,", " ") %>% str_replace_all("\"", " ") switch(conversion_type, - da2doms = { - z2 <- z1 %>% - str_replace_all("\\+", " ") - }, - gc2da = { - z2 <- z1 %>% - str_replace_all("\\<-", " ") %>% - str_replace_all("-\\>", " ") %>% - str_replace_all("\\|", " ") - } + da2doms = { + z2 <- z1 %>% + str_replace_all("\\+", " ") + }, + gc2da = { + z2 <- z1 %>% + str_replace_all("\\<-", " ") %>% + str_replace_all("-\\>", " ") %>% + str_replace_all("\\|", " ") + } ) # str_replace_all("^c\\($", " ") %>% # remove "c(" # str_replace_all("\\)$", " ") %>% # remove ")" @@ -252,20 +252,20 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms" #' @description #' Get word counts (wc) [DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)] #' -#' @param string A character string containing the elements (words) to count. -#' This would typically be a space-delimited string representing domain +#' @param string A character string containing the elements (words) to count. +#' This would typically be a space-delimited string representing domain #' architectures or genomic contexts. #' #' @importFrom dplyr as_tibble filter arrange #' @importFrom stringr str_replace_all #' -#' @return A tibble (tbl_df) with two columns: +#' @return A tibble (tbl_df) with two columns: #' \describe{ -#' \item{`words`}{A column containing the individual words +#' \item{`words`}{A column containing the individual words #' (domains or domain architectures).} #' \item{`freq`}{A column containing the frequency counts for each word.} #' } -#' +#' #' #' @examples #' \dontrun{ @@ -279,7 +279,7 @@ words2WordCounts <- function(string) { if (!is.character(string) || length(string) != 1) { abort("Error: 'string' must be a single character vector.") } - + df_word_count <- string %>% # reduce spaces with length 2 or greater to a single space str_replace_all("\\s{2,}", " ") %>% @@ -311,14 +311,14 @@ words2WordCounts <- function(string) { #' filterByFrequency #' @description #' Function to filter based on frequencies -#' -#' @param x A tibble (tbl_df) containing at least two columns: one for +#' +#' @param x A tibble (tbl_df) containing at least two columns: one for #' elements (e.g., `words`) and one for their frequency (e.g., `freq`). -#' @param min.freq A numeric value specifying the minimum frequency threshold. -#' Only elements with frequencies greater than or equal to this value will be +#' @param min.freq A numeric value specifying the minimum frequency threshold. +#' Only elements with frequencies greater than or equal to this value will be #' retained. #' -#' @return A tibble with the same structure as `x`, but filtered to include +#' @return A tibble with the same structure as `x`, but filtered to include #' only rows where the frequency is greater than or equal to `min.freq`. #' #' @export @@ -328,18 +328,18 @@ words2WordCounts <- function(string) { #' filterByFrequency() #' } filterByFrequency <- function(x, min.freq) { - + # Check if 'x' is a data frame if (!is.data.frame(x)) { abort("Error: 'x' must be a data frame.") } - + # Check if 'min.freq' is a positive integer - if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 || + if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 || floor(min.freq) != min.freq) { abort("Error: 'min.freq' must be a positive integer.") } - + # Check if the 'freq' column exists in the data frame if (!"freq" %in% names(x)) { abort("Error: The data frame must contain a 'freq' column.") @@ -355,24 +355,24 @@ filterByFrequency <- function(x, min.freq) { #' @name MolEvolvR_summary #' @description #' A collection of summary functions for the MolEvolvR package. -#' +#' NULL #' summarizeByLineage #' #' @param prot A dataframe or tibble containing the data. -#' @param column A string representing the column to be summarized +#' @param column A string representing the column to be summarized #' (e.g., `DomArch`). Default is "DomArch". -#' @param by A string representing the grouping column (e.g., `Lineage`). +#' @param by A string representing the grouping column (e.g., `Lineage`). #' Default is "Lineage". -#' @param query A string specifying the query pattern for filtering the target +#' @param query A string specifying the query pattern for filtering the target #' column. Use "all" to skip filtering and include all rows. #' #' @importFrom dplyr arrange filter group_by summarise #' @importFrom rlang sym #' -#' @return A tibble summarizing the counts of occurrences of elements in -#' the `column`, grouped by the `by` column. The result includes the number +#' @return A tibble summarizing the counts of occurrences of elements in +#' the `column`, grouped by the `by` column. The result includes the number #' of occurrences (`count`) and is arranged in descending order of count. #' @rdname MolEvolvR_summary #' @export @@ -385,24 +385,24 @@ NULL #' } #' summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage", - query) { + query) { # Check if 'prot' is a data frame if (!is.data.frame(prot)) { abort("Error: 'prot' must be a data frame.") } - + # Check if the specified column exists in the data frame if (!column %in% names(prot)) { - abort(paste("Error: The specified column '", column, "' does not exist in + abort(paste("Error: The specified column '", column, "' does not exist in the data frame.", sep = "")) } - + # Check if the 'by' column exists in the data frame if (!by %in% names(prot)) { - abort(paste("Error: The specified 'by' column '", by, "' does not exist + abort(paste("Error: The specified 'by' column '", by, "' does not exist n the data frame.", sep = "")) } - + column <- sym(column) by <- sym(by) if (query == "all") { @@ -427,16 +427,16 @@ summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage" #' Function to summarize and retrieve counts by Domains & Domains+Lineage #' #' -#' @param x A dataframe or tibble containing the data. It must have columns +#' @param x A dataframe or tibble containing the data. It must have columns #' named `DomArch` and `Lineage`. #' #' @importFrom dplyr arrange count desc filter group_by summarise #' @importFrom rlang .data #' -#' @return A tibble summarizing the counts of unique domain architectures -#' (`DomArch`) per lineage (`Lineage`). The resulting table contains three -#' columns: `DomArch`, `Lineage`, and `count`, which indicates the frequency -#' of each domain architecture for each lineage. The results are arranged in +#' @return A tibble summarizing the counts of unique domain architectures +#' (`DomArch`) per lineage (`Lineage`). The resulting table contains three +#' columns: `DomArch`, `Lineage`, and `count`, which indicates the frequency +#' of each domain architecture for each lineage. The results are arranged in #' descending order of `count`. #' @rdname MolEvolvR_summary #' @@ -451,13 +451,13 @@ summarizeDomArch_ByLineage <- function(x) { if (!is.data.frame(x)) { abort("Error: 'x' must be a data frame.") } - + # Check if required columns exist in the data frame required_columns <- c("DomArch", "Lineage") missing_columns <- setdiff(required_columns, names(x)) - + if (length(missing_columns) > 0) { - abort(paste("Error: The following required columns are + abort(paste("Error: The following required columns are missing:", paste(missing_columns, collapse = ", "))) } x %>% @@ -473,17 +473,17 @@ summarizeDomArch_ByLineage <- function(x) { #' @description #' Function to retrieve counts of how many lineages a DomArch appears in #' -#' @param x A dataframe or tibble containing the data. It must have a column -#' named `DomArch` and a count column, such as `count`, which represents the +#' @param x A dataframe or tibble containing the data. It must have a column +#' named `DomArch` and a count column, such as `count`, which represents the #' occurrences of each architecture in various lineages. #' #' @importFrom dplyr arrange group_by filter summarise desc #' @importFrom rlang .data #' -#' @return A tibble summarizing each unique `DomArch`, along with the following +#' @return A tibble summarizing each unique `DomArch`, along with the following #' columns: #' - `totalcount`: The total occurrences of each `DomArch` across all lineages. -#' - `totallin`: The total number of unique lineages in which each `DomArch` +#' - `totallin`: The total number of unique lineages in which each `DomArch` #' appears. #' The results are arranged in descending order of `totallin` and `totalcount`. #' @rdname MolEvolvR_summary @@ -501,7 +501,7 @@ summarizeDomArch <- function(x) { x %>% group_by(.data$DomArch) %>% summarise( - totalcount = sum(.data$count), + totalcount = sum(.data$count), totallin = n() ) %>% arrange(desc(.data$totallin), desc(.data$totalcount)) %>% @@ -511,18 +511,18 @@ summarizeDomArch <- function(x) { #' summarizeGenContext_ByDomArchLineage #' -#' @param x A dataframe or tibble containing the data. It must have columns +#' @param x A dataframe or tibble containing the data. It must have columns #' named `GenContext`, `DomArch`, and `Lineage`. #' #' @importFrom dplyr arrange desc filter group_by n summarise #' @importFrom rlang .data #' -#' @return A tibble summarizing each unique combination of `GenContext`, +#' @return A tibble summarizing each unique combination of `GenContext`, #' `DomArch`, and `Lineage`, along with the following columns: #' - `GenContext`: The genomic context for each entry. #' - `DomArch`: The domain architecture for each entry. #' - `Lineage`: The lineage associated with each entry. -#' - `count`: The total number of occurrences for each combination of +#' - `count`: The total number of occurrences for each combination of #' `GenContext`, `DomArch`, and `Lineage`. #' #' The results are arranged in descending order of `count`. @@ -550,13 +550,13 @@ summarizeGenContext_ByDomArchLineage <- function(x) { #' summarizeGenContext_ByLineage #' -#' @param x A dataframe or tibble containing the data. It must have columns +#' @param x A dataframe or tibble containing the data. It must have columns #' named `GenContext`, `DomArch`, and `Lineage`. #' #' @importFrom dplyr arrange desc filter group_by n summarise #' @importFrom rlang .data #' -#' @return A tibble summarizing each unique combination of `GenContext` and `Lineage`, +#' @return A tibble summarizing each unique combination of `GenContext` and `Lineage`, #' along with the count of occurrences. The results are arranged in descending order of count. #' @rdname MolEvolvR_summary #' @export @@ -582,7 +582,7 @@ summarizeGenContext_ByLineage <- function(x) { #' summarizeGenContext #' -#' @param x A dataframe or tibble containing the data. It must have columns +#' @param x A dataframe or tibble containing the data. It must have columns #' named `GenContext`, `DomArch`, `Lineage`, and `count`. #' #' @importFrom dplyr arrange desc filter group_by n_distinct summarise @@ -631,13 +631,13 @@ summarizeGenContext <- function(x) { #' @param prot A data frame that must contain columns: #' \itemize{\item Either 'GenContext' or 'DomArch.norep' \item count} #' @param column Character. The column to summarize, default is "DomArch". -#' @param lineage_col Character. The name of the lineage column, default is +#' @param lineage_col Character. The name of the lineage column, default is #' "Lineage". -#' @param cutoff Numeric. Cutoff for total count. Counts below this cutoff value +#' @param cutoff Numeric. Cutoff for total count. Counts below this cutoff value #' will not be shown. Default is 0. -#' @param RowsCutoff Logical. If TRUE, filters based on cumulative percentage +#' @param RowsCutoff Logical. If TRUE, filters based on cumulative percentage #' cutoff. Default is FALSE. -#' @param digits Numeric. Number of decimal places for percentage columns. +#' @param digits Numeric. Number of decimal places for percentage columns. #' Default is 2. #' #' @@ -646,9 +646,9 @@ summarizeGenContext <- function(x) { #' #' @return A data frame with the following columns: #' - `{{ column }}`: Unique values from the specified column. -#' - `totalcount`: The total count of occurrences for each unique value in +#' - `totalcount`: The total count of occurrences for each unique value in #' the specified column. -#' - `IndividualCountPercent`: The percentage of each `totalcount` relative to +#' - `IndividualCountPercent`: The percentage of each `totalcount` relative to #' the overall count. #' - `CumulativePercent`: The cumulative percentage of total counts. #' @rdname MolEvolvR_summary @@ -662,34 +662,34 @@ summarizeGenContext <- function(x) { #' totalGenContextOrDomArchCounts(pspa - gc_lin_counts, 0, "GC") #' } totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col = "Lineage", - cutoff = 90, RowsCutoff = FALSE, digits = 2 - # type = "GC" + cutoff = 90, RowsCutoff = FALSE, digits = 2 + # type = "GC" ) { # Check if 'prot' is a data frame if (!is.data.frame(prot)) { abort("Error: 'prot' must be a data frame.") } - + # Check if the specified columns exist in the data frame required_columns <- c(column, lineage_col) missing_columns <- setdiff(required_columns, names(prot)) - + if (length(missing_columns) > 0) { - abort(paste("Error: The following required columns are missing:", - paste(missing_columns, collapse = ", "))) + abort(paste("Error: The following required columns are missing:", + paste(missing_columns, collapse = ", "))) } - + # Check that cutoff is a numeric value between 0 and 100 if (!is.numeric(cutoff) || length(cutoff) != 1 || cutoff < 0 || cutoff > 100) { abort("Error: 'cutoff' must be a numeric value between 0 and 100.") } - + # Check that digits is a non-negative integer - if (!is.numeric(digits) || length(digits) != 1 || digits < 0 || + if (!is.numeric(digits) || length(digits) != 1 || digits < 0 || floor(digits) != digits) { abort("Error: 'digits' must be a non-negative integer.") } - + column <- sym(column) prot <- select(prot, {{ column }}, {{ lineage_col }}) %>% @@ -853,7 +853,7 @@ findParalogs <- function(prot) { if (!is.data.frame(prot)) { abort("Error: 'prot' must be a data frame.") } - + # Remove eukaryotes prot <- prot %>% filter(!grepl("^eukaryota", Lineage)) paralogTable <- prot %>% @@ -901,4 +901,4 @@ findParalogs <- function(prot) { # cat("Word counts for broken up domains from DAs and DAs from GCs. # \nFor e.g.: # DA.doms.wc <- query.sub$DA.doms %>% -# words2WordCounts()") +# words2WordCounts()") \ No newline at end of file From 424b0e8751db2428ee58ae02fcd383f4561eb25d Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Wed, 20 Nov 2024 03:23:32 +0200 Subject: [PATCH 09/23] trailing spaces Signed-off-by: Awa Synthia --- R/networks_domarch.R | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/R/networks_domarch.R b/R/networks_domarch.R index e4da7aca..84522943 100755 --- a/R/networks_domarch.R +++ b/R/networks_domarch.R @@ -24,16 +24,16 @@ #' A network of domains is returned based on shared domain architectures. #' #' @param prot A data frame that contains the column 'DomArch'. -#' @param column Name of column containing Domain architecture from which nodes +#' @param column Name of column containing Domain architecture from which nodes #' and edges are generated. #' @param domains_of_interest Character vector specifying domains of interest. -#' @param cutoff Integer. Only use domains that occur at or above the cutoff for +#' @param cutoff Integer. Only use domains that occur at or above the cutoff for #' total counts if cutoff_type is "Total Count". -#' Only use domains that appear in cutoff or greater lineages if cutoff_type is +#' Only use domains that appear in cutoff or greater lineages if cutoff_type is #' Lineage. #' @param layout Character. Layout type to be used for the network. Options are: #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"} -#' @param query_color Character. Color to represent the queried domain in the +#' @param query_color Character. Color to represent the queried domain in the #' network. #' #' @importFrom dplyr across add_row all_of distinct filter mutate pull select @@ -231,18 +231,18 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c #' #' #' @param prot A data frame that contains the column 'DomArch'. -#' @param column Name of column containing Domain architecture from which nodes +#' @param column Name of column containing Domain architecture from which nodes #' and edges are generated. #' @param domains_of_interest Character vector specifying the domains of interest. -#' @param cutoff Integer. Only use domains that occur at or above the cutoff for +#' @param cutoff Integer. Only use domains that occur at or above the cutoff for #' total counts if cutoff_type is "Total Count". -#' Only use domains that appear in cutoff or greater lineages if cutoff_type is +#' Only use domains that appear in cutoff or greater lineages if cutoff_type is #' Lineage. #' @param layout Character. Layout type to be used for the network. Options are: #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"} -#' @param query_color Color that the nodes of the domains in the +#' @param query_color Color that the nodes of the domains in the #' domains_of_interest vector are colored -#' @param partner_color Color that the nodes that are not part of the +#' @param partner_color Color that the nodes that are not part of the #' domains_of_interest vector are colored #' @param border_color Color for the borders of the nodes. #' @param IsDirected Is the network directed? Set to false to eliminate arrows @@ -364,4 +364,4 @@ createBinaryDomainNetwork <- function(prot, column = "DomArch", domains_of_inter "auto" = visIgraphLayout(vg, "layout.auto") ) vg -} +} \ No newline at end of file From 9ce780b652fe43bfa8098b35afab05a8f494b4e1 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Wed, 20 Nov 2024 03:35:03 +0200 Subject: [PATCH 10/23] trailing spaces Signed-off-by: Awa Synthia --- R/pre-msa-tree.R | 66 ++++++++++++++++++++++++------------------------ R/summarize.R | 6 ++--- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/R/pre-msa-tree.R b/R/pre-msa-tree.R index e15e7996..6e2e8fd7 100644 --- a/R/pre-msa-tree.R +++ b/R/pre-msa-tree.R @@ -46,7 +46,7 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE") #' @param y Delimitter. Default is space (" "). #' #' @importFrom rlang abort -#' +#' #' @return A character vector in title case. #' @export #' @@ -112,21 +112,21 @@ addLeaves2Alignment <- function(aln_file = "", lin_file = "data/rawdata_tsv/all_semiclean.txt", # !! finally change to all_clean.txt!! # lin_file="data/rawdata_tsv/PspA.txt", reduced = FALSE) { - + #Check if the alignment file is provided and exists if (nchar(aln_file) == 0) { abort("Error: Alignment file path must be provided.") } - + if (!file.exists(aln_file)) { abort(paste("Error: The alignment file '", aln_file, "' does not exist.")) } - + # Check if the lineage file exists if (!file.exists(lin_file)) { abort(paste("Error: The lineage file '", lin_file, "' does not exist.")) } - + # Check that the 'reduced' parameter is logical if (!is.logical(reduced) || length(reduced) != 1) { abort("Error: 'reduced' must be a single logical value (TRUE or FALSE).") @@ -249,15 +249,15 @@ addName <- function(data, if (!is.data.frame(data)) { abort("Error: The input 'data' must be a data frame") } - + # Check that the specified columns exist in the data required_cols <- c(accnum_col, spec_col, lin_col) missing_cols <- setdiff(required_cols, names(data)) if (length(missing_cols) > 0) { - abort(paste("Error: The following columns are missing from the data:", + abort(paste("Error: The following columns are missing from the data:", paste(missing_cols, collapse = ", "))) } - + cols <- c(accnum_col, "Kingdom", "Phylum", "Genus", "Spp") split_data <- data %>% separate( @@ -347,16 +347,16 @@ convertAlignment2FA <- function(aln_file = "", if (nchar(aln_file) == 0) { abort("Error: Alignment file path must be provided.") } - + if (!file.exists(aln_file)) { abort(paste("Error: The alignment file '", aln_file, "' does not exist.")) } - + # Check if the lineage file exists if (!file.exists(lin_file)) { abort(paste("Error: The lineage file '", lin_file, "' does not exist.")) } - + # Check that the 'reduced' parameter is logical if (!is.logical(reduced) || length(reduced) != 1) { abort("Error: 'reduced' must be a single logical value (TRUE or FALSE).") @@ -424,14 +424,14 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { if (!is.data.frame(acc2name)) { abort("Error: acc2name must be a data frame.") } - + # Check if the specified columns exist in the data frame if (!(acc_col %in% colnames(acc2name))) { - abort("Error: The specified acc_col '", acc_col, "' does not exist in + abort("Error: The specified acc_col '", acc_col, "' does not exist in acc2name.") } if (!(name_col %in% colnames(acc2name))) { - abort("Error: The specified name_col '", name_col, "' does not exist in + abort("Error: The specified name_col '", name_col, "' does not exist in acc2name.") } @@ -475,7 +475,7 @@ rename_fasta <- function(fa_path, outpath, abort("Error: The input FASTA file does not exist at the specified path: ", fa_path) } - + # Check if the output path is writable outdir <- dirname(outpath) if (!dir.exists(outdir)) { @@ -541,20 +541,20 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"), reduced = F) { # Check if the alignment path exists if (!dir.exists(aln_path)) { - abort("Error: The alignment directory does not exist at the specified + abort("Error: The alignment directory does not exist at the specified path: ", aln_path) } - + # Check if the output path exists; if not, attempt to create it if (!dir.exists(fa_outpath)) { dir.create(fa_outpath, recursive = TRUE) - message("Note: The output directory did not exist and has been created: ", + message("Note: The output directory did not exist and has been created: ", fa_outpath) } - + # Check if the linear file exists if (!file.exists(lin_file)) { - abort("Error: The linear file does not exist at the specified path: ", + abort("Error: The linear file does not exist at the specified path: ", lin_file) } # library(here) @@ -626,7 +626,7 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { if (!is.character(accessions) || length(accessions) == 0) { abort("Error: 'accessions' must be a non-empty character vector.") } - + if (!dir.exists(dirname(outpath))) { abort("Error: The output directory does not exist: ", dirname(outpath)) } @@ -676,7 +676,7 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { id = accessions_partitioned[[x]], db = "protein", rettype = "fasta", - #api_key = Sys.getenv("ENTREZ_API_KEY") + # api_key = Sys.getenv("ENTREZ_API_KEY") ) ) }) @@ -732,21 +732,21 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { createRepresentativeAccNum <- function(prot_data, reduced = "Lineage", accnum_col = "AccNum") { - + # Validate input if (!is.data.frame(prot_data)) { abort("Error: 'prot_data' must be a data frame.") } - + # Check if the reduced column exists in prot_data if (!(reduced %in% colnames(prot_data))) { - abort("Error: The specified reduced column '", reduced, "' does not + abort("Error: The specified reduced column '", reduced, "' does not exist in the data frame.") } - + # Check if the accnum_col exists in prot_data if (!(accnum_col %in% colnames(prot_data))) { - abort("Error: The specified accession number column '", accnum_col, "' + abort("Error: The specified accession number column '", accnum_col, "' does not exist in the data frame.") } # Get Unique reduced column and then bind the AccNums back to get one AccNum per reduced column @@ -808,7 +808,7 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { if (!file.exists(fasta_file)) { abort("Error: The FASTA file does not exist: ", fasta_file) } - + # if (file_ext(fasta_file) != "fasta" && file_ext(fasta_file) != "fa") { # abort("Error: The specified file is not a valid FASTA file: ", fasta_file) # } @@ -857,23 +857,23 @@ writeMSA_AA2FA <- function(alignment, outpath) { if (!inherits(alignment, "AAMultipleAlignment")) { abort("Error: The alignment must be of type 'AAMultipleAlignment'.") } - + # Check the output path is a character string if (!is.character(outpath) || nchar(outpath) == 0) { abort("Error: Invalid output path specified.") } - + # Check if the output directory exists outdir <- dirname(outpath) if (!dir.exists(outdir)) { abort("Error: The output directory does not exist: ", outdir) } - l <- length(names(unmasked(alignment))) + l <- length(names(unmasked((alignment)))) fasta <- "" for (i in 1:l) { - fasta <- paste0(fasta, paste(">", names(unmasked(alignment)[i])), "\n") + fasta <- paste0(fasta, paste(">", names(unmasked((alignment)))[i]), "\n") seq <- toString(unmasked(alignment)[[i]]) fasta <- paste0(fasta, seq, "\n") } @@ -955,4 +955,4 @@ getAccNumFromFA <- function(fasta_file) { # cfile <- read_delim("data/alignments/pspc.gismo.aln", delim=" ") # cfile <- as.data.frame(map(cfile,function(x) gsub("\\s+", "",x))) # colnames(cfile) <- c("AccNum", "Alignment") -# } +# } \ No newline at end of file diff --git a/R/summarize.R b/R/summarize.R index c9e965bd..b1fef89d 100644 --- a/R/summarize.R +++ b/R/summarize.R @@ -690,7 +690,7 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col abort("Error: 'digits' must be a non-negative integer.") } - column <- sym(column) + # column <- sym(column) prot <- select(prot, {{ column }}, {{ lineage_col }}) %>% filter(!is.na({{ column }}) & !is.na({{ lineage_col }})) %>% @@ -698,10 +698,10 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col prot <- summarizeByLineage(prot, column, by = lineage_col, query = "all") col_count <- prot %>% - group_by({{ column }}) %>% + group_by(!!sym(column)) %>% summarise(totalcount = sum(count)) - total <- left_join(prot, col_count, by = as_string(column)) + total <- left_join(prot, col_count, by = column) sum_count <- sum(total$count) total <- total %>% From 397c02c028cd429bcf157afbea06c560f12db633 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Wed, 20 Nov 2024 04:29:41 +0200 Subject: [PATCH 11/23] update paths and address suggestions Signed-off-by: Awa Synthia --- R/ipr2viz.R | 2 +- inst/report/scripts/MolEvolData_class.R | 31 -------------------- inst/report/scripts/generate_report.R | 6 ++-- inst/report/scripts/run_molevolvr_pipeline.R | 30 ++++++++++--------- 4 files changed, 20 insertions(+), 49 deletions(-) diff --git a/R/ipr2viz.R b/R/ipr2viz.R index a4769ed6..e5fed49f 100644 --- a/R/ipr2viz.R +++ b/R/ipr2viz.R @@ -212,7 +212,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), analysis_labeler <- analyses %>% pivot_wider(names_from = .data$Analysis, values_from = .data$Analysis) - lookup_tbl_path <- "~/awasyn/new_trial/cln_lookup_tbl.tsv" + system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE) lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols) lookup_tbl <- lookup_tbl %>% select(-.data$ShortName) # Already has ShortName -- Just needs SignDesc diff --git a/inst/report/scripts/MolEvolData_class.R b/inst/report/scripts/MolEvolData_class.R index 791f1d6a..f4998ee9 100644 --- a/inst/report/scripts/MolEvolData_class.R +++ b/inst/report/scripts/MolEvolData_class.R @@ -41,37 +41,6 @@ setClass("seqUpload", ) ) -# Group by lineage + DA then take top 20 -top_acc <- function(cln_file, DA_col = "DomArch.Pfam", - lin_col = "Lineage", n = 20) { - lin_sym <- sym(lin_col) - DA_sym <- sym(DA_col) - - cln <- fread(cln_file, sep = "\t", fill = T) - - grouped <- cln %>% - group_by({{ lin_sym }}, {{ DA_sym }}) %>% - summarise(count = n()) %>% - arrange(-count) %>% - filter(!is.na({{ lin_sym }}) & !is.na({{ DA_sym }})) - - top_acc <- character(n) - for (r in 1:min(nrow(grouped), n)) - { - l <- (grouped %>% pull({{ lin_sym }}))[r] - DA <- (grouped %>% pull({{ DA_sym }}))[r] - - filt <- cln %>% filter({{ lin_sym }} == l & {{ DA_sym }} == DA) - - top <- filt[which(filt$PcPositive == max(filt$PcPositive))[1], ] - - top_acc[r] <- top$AccNum - } - top_acc <- top_acc[which(top_acc != "")] - return(top_acc) -} - - combineFilesNopmap <- function(inpath, pattern, outpath, delim = "\t", skip = 0, col_names) { diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R index 4ad5b076..e7ab37b3 100644 --- a/inst/report/scripts/generate_report.R +++ b/inst/report/scripts/generate_report.R @@ -303,7 +303,7 @@ runAnalysis <- function( # After uploading the sequence data, you would check the uploaded data if (sequence_upload_data@seqs == "") { - stop("Error: Please upload a protein sequence") + stop("Error: Please upload a protein sequence.") } OUT_PATH <- getwd() unavailable_pins <- list.files(OUT_PATH) @@ -339,7 +339,7 @@ runAnalysis <- function( validateFasta(tmp_file) }, error = function(e) { - warning("Error: Failed to run input FASTA verification") + warning("Error: Failed to run input FASTA verification.") return(FALSE) # Return FALSE if an error occurs }, finally = { @@ -526,7 +526,7 @@ runAnalysis <- function( } if (ipr_upload_data()@seqs == "" && !ipr_ncbi_check) { stop("Error: Please provide a file containing sequences or check the - box to use fetch sequences for NCBI accession numbers.") + box to fetch sequences using NCBI accession numbers.") } ipr <- read_tsv(ipr_upload_data()@df, col_names = FALSE) diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R index ea52a3b5..7cf1ebe6 100644 --- a/inst/report/scripts/run_molevolvr_pipeline.R +++ b/inst/report/scripts/run_molevolvr_pipeline.R @@ -187,7 +187,7 @@ runMolevolvrPipeline <- function(input_paths, db, nhits, eval, # setwd(OUTDIR) # Run DELTABLAST - runDeltablast(input_paths, PREFIX, OUTDIR, db, nhits, eval) + runDELTABLAST(input_paths, PREFIX, OUTDIR, db, nhits, eval) # Run ACC2FA convertAccNum2Fasta(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")), @@ -207,7 +207,7 @@ runMolevolvrPipeline <- function(input_paths, db, nhits, eval, # Sys.sleep(30) # Run BLASTCLUST - runBlastclust(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")), + runCDHIT(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")), PREFIX, OUTDIR ) # Convert clusters to table @@ -541,7 +541,7 @@ replaceAccNums <- function(path_acc2info, } -runDeltablast <- function(infile, prefix, outdir, +runDELTABLAST <- function(infile, prefix, outdir, db = "refseq_protein", nhits = 5000, evalue = 1e-5, threads = 10) { @@ -700,9 +700,9 @@ cleanupBlast <- function(infile_blast, acc2info, prefix, wblast = F) { # TaxID to lineage mapping cleanedup_blast$TaxID <- as.integer(cleanedup_blast$TaxID) - lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt", - header = TRUE, fill = TRUE, - colClasses = lineage_map_cols) + lineage_map <- fread( + system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE), + header = TRUE, fill = TRUE, colClasses = lineage_map_cols) # Merge with lineage map and clean up columns mergedLins <- merge(cleanedup_blast, lineage_map, by = "TaxID", @@ -722,7 +722,7 @@ cleanupBlast <- function(infile_blast, acc2info, prefix, wblast = F) { # Function to run BLASTCLUST on given input -runBlastclust <- function(infile, suffix, outdir) { +runCDHIT <- function(infile, suffix, outdir) { # Prepare output file path outfile <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv")) @@ -734,13 +734,13 @@ runBlastclust <- function(infile, suffix, outdir) { cat("## Now running BLASTCLUST on file(s):", infile, "\n") cat("#####################################\n") - # Run BLASTCLUST + # Cluster sequences w/ BLASTCLUST/CD-HIT # blastclust_cmd <- paste("blastclust -i", infile, "-o", outfile, "-p T -L .6 -b T -S 80 -a 8") cdhit_command <- sprintf( "cd-hit -i %s -o %s -c 0.8 -aS 0.6 -T 8", infile, outfile ) - clean_command <- sprintf( + clean_cdhit_format <- sprintf( "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s", input_file, outfile @@ -750,7 +750,7 @@ runBlastclust <- function(infile, suffix, outdir) { # system(blastclust_cmd) system(cdhit_command) - system(clean_command) + system(clean_cdhit_format) } @@ -881,8 +881,9 @@ ipr2Linear <- function(ipr, acc2info, prefix) { ipr_tax <- left_join(ipr_in, acc2info_out, by = "AccNum") # read in lineage map - lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt", - header = T, fill = T) + lineage_map <- fread( + system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE), + header = TRUE, fill = TRUE) # merge ipr+info w/ lineage # both tables have a species column, but only @@ -895,8 +896,9 @@ ipr2Linear <- function(ipr, acc2info, prefix) { select(-Species.x, -Species.y) # add lookup table to iprscan file - lookup_tbl <- fread(input = "~/awasyn/new_trial/cln_lookup_tbl.tsv", - sep = "\t", header = T, fill = T) %>% + lookup_tbl <- fread( + system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE), + sep = "\t", header = TRUE, fill = TRUE) %>% distinct() if ("AccNum.x" %in% names(ipr_lin)) { ipr_lin <- ipr_lin %>% From 16e957159ff950149a2e6ae3621c15ebd67ec0cc Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Wed, 20 Nov 2024 05:26:55 +0200 Subject: [PATCH 12/23] add generate case study function Signed-off-by: Awa Synthia --- .gitignore | 3 +- inst/report/report_template.Rmd | 10 +++++- inst/report/scripts/generate_report.R | 34 ++++++++++++++++++-- inst/report/scripts/run_molevolvr_pipeline.R | 7 +++- 4 files changed, 49 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index ef11006e..74dbc757 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .Rproj.user docs +common_data .Rhistory -.DS_Store \ No newline at end of file +.DS_Store diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd index dddf357c..195e7887 100644 --- a/inst/report/report_template.Rmd +++ b/inst/report/report_template.Rmd @@ -472,7 +472,15 @@ if(length(params$repAccNums) >= 3 ) { # Call MSA2PDF msa_pdf_path <- tempfile() - msa_prefix <- "~/awasyn/MolEvolvRsHzFyk_full/msa_figs/" + msa_parent_dir <- dirname(params$app_data@fasta_path) + msa_subdir_path <- file.path(parent_dir, "msa_figs") + if (!dir.exists(subdir_path)) { + dir.create(subdir_path, recursive = TRUE) + message("Subdirectory 'msa_figs' created at: ", subdir_path) + } else { + message("Subdirectory 'msa_figs' already exists at: ", subdir_path) + } + msa_prefix <- paste0(msa_subdir_path, "/") post_fix <- paste("msa", params$query_pin, params$PhyloSelect, params$msa_reduce_by, ".pdf", sep = "_") msa_pdf_path <- paste0(msa_prefix, post_fix) diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R index e7ab37b3..6f7fcaef 100644 --- a/inst/report/scripts/generate_report.R +++ b/inst/report/scripts/generate_report.R @@ -1,6 +1,36 @@ # Author(s): Awa Synthia # Last modified: 2024 +getCaseStudyReport <- function(pathogen = NULL, drug = NULL, ...) { + cat("\n") + cat("\033[1;36m") # Cyan for the "MolEvolvR" header + cat("=========================================\n") + cat(" M o l E v o l R\n") + cat("=========================================\n") + cat("\033[0m") + + cat("\033[1;32m") + cat("\n>>>>> Case Study Report Generated <<<<<\n") + cat("\033[0m") + + # Step 1: Fetch the FASTA sequences of the given pathogen and drug + getCardData(pathogen, drug) + + # Check if the FASTA file exists after fetching + fasta_file <- "filtered_proteins.fasta" + if (!file.exists(fasta_file)) { + stop("Failed to retrieve FASTA sequences. The file + 'filtered_proteins.fasta' does not exist.") + } + # Step 2: Once the FASTA file is ready, call runAnalysis to run pipeline + message("FASTA file downloaded, running analysis...") + + # Pass the file to runAnalysis function + runAnalysis(file_paths = list(fasta = fasta_file)) + + message("Case study report completed") +} + # get fasta of pathogen and/or drug getCardData <- function(pathogen = NULL, drug = NULL) { destination_dir <- "CARD_data" @@ -84,8 +114,8 @@ runAnalysis <- function( acc_homology_analysis = TRUE, acc_da_analysis = TRUE, acc_phylogeny_analysis = FALSE, - report_template_path = "/report/report_template.Rmd", - output_file = file.path(tempdir(), "report.html"), + report_template_path = system.file("report", "report_template.Rmd", package = "MolEvolvR", mustWork = TRUE), + output_file = file.path(getwd(), "report.html"), DASelect = "All", mainSelect = NULL, PhyloSelect = "All", diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R index 7cf1ebe6..c9c0deeb 100644 --- a/inst/report/scripts/run_molevolvr_pipeline.R +++ b/inst/report/scripts/run_molevolvr_pipeline.R @@ -846,8 +846,13 @@ runIPRScan <- function(query_file, prefix, outdir) { # Run InterProScan command # Construct the command + + # get the path to the interproscan.sh script from the environment + # variable INTERPROSCAN_CMD, or assume it's on the path if unspecified + iprscan_cmd <- Sys.getenv("INTERPROSCAN_CMD", unset="interproscan.sh") + command <- paste( - "~/iprdir/interproscan-5.70-102.0/interproscan.sh -i", + iprscan_cmd, "-i", shQuote(query_file), "-b", shQuote(outfile), "-f TSV --cpu", Sys.getenv("INTERPROSCAN_CPUS", "4"), From 83ccfb751ad8f71663d637c6d3e82576feea09d6 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Wed, 20 Nov 2024 05:54:31 +0200 Subject: [PATCH 13/23] update generate function Signed-off-by: Awa Synthia --- inst/report/scripts/generate_report.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R index 6f7fcaef..06128453 100644 --- a/inst/report/scripts/generate_report.R +++ b/inst/report/scripts/generate_report.R @@ -10,7 +10,7 @@ getCaseStudyReport <- function(pathogen = NULL, drug = NULL, ...) { cat("\033[0m") cat("\033[1;32m") - cat("\n>>>>> Case Study Report Generated <<<<<\n") + cat("\n>>>>> Case Study Report Generation started... <<<<<\n") cat("\033[0m") # Step 1: Fetch the FASTA sequences of the given pathogen and drug @@ -26,7 +26,7 @@ getCaseStudyReport <- function(pathogen = NULL, drug = NULL, ...) { message("FASTA file downloaded, running analysis...") # Pass the file to runAnalysis function - runAnalysis(file_paths = list(fasta = fasta_file)) + runAnalysis(file_paths = list(fasta = fasta_file), ...) message("Case study report completed") } From 4bf5b00e32802c1a42997d510789a590af0ba174 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Wed, 20 Nov 2024 19:14:00 +0200 Subject: [PATCH 14/23] add getCaseStudyReport function Signed-off-by: Awa Synthia --- R/ipr2viz.R | 2 +- inst/report/scripts/generate_report.R | 18 +++++--- inst/report/scripts/run_molevolvr_pipeline.R | 43 ++++++++++++++++---- inst/report/scripts/viz_utils.R | 34 +++++++++------- 4 files changed, 69 insertions(+), 28 deletions(-) diff --git a/R/ipr2viz.R b/R/ipr2viz.R index e5fed49f..fb145c67 100644 --- a/R/ipr2viz.R +++ b/R/ipr2viz.R @@ -212,7 +212,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), analysis_labeler <- analyses %>% pivot_wider(names_from = .data$Analysis, values_from = .data$Analysis) - system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE) + lookup_tbl_path <- system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE) lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols) lookup_tbl <- lookup_tbl %>% select(-.data$ShortName) # Already has ShortName -- Just needs SignDesc diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R index 06128453..d1dd178b 100644 --- a/inst/report/scripts/generate_report.R +++ b/inst/report/scripts/generate_report.R @@ -1,6 +1,12 @@ # Author(s): Awa Synthia # Last modified: 2024 +# load functions to use +source("MolEvolData_class.R") +source("run_molevolvr_pipeline.R") +source("viz_utils.R") + +# example usage: getCaseStudyReport("Acinetobacter baumannii", "Beta-lactams") getCaseStudyReport <- function(pathogen = NULL, drug = NULL, ...) { cat("\n") cat("\033[1;36m") # Cyan for the "MolEvolvR" header @@ -616,7 +622,7 @@ runAnalysis <- function( domarch_cols_value <- getDomArchCols(app_data, DASelect) - query_domarch_cols_value <- getDomArchCols(query_data@df) + query_domarch_cols_value <- getQueryDomArchCols(query_data@df) mainTable_value <- getDataTable(data) @@ -630,8 +636,8 @@ runAnalysis <- function( rs_IprGenes_value <- getIPRGenesVisualization(data, app_data, - input_rs_iprDatabases, - input_rs_iprVisType) + query_iprDatabases, + query_iprVisType) rs_network_layout_value <- getRSNetworkLayout(data, app_data, @@ -641,9 +647,9 @@ runAnalysis <- function( rs_data_table_value <- getDataTable(data) da_IprGenes_value <- getDomArchIPRGenesPlot(app_data, - da_iprDatabases, - da_iprVisType, - DASelect) + query_iprDatabases, + query_iprVisType, + DASelect) query_heatmap_value <- getQueryHeatmap(query_data@df, heatmap_select = "All", diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R index c9c0deeb..94324599 100644 --- a/inst/report/scripts/run_molevolvr_pipeline.R +++ b/inst/report/scripts/run_molevolvr_pipeline.R @@ -740,18 +740,47 @@ runCDHIT <- function(infile, suffix, outdir) { "cd-hit -i %s -o %s -c 0.8 -aS 0.6 -T 8", infile, outfile ) - clean_cdhit_format <- sprintf( - "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s", - input_file, - outfile - ) + # clean_cdhit_format <- sprintf( + # "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s", + # input_file, + # outfile + # ) cat("\nPerforming BLASTCLUST analysis on", infile, "\n") # system(blastclust_cmd) system(cdhit_command) - system(clean_cdhit_format) + # system(clean_cdhit_format) + cleanCDHIT(input_file, outfile) + cat("Cleaning CDHIT results completed") +} +# extract_sequences("input_file.tsv.clstr", "output_file.tsv") +cleanCDHIT <- function(input_file, output_file) { + # Read the input file line by line + lines <- readLines(input_file) + + # Initialize an empty vector to store extracted identifiers + extracted <- c() + + # Loop through lines and extract content between ">" and "..." + for (line in lines) { + # Only process lines containing valid sequences (skip cluster headers) + if (grepl(">", line) && grepl("\\.\\.\\.", line)) { + # Extract the sequence identifier between ">" and "..." + match <- regmatches(line, regexpr(">(.*?)\\.\\.\\.", line)) + if (length(match) > 0) { + sequence <- gsub("^>", "", match) # Remove the leading ">" + sequence <- gsub("\\.\\.\\.$", "", sequence) # Remove trailing dots + extracted <- c(extracted, sequence) + } + } + } + + # Write the extracted sequences to the output file + writeLines(extracted, output_file) + + cat("Sequences extracted and written to:", output_file, "\n") } # Function to format blastclust output @@ -902,7 +931,7 @@ ipr2Linear <- function(ipr, acc2info, prefix) { # add lookup table to iprscan file lookup_tbl <- fread( - system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE), + system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE), sep = "\t", header = TRUE, fill = TRUE) %>% distinct() if ("AccNum.x" %in% names(ipr_lin)) { diff --git a/inst/report/scripts/viz_utils.R b/inst/report/scripts/viz_utils.R index 573e0739..aa62e126 100644 --- a/inst/report/scripts/viz_utils.R +++ b/inst/report/scripts/viz_utils.R @@ -10,8 +10,8 @@ library(plotly) # Function to generate the InterProScan Visualization getIPRGenesVisualization <- function(data, app_data, - input_rs_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"), - input_rs_iprVisType = "Analysis") { + query_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"), + query_iprVisType = "Analysis") { # Check if analysis is loaded if (nrow(data@df) == 0 || app_data@ipr_path == "") { @@ -33,8 +33,8 @@ getIPRGenesVisualization <- function(data, app_data, ipr_plot <- plotIPR2VizWeb( infile_ipr = data@ipr_path, accessions = data@df$Name, - analysis = input_rs_iprDatabases, - group_by = input_rs_iprVisType, + analysis = query_iprDatabases, + group_by = query_iprVisType, name = n ) @@ -45,8 +45,8 @@ getIPRGenesVisualization <- function(data, app_data, infile_ipr = data@ipr_path, infile_full = data@df, accessions = unique(data@df$Name), - analysis = input_rs_iprDatabases, - group_by = input_rs_iprVisType, + analysis = query_iprDatabases, + group_by = query_iprVisType, topn = 20, # This value is hardcoded in the original code query = "All" ) @@ -269,7 +269,13 @@ getMSAData <- function(msa_path) { if (is.null(msa_path) || msa_path == "") { stop("Error: MSA path is not provided.") } - return(read_file(msa_path)) + # Attempt to read the file and handle potential errors + if (file.exists(msa_path)) { + return(read_file(msa_path)) + } else { + warning(sprintf("Warning: Unable to read the file at path '%s'. Ignoring...", msa_path)) + return(NULL) # Return NULL if the file cannot be read + } } # Function to generate a heatmap @@ -307,7 +313,7 @@ getQueryHeatmap <- function(query_data_df, } # Function to retrieve domain architecture columns -getDomArchCols <- function(query_data_df) { +getQueryDomArchCols <- function(query_data_df) { # Check if query data exists if (nrow(query_data_df) == 0) { stop("No query data available.") @@ -712,8 +718,8 @@ getDomArchCols <- function(app_data, DASelect) { } # Function to generate the IPR genes plot -getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases, - da_iprVisType, DASelect) { +getDomArchIPRGenesPlot <- function(app_data, query_iprDatabases, + query_iprVisType, DASelect) { if (app_data@ipr_path == "") { stop("IPR path is not set.") @@ -739,8 +745,8 @@ getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases, plot <- plotIPR2VizWeb( infile_ipr = app_data@ipr_path, accessions = df$Name, - analysis = da_iprDatabases, - group_by = da_iprVisType, + analysis = query_iprDatabases, + group_by = query_iprVisType, name = name_column ) } else { @@ -749,8 +755,8 @@ getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases, infile_ipr = app_data@ipr_path, infile_full = df, accessions = unique(df$Name), - analysis = da_iprDatabases, - group_by = da_iprVisType, + analysis = query_iprDatabases, + group_by = query_iprVisType, topn = 20, query = DASelect ) From e4af641bf998987e23a27b97e046cdf19fdaf91f Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Thu, 21 Nov 2024 00:58:02 +0200 Subject: [PATCH 15/23] move functions to R/ Signed-off-by: Awa Synthia --- NAMESPACE | 1 + {inst/report/scripts => R}/MolEvolData_class.R | 8 ++++---- {inst/report/scripts => R}/generate_report.R | 6 ++---- .../run_molevolvr_pipeline.R => R/run_pipeline.R | 4 ++-- {inst/report/scripts => R}/viz_utils.R | 2 +- inst/report/report_template.Rmd | 10 +++++----- 6 files changed, 15 insertions(+), 16 deletions(-) rename {inst/report/scripts => R}/MolEvolData_class.R (99%) rename {inst/report/scripts => R}/generate_report.R (99%) rename inst/report/scripts/run_molevolvr_pipeline.R => R/run_pipeline.R (99%) rename {inst/report/scripts => R}/viz_utils.R (99%) diff --git a/NAMESPACE b/NAMESPACE index 6ae464f6..1a2f3a99 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -49,6 +49,7 @@ export(findParalogs) export(formatJobArgumentsHTML) export(generateAllAlignments2FA) export(getAccNumFromFA) +export(getCaseStudyReport) export(getProcessRuntimeWeights) export(getTopAccByLinDomArch) export(mapAcc2Name) diff --git a/inst/report/scripts/MolEvolData_class.R b/R/MolEvolData_class.R similarity index 99% rename from inst/report/scripts/MolEvolData_class.R rename to R/MolEvolData_class.R index f4998ee9..bfdd846c 100644 --- a/inst/report/scripts/MolEvolData_class.R +++ b/R/MolEvolData_class.R @@ -415,7 +415,7 @@ validateAccNumFasta <- function(text) { } -#' Test whether a single accession returns a valid protein from Entrez +# Test whether a single accession returns a valid protein from Entrez isAccNumValidEntrez <- function(accnum, verbose = FALSE) { # empty accnum wil not raise an error from efetch, so test for this first if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)} @@ -435,7 +435,7 @@ isAccNumValidEntrez <- function(accnum, verbose = FALSE) { return(result) } -#' Test whether a single accession returns a valid protein from EBI +# Test whether a single accession returns a valid protein from EBI isAccNumValidEbi <- function(accnum, verbose = FALSE) { # validation: ensure there's some text to parse if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)} @@ -460,7 +460,7 @@ isAccNumValidEbi <- function(accnum, verbose = FALSE) { return(result) } -#' Perform a series of API reqs using NCBI entrez to validate accession numbers +# Perform a series of API reqs using NCBI entrez to validate accession numbers performEntrezReqs <- function(accnums, verbose = FALSE, track_progress = FALSE) { # API guidelines docs # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html @@ -501,7 +501,7 @@ performEbiReqs <- function(accnums, verbose = FALSE, track_progress = FALSE) { return(results) } -#' Validate accession numbers from MolEvolvR user input +# Validate accession numbers from MolEvolvR user input validateAccNum <- function(text, verbose = FALSE, track_progress = FALSE, n_steps = integer()) { # API guidelines docs # entrez https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Usage_Guidelines_and_Requiremen diff --git a/inst/report/scripts/generate_report.R b/R/generate_report.R similarity index 99% rename from inst/report/scripts/generate_report.R rename to R/generate_report.R index d1dd178b..98377448 100644 --- a/inst/report/scripts/generate_report.R +++ b/R/generate_report.R @@ -2,10 +2,8 @@ # Last modified: 2024 # load functions to use -source("MolEvolData_class.R") -source("run_molevolvr_pipeline.R") -source("viz_utils.R") +#' @export # example usage: getCaseStudyReport("Acinetobacter baumannii", "Beta-lactams") getCaseStudyReport <- function(pathogen = NULL, drug = NULL, ...) { cat("\n") @@ -687,7 +685,7 @@ runAnalysis <- function( ####### Report Generation ######## tryCatch({ - tempReport <- file.path(tempdir(), "report.Rmd") + tempReport <- file.path(dir, "report.Rmd") file.copy(report_template_path, tempReport, overwrite = TRUE) # List of graphics to include in report diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/R/run_pipeline.R similarity index 99% rename from inst/report/scripts/run_molevolvr_pipeline.R rename to R/run_pipeline.R index 94324599..ffcc4f52 100644 --- a/inst/report/scripts/run_molevolvr_pipeline.R +++ b/R/run_pipeline.R @@ -215,7 +215,7 @@ runMolevolvrPipeline <- function(input_paths, db, nhits, eval, file.path(OUTDIR, paste0(PREFIX, ".blast.cln.tsv"))) # Run INTERPROSCAN - runIPRScan(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")), + runIPRScan2(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")), PREFIX, OUTDIR) new_header <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis", "DB.ID", "SignDesc", "StartLoc", "StopLoc", "Score", @@ -859,7 +859,7 @@ clust2Table <- function(clust, blast) { } # Function to run InterProScan -runIPRScan <- function(query_file, prefix, outdir) { +runIPRScan2 <- function(query_file, prefix, outdir) { # Start InterProScan run cat("\n######################\n") diff --git a/inst/report/scripts/viz_utils.R b/R/viz_utils.R similarity index 99% rename from inst/report/scripts/viz_utils.R rename to R/viz_utils.R index aa62e126..590f987e 100644 --- a/inst/report/scripts/viz_utils.R +++ b/R/viz_utils.R @@ -732,7 +732,7 @@ getDomArchIPRGenesPlot <- function(app_data, query_iprDatabases, your data correctly.") } - if (is.null(da_iprDatabases) || length(da_iprDatabases) == 0) { + if (is.null(query_iprDatabases) || length(query_iprVisType) == 0) { stop("Please select an analysis.") } diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd index 195e7887..9d616e9e 100644 --- a/inst/report/report_template.Rmd +++ b/inst/report/report_template.Rmd @@ -473,12 +473,12 @@ if(length(params$repAccNums) >= 3 ) { # Call MSA2PDF msa_pdf_path <- tempfile() msa_parent_dir <- dirname(params$app_data@fasta_path) - msa_subdir_path <- file.path(parent_dir, "msa_figs") - if (!dir.exists(subdir_path)) { - dir.create(subdir_path, recursive = TRUE) - message("Subdirectory 'msa_figs' created at: ", subdir_path) + msa_subdir_path <- file.path(msa_parent_dir, "msa_figs") + if (!dir.exists(msa_subdir_path)) { + dir.create(msa_subdir_path, recursive = TRUE) + message("Subdirectory 'msa_figs' created at: ", msa_subdir_path) } else { - message("Subdirectory 'msa_figs' already exists at: ", subdir_path) + message("Subdirectory 'msa_figs' already exists at: ", msa_subdir_path) } msa_prefix <- paste0(msa_subdir_path, "/") post_fix <- paste("msa", params$query_pin, params$PhyloSelect, params$msa_reduce_by, ".pdf", sep = "_") From 7b50cbf60345736930a8bfb754d0122422647f9a Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Thu, 21 Nov 2024 23:12:45 +0200 Subject: [PATCH 16/23] add libraries to description Signed-off-by: Awa Synthia --- DESCRIPTION | 3 +++ R/MolEvolData_class.R | 3 --- R/combine_files.R | 16 ++++++++-------- R/run_pipeline.R | 14 -------------- R/viz_utils.R | 7 ------- man/combineFiles.Rd | 2 +- 6 files changed, 12 insertions(+), 33 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0fa9a949..fd0ba89a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,6 +40,7 @@ Imports: DBI, doParallel, dplyr, + DT, forcats, fs, furrr, @@ -51,6 +52,7 @@ Imports: here, htmltools, htmlwidgets, + httr, igraph, latexpdf, magrittr, @@ -73,6 +75,7 @@ Imports: tibble, tidyr, tinytex, + tidyverse, tools, UpSetR, viridis, diff --git a/R/MolEvolData_class.R b/R/MolEvolData_class.R index bfdd846c..427bd85b 100644 --- a/R/MolEvolData_class.R +++ b/R/MolEvolData_class.R @@ -584,9 +584,6 @@ validateEvalue <- function(input_value) { # Author(s): JK # Last modified: 2023_06 #=============================================================================== - -library("Biostrings") -library("tidyverse") #------------------------------------------------------------------------------- guessSeqType <- function(single_fasta, dna_guess_cutoff = 0.9, other_guess_cutoff = 0.5) { tb <- as_tibble(alphabetFrequency(single_fasta)) diff --git a/R/combine_files.R b/R/combine_files.R index 4f03b1d2..1c5ac6a2 100755 --- a/R/combine_files.R +++ b/R/combine_files.R @@ -24,22 +24,22 @@ #' #' @author Janani Ravi #' -#' @param inpath Character. The master directory path where the files reside. +#' @param inpath Character. The master directory path where the files reside. #' The search is recursive (i.e., it will look in subdirectories as well). -#' @param pattern Character. A search pattern to identify files to be combined. +#' @param pattern Character. A search pattern to identify files to be combined. #' Default is "*full_analysis.tsv". -#' @param delim Character. The delimiter used in the input files. -#' Default is tab ("\t"). -#' @param skip Integer. The number of lines to skip at the beginning of each file. +#' @param delim Character. The delimiter used in the input files. +#' Default is tab \code{"\t"}. +#' @param skip Integer. The number of lines to skip at the beginning of each file. #' Default is 0. -#' @param col_names Logical or character vector. If TRUE, the first row of each file -#' is treated as column names. Alternatively, a character vector can +#' @param col_names Logical or character vector. If TRUE, the first row of each file +#' is treated as column names. Alternatively, a character vector can #' be provided to specify custom column names. #' #' @importFrom purrr pmap_dfr #' @importFrom readr cols #' -#' @return A data frame containing the combined contents of all matched files. +#' @return A data frame containing the combined contents of all matched files. #' Each row will include a new column "ByFile" indicating the source file of the data. #' #' @export diff --git a/R/run_pipeline.R b/R/run_pipeline.R index ffcc4f52..c49b5f06 100644 --- a/R/run_pipeline.R +++ b/R/run_pipeline.R @@ -1,12 +1,6 @@ # Author(s): Awa Synthia # Last modified: 2024 -# Load necessary libraries -library(httr) -library(data.table) -library(readr) -library(rentrez) - getSeqs <- function(sequences, acc_file_path = "accs.txt", dir_path = "~", @@ -320,7 +314,6 @@ acc2info <- function(infile, prefix, outdir) { } } - if (any_atomic) { parsed_data <- data.frame( AccNum = docsums$oslt$value, @@ -520,7 +513,6 @@ subsAccnum4cc2Info <- function(df_acc2info, df_header_map) { return(df_result) } - replaceAccNums <- function(path_acc2info, path_query_header_map, path_out) { @@ -540,7 +532,6 @@ replaceAccNums <- function(path_acc2info, write_tsv(df_acc2info_substituted, file = path_out, col_names = TRUE) } - runDELTABLAST <- function(infile, prefix, outdir, db = "refseq_protein", nhits = 5000, evalue = 1e-5, @@ -576,9 +567,7 @@ runDELTABLAST <- function(infile, prefix, outdir, cat("DELTABLAST completed.\n") } - # This script converts AccNum to Fasta using NCBI's EDirect or EBI's API - convertAccNum2Fasta <- function(infile, prefix, outdir) { # Create the output file path @@ -647,7 +636,6 @@ convertAccNum2Fasta <- function(infile, prefix, outdir) { cat("#####################\n") } - cleanupBlast <- function(infile_blast, acc2info, prefix, wblast = F) { outdir <- dirname(infile_blast) @@ -720,7 +708,6 @@ cleanupBlast <- function(infile_blast, acc2info, prefix, wblast = F) { write_tsv(blast_names, file_name, col_names = TRUE) } - # Function to run BLASTCLUST on given input runCDHIT <- function(infile, suffix, outdir) { @@ -1090,7 +1077,6 @@ web_blast_colnames <- c("Query", "AccNum", "EValue", "BitScore", "PcPosOrig", "QSFrames") # specific to "blastx" - # BLAST Command line cl_blast_colnames <- c("Query", "SAccNum", "AccNum", "SAllSeqID", "STitle", "Species", "TaxID", diff --git a/R/viz_utils.R b/R/viz_utils.R index 590f987e..5549ff97 100644 --- a/R/viz_utils.R +++ b/R/viz_utils.R @@ -1,13 +1,6 @@ # Author(s): Awa Synthia # Last modified: 2024 -# Load necessary packages -library(dplyr) -library(stringr) -library(visNetwork) -library(DT) -library(plotly) - # Function to generate the InterProScan Visualization getIPRGenesVisualization <- function(data, app_data, query_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"), diff --git a/man/combineFiles.Rd b/man/combineFiles.Rd index 81464fa6..afbc6c5f 100644 --- a/man/combineFiles.Rd +++ b/man/combineFiles.Rd @@ -20,7 +20,7 @@ The search is recursive (i.e., it will look in subdirectories as well).} Default is "*full_analysis.tsv".} \item{delim}{Character. The delimiter used in the input files. -Default is tab ("\t").} +Default is tab \code{"\t"}.} \item{skip}{Integer. The number of lines to skip at the beginning of each file. Default is 0.} From beb01c2699f4d6d71083e2aa591cb8862ec1c594 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Thu, 21 Nov 2024 23:43:51 +0200 Subject: [PATCH 17/23] short fix Signed-off-by: Awa Synthia --- DESCRIPTION | 1 + R/run_pipeline.R | 3 +++ 2 files changed, 4 insertions(+) diff --git a/DESCRIPTION b/DESCRIPTION index fd0ba89a..e2ff7eca 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -53,6 +53,7 @@ Imports: htmltools, htmlwidgets, httr, + httr2, igraph, latexpdf, magrittr, diff --git a/R/run_pipeline.R b/R/run_pipeline.R index c49b5f06..b94ba001 100644 --- a/R/run_pipeline.R +++ b/R/run_pipeline.R @@ -1,6 +1,9 @@ # Author(s): Awa Synthia # Last modified: 2024 +# Temporal fix. should work normally in descr +library(readr) + getSeqs <- function(sequences, acc_file_path = "accs.txt", dir_path = "~", From a1a90f9c3854cf39fa6b78e665f25a7e9f678e26 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Fri, 22 Nov 2024 00:08:46 +0200 Subject: [PATCH 18/23] short fix Signed-off-by: Awa Synthia --- NAMESPACE | 9 +++++++++ R/generate_report.R | 3 ++- R/run_pipeline.R | 4 ++-- R/viz_utils.R | 3 +++ 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 1a2f3a99..b22ec462 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -99,6 +99,15 @@ export(wordcloud3) export(writeMSA_AA2FA) export(writeProcessRuntime2TSV) export(writeProcessRuntime2YML) +import(DT) +import(data.table) +import(dplyr) +import(httr) +import(plotly) +import(readr) +import(rentrez) +import(stringr) +import(visNetwork) importFrom(Biostrings,AAStringSet) importFrom(Biostrings,readAAStringSet) importFrom(Biostrings,toString) diff --git a/R/generate_report.R b/R/generate_report.R index 98377448..aa2872c8 100644 --- a/R/generate_report.R +++ b/R/generate_report.R @@ -1,7 +1,8 @@ # Author(s): Awa Synthia # Last modified: 2024 -# load functions to use +# import libs +#' @import readr stringr #' @export # example usage: getCaseStudyReport("Acinetobacter baumannii", "Beta-lactams") diff --git a/R/run_pipeline.R b/R/run_pipeline.R index b94ba001..24b08272 100644 --- a/R/run_pipeline.R +++ b/R/run_pipeline.R @@ -1,8 +1,8 @@ # Author(s): Awa Synthia # Last modified: 2024 -# Temporal fix. should work normally in descr -library(readr) +# import libs +#' @import readr data.table httr rentrez getSeqs <- function(sequences, acc_file_path = "accs.txt", diff --git a/R/viz_utils.R b/R/viz_utils.R index 5549ff97..6f4a2b3e 100644 --- a/R/viz_utils.R +++ b/R/viz_utils.R @@ -1,6 +1,9 @@ # Author(s): Awa Synthia # Last modified: 2024 +# import libs +#' @import dplyr stringr visNetwork DT plotly +#' # Function to generate the InterProScan Visualization getIPRGenesVisualization <- function(data, app_data, query_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"), From 3d0ca3a5079db266a5f3a18d8de7452a3f587927 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Fri, 22 Nov 2024 08:21:45 +0200 Subject: [PATCH 19/23] add fix Signed-off-by: Awa Synthia --- NAMESPACE | 2 ++ R/MolEvolData_class.R | 2 ++ 2 files changed, 4 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index b22ec462..eadec16a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -99,6 +99,7 @@ export(wordcloud3) export(writeMSA_AA2FA) export(writeProcessRuntime2TSV) export(writeProcessRuntime2YML) +import(Biostrings) import(DT) import(data.table) import(dplyr) @@ -107,6 +108,7 @@ import(plotly) import(readr) import(rentrez) import(stringr) +import(tidyverse) import(visNetwork) importFrom(Biostrings,AAStringSet) importFrom(Biostrings,readAAStringSet) diff --git a/R/MolEvolData_class.R b/R/MolEvolData_class.R index 427bd85b..ab496048 100644 --- a/R/MolEvolData_class.R +++ b/R/MolEvolData_class.R @@ -1,6 +1,8 @@ # Author(s): Samuel Chen # Last modified: 2020 +#' @import Biostrings tidyverse + setClass("MolEvolData", slots = list( df = "data.frame", From 9e11b4f69732c9b08850d7be5d4e951100679f87 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Sun, 24 Nov 2024 01:04:32 +0200 Subject: [PATCH 20/23] use iprscanr Signed-off-by: Awa Synthia --- R/generate_report.R | 18 +++---- R/ipr2viz.R | 6 +-- R/run_pipeline.R | 85 +++++++++++++++++++++------------ R/viz_utils.R | 4 +- inst/report/report_template.Rmd | 6 +-- man/getTopAccByLinDomArch.Rd | 2 +- 6 files changed, 73 insertions(+), 48 deletions(-) diff --git a/R/generate_report.R b/R/generate_report.R index aa2872c8..65b8364d 100644 --- a/R/generate_report.R +++ b/R/generate_report.R @@ -129,13 +129,13 @@ runAnalysis <- function( GCCutoff = 0.5, query_select = NULL, query_iprDatabases = c( - "Pfam", "SMART", "Phobius", - "Gene3D", "TMHMM", "SignalP_GRAM_POSITIVE", - "SUPERFAMILY", "MobiDBLite", "TIGRFAM", "PANTHER", "Coils" + "PfamA", "SMART", "Phobius", + "Gene3d", "TMHMM", "SignalP_GRAM_POSITIVE", + "SuperFamily", "MobiDBLite", "Panther", "Coils" ), query_iprVisType = "Analysis", tree_msa_tool = "ClustalO", levels = 2, - DA_Col = "DomArch.Pfam", + DA_Col = "DomArch.PfamA", msa_rep_num = 10, msa_reduce_by = "Species", rval_phylo = FALSE, @@ -496,6 +496,7 @@ runAnalysis <- function( phylo = phylo, type = type, job_code = pin_id, + APPLICATION = query_iprDatabases ) } @@ -656,7 +657,7 @@ runAnalysis <- function( DA_Prot_value <- getDomArchProt(app_data, DASelect) - DALinPlot_value <- getDomArchHeatmapPlot(DA_col = "DomArch.Pfam", + DALinPlot_value <- getDomArchHeatmapPlot(DA_col = "DomArch.PfamA", DACutoff, DA_Prot_value, DA_lin_color = "viridis", @@ -664,14 +665,14 @@ runAnalysis <- function( DALin_TotalCounts_value <- getDomArchTotalCounts(DA_Prot_value, DACutoff, - DA_col = "DomArch.Pfam", + DA_col = "DomArch.PfamA", app_data) - DALinTable_value <- getDomArchLinearTable(DA_col = "DomArch.Pfam", + DALinTable_value <- getDomArchLinearTable(DA_col = "DomArch.PfamA", app_data@ipr_path, DALin_TotalCounts_value) - DANetwork_value <- getDomNetwork(DA_col = "DomArch.Pfam", + DANetwork_value <- getDomNetwork(DA_col = "DomArch.PfamA", DACutoff, DA_Prot_value, networkLayout = "nice", @@ -728,6 +729,7 @@ runAnalysis <- function( output_file = output_file, params = params, envir = new.env(parent = globalenv())) + browseURL(output_file) }, error = function(e) { return(paste("Error in report generation:", e$message)) }) diff --git a/R/ipr2viz.R b/R/ipr2viz.R index fb145c67..51650761 100644 --- a/R/ipr2viz.R +++ b/R/ipr2viz.R @@ -79,7 +79,7 @@ themeGenes2 <- function() { #' n = 20, query = "specific_query_name") #' } getTopAccByLinDomArch <- function(infile_full, - DA_col = "DomArch.Pfam", + DA_col = "DomArch.PfamA", lin_col = "Lineage_short", n = 20, query) { @@ -187,7 +187,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), ## Getting top n accession numbers using getTopAccByLinDomArch() top_acc <- getTopAccByLinDomArch( infile_full = infile_full, - DA_col = "DomArch.Pfam", + DA_col = "DomArch.PfamA", ## @SAM, you could pick by the Analysis w/ max rows! lin_col = "Lineage_short", n = topn, query = query @@ -360,7 +360,7 @@ plotIPR2VizWeb <- function(infile_ipr, ## @SAM, colnames, merges, everything neeeds to be done now based on the ## combined lookup table from "common_data" - lookup_tbl_path <- "/data/research/jravilab/common_data/cln_lookup_tbl.tsv" + lookup_tbl_path <- system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE) lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols) ## Read IPR file and subset by Accessions diff --git a/R/run_pipeline.R b/R/run_pipeline.R index 24b08272..d3f60e98 100644 --- a/R/run_pipeline.R +++ b/R/run_pipeline.R @@ -45,7 +45,8 @@ runFull <- function( job_code=NULL, submitter_email=NULL, advanced_options=NULL, - get_slurm_mails=FALSE + get_slurm_mails=FALSE, + APPLICATION ) { # Set working directory setwd(dir) @@ -86,7 +87,7 @@ runFull <- function( # output_file <- paste0(dir, "/blast_output_", i, ".txt") # Construct the local BLAST command (make sure 'blastn' is available locally) - runMolevolvrPipeline(input_file, DB, NHITS, EVAL, is_query = F, type, i) + runMolevolvrPipeline(input_file, DB, NHITS, EVAL, is_query = F, type, i, APPLICATION = APPLICATION) #cmd <- sprintf( # "deltablast -query %s -db %s -out %s -num_alignments %d -evalue %f -remote", @@ -104,7 +105,7 @@ runFull <- function( } # Simulate query run locally - runMolevolvrPipeline(sequences, DB, NHITS, EVAL, is_query = TRUE, type) + runMolevolvrPipeline(sequences, DB, NHITS, EVAL, is_query = TRUE, type, APPLICATION = APPLICATION) # cmd_query <- sprintf( # "deltablast -query %s -db %s -out %s_query.txt -num_alignments %d -evalue %f -remote", # sequences, DB, paste0(dir, "/query_output"), NHITS, EVAL @@ -122,7 +123,7 @@ runFull <- function( # Define the main pipeline function runMolevolvrPipeline <- function(input_paths, db, nhits, eval, - is_query, type, i) { + is_query, type, i, APPLICATION) { # Start time start <- Sys.time() @@ -213,18 +214,8 @@ runMolevolvrPipeline <- function(input_paths, db, nhits, eval, # Run INTERPROSCAN runIPRScan2(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")), - PREFIX, OUTDIR) - new_header <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis", "DB.ID", - "SignDesc", "StartLoc", "StopLoc", "Score", - "Status", "RunDate", "IPRAcc", "IPRDesc") - - temp_data <- read_tsv(file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")), - col_names = FALSE) - - colnames(temp_data) <- new_header + prefix = PREFIX, outdir = OUTDIR, appl = APPLICATION) - write.table(temp_data, file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")), - sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) # Run IPR2LIN ipr2Linear(file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")), file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), PREFIX) @@ -849,7 +840,7 @@ clust2Table <- function(clust, blast) { } # Function to run InterProScan -runIPRScan2 <- function(query_file, prefix, outdir) { +runIPRScan2 <- function(query_file, prefix, outdir, appl) { # Start InterProScan run cat("\n######################\n") @@ -868,19 +859,42 @@ runIPRScan2 <- function(query_file, prefix, outdir) { # get the path to the interproscan.sh script from the environment # variable INTERPROSCAN_CMD, or assume it's on the path if unspecified - iprscan_cmd <- Sys.getenv("INTERPROSCAN_CMD", unset="interproscan.sh") - - command <- paste( - iprscan_cmd, "-i", - shQuote(query_file), - "-b", shQuote(outfile), - "-f TSV --cpu", Sys.getenv("INTERPROSCAN_CPUS", "4"), - "--appl Pfam,MobiDBlite,Phobius,Coils,SignalP_GRAM_POSITIVE,", - "SignalP_GRAM_NEGATIVE,Hamap,Gene3D,SignalP_EUK" - ) + # iprscan_cmd <- Sys.getenv("INTERPROSCAN_CMD", unset="interproscan.sh") + # + # command <- paste( + # iprscan_cmd, "-i", + # shQuote(query_file), + # "-b", shQuote(outfile), + # "-f TSV --cpu", Sys.getenv("INTERPROSCAN_CPUS", "4"), + # "--appl Pfam,MobiDBlite,Phobius,Coils,SignalP_GRAM_POSITIVE,", + # "SignalP_GRAM_NEGATIVE,Hamap,Gene3D,SignalP_EUK" + # ) # Run the command - system(command) + # system(command) + submit_ipr(path2seq = query_file, + outfolder = outdir, + appl = appl, + email = "jravilab.msu@gmail.com") + + new_header <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis", "DB.ID", + "SignDesc", "StartLoc", "StopLoc", "Score", + "Status", "RunDate", "IPRAcc", "IPRDesc") + + temp_data <- read_tsv(file.path(outdir, paste0("ipr_joined.tsv"))) + + # Drop the last two columns + temp_data <- temp_data[, -((ncol(temp_data) - 1):ncol(temp_data))] + + # remove last 10 xters in AccNum + temp_data$file_name <- substr(temp_data$file_name, 1, + nchar(temp_data$file_name) - 12) + + colnames(temp_data) <- new_header + + write.table(temp_data, file.path(outdir, paste0(prefix, ".iprscan.tsv")), + sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) + cat("##################\n") cat("END OF IPRSCAN RUN\n") @@ -893,6 +907,10 @@ ipr2Linear <- function(ipr, acc2info, prefix) { ipr_in <- read_tsv(ipr, col_names = TRUE) %>% mutate(DB.ID = gsub("G3DSA:", "", DB.ID)) + if (prefix == "query_data") { + ipr_in$AccNum <- sub("(.*)([0-9])([0-9])$", "\\1.\\2_\\3", ipr_in$AccNum) + } + acc2info_out <- fread(input = acc2info, sep = "\t", header = T, fill = T) %>% mutate(FullAccNum = gsub("\\|", "", FullAccNum)) %>% mutate(FullAccNum = gsub(".*[a-z]", "", FullAccNum)) @@ -991,9 +1009,9 @@ ipr2Linear <- function(ipr, acc2info, prefix) { ipr2DomArch <- function(infile_ipr, prefix, analysis = c( - "Pfam", "SMART", "Phobius", - "Gene3D", "TMHMM", "SignalP_GRAM_POSITIVE", - "SUPERFAMILY", "MobiDBLite", "TIGRFAM", "PANTHER", "Coils" + "PfamA", "SMART", "Phobius", + "Gene3d", "TMHMM", "SignalP_GRAM_POSITIVE", + "SuperFamily", "MobiDBLite", "Panther", "Coils" )) { # read in cleaned up iprscan results ipr_in <- read_tsv(infile_ipr, col_names = T, col_types = ipr_cln_cols) @@ -1012,7 +1030,12 @@ ipr2DomArch <- function(infile_ipr, prefix, group_by(Analysis) %>% arrange(StartLoc) i <- 1 - for (a in analysis) { + analysis_var <- c( + "Pfam", "SMART", "Phobius", + "Gene3D", "TMHMM", "SignalP_GRAM_POSITIVE", + "SUPERFAMILY", "MobiDBLite", "PANTHER", "Coils" + ) + for (a in analysis_var) { a_da <- DA %>% filter(Analysis == a) if (a == "SignalP_EUK" || a == "SignalP_GRAM_NEGATIVE" || a == "SignalP_GRAM_POSITIVE") { diff --git a/R/viz_utils.R b/R/viz_utils.R index 6f4a2b3e..4697204e 100644 --- a/R/viz_utils.R +++ b/R/viz_utils.R @@ -6,7 +6,7 @@ #' # Function to generate the InterProScan Visualization getIPRGenesVisualization <- function(data, app_data, - query_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"), + query_iprDatabases = c("PfamA", "Phobius", "TMHMM", "Gene3d"), query_iprVisType = "Analysis") { # Check if analysis is loaded @@ -829,7 +829,7 @@ getDomArchPlot <- function(ipr_path, query_names, } n <- "Name" - plot <- ipr2viz_web( + plot <- plotIPR2VizWeb( infile_ipr = ipr_path, accessions = query_names, analysis = analysis_type, diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd index 9d616e9e..d2c9acde 100644 --- a/inst/report/report_template.Rmd +++ b/inst/report/report_template.Rmd @@ -277,8 +277,8 @@ Visualizations and analyses of all query and homologous protein domains, structu ```{r, echo=FALSE} if(is.null(params$query_iprDatabases)){ choices <- params$query_domarch_cols - if ("Pfam" %in% choices & "Phobius" %in% choices) { - analysis_type <- c("Pfam", "Phobius") + if ("PfamA" %in% choices & "Phobius" %in% choices) { + analysis_type <- c("PfamA", "Phobius") } else { default <- choices[1] } @@ -293,7 +293,7 @@ if(is.null(params$query_iprVisType)){ } if(length(params$query_domarch_cols) >= 1 ) { - DomArchPlot <- ipr2viz_web( + DomArchPlot <- plotIPR2VizWeb( infile_ipr = params$query_data@ipr_path, accessions = params$query_data@df$QueryName, analysis = analysis_type, group_by = analysis_group, name = "Name" diff --git a/man/getTopAccByLinDomArch.Rd b/man/getTopAccByLinDomArch.Rd index 0eeb0610..9e91d606 100644 --- a/man/getTopAccByLinDomArch.Rd +++ b/man/getTopAccByLinDomArch.Rd @@ -6,7 +6,7 @@ \usage{ getTopAccByLinDomArch( infile_full, - DA_col = "DomArch.Pfam", + DA_col = "DomArch.PfamA", lin_col = "Lineage_short", n = 20, query From 53c94dea592522ea655621666b788d30026d7ec8 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Sun, 24 Nov 2024 12:27:26 +0200 Subject: [PATCH 21/23] make it work Signed-off-by: Awa Synthia --- R/networks_domarch.R | 40 ++++++++++++++++----------------- inst/report/report_template.Rmd | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/R/networks_domarch.R b/R/networks_domarch.R index 84522943..b171b3a8 100755 --- a/R/networks_domarch.R +++ b/R/networks_domarch.R @@ -24,20 +24,20 @@ #' A network of domains is returned based on shared domain architectures. #' #' @param prot A data frame that contains the column 'DomArch'. -#' @param column Name of column containing Domain architecture from which nodes +#' @param column Name of column containing Domain architecture from which nodes #' and edges are generated. #' @param domains_of_interest Character vector specifying domains of interest. -#' @param cutoff Integer. Only use domains that occur at or above the cutoff for +#' @param cutoff Integer. Only use domains that occur at or above the cutoff for #' total counts if cutoff_type is "Total Count". -#' Only use domains that appear in cutoff or greater lineages if cutoff_type is +#' Only use domains that appear in cutoff or greater lineages if cutoff_type is #' Lineage. #' @param layout Character. Layout type to be used for the network. Options are: #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"} -#' @param query_color Character. Color to represent the queried domain in the +#' @param query_color Character. Color to represent the queried domain in the #' network. #' #' @importFrom dplyr across add_row all_of distinct filter mutate pull select -#' @importFrom igraph delete_vertices graph_from_edgelist vertex +#' @importFrom igraph delete_vertices graph_from_edgelist vertex V #' @importFrom purrr map #' @importFrom rlang sym #' @importFrom shiny showNotification @@ -174,21 +174,21 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c if ("X" %in% V(g)$name) { g <- delete_vertices(g, "X") } - V(g)$size <- as.numeric(wc[V(g)$name]) + igraph::V(g)$size <- as.numeric(wc[igraph::V(g)$name]) - V(g)$size <- (V(g)$size - min(V(g)$size)) / (max(V(g)$size) - min(V(g)$size)) * 20 + 10 # scaled by degree + igraph::V(g)$size <- (igraph::V(g)$size - min(igraph::V(g)$size)) / (max(igraph::V(g)$size) - min(igraph::V(g)$size)) * 20 + 10 # scaled by degree # setting vertex color by size - V(g)$color <- rainbow(5, alpha = .5)[round((V(g)$size - min(V(g)$size)) / (max(V(g)$size) - min(V(g)$size)) * 4 + 1)] - V(g)$frame.color <- V(g)$color + igraph::V(g)$color <- rainbow(5, alpha = .5)[round((igraph::V(g)$size - min(igraph::V(g)$size)) / (max(igraph::V(g)$size) - min(igraph::V(g)$size)) * 4 + 1)] + igraph::V(g)$frame.color <- igraph::V(g)$color # scaling edge width - E(g)$width <- e.sz - E(g)$width <- ifelse(log(E(g)$width) == 0, .3, log(E(g)$width)) + igraph::E(g)$width <- e.sz + igraph::E(g)$width <- ifelse(log(igraph::E(g)$width) == 0, .3, log(igraph::E(g)$width)) # coloring edges by width ew <- c(2.7, 4.5) - E(g)$color <- sapply( - E(g)$width, - function(x) if (x >= ew[1] && x <= ew[2]) E(g)$color <- adjustcolor("cadetblue", alpha.f = .7) else if (x > ew[2]) E(g)$color <- adjustcolor("maroon", alpha.f = .5) else E(g)$color <- "gray55" + igraph::E(g)$color <- sapply( + igraph::E(g)$width, + function(x) if (x >= ew[1] && x <= ew[2]) igraph::E(g)$color <- adjustcolor("cadetblue", alpha.f = .7) else if (x > ew[2]) igraph::E(g)$color <- adjustcolor("maroon", alpha.f = .5) else igraph::E(g)$color <- "gray55" ) vis_g <- visIgraph(g, type = "full") } @@ -231,18 +231,18 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c #' #' #' @param prot A data frame that contains the column 'DomArch'. -#' @param column Name of column containing Domain architecture from which nodes +#' @param column Name of column containing Domain architecture from which nodes #' and edges are generated. #' @param domains_of_interest Character vector specifying the domains of interest. -#' @param cutoff Integer. Only use domains that occur at or above the cutoff for +#' @param cutoff Integer. Only use domains that occur at or above the cutoff for #' total counts if cutoff_type is "Total Count". -#' Only use domains that appear in cutoff or greater lineages if cutoff_type is +#' Only use domains that appear in cutoff or greater lineages if cutoff_type is #' Lineage. #' @param layout Character. Layout type to be used for the network. Options are: #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"} -#' @param query_color Color that the nodes of the domains in the +#' @param query_color Color that the nodes of the domains in the #' domains_of_interest vector are colored -#' @param partner_color Color that the nodes that are not part of the +#' @param partner_color Color that the nodes that are not part of the #' domains_of_interest vector are colored #' @param border_color Color for the borders of the nodes. #' @param IsDirected Is the network directed? Set to false to eliminate arrows @@ -364,4 +364,4 @@ createBinaryDomainNetwork <- function(prot, column = "DomArch", domains_of_inter "auto" = visIgraphLayout(vg, "layout.auto") ) vg -} \ No newline at end of file +} diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd index d2c9acde..ab02bdeb 100644 --- a/inst/report/report_template.Rmd +++ b/inst/report/report_template.Rmd @@ -455,7 +455,7 @@ if(length(params$repAccNums) >= 3 ) { query_accession <- params$app_data@df %>% filter(!duplicated(QueryName)) query_accession <- unique(query_accession$Query) query <- seqs[query_accession] - names(query) <- params$PhyloSelect + # names(query) <- params$PhyloSelect query <- AAStringSet(query) } From 3c57ec71f247d42efa8074a66f17a2215fba8ac7 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Sun, 24 Nov 2024 14:12:12 +0200 Subject: [PATCH 22/23] add iprscanr to desc Signed-off-by: Awa Synthia --- DESCRIPTION | 1 + NAMESPACE | 1 + R/run_pipeline.R | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index e2ff7eca..8933ae4c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -55,6 +55,7 @@ Imports: httr, httr2, igraph, + iprscanr, latexpdf, magrittr, msa, diff --git a/NAMESPACE b/NAMESPACE index eadec16a..a5a44a79 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -104,6 +104,7 @@ import(DT) import(data.table) import(dplyr) import(httr) +import(iprscanr) import(plotly) import(readr) import(rentrez) diff --git a/R/run_pipeline.R b/R/run_pipeline.R index d3f60e98..fe1fa4c9 100644 --- a/R/run_pipeline.R +++ b/R/run_pipeline.R @@ -2,7 +2,7 @@ # Last modified: 2024 # import libs -#' @import readr data.table httr rentrez +#' @import readr data.table httr rentrez iprscanr getSeqs <- function(sequences, acc_file_path = "accs.txt", From f6b379116d3db78164d9c7db98e475b6fedc7c02 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Mon, 25 Nov 2024 14:09:31 +0200 Subject: [PATCH 23/23] add iprscanr to desc suggests Signed-off-by: Awa Synthia --- DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8933ae4c..a30025de 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -55,7 +55,6 @@ Imports: httr, httr2, igraph, - iprscanr, latexpdf, magrittr, msa, @@ -87,4 +86,5 @@ Imports: XVector, yaml Suggests: - knitr + knitr, + iprscanr