From 57671cee2a7abde00fb93604cccff577680c2fbf Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Sat, 26 Oct 2024 11:17:28 +0300
Subject: [PATCH 01/23] Add scripsts to run case study report

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 inst/report/report_template.Rmd              |  485 +++++++
 inst/report/scripts/MolEvolData_class.R      |  751 ++++++++++
 inst/report/scripts/generate_report.R        |  668 +++++++++
 inst/report/scripts/run_molevolvr_pipeline.R | 1289 ++++++++++++++++++
 inst/report/scripts/viz_utils.R              |  742 ++++++++++
 5 files changed, 3935 insertions(+)
 create mode 100644 inst/report/report_template.Rmd
 create mode 100644 inst/report/scripts/MolEvolData_class.R
 create mode 100644 inst/report/scripts/generate_report.R
 create mode 100644 inst/report/scripts/run_molevolvr_pipeline.R
 create mode 100644 inst/report/scripts/viz_utils.R
diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd
new file mode 100644
index 00000000..c42aa443
--- /dev/null
+++ b/inst/report/report_template.Rmd
@@ -0,0 +1,485 @@
+---
+title: MolEvolvR
+output:
+  html_document: 
+    self_contained: yes
+params:
+  rs_interproscan_visualization: default
+  proximity_network: default
+  sunburst: default
+  data: default
+  queryDataTable: default
+  fastaDataText: default
+  heatmap: default
+  query_data: default
+  query_domarch_cols: default
+  query_iprDatabases: default
+  query_iprVisType: default
+  mainTable: default
+  DALinTable: default
+  DALinPlot: default
+  DANetwork: default
+  DA_Prot: default
+  domarch_cols: default
+  DA_Col: default
+  DACutoff: default
+  da_interproscan_visualization: default
+  phylo_sunburst_levels: default
+  phylo_sunburst: default
+  lineage_table: default
+  ## Tree
+  tree_msa_tool: default ##input
+  ## MSA
+  rep_accnums: default
+  msa_rep_num: default ##input
+  app_data: default
+  PhyloSelect: default ##input
+  acc_to_name: default
+  rval_phylo: default
+  query_pin: default
+  msa_reduce_by: default
+---
+
+---
+subtitle: "`r glue::glue('Retrieval Code: {params$query_pin}')`"
+---
+
+<style type="text/css">
+.main-container {
+  max-width: 100% !important;
+  margin: auto;
+}
+</style>
+
+<style>
+.full-results-button {
+  vertical-align: middle;
+  // -moz-box-shadow: 0px 10px 14px -7px #000000;
+  // -webkit-box-shadow: 0px 10px 14px -7px #000000;
+  // box-shadow: 0px 10px 14px -7px #000000;
+  -moz-border-radius: 4px;
+  -webkit-border-radius: 4px;
+  border-radius: 4px;
+  border: 1px solid #46b8da;
+  display: inline-block;
+  font-size: 15px; 
+  padding: 6px 10px;
+  text-decoration: none; /*no underline!!*/
+  cursor: pointer;
+}
+.full-results-button:active { /*simulate movement*/
+  position: relative;
+  top: 1px;
+}
+</style>
+
+```{r mine,results="asis",engine="js", echo=FALSE}
+// stumble through the tabset(s)
+setTimeout(function(){
+  document.querySelector('a#FullResultButton1').addEventListener('click', doIt);
+  function doIt(){
+    document.querySelector('a[href*="#domain-architecture-2"]').click(); //Second(-2) Domain Architecture Section (reuse at different heading levels)
+  }
+}, 100); // slow your roll
+
+setTimeout(function(){
+  document.querySelector('a#FullResultButton2').addEventListener('click', doIt);
+  function doIt(){
+    document.querySelector('a[href*="#phylogeny-1"]').click(); //Second Phylogeny(-1) Section (reuse at different heading levels)
+  }
+}, 100); 
+
+setTimeout(function(){
+  document.querySelector('a#FullResultButton3').addEventListener('click', doIt);
+  function doIt(){
+    document.querySelector('a[href*="#homolog-data"]').click(); //Homolog Data Section 
+  }
+}, 100); 
+```
+
+```{r setup, include = FALSE}
+## Globally Suppress warnings and messaging
+knitr::opts_chunk$set(warning = FALSE, message = FALSE)
+## Create Sunburst Report Function
+cpcols <- c(
+    "#AFEEEE", "#DDA0DD", "#EE2C2C", "#CDBE70", "#B0B099",
+    "#8B2323", "#EE7600", "#EEC900", "chartreuse3", "#0000FF",
+    "#FFD900", "#32CD32", "maroon4", "cornflowerblue", "darkslateblue",
+    "#AB82FF", "#CD6889", "#FFA07A", "#FFFF00", "#228B22",
+    "#FFFFE0", "#FFEC8B", "peru", "#668B8B", "honeydew",
+    "#A020F0", "grey", "#8B4513", "#191970", "#00FF7F",
+    "lemonchiffon", "#66CDAA", "#5F9EA0", "#A2CD5A", "#556B2F",
+    "#EEAEEE", "thistle4", "#473C8B", "#FFB6C1", "#8B1C62",
+    "#FFE4B5", "black", "#FF7F50", "#FFB90F", "#FF69B4", "#836FFF",
+    "#757575", "#CD3333", "#EE7600", "#CDAD00", "#556B2F", "#7AC5CD"
+    )
+lineage_sunburst_report <- function(prot, lineage_column = "Lineage",
+    type = "sunburst",
+    levels = 2, colors = NULL, legendOrder = NULL, showLegend = TRUE, maxLevels = 5) {
+    lin_col <- sym(lineage_column)
+
+    # ensure they don't exceed maxLevels, although this should
+    if (!is.null(maxLevels) && levels > maxLevels) {
+        levels <- maxLevels
+    }
+
+    levels_vec <- c()
+    for (i in 1:levels)
+    {
+        levels_vec <- append(levels_vec, paste0("level", i))
+    }
+
+    # Take lineage column and break into the first to levels
+    prot <- prot %>%
+        select({{ lin_col }}) %>%
+        arrange(desc({{ lin_col }})) %>%
+        drop_na({{ lin_col }})
+    protLevels <- prot %>% separate({{ lin_col }}, into = levels_vec, sep = ">")
+    # Count the occurrance of each group of levels
+    protLevels <- protLevels %>%
+        group_by_at(levels_vec) %>%
+        summarise(size = n())
+    protLevels <- protLevels %>% arrange()
+    tree <- d3_nest(protLevels, value_cols = "size")
+
+    # Plot sunburst
+    if (type == "sunburst") {
+        result <- sunburst(tree, legend = list(w = 225, h = 15, r = 5, s = 5), colors = cpcols, legendOrder = legendOrder)
+    } else if (type == "sund2b") {
+        result <- sund2b(tree)
+    }
+
+    if (showLegend) {
+        return(
+            htmlwidgets::onRender(
+                result,
+                "function(el, x) {
+          jQuery('.sunburst-togglelegend', el)
+            .attr('checked', 'true')
+            .attr('data-html2canvas-ignore', 'true');
+          jQuery('.sunburst-legend', el).css('visibility', '');
+
+          // create a button to download the sunburst
+          // (relies on html2canvas being included in the page)
+
+          // FIXME: consider pulling this all out into a js library
+          //  so that we can apply it to other components.
+
+          const downloadBtn = jQuery('<button id=\"download-sunburst\">Download Sunburst</button>')
+            .css({'position': 'absolute', 'right': '5px', 'top': '5px'})
+            .attr('data-html2canvas-ignore', 'true')
+            .appendTo(el);
+
+          saveAs = (blob, fileName) => {
+              const link = document.createElement('a');
+              link.download = fileName
+              link.href = URL.createObjectURL(blob);
+              link.click();
+              URL.revokeObjectURL(link.href);
+          }
+
+          downloadBtn.click(() => {
+            html2canvas(el, { scale: 4, logging: false }).then(canvas => {
+              canvas.toBlob(function(blob) {
+                saveAs(blob, 'sunburst.png');
+              });
+            });
+          });
+        }"
+            )
+        )
+    }
+
+    return(result)
+}
+```
+
+# {.tabset}
+
+## Results Summary
+
+An overview of the protein analysis. To view the full results, explore the additional tabs.
+
+### Domain Architecture 
+
+Visualizations and summaries for protein domains.
+
+<a class="full-results-button" id="FullResultButton1" style="background: #5bc0de; color:#fff; font-face: bold;">View full results</a>
+
+
+#### Interproscan Visualization
+
+```{r, echo=FALSE}
+params$rs_interproscan_visualization
+```
+
+#### Proximity Network
+
+```{r, echo=FALSE}
+params$proximity_network
+```
+
+### Phylogeny
+
+Visualizations for protein evolution.
+
+<a class="full-results-button" id="FullResultButton2" style="background: #5bc0de; color:#fff; font-face: bold;">View full results</a>
+
+#### Sunburst
+
+```{r, echo=FALSE}
+library(dplyr)
+library(tidyr)
+lineage_sunburst_report(params$sunburst, "Lineage", levels = 2)
+```
+
+### Data
+
+Summary table of proteins including domain architectures, phylogeny, and homologs, when applicable.
+
+<a class="full-results-button" id="FullResultButton3" style="background: #5bc0de; color:#fff; font-face: bold;">View full results</a>
+
+```{r, echo=FALSE}
+params$data
+```
+
+
+## Query Data {.tabset}
+
+Input data, additional metadata, and preliminary analyses of query protein(s).
+
+### Data Table
+
+The data table provides a summary of the sequences submitted, or \"queried\", for analysis. The preview shown can be extended by using \"Add/remove column(s)\" to see info about other taxonomic classes as well as domain architecuture codes from databases other than the default (Pfam).
+
+```{r, echo=FALSE}
+params$queryDataTable
+```
+
+### FASTA
+
+Uploaded amino acid FASTA sequence(s).
+
+`r glue::glue(params$fastaDataText)`
+
+### Query Heatmap
+
+A heatmap of submitted sequences and their respective taxonomic lineages.
+
+```{r, echo=FALSE}
+params$heatmap
+```
+
+### Domain Architecture
+
+Visualizations and analyses of all query and homologous protein domains, structural or functional subunits, and their architectures.
+
+```{r, echo=FALSE}
+if(is.null(params$query_iprDatabases)){
+    choices <- params$query_domarch_cols
+    if ("Pfam" %in% choices & "Phobius" %in% choices) {
+      analysis_type <- c("Pfam", "Phobius")
+      } else {
+        default <- choices[1]
+        }
+  } else {
+    analysis_type <- params$query_iprDatabases
+    }
+
+if(is.null(params$query_iprVisType)){
+  analysis_group <- 'Analysis'
+  } else {
+    analysis_group <- params$query_iprVisType
+    }
+
+if(length(params$query_domarch_cols) >= 1 ) {
+  DomArchPlot <- ipr2viz_web(
+    infile_ipr = params$query_data@ipr_path,
+    accessions = params$query_data@df$QueryName,
+    analysis = analysis_type, group_by = analysis_group, name = "Name"
+    )
+    DomArchPlot
+} else {
+  "No domains found in the input sequences."
+  }
+```
+
+## Homolog Data
+
+Full set of homologs of query sequences, including their lineage and domain architecture info.
+
+```{r, echo=FALSE}
+params$mainTable
+```
+
+## Domain Architecture {.tabset}
+
+Summary and visualizations of protein motifs/subunits (domains) and their configurations within the query protein(s) (domain architectures).
+
+### Table
+
+```{r, echo=FALSE}
+params$DALinTable
+```
+
+### Heatmap
+
+```{r, echo=FALSE}
+params$DALinPlot
+```
+
+### Network
+
+```{r, echo=FALSE}
+params$DANetwork
+```
+
+```{r, echo=FALSE}
+if (length(params$domarch_cols) == 0) {
+  choices <- c("None")
+  vals <- "None"
+  selected <- "None"
+  } else {
+    vals <- params$domarch_cols
+    choices <- substring(vals, first = 9)
+    selected <- if (length(choices) >= 2) choices[1:2] else choices[1]
+    }
+
+if (is.null(params$DA_col)) {
+  da_col <- vals[[1]]
+  } else {
+    da_col <- params$DA_Col
+  }
+
+plot_data <- params$DA_Prot
+plot_data[[da_col]] <- str_replace_all(plot_data[[da_col]], " ", "_")
+wordcloud_element(plot_data, colname = da_col, cutoff = params$DACutoff, UsingRowsCutoff = F)
+```
+
+### Interproscan Visualization
+
+```{r, echo=FALSE}
+params$da_interproscan_visualization
+```
+
+### UpSet Plot
+
+```{r, echo=FALSE}
+upset_plot_data <- params$DA_Prot
+upset_plot_data[[da_col]] <- str_replace_all(upset_plot_data[[da_col]], " ", "_")
+final_plot <- upset.plot(upset_plot_data, colname = da_col, cutoff = params$DACutoff)
+domains <- upset_plot_data %>% 
+  dplyr::pull(da_col)
+n_unique_domains <- domains %>%
+    unique() %>% 
+    length()
+
+if (n_unique_domains > 1) {
+  final_plot
+  } else {
+    stringr::str_glue("UpSet plot requires more than 1 unique domain (only {n_unique_domains} present). Try selecting more proteins.")
+    }
+```
+
+## Phylogeny {.tabset}
+
+Visualizations of phyletic patterns, sequence similarity, and evolution of related proteins.
+
+### Sunburst
+
+```{r, echo=FALSE}
+lineage_sunburst_report(params$phylo_sunburst, lineage_column = "Lineage_long_na", type = "sunburst", levels = params$phylo_sunburst_levels)
+```
+
+### Tree
+
+```{r, include=FALSE}
+if (is.null(params$tree_msa_tool)) {
+  tree_msa_tool <- 'ClustalO' ## initialize with default value
+  } else {
+    tree_msa_tool <- params$tree_msa_tool
+    }
+
+if (is.null(params$msa_rep_num)) {
+  msa_rep_num <- 3 ## initialize with default value
+  } else {
+    msa_rep_num <- params$msa_rep_num
+    } 
+```
+
+```{r, include=FALSE}
+if(length(params$rep_accnums) >= 3 ) {
+  rep <- params$rep_accnums[1:msa_rep_num]
+  if (!params$rval_phylo) {
+    seqs <- readAAStringSet(params$app_data@fasta_path)
+    names(seqs) <- sub(" .*", "", names(seqs))
+    query_accession <- params$app_data@df %>% filter(params$app_data@df$QueryName == params$PhyloSelect)
+    query_accession <- unique(query_accession$Query)
+    query <- seqs[query_accession]
+    names(query) <- params$PhyloSelect
+    query <- AAStringSet(query)
+  }
+  # Generate Fasta File
+  rep_fasta_path <- tempfile()
+
+  acc2fa(rep, outpath = rep_fasta_path, "sequential")
+  rename_fasta(rep_fasta_path, rep_fasta_path,
+    replacement_function = map_acc2name,
+    acc2name = params$acc_to_name
+  )
+  if (!params$rval_phylo) {
+    writeXStringSet(query, rep_fasta_path, append = TRUE)
+  }
+
+  rep_msa_path <- tempfile()
+  alignFasta(rep_fasta_path, tree_msa_tool, rep_msa_path)
+}
+```
+
+```{r, echo=FALSE}
+if(length(params$rep_accnums) >= 3 ) {
+  seq_tree(fasta_filepath = rep_msa_path) 
+ } else {"Not enough representative sequences: try changing the 'Reduce By' field."}
+```
+
+### MSA
+
+```{r, include=FALSE}
+if(length(params$rep_accnums) >= 3 ) {
+  if (!params$rval_phylo) {
+    seqs <- readAAStringSet(params$app_data@fasta_path)
+    names(seqs) <- sub(" .*", "", names(seqs))
+    query_accession <- params$app_data@df %>% filter(params$app_data@df$QueryName == params$PhyloSelect)
+    query_accession <- unique(query_accession$Query)
+    query <- seqs[query_accession]
+    names(query) <- params$PhyloSelect
+    query <- AAStringSet(query)
+    }
+
+  # Generate Fasta File
+  rep_fasta_path <- tempfile()
+  acc2fa(rep, outpath = rep_fasta_path, "sequential")
+  rename_fasta(rep_fasta_path, rep_fasta_path,
+    replacement_function = map_acc2name,
+    acc2name = params$acc_to_name
+  )
+  if (!params$rval_phylo) {
+    writeXStringSet(query, rep_fasta_path, append = TRUE)
+  }
+
+  # Call MSA2PDF
+  msa_pdf_path <- tempfile()
+  msa_prefix <- "/data/research/jravilab/molevolvr_app/www/msa_figs/"
+  post_fix <- paste("msa", params$query_pin, params$PhyloSelect, params$msa_reduce_by, ".pdf", sep = "_")
+
+  msa_pdf_path <- paste0(msa_prefix, post_fix)
+
+  msa_pdf(fasta_path = rep_fasta_path, msa_pdf_path)
+  }
+```
+
+```{r msa_pdf, echo = FALSE, out.width = "95%", out.height  = "800px"}
+if(length(params$rep_accnums) >= 3 ) {
+  knitr::include_graphics(msa_pdf_path)
+  } else {"Not enough representative sequences: try changing the 'Reduce By' field."}
+```
diff --git a/inst/report/scripts/MolEvolData_class.R b/inst/report/scripts/MolEvolData_class.R
new file mode 100644
index 00000000..c50e2925
--- /dev/null
+++ b/inst/report/scripts/MolEvolData_class.R
@@ -0,0 +1,751 @@
+# Author(s): Samuel Chen
+# Last modified: 2020
+
+setClass("MolEvolData",
+         slots = list(
+             df = "data.frame",
+             fasta_path = "character",
+             msa_path = "character",
+             ipr_path = "character",
+             queries = "character",
+             cln_path = "character",
+             domainSeqs = "character",
+             domain_ipr_path = "character"
+         )
+)
+setClass("queryData",
+         slots = list(
+             df = "data.frame",
+             fasta_path = "character",
+             msa_path = "character",
+             ipr_path = "character",
+             queries = "character",
+             cln_path = "character"
+         )
+)
+setClass("blastUpload",
+         slots = list(
+             df = "character",
+             seqs = "character"
+         )
+)
+setClass("iprUpload",
+         slots = list(
+             df = "character",
+             seqs = "character"
+         )
+)
+setClass("seqUpload",
+         slots = list(
+             seqs = "character"
+         )
+)
+
+# Group by lineage + DA then take top 20
+top_acc <- function(cln_file, DA_col = "DomArch.Pfam",
+                    lin_col = "Lineage", n = 20) {
+    lin_sym <- sym(lin_col)
+    DA_sym <- sym(DA_col)
+
+    cln <- fread(cln_file, sep = "\t", fill = T)
+
+    grouped <- cln %>%
+        group_by({{ lin_sym }}, {{ DA_sym }}) %>%
+        summarise(count = n()) %>%
+        arrange(-count) %>%
+        filter(!is.na({{ lin_sym }}) & !is.na({{ DA_sym }}))
+
+    top_acc <- character(n)
+    for (r in 1:min(nrow(grouped), n))
+    {
+        l <- (grouped %>% pull({{ lin_sym }}))[r]
+        DA <- (grouped %>% pull({{ DA_sym }}))[r]
+
+        filt <- cln %>% filter({{ lin_sym }} == l & {{ DA_sym }} == DA)
+
+        top <- filt[which(filt$PcPositive == max(filt$PcPositive))[1], ]
+
+        top_acc[r] <- top$AccNum
+    }
+    top_acc <- top_acc[which(top_acc != "")]
+    return(top_acc)
+}
+
+
+combine_files_nopmap <- function(inpath, pattern, outpath,
+                                 delim = "\t", skip = 0,
+                                 col_names) {
+    source_files <- dir(path = inpath, pattern = pattern, recursive = T)
+
+    source_files_path <- paste0(inpath, source_files)
+
+    dt_list <- map(source_files_path, function(x) {
+        if (!grepl("query", x)) {
+            fread(x, sep = delim, skip = skip, fill = T) # , col.names = col_names)
+        }
+    })
+
+    combined <- rbindlist(dt_list, fill = T)
+
+    fwrite(combined, outpath, sep = "\t")
+
+    return(combined)
+}
+
+
+full_analysis_colnames <- c("AccNum", "QueryName", "STitle", "Species.x",
+                            "TaxID.x", "Lineage", "PcPositive", "PcIdentity",
+                            "AlnLength", "SAccNum", "SAllSeqID", "Mismatch",
+                            "GapOpen", "QStart", "QEnd", "QLength", "SStart",
+                            "SEnd", "SLength", "EValue",
+                            "BitScore", "PcPosOrig",
+                            "Name", "ClusterID", "DomArch.Pfam",
+                            "DomArch.SMART",
+                            "DomArch.CDD", "DomArch.TIGRFAM",
+                            "DomArch.Phobius",
+                            "DomArch.Gene3D", "DomArch.TMHMM",
+                            "DomArch.SignalP_EUK",
+                            "DomArch.SignalP_GRAM_NEGATIVE",
+                            "DomArch.SignalP_GRAM_POSITIVE",
+                            "Description", "Length", "TaxID.y",
+                            "Species.y", "SourceDB", "Completeness")
+
+
+process_wrapper_dir <- function(path, pinName, type = "full") {
+    if (type == "full") {
+        if (file.exists(paste0(path, "/cln_combined.tsv")) && file.exists(paste0(path, "/ipr_combined.tsv"))) {
+            query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv"))
+            query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv")
+            query_msa_path <- paste0(path, "/query_data/query_seqs.msa")
+            query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa")
+            query_data$Query <- query_data$AccNum
+            queries <- unique(query_data$Query)
+            cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv")
+            query_data <- query_data %>%
+                mutate(QueryName = Name) %>%
+                select(QueryName, Species, contains("Lineage"), contains("DomArch"), Query, Name) %>%
+                distinct(QueryName, .keep_all = TRUE)
+            query_wrapper_data <- new("queryData",
+                                      df = query_data, fasta_path = query_sequence_path,
+                                      msa_path = query_msa_path, ipr_path = query_ipr_path, queries = queries, cln_path = cln_path
+            )
+            com_blast_data <- read_tsv(path, "/blast_combined.tsv") %>% arrange(desc(PcPositive))
+            ipr_blast_path <- paste0(path, "/ipr_combined.tsv")
+            queries <- unique(com_blast_data$Query)
+            com_blast_data$Lineage <- as.factor(com_blast_data$Lineage)
+            com_blast_data <- com_blast_data %>% select(QueryName, everything())
+            wrapper_data <- new("MolEvolData",
+                                df = com_blast_data, queries = queries, ipr_path = ipr_blast_path, cln_path = com_blast_path,
+                                fasta_path = query_sequence_path, msa_path = query_msa_path, domainSeqs = "", domain_ipr_path = ""
+            )
+
+            return(list(wrapper_data, query_wrapper_data))
+        } else {
+
+            # Get the query data
+            query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv"))
+            query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv")
+            query_msa_path <- paste0(path, "/query_data/query_seqs.msa")
+            query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa")
+            query_data$Query <- query_data$AccNum
+            # alignFasta(query_sequence_path, tool = "ClustalO", outpath = query_msa_path)
+            queries <- unique(query_data$Query)
+            cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv")
+            query_data <- query_data %>%
+                mutate(QueryName = Name) %>%
+                select(QueryName, Species, contains("Lineage"), contains("DomArch"), Query, Name) %>%
+                distinct(QueryName, .keep_all = TRUE)
+            query_wrapper_data <- new("queryData",
+                                      df = query_data, fasta_path = query_sequence_path,
+                                      msa_path = query_msa_path, ipr_path = query_ipr_path, queries = queries, cln_path = cln_path
+            )
+
+            query_data <- query_data %>% select(Query, QueryName)
+
+            com_blast_path <- paste0(path, "/blast_combined.tsv")
+            ipr_blast_path <- paste0(path, "/ipr_combined.tsv")
+            com_blast_data <- combine_files_nopmap(paste0(path, "/"),
+                                                   pattern = "*.full_analysis.tsv", skip = 0,
+                                                   col_names = c(), outpath = com_blast_path, delim = "\t"
+            )
+
+            ipr_blast_data <- combine_files_nopmap(paste0(path, "/"),
+                                                   pattern = "*.iprscan_cln.tsv", skip = 0,
+                                                   col_names = ipr_colnames, outpath = ipr_blast_path, delim = "\t"
+            )
+            com_blast_data <- merge(com_blast_data, query_data, by = "Query")
+            com_blast_data <- com_blast_data %>% arrange(desc(PcPositive))
+            fwrite(com_blast_data, com_blast_path, sep = "\t")
+
+            queries <- unique(com_blast_data$Query)
+
+            com_blast_data$Lineage <- as.factor(com_blast_data$Lineage)
+            com_blast_data <- com_blast_data %>% select(QueryName, everything())
+
+            wrapper_data <- new("MolEvolData",
+                                df = com_blast_data, queries = queries, ipr_path = ipr_blast_path, cln_path = com_blast_path,
+                                fasta_path = query_sequence_path, msa_path = query_msa_path, domainSeqs = "", domain_ipr_path = ""
+            )
+
+            return(list(wrapper_data, query_wrapper_data))
+        }
+    } else if (type == "dblast") {
+        query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv"))
+        query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv")
+        query_msa_path <- paste0(path, "/query_data/query_seqs.msa")
+        query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa")
+        alignFasta(query_sequence_path, tool = "ClustalO", outpath = query_msa_path)
+        cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv")
+        query_data <- query_data %>% mutate(Query = AccNum)
+        query_data <- query_data %>%
+            mutate(QueryName = Name) %>%
+            select(QueryName, Species, contains("Lineage"), contains("DomArch"),
+                   Query, Name, AccNum) %>%
+            distinct(QueryName, .keep_all = TRUE)
+        queries <- unique(query_data$Query)
+        query_wrapper_data <- new("queryData",
+                                  df = query_data,
+                                  fasta_path = query_sequence_path,
+                                  msa_path = query_msa_path,
+                                  ipr_path = query_ipr_path,
+                                  queries = queries, cln_path = cln_path
+        )
+        query_data <- query_data %>% select(Query, QueryName)
+        com_blast_path <- paste0(path, "/blast_combined.tsv")
+        com_blast_data <- combine_files_nopmap(paste0(path, "/"),
+                                               pattern = "*.blast.cln.tsv",
+                                               skip = 0,
+                                               col_names = c(),
+                                               outpath = com_blast_path,
+                                               delim = "\t"
+        )
+        com_blast_data <- merge(com_blast_data, query_data, by = "Query")
+        queries <- unique(com_blast_data$Query)
+        com_blast_data$Lineage <- as.factor(com_blast_data$Lineage)
+        com_blast_data <- com_blast_data %>%
+            select(QueryName, everything()) %>%
+            arrange(desc(PcPositive))
+        wrapper_data <- new("MolEvolData",
+                            df = com_blast_data, queries = queries,
+                            ipr_path = "", cln_path = com_blast_path,
+                            fasta_path = query_sequence_path,
+                            msa_path = query_msa_path, domainSeqs = "",
+                            domain_ipr_path = ""
+        )
+        return(list(wrapper_data, query_wrapper_data))
+    } else if (type == "phylo") {
+        query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv"))
+        query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv")
+        query_msa_path <- paste0(path, "/query_data/query_seqs.msa")
+        cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv")
+        query_data <- query_data %>% mutate(Query = AccNum)
+        query_data <- query_data %>%
+            mutate(QueryName = Name) %>%
+            select(QueryName, Species, contains("Lineage"), contains("DomArch"),
+                   Query, Name, AccNum) %>%
+            distinct(QueryName, .keep_all = TRUE)
+        queries <- unique(query_data$Query)
+        query_wrapper_data <- new("queryData",
+                                  df = query_data,
+                                  fasta_path = query_sequence_path,
+                                  msa_path = query_msa_path,
+                                  ipr_path = query_ipr_path,
+                                  queries = queries, cln_path = cln_path
+        )
+        wrapper_data <- new("MolEvolData",
+                            df = query_data,
+                            fasta_path = query_sequence_path,
+                            msa_path = query_msa_path,
+                            ipr_path = query_ipr_path,
+                            queries = queries, cln_path = cln_path,
+                            domainSeqs = "", domain_ipr_path = ""
+        )
+        return(list(wrapper_data, query_wrapper_data))
+    } else if (type == "da") {
+        query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv"))
+        query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv")
+        query_msa_path <- paste0(path, "/query_data/query_seqs.msa")
+        query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa")
+        alignFasta(query_sequence_path, tool = "ClustalO", outpath = query_msa_path)
+        cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv")
+        query_data <- query_data %>%
+            mutate(QueryName = Name) %>%
+            select(QueryName, Species, contains("Lineage"), contains("DomArch"),
+                   Query, Name) %>%
+            distinct(QueryName, .keep_all = TRUE)
+        queries <- unique(query_data$Query)
+        query_wrapper_data <- new("queryData",
+                                  df = query_data,
+                                  fasta_path = query_sequence_path,
+                                  msa_path = query_msa_path,
+                                  ipr_path = query_ipr_path,
+                                  queries = queries, cln_path = cln_path
+        )
+        com_blast_path <- paste0(path, "/blast_combined.tsv")
+        com_blast_data <- combine_files_nopmap(paste0(path, "/"),
+                                               pattern = "*.full_analysis.tsv",
+                                               skip = 0,
+                                               col_names = c(),
+                                               outpath = com_blast_path,
+                                               delim = "\t"
+        )
+        com_blast_data <- merge(com_blast_data, query_data, by = "Query")
+        com_blast_data <- com_blast_data %>%
+            select(QueryName, everything()) %>%
+            arrange(desc(PcPositive))
+        wrapper_data <- new("MolEvolData",
+                            df = com_blast_data, queries = queries,
+                            ipr_path = "", cln_path = com_blast_path,
+                            fasta_path = query_sequence_path,
+                            msa_path = query_msa_path,
+                            domainSeqs = "", domain_ipr_path = ""
+        )
+        return(list(wrapper_data, query_wrapper_data))
+    } else {
+        stop("Unrecognized type. Please use 'full', 'dblast', 'phylo', or 'da'.")
+    }
+}
+
+drop_empty <- function(df) {
+    for (c in colnames(df)) {
+        if (all(is.na(df[, ..c])) || all(df[, ..c] == "")) {
+            df <- df %>% select(-contains(c))
+        }
+    }
+    return(df)
+}
+
+
+clean_fetched <- function(df) {
+    df <- as.data.frame(df)
+    # df = df %>% cleanup_species()
+
+    # Cleanup domarchs
+    cols <- colnames(df)
+    for (c in cols) {
+        if (grepl("^DomArch", c)) {
+            # Repeats
+            old <- c
+            new <- paste0(c, ".repeats")
+            df <- df %>% cleanup_domarch(
+                old = old, new = new,
+                domains_rename = NULL, # domains_rename,
+                domains_keep = NULL, # filter applied to only ClustName for now.
+                domains_ignore = NULL, # !! should check and remove soon!
+                repeat2s = FALSE,
+                remove_tails = F, # new! check below if it works!
+                remove_empty = F
+            )
+
+            new <- c
+            df <- df %>% cleanup_domarch(
+                old = old, new = new,
+                domains_rename = NULL, # domains_rename,
+                domains_keep = NULL, # filter applied to only ClustName for now.
+                domains_ignore = NULL, # !! should check and remove soon!
+                repeat2s = TRUE,
+                remove_tails = F, # new! check below if it works!
+                remove_empty = F
+            )
+        }
+    }
+
+
+    return(df)
+}
+
+# Description
+# validation functions for accession numbers/headers for FASTA (validate_accnum_fasta)
+# & accession number input (validate_accnum) submission types.
+
+library(Biostrings)
+library(httr)
+library(httr2)
+library(rentrez)
+library(shiny)
+
+.get_accnum_from_fasta <- function(biostrings_aa_string_set, verbose = FALSE) {
+    # parsing/cleaning accession numbers using the same methods as
+    # `upstream_scripts/00_submit_full.R`'s `get_sequences()` function
+    accnums <- c()
+    for (header in names(biostrings_aa_string_set)) {
+        # case 1 UnitProtKB formatted FASTA
+        # header: https://www.uniprot.org/help/fasta-headers
+        if (grepl("\\|", header)) {
+            accnum <- unlist(strsplit(header, "\\|"))[2]
+            # case 2 NCBI formatted FASTA
+            # header: https://www.ncbi.nlm.nih.gov/genbank/fastaformat
+        } else if (grepl(" ", header)) {
+            accnum <- unlist(strsplit(header, " "))[1]
+            # case 3 neither delimiter present; use the whole header
+        } else {
+            accnum <- header
+        }
+        # print debug info
+        if (verbose == TRUE) {
+            cat("header:", header, "\n")
+            cat("accnum_parsed:", accnum, "\n")
+        }
+        accnums <- append(accnums, accnum)
+    }
+    print(accnums)
+    return(accnums)
+}
+
+validate_accnum_fasta <- function(text) {
+    # INPUT: text from input object that houses FASTA data
+    # validate the headers/accnums for FASTA submission
+    # Return:
+    #   T: when no duplicate accnums after parsing
+    #   F: when duplicate accnums are present after parsing
+
+    # TRY write tmp file for biostrings parsing
+    path <- tryCatch(
+        expr = {
+            path <- tempfile()
+            write(text, path)
+            path
+        },
+        error = function(e) {
+            NULL
+        }
+    )
+    if (is.null(path)) {
+        cat("failed to write temp fasta during accnum validation\n", file = stderr())
+        return(FALSE) # validation fail
+    }
+
+    # TRY read seq
+    fasta <- tryCatch(
+        expr = {
+            fasta <- Biostrings::readAAStringSet(path)
+            fasta
+        },
+        error = function(e) {
+            NULL
+        },
+        finally = {
+            unlink(path)
+        }
+    )
+    if (is.null(fasta)) {
+        cat("failed to read sequence during accnum validation\n", file = stderr())
+        return(FALSE) # validation fail
+    }
+
+    # parse accnums from fasta
+    accnums <- .get_accnum_from_fasta(fasta)
+    tb_accnum_counts <- tibble("frequencies" = table(accnums))
+    # check for duplicates
+    if (any(tb_accnum_counts$frequencies > 1)) {
+        cat("duplicate headers found during fasta validation\n", file = stderr())
+        return(FALSE) # validation fail
+    } else {
+        return(TRUE)
+    }
+}
+
+
+#' Test whether a single accession returns a valid protein from Entrez
+is_accnum_valid_entrez <- function(accnum, verbose = FALSE) {
+    # empty accnum wil not raise an error from efetch, so test for this first
+    if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)}
+
+    # try performing a POST of the accnum, followed by a GET request for a protein
+    # sequence
+    # rentrez::entrez_fetch() returns an error when there's no protein,
+    # so we're using try statements to handle this and return FALSE upon error
+    result <- tryCatch(
+        expr = {
+            result <- rentrez::entrez_fetch(db = "protein", id = accnum, rettype = "fasta")
+            if (verbose) {print(result)}
+            TRUE
+        },
+        error = function(e) {if (verbose) {print(e)}; FALSE}
+    )
+    return(result)
+}
+
+#' Test whether a single accession returns a valid protein from EBI
+is_accnum_valid_ebi <- function(accnum, verbose = FALSE) {
+    # validation: ensure there's some text to parse
+    if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)}
+    # construct a httr2 request to POST an accession number, then GET the fasta
+    url_base <- "https://www.ebi.ac.uk/proteins/api/proteins?accession="
+    url_protein <- paste0(url_base, accnum)
+    req <- httr2::request(url_protein) |>
+        httr2::req_headers("Accept" = "text/x-fasta", "Content-type" = "application/x-www-form-urlencoded")
+
+    # wrap in try since a failed HTTP request will raise an R error
+    try(httr2::req_perform(req))
+    # get the HTTP response code
+    resp <- httr2::last_response()
+    # assign result based on HTTP response code ('200' is a success)
+    result <- ifelse(resp$status == 200, TRUE, FALSE)
+    if (verbose) {
+        msg <- stringr::str_glue(
+            "EBI protein query results:\n",
+            "\taccnum: {accnum}\t validation result: {result}\n"
+        ) |> print()
+    }
+    return(result)
+}
+
+#' Perform a series of API reqs using NCBI entrez to validate accession numbers
+perform_entrez_reqs <- function(accnums, verbose = FALSE, track_progress = FALSE) {
+    # API guidelines docs
+    # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html
+    # entrez recommends no more than 3 POSTs per second
+    # simple method, sleep for 1 second after every 3rd POST
+    i <- 0
+    results <- vapply(
+        X = accnums,
+        FUN = function(accnum) {
+            result <- is_accnum_valid_entrez(accnum, verbose = TRUE)
+            i <<- i + 1L
+            if (i >= 3L) {Sys.sleep(1); print('sleeping for entrez API reqs'); i <<- 0L}
+            if (track_progress) {incProgress(1)}
+            result
+        },
+        FUN.VALUE = logical(1)
+    )
+    return(results)
+}
+
+perform_ebi_reqs <- function(accnums, verbose = FALSE, track_progress = FALSE) {
+    # API guidelines docs
+    # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html
+    # EBI allows 200 POSTs per second
+    # simple method, sleep for 1 second after every 200th POST
+    i <- 0
+    results <- vapply(
+        X = accnums,
+        FUN = function(accnum) {
+            result <- is_accnum_valid_ebi(accnum, verbose = TRUE)
+            i <<- i + 1L
+            if (i >= 200L) {Sys.sleep(1); i <<- 0L}
+            if (track_progress) incProgress(1)
+            result
+        },
+        FUN.VALUE = logical(1)
+    )
+    return(results)
+}
+
+#' Validate accession numbers from MolEvolvR user input
+validate_accnum <- function(text, verbose = FALSE, track_progress = FALSE, n_steps = integer()) {
+    # API guidelines docs
+    # entrez https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Usage_Guidelines_and_Requiremen
+    # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html
+
+    # get accnums from three possible delimiters
+    # 1. comma separated values with any amount of space between them
+    # 2. one or more space delimiter
+    # 3. "\n" (newline) delimiter
+    accnums <- unlist(strsplit(text, "\\s*,\\s*|\\s+|\\n|,"))
+
+    warning()
+
+    if (track_progress) {incProgress(1 / n_steps,
+        message = "Testing for duplicate accession numbers . . .")}
+    # fail fast for duplicates
+    if (any(duplicated(accnums))) {
+        warning("duplicate accesion numbers found")
+        if (track_progress) {setProgress(n_steps,
+            message = "Duplicate accession numbers found")}
+        return(FALSE)
+    }
+    if (track_progress) {incProgress(1 / n_steps,
+        message = "Please wait (searching NCBI's protein database) . . .")}
+    # entrez API
+    entrez_results <- perform_entrez_reqs(accnums, verbose = verbose)
+    if (verbose) {
+        stringr::str_glue("rentrez
+                results:\n\t{paste0(entrez_results, collapse=",")}\n") |>
+            print()
+    }
+
+    # EBI API
+    # if all of entrez resulted in success, then skip EBI. Else, try EBI
+    if (!all(entrez_results)) {
+        ebi_results <- perform_ebi_reqs(accnums, verbose = verbose)
+        if (verbose) {
+            stringr::str_glue("ebi results:\n\t{paste0(ebi_results,
+                              collapse=",")}\n") |>
+                print()
+        }
+        if (track_progress) {incProgress(1 / n_steps, message = "Please wait
+            (searching EBI's protein database) . . .")}
+        # OR test each accnum result across both dbs
+        final_results <- ebi_results | entrez_results
+    } else {
+        if (track_progress) {setProgress(n_steps, message = "All accession
+                                         numbers validated")}
+        final_results <- entrez_results
+    }
+    failed_accnums <- accnums[which(!final_results)]
+    if (length(failed_accnums) >= 1) {
+        msg <- stringr::str_glue(
+            "The following accnums failed validation:\n",
+            "\t{paste0(failed_accnums, collapse=",")}\n"
+        )
+        warning(msg)
+        if (track_progress) {setProgress(n_steps, message = "Accession number
+                                         validation failed")}
+    }
+    setProgress(n_steps, message = "Accession number validation finished")
+    return(
+        list(
+            "validation_result" = all(final_results),
+            "failed_accnums" = failed_accnums
+        )
+    )
+}
+
+#===============================================================================
+# Description
+#   validation function for evalue input
+#===============================================================================
+validate_evalue <- function(input_value) {
+    is_valid_evalue <- is.numeric(input_value) && input_value != 0
+    return(is_valid_evalue)
+}
+
+#===============================================================================
+# Author(s): JK
+# Last modified: 2023_06
+#===============================================================================
+
+library("Biostrings")
+library("tidyverse")
+#-------------------------------------------------------------------------------
+.guess_seq_type <- function(single_fasta, dna_guess_cutoff = 0.9, other_guess_cutoff = 0.5) {
+    tb <- as_tibble(alphabetFrequency(single_fasta))
+    n_other <- if ("other" %in% colnames(tb)) sum(unlist(tb["other"])) else 0
+
+    aa_cols <- setdiff(Biostrings::AA_ALPHABET, c("*", "."))
+    tb_dna <- tb %>% select(all_of(Biostrings::DNA_ALPHABET))
+    tb_aa <- tb %>% select(all_of(aa_cols))
+
+    total <- nchar(single_fasta)
+    est_dna_prop <- sum(unlist(tb_dna)) / total
+    est_aa_prop <- sum(unlist(tb_aa)) / total
+    other_prop <- n_other / total
+    cat(
+        names(single_fasta), "\n",
+        "estimated DNA alphabet proportion:", est_dna_prop, "\n",
+        "estimated AA alphabet proportion:", est_aa_prop, "\n",
+        "'other' alphabet proportion:", other_prop, "\n"
+    )
+
+    if (est_dna_prop >= dna_guess_cutoff) {
+        guess <- "DNA"
+    } else if (other_prop >= other_guess_cutoff) {
+        guess <- NA
+    } else {
+        guess <- "AA"
+    }
+    cat("Guess:", guess, "\n")
+    return(guess)
+}
+
+.validate_seq_body <- function(text) {
+    # convert string to single letter character vector
+    individual_chars <- unlist(strsplit(text, ""))
+    # return the characters from input that are NOT in the AA_ALPHABET
+    invalid_chars <- setdiff(individual_chars, Biostrings::AA_ALPHABET)
+    n_invalid_chars <- length(invalid_chars)
+
+    # 0 length character vector means the sequence contains
+    # characters that are all present in the AA_ALPHABET
+    # >0 length character vector means there are some characters
+    # which are NOT found in the AA_ALPHABET
+    if (n_invalid_chars == 0) {
+        return(TRUE)
+    } else {
+        warning("A sequence contains characters not found in the AA_ALPHABET")
+        return(FALSE)
+    }
+}
+
+#-------------------------------------------------------------------------------
+validate_fasta <- function(fasta_path, .type = "AA") {
+    # handle case of fasta being unreadable/unrecognized format
+    fasta <- tryCatch(
+        expr = Biostrings::readAAStringSet(fasta_path),
+        error = function(e) {
+            warning(paste0("error: failed to read sequence from: ", fasta_path))
+            return(NULL)
+        }
+    )
+    # failure to read yields quick return
+    if (is.null(fasta)) {
+        return(FALSE) # validation fail
+    }
+
+    # require at least some characters for the sequence
+    if (!(sum(nchar(unlist(fasta))) >= 1)) {
+        warning(paste0("fasta does not have any sequence characters"))
+        return(FALSE) # validation fail
+    }
+    print(fasta)
+
+    ### validate sequence BODY
+    # iteratively return boolean value for valid seq body (FALSE = invalid)
+    seq_body_validations <- c()
+    for (i in seq_along(1:length(fasta))) {
+        seq_body <- as.character(fasta[i])
+        seq_body_validations <- c(.validate_seq_body(seq_body), seq_body_validations)
+    }
+    # require all sequences to have chars only in AA_ALPHABET
+    if (!(all(seq_body_validations))) {
+        warning("At least one sequence has non-IUPAC AA characters")
+        return(FALSE) # validation fail
+    }
+
+    ### validate sequence TYPE
+    # iteratively guess the type (DNA, AA, or other of each seq in the FASTA file)
+    seq_types <- c()
+    for (idx in seq_along(1:length(fasta))) {
+        seq_types <- c(.guess_seq_type(fasta[idx]), seq_types)
+    }
+
+    switch(.type,
+           "DNA" = {
+               anti_type <- "AA"
+           },
+           "AA" = {
+               anti_type <- "DNA"
+           }
+    )
+
+    # if all seqs are of desired .type, only then TRUE
+    if (sum(is.na(seq_types)) != 0) {
+        return(FALSE) # validation fail
+    } else if (anti_type %in% seq_types) {
+        return(FALSE) # validation fail
+    } else {
+        return(TRUE) # validation success
+    }
+}
+
+# Author(s): Samuel Chen, Lo Sosinski, Joe Burke
+# Last modified: 2021
+
+alpha_numeric <- c(0:9, letters, LETTERS)
+
+rand_string <- function(length, characters = alpha_numeric, post_fix = "", ignorelist = c()) {
+    str <- NULL
+
+    while (is.null(str) || (str %in% ignorelist)) {
+        chars <- sample(characters, size = length, replace = TRUE)
+        str <- paste0(paste(chars, collapse = ""), post_fix)
+    }
+
+    return(str)
+}
+strsplit_vect <- function(strings, pattern = "", pos) {
+    split_strings <- map(strsplit(strings, "_"), function(x) x[1]) %>% unlist()
+
+    return(split_strings)
+}
diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R
new file mode 100644
index 00000000..c8b4301f
--- /dev/null
+++ b/inst/report/scripts/generate_report.R
@@ -0,0 +1,668 @@
+# Author(s): Awa Synthia
+# Last modified: 2024
+
+# get fasta of pathogen and/or drug
+get_card_data <- function(pathogen = NULL, drug = NULL) {
+  destination_dir <- "CARD_data"
+  # Check if CARD data exists
+  if (!dir.exists(destination_dir)) {
+    dir.create(destination_dir)
+  }
+  if (!file.exists("CARD_data/aro_index.tsv")) {
+    # Step 1: Download CARD Data
+    download.file("https://card.mcmaster.ca/latest/data", "card_data.tar.bz2")
+    #unzip("card_data.tar.bz2", exdir = "CARD_data")
+    system(paste("tar -xjf card_data.tar.bz2 -C", destination_dir))
+  }
+
+  # Step 2: Open ARO_index.tsv
+  aro_data <- read.delim("CARD_data/aro_index.tsv", sep = "\t", header = TRUE)
+
+  # Step 3: Map CARD Short Name
+  antibiotics <- read.delim("CARD_data/shortname_antibiotics.tsv",
+                            sep = "\t", header = TRUE)
+  pathogens <- read.delim("CARD_data/shortname_pathogens.tsv",
+                          sep = "\t", header = TRUE)
+
+  aro_data <- aro_data %>%
+    mutate(
+      pathogen = str_extract(CARD.Short.Name, "^[^_]+"),
+      gene = str_extract(CARD.Short.Name, "(?<=_)[^_]+"),
+      drug = str_extract(CARD.Short.Name, "(?<=_)[^_]+$")
+    ) %>%
+    left_join(pathogens, by = c("pathogen" = "Abbreviation")) %>%
+    left_join(antibiotics, by = c("drug" = "AAC.Abbreviation"))
+
+  # Sort names
+  aro_data <- aro_data %>%
+    arrange(Pathogen, Molecule) %>%
+    group_by(Pathogen, Molecule)
+
+  # Filter data based on user input
+  filtered_data <- aro_data
+
+  if (!is.null(pathogen)) {
+    filtered_data <- filtered_data %>% filter(Pathogen == !!pathogen)
+  }
+
+  if (!is.null(drug)) {
+    filtered_data <- filtered_data %>% filter(Molecule == !!drug)
+  }
+
+  # Check if filtered data is empty
+  if (nrow(filtered_data) == 0) {
+    stop("No data found for the specified pathogen and drug.")
+  }
+
+  # Extract protein accessions
+  accessions <- filtered_data$Protein.Accession
+
+  # Function to fetch FASTA sequence from NCBI
+  get_fasta <- function(accession) {
+    entrez_fetch(db = "protein", id = accession, rettype = "fasta")
+  }
+
+  # Download sequences
+  fasta_sequences <- lapply(accessions, get_fasta)
+
+  # Write sequences to a file
+  writeLines(unlist(fasta_sequences), "filtered_proteins.fasta")
+
+  return("FASTA sequences downloaded to 'filtered_proteins.fasta'.")
+}
+
+# Run analysis
+run_analysis <- function(
+        dupload_type = "Fasta",
+        evalue = 0.00001,
+        accnum_fasta_input = "",
+        file_paths = list(accnum = NULL, fasta = tempfile(), msa = tempfile(),
+                          blast = NULL, iprscan = NULL),
+        blast_db = "refseq_protein",
+        blast_hits = 100,
+        blast_eval = 0.0001,
+        acc_homology_analysis = TRUE,
+        acc_da_analysis = TRUE,
+        acc_phylogeny_analysis = FALSE,
+        report_template_path = "/report/report_template.Rmd",
+        output_file = file.path(tempdir(), "report.html"),
+        DASelect = "All",
+        mainSelect = NULL,
+        PhyloSelect = NULL,
+        q_heatmap_select = "All",
+        DACutoff = 95,
+        GCCutoff = 0.5,
+        query_select = NULL,
+        query_iprDatabases = NULL,
+        query_iprVisType = NULL, tree_msa_tool = "ClustalO",
+        levels = 2,
+        DA_Col = "DomArch.Pfam",
+        msa_rep_num = NULL,
+        msa_reduce_by = "Species",
+        rval_phylo = FALSE,
+        ...
+
+) {
+
+    ##### Initialize Variables and Classes #####
+    blast_upload_data <- new("blastUpload", df = "", seqs = "")
+    ipr_upload_data <- new("iprUpload", df = "", seqs = "")
+    sequence_upload_data <- new("seqUpload", seqs = "")
+    data <- new("MolEvolData", msa_path = tempfile(),
+                fasta_path = file_paths$fasta)
+    app_data <- new("MolEvolData", msa_path = tempfile(),
+                    fasta_path = file_paths$fasta)
+    query_data <- new("queryData")
+
+    if (length(dupload_type) != 1) {
+        stop("dupload_type must be a single value.")
+    }
+
+    # Reset default analysis function
+    resetSettings <- function() {
+        acc_homology_analysis <<- FALSE
+        acc_da_analysis <<- FALSE
+        acc_phylogeny_analysis <<- FALSE
+        domain_split <<- FALSE
+    }
+
+    fasta_set <- c("Fasta", "AccNum", "MSA")
+
+    # Update settings based on upload type
+    updateUploadType <- function(dupload_type) {
+        switch(dupload_type,
+               "Fasta" = {
+                   resetSettings()
+                   acc_homology_analysis <<- TRUE
+                   acc_da_analysis <<- TRUE
+               },
+               "AccNum" = {
+                   resetSettings()
+                   acc_homology_analysis <<- TRUE
+                   acc_da_analysis <<- TRUE
+               },
+               "MSA" = {
+                   resetSettings()
+                   acc_homology_analysis <<- TRUE
+                   acc_da_analysis <<- TRUE
+               },
+               "BLAST Output" = {
+                   resetSettings()
+                   acc_phylogeny_analysis <<- TRUE
+               },
+               "InterProScan Output" = {
+                   resetSettings()
+                   acc_da_analysis <<- TRUE
+               }
+        )
+    }
+
+    updateUploadType(dupload_type)
+
+    ####### File Upload Functions ########
+
+    # Update function to return modified object
+    uploadAccNumFile <- function(file_path, seq_obj) {
+        accnum_data <- read_file(file_path)
+        seq_obj@seqs <- accnum_data
+        return(seq_obj)
+    }
+
+    uploadFastaFile <- function(file_path, seq_obj) {
+        fasta_data <- read_file(file_path)
+        seq_obj@seqs <- fasta_data
+        return(seq_obj)
+    }
+
+    uploadMSAFile <- function(file_path, seq_obj) {
+        msa_data <- read_file(file_path)
+        seq_obj@seqs <- msa_data
+        return(seq_obj)
+    }
+
+    uploadBlastFile <- function(file_path, blast_obj) {
+        blast_obj@df <- file_path
+        return(blast_obj)
+    }
+
+    uploadIPRScanFile <- function(file_path, ipr_obj) {
+        ipr_obj@df <- file_path
+        return(ipr_obj)
+    }
+
+    # Now modify the calling section to reassign the modified objects
+    if (!is.null(file_paths$accnum)) {
+        sequence_upload_data <- uploadAccNumFile(file_paths$accnum,
+                                                 sequence_upload_data)
+    }
+    if (!is.null(file_paths$fasta)) {
+        sequence_upload_data <- uploadFastaFile(file_paths$fasta,
+                                                sequence_upload_data)
+    }
+    if (!is.null(file_paths$msa)) {
+        sequence_upload_data <- uploadMSAFile(file_paths$msa,
+                                              sequence_upload_data)
+    }
+    if (!is.null(file_paths$blast)) {
+        blast_upload_data <- uploadBlastFile(file_paths$blast,
+                                             blast_upload_data)
+    }
+    if (!is.null(file_paths$iprscan)) {
+        ipr_upload_data <- uploadIPRScanFile(file_paths$iprscan,
+                                             ipr_upload_data)
+    }
+
+    # Validation of inputs
+    fasta_data <- read_file(file_paths$fasta)
+    validate_and_process_inputs <- function(evalue, fasta_data) {
+        if (!validate_evalue(evalue)) {
+            return("Error: A numeric E-value is required. Please set a valid
+                   value (e.g., 0.0001).")
+        }
+
+
+        if (!validate_accnum_fasta(fasta_data)) {
+            return("Error: Input for AccNum/Fasta cannot be empty or invalid.")
+        }
+
+        sequence_vector <- unlist(strsplit(fasta_data, "\n"))
+        return(list(message = "Inputs are valid!", sequences = sequence_vector))
+    }
+
+    validation_result <- validate_and_process_inputs(evalue, fasta_data)
+    if (is.character(validation_result)) {
+        return(validation_result)
+    }
+
+    # Phylogenetic Analysis Validation
+    phylo <- acc_phylogeny_analysis
+    if (phylo) {
+        # Validate phylogenetic analysis based on upload type
+        is_valid_phylo <- switch(
+            dupload_type,
+            "Fasta" = {
+                str_count(string = sequence_upload_data@seqs, ">") > 1
+            },
+            "AccNum" = {
+                accnums <- sequence_upload_data@seqs |>
+                    strsplit("\\s*,\\s*|\\s+|\\n|,") |>
+                    unlist()
+                length(accnums) > 1
+            },
+            "MSA" = {
+                str_count(string = sequence_upload_data@seqs, ">") > 1
+            },
+            FALSE  # Default to FALSE if no valid type is found
+        )
+
+        if (!is_valid_phylo) {
+            return("Error: At least two sequences/identifiers are required for
+                   phylogenetic analysis.")
+        }
+    }
+    # Fasta-like submissions
+    if (dupload_type %in% fasta_set) {
+        # Can have any combination of select options
+        type <- ""
+        script <- ""
+        postfix <- ""
+
+        if (acc_homology_analysis && !phylo && acc_da_analysis) {
+            type <- "full"
+            postfix <- "full"
+        }
+        # Phylogenetic analysis, do full script but skip blast
+        else if (phylo && acc_da_analysis) {
+            type <- "phylo"
+            postfix <- "phylo"
+        }
+        else if (acc_da_analysis) {
+            type <- "da"
+            postfix <- "da"
+        }
+        else if (acc_homology_analysis) {
+            # Only run BLAST
+            type <- "dblast"
+            postfix <- "dblast"
+        }
+        else {
+            # Something went wrong, throw error
+          stop("Please select one of the above analyses (full, phylo, da,
+               dblast) to run.")
+
+            return()
+        }
+    }
+
+    # After uploading the sequence data, you would check the uploaded data
+    if (sequence_upload_data@seqs == "") {
+        stop("Error: Please upload a protein sequence")
+    }
+    OUT_PATH <- getwd()
+    unavailable_pins <- list.files(OUT_PATH)
+    unavailable_pins <- strsplit_vect(unavailable_pins, pattern = "_")
+    pinName <- rand_string(length = 6, post_fix = "", ignorelist = unavailable_pins)
+    pin_id <- strtrim(pinName, 6)
+
+    dir <- paste0(OUT_PATH, pin_id, "_", postfix)
+    path <- paste0(dir, "/", pin_id, ".fa")
+    # Adding validation logic based on upload type
+    system(paste0("mkdir ", dir), wait = TRUE)
+    switch(dupload_type,
+           "Fasta" = {
+               # Validate sequence limit
+               if (str_count(sequence_upload_data@seqs, ">") > 200) {
+                   stop("Error: Only submissions with less than 200 proteins
+                   are accepted at this time. For analyses with more than 200 p
+                        roteins please contact janani.ravi@cuanschutz.edu.")
+               }
+
+               # Validate accession numbers for FASTA submission
+               if (!(validate_accnum_fasta(sequence_upload_data@seqs))) {
+                   stop("Error: Please adjust the FASTA headers. Ensure a header
+                        line for each sequence, no duplicate header names, and
+                        no duplicate protein accession numbers.")
+               }
+
+               # TRY load fasta for fasta validation
+               is_valid_aa_fasta <- tryCatch(
+                   expr = {
+                       tmp_file <- tempfile()
+                       writeLines(sequence_upload_data@seqs, tmp_file)
+                       validate_fasta(tmp_file)
+                   },
+                   error = function(e) {
+                       warning("Error: Failed to run input FASTA verification")
+                       return(FALSE)  # Return FALSE if an error occurs
+                   },
+                   finally = {
+                       unlink(tmp_file)
+                   }
+               )
+
+               # Validate AA fasta
+               if (!(is_valid_aa_fasta)) {
+                   stop("Error: The FASTA input could not be recognized as valid
+                   Amino Acid (AA) sequence(s). MolEvolvR only accepts FASTA
+                   formatted AA/protein sequences as input (not DNA/RNA).
+                   For a short reference on FASTA format, review this
+                   https://blast.ncbi.nlm.nih.gov/doc/blast-topics/short NCBI guide.")
+               }
+
+               write(sequence_upload_data@seqs, path)
+           },
+           "AccNum" = {
+               # Convert Acc to FASTA
+               accnum_vect <- unlist(strsplit(sequence_upload_data@seqs, "\\s*,\\s*|\\s+|\\n|,"))
+               if (length(accnum_vect) > 200) {
+                   stop("Error: Only submissions with less than 200 proteins are
+                   accepted at this time. For analyses with more than 200
+                        proteins please contact janani.ravi@cuanschutz.edu.")
+               }
+
+               # if an error is raised, validation fails
+               validation_results <- purrr::map_lgl(
+                   accnum_vect,
+                   function(accnum) {
+                       tryCatch(
+                           expr = {
+                               tmp <- tempfile(
+                                   pattern = paste0("molevolvr_acccnum_validation-", accnum, "-", "XXXXX"),
+                                   fileext = ".fa"
+                               )
+                               acc2fa(accnum, tmp)
+                               readAAStringSet(tmp)
+                               TRUE
+                           },
+                           error = function(e) {
+                               FALSE
+                           },
+                           finally = {
+                               suppressWarnings(sink())
+                               unlink(tmp)
+                           }
+                       )
+                   }
+               )
+
+               # If any accession numbers fail, halt submission
+               if (!all(validation_results)) {
+                   stop("Error: MolEvolvR could not locate sequences for the
+                   following accession numbers:
+                   [ {paste0(accnum_vect[which(!validation_results)], collapse = ', ')} ]
+      Please try submitting FASTA sequences instead.")
+               } else {
+                   # Write a multifasta
+                   acc2fa(accnum_vect, outpath = path)
+               }
+           },
+           "MSA" = {
+               # Validate sequence limit
+               if (str_count(sequence_upload_data@seqs, ">") > 200) {
+                   stop("Error: Only submissions with less than 200 proteins
+                   are accepted at this time. For analyses with more than 200
+                        proteins please contact janani.ravi@cuanschutz.edu.")
+               }
+
+               msa_temp <- str_replace_all(sequence_upload_data@seqs, "-", "")
+               write(msa_temp, path)
+           }
+    )
+
+    phylo <- if_else(phylo, "TRUE", "FALSE")
+
+    # Clean FASTA headers
+    fasta <- Biostrings::readAAStringSet(filepath = path)
+    headers_original <- names(fasta)
+    headers_accnum <- names(fasta) |> purrr::map_chr(function(x) extractAccNum(x))
+    fasta <- cleanFAHeaders(fasta)
+    headers_cleaned <- names(fasta)
+
+    # Write a table to map original accnums to their cleaned version
+    readr::write_tsv(
+        x = tibble::tibble(
+            "header_original" = headers_original,
+            "header_accnum" = headers_accnum,
+            "header_clean" = headers_cleaned
+        ),
+        file = file.path(dir, "query-fasta_header-map.tsv")
+    )
+
+    # Remove '*' characters which are incompatible with interproscan
+    fasta <- gsub(pattern = "\\*", replacement = "X",
+                  x = fasta) |> Biostrings::AAStringSet()
+    Biostrings::writeXStringSet(x = fasta, filepath = path)
+
+    if (domain_split) {
+        submit_split_by_domain(
+            dir = dir,
+            sequences = path,
+            DB = blast_db,
+            NHITS = blast_hits,
+            EVAL = blast_eval,
+            phylo = phylo,
+            type = type,
+            job_code = pin_id,
+        )
+    } else {
+        submit_full(
+            dir = dir,
+            sequences = path,
+            DB = blast_db,
+            NHITS = blast_hits,
+            EVAL = blast_eval,
+            phylo = phylo,
+            type = type,
+            job_code = pin_id,
+        )
+    }
+
+    # Additional handling for different upload types
+    if (dupload_type == "BLAST Output") {
+        dir <- paste0(OUT_PATH, pin_id, "_blast")
+        if (blast_upload_data@df == "") {
+            stop("Error: Please upload a BLAST file.")
+        }
+        if (blast_upload_data@seqs == "" && !blast_ncbi_check) {
+            stop("Error: Please provide a file containing sequences or check
+                 the box to use fetch sequences for NCBI accession numbers.")
+        }
+
+        if (tools::file_ext(blast_upload_data@df) == "tsv" || blast_upload_data@df == EX_BLASTOUTPUT) {
+            data <- read_tsv(blast_upload_data@df, col_names = web_blastp_hit_colnames)
+        } else {
+            data <- read_csv(blast_upload_data@df, col_names = web_blastp_hit_colnames)
+        }
+
+        if (nrow(data) > 5005) {
+            stop("Error: BLAST output submissions are limited to 5000 proteins
+            at the moment. For analyses with more than 5000 proteins please
+                 contact janani.ravi@cuanschutz.edu.")
+        }
+
+        system(paste0("mkdir ", dir), wait = TRUE)
+        blast_path <- paste0(dir, "/", pin_id, ".wblast.tsv")
+        write_tsv(data, blast_path, col_names = FALSE)
+
+        queries <- data$Query %>% unique()
+        writeLines(queries, paste0(dir, "/", "accs.txt"))
+
+        if (blast_ncbi_check) {
+            submit_blast(
+                dir = dir,
+                blast = paste0(dir, "/", pin_id, ".wblast.tsv"),
+                seqs = paste0(dir, "/", "seqs.fa"),
+                ncbi = TRUE,
+                job_code = pin_id,
+                submitter_email = notify_email,
+                advanced_options = isolate(rvals_advanced_options |> reactiveValuesToList() |> unlist())
+            )
+        } else {
+            seqs <- read_file(blast_upload_data@seqs)
+            writeLines(seqs, paste0(dir, "/", "seqs.fa"))
+            submit_blast(
+                dir = dir,
+                blast = paste0(dir, "/", pin_id, ".wblast.tsv"),
+                seqs = paste0(dir, "/", "seqs.fa"),
+                ncbi = FALSE,
+                job_code = pin_id,
+                submitter_email = notify_email,
+                advanced_options = isolate(rvals_advanced_options |> reactiveValuesToList() |> unlist())
+            )
+        }
+
+    } else if (dupload_type == "InterProScan Output") {
+        dir <- paste0(OUT_PATH, pin_id, "_ipr")
+        path <- paste0(dir, "/", pin_id, "_ipr.tsv")
+        if (ipr_upload_data()@df == "") {
+            stop("Error: Please upload an interproscan file.")
+        }
+        if (ipr_upload_data()@seqs == "" && !ipr_ncbi_check) {
+            stop("Error: Please provide a file containing sequences or check the
+                 box to use fetch sequences for NCBI accession numbers.")
+        }
+
+        ipr <- read_tsv(ipr_upload_data()@df, col_names = FALSE)
+        if (nrow(ipr) > 200) {
+            stop("Error: Only submissions with less than 200 proteins are
+            accepted at this time. For analyses with more than 200 proteins
+                 please contact janani.ravi@cuanschutz.edu.")
+        }
+
+        system(paste0("mkdir ", dir), wait = TRUE)
+        writeLines("AccNum\tSeqMD5Digest\tSLength\tAnalysis\tDB.ID\tSignDesc\tStartLoc\tStopLoc\tScore\tStatus\tRunDate\tIPRAcc\tIPRDesc\tGOTerms\tExtra\n", path)
+        write_tsv(ipr, path, col_names = FALSE, append = TRUE)
+
+        ncbi <- if_else(ipr_ncbi_check, TRUE, FALSE)
+        blast <- if_else(acc_homology_analysis, TRUE, FALSE)
+
+        if (ncbi) {
+            submit_ipr(
+                dir = dir,
+                ipr = path,
+                blast = acc_homology_analysis,
+                seqs = paste0(dir, "/", "seqs.fa"),
+                ncbi = TRUE,
+                DB = blast_db,
+                NHITS = blast_hits,
+                EVAL = blast_eval,
+            )
+        } else {
+            seqs <- read_file(ipr_upload_data()@seqs)
+            writeLines(seqs, paste0(dir, "/", "seqs.fa"))
+            submit_ipr(
+                dir = dir,
+                ncbi = FALSE,
+                ipr = path,
+                blast = acc_homology_analysis,
+                seqs = paste0(dir, "/", "seqs.fa"),
+                DB = blast_db,
+                NHITS = blast_hits,
+                EVAL = blast_eval,
+
+            )
+        }
+
+    }
+
+    # Process results for results
+    fetched <- process_wrapper_dir(dir, pinName = pinName, type = type)
+
+    # Assign fetched data to objects if the fetched list has the expected length
+    if (length(fetched) == 2) {
+        data <<- fetched[[1]]       # Assign the first element to 'data'
+        app_data <<- fetched[[1]]   # Assign the same to 'app_data'
+        query_data <<- fetched[[2]] # Assign the second element to 'query_data'
+
+        r_nrow_initial(nrow(fetched[[1]]@df))  # Initialize row count
+    }
+
+    domarch_cols_value <- get_domarch_cols(app_data, DASelect)
+
+    query_domarch_cols_value <- get_domarch_columns(query_data)
+
+    mainTable_value <- generate_data_table(data)
+
+    queryDataTable_value <- generate_query_data_table(query_data, query_select)
+
+    fastaDataText_value <- get_fasta_data(query_data@fasta_path)
+
+    domainDataText_value <- get_domain_data()
+
+    msaDataText_value <- get_msa_data(query_data@msa_path)
+
+    rs_IprGenes_value <- generate_ipr_genes_visualization(data, app_data, input_rs_iprDatabases, input_rs_iprVisType)
+
+    rval_rs_network_layout_value <- generate_rs_network_layout(data, app_data, cutoff = 100, layout = "nice")
+
+    rs_data_table_value <- generate_data_table(data)
+
+    da_IprGenes_value <- generate_da_ipr_genes_plot(app_data, da_iprDatabases, da_iprVisType, DASelect)
+
+    query_heatmap_value <- generate_query_heatmap(query_data_df, heatmap_select = "All", heatmap_color = "blue")
+
+    DA_Prot_value <- get_DA_Prot(app_data, validate_da, DASelect)
+
+    DALinPlot_value <- generate_DA_heatmap_plot(DA_col, DACutoff, DA_Prot, DA_lin_color, ipr_path)
+
+    DALinTable_value <- generate_DA_lin_table(DA_col, ipr_path, DAlin_count_table_DT)
+
+    DANetwork_value <- generate_domain_network(DA_col, DACutoff, DA_Prot, networkLayout, ipr_path)
+
+    phylogeny_prot_value <- filter_phylogeny_proteins(app_data, phylo_select)
+
+    acc_to_name_value <- acc_to_name(app_data)
+
+    ####### Report Generation ########
+
+    tryCatch({
+        tempReport <- file.path(tempdir(), "report.Rmd")
+        file.copy(report_template_path, tempReport, overwrite = TRUE)
+
+        # List of graphics to include in report
+        params <- list(
+            rs_interproscan_visualization = rs_IprGenes_value,
+            proximity_network = rval_rs_network_layout_value,
+            sunburst = data@df,
+            data = rs_data_table_value,
+            queryDataTable = queryDataTable_value,
+            fastaDataText = fastaDataText_value,
+            heatmap = query_heatmap_value,
+            query_data = query_data,
+            query_domarch_cols = query_domarch_cols_value,
+            query_iprDatabases = query_iprDatabases,
+            query_iprVisType = query_iprVisType,
+            mainTable = maintable_value,
+            DALinTable = DALinTable_value,
+            DALinPlot = DALinPlot_value,
+            DANetwork = DANetwork_value,
+            DA_Prot = DA_Prot_value,
+            domarch_cols = domarch_cols_value,
+            DA_Col = DA_Col,
+            DACutoff = DACutoff,
+            da_interproscan_visualization = da_IprGenes_value,
+            phylo_sunburst_levels = levels,
+            phylo_sunburst = phylogeny_prot_value,
+            tree_msa_tool = tree_msa_tool,
+            rep_accnums = rep_accnums(),
+            msa_rep_num = 10,
+            app_data = app_data,
+            PhyloSelect = PhyloSelect,
+            acc_to_name = acc_to_name_value,
+            rval_phylo = rval_phylo,
+            msa_reduce_by = msa_reduce_by
+        )
+
+        # Render RMarkdown report
+        rmarkdown::render(tempReport,
+                          output_file = output_file,
+                          params = params,
+                          envir = new.env(parent = globalenv()))
+    }, error = function(e) {
+        return(paste("Error in report generation:", e$message))
+    })
+
+    return("Initialization, upload, and
+           report generation completed successfully.")
+}
+
diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R
new file mode 100644
index 00000000..0ddc2761
--- /dev/null
+++ b/inst/report/scripts/run_molevolvr_pipeline.R
@@ -0,0 +1,1289 @@
+# Author(s): Awa Synthia
+# Last modified: 2024
+
+# Load necessary libraries
+library(httr)
+library(data.table)
+library(readr)
+library(rentrez)
+
+get_sequences <- function(sequences,
+                          acc_file_path = "accs.txt",
+                          dir_path = "~",
+                          separate = TRUE) {
+  seqs <- readAAStringSet(sequences)
+  cln_names <- c()
+  for (accnum in names(seqs)) {
+    if (grepl("\\|", accnum)) {
+      accnum_cln <- strsplit(accnum, "\\|")[[1]][2]
+      accnum_cln <- strsplit(accnum_cln, " ")[[1]]
+    } else {
+      accnum_cln <- strsplit(accnum, " ")[[1]][1]
+    }
+    cln_names <- append(cln_names, accnum_cln)
+    write(accnum_cln, file = acc_file_path, append = TRUE)
+    if (separate) {
+      write(paste0(dir_path, "/", accnum_cln, ".faa"),
+            file = "input.txt", append = TRUE)
+      write(paste0(">", accnum_cln),
+            file = paste0(accnum_cln, ".faa"), append = TRUE)
+      write(toString(seqs[accnum]),
+            file = paste0(accnum_cln, ".faa"), append = TRUE)
+    }
+  }
+  names(seqs) <- cln_names
+  writeXStringSet(seqs, sequences, format = "fasta")
+  return(length(seqs))
+}
+submit_full <- function(
+    dir = "/data/scratch",
+    DB = Sys.getenv("BLAST_DB", unset = "refseq"),
+    NHITS = Sys.getenv("BLAST_HITS", unset = 100),
+    EVAL = Sys.getenv("BLAST_EVALUE", unset = 0.00001),
+    sequences = "~/test.fa",
+    phylo = "FALSE",
+    by_domain = "FALSE",
+    domain_starting = "~/domain_seqs.fa",
+    type = "full",
+    job_code=NULL,
+    submitter_email=NULL,
+    advanced_options=NULL,
+    get_slurm_mails=FALSE
+) {
+  # Set working directory
+  setwd(dir)
+
+  advanced_options_names <- names(advanced_options[advanced_options == TRUE])
+
+  # Write job submission params to file
+  job_args <- list(
+    submission_type = type,
+    database = ifelse(phylo == FALSE, DB, NA),
+    nhits = ifelse(phylo == FALSE, NHITS, NA),
+    evalue = ifelse(phylo == FALSE, EVAL, NA),
+    submitter_email = submitter_email,
+    advanced_options = advanced_options_names,
+    job_code = job_code
+  )
+  yml <- yaml::as.yaml(job_args)
+  write(yml, "job_args.yml")
+
+  # Create a log file
+  write("START_DT\tSTOP_DT\tquery\tdblast\tacc2info\tdblast_cleanup\tacc2fa
+        \tblast_clust\tclust2table\tiprscan\tipr2lineage\tipr2da\tduration",
+        "logfile.tsv")
+
+  # Process sequences (local handling)
+  if (phylo == "FALSE") {
+    # Split the sequences if needed, store them locally
+    num_seqs <- get_sequences(sequences, dir_path = dir, separate = TRUE)
+
+    fasta <- Biostrings::readAAStringSet(sequences)
+    headers_original <- names(fasta)
+    headers_accnum <- names(fasta) |> purrr::map_chr(function(x) extractAccNum(x))
+
+    # Execute BLAST locally (instead of submitting jobs to the cluster)
+    for (i in 1:num_seqs) {
+      # Assume each sequence is saved separately
+      input_file <- paste0(dir, "/", headers_accnum[i], ".faa")
+      # output_file <- paste0(dir, "/blast_output_", i, ".txt")
+
+      # Construct the local BLAST command (make sure 'blastn' is available locally)
+      run_molevolvr_pipeline(input_file, DB, NHITS, EVAL, is_query = F, type, i)
+
+      #cmd <- sprintf(
+      #  "deltablast -query %s -db %s -out %s -num_alignments %d -evalue %f -remote",
+      #  input_file, DB, output_file, NHITS, EVAL
+      #)
+
+      # Execute BLAST locally
+      #system(cmd)
+
+      cat(sprintf("BLAST for sequence %d completed.\n", i), file=stderr())
+    }
+  } else {
+    # Handle phylogenetic analysis if needed
+    cat("Phylogenetic analysis is not supported in the local version yet.\n")
+  }
+
+  # Simulate query run locally
+  run_molevolvr_pipeline(sequences, DB, NHITS, EVAL, is_query = TRUE, type)
+  # cmd_query <- sprintf(
+  #   "deltablast -query %s -db %s -out %s_query.txt -num_alignments %d -evalue %f -remote",
+  #   sequences, DB, paste0(dir, "/query_output"), NHITS, EVAL
+  # )
+  # system(cmd_query)
+
+  cat("Query analysis completed.\n")
+
+  # Status update
+  num_runs <- num_seqs + 1
+  write(paste0("0/", num_runs, " analyses completed"), "status.txt")
+
+  cat("All analyses completed locally.\n")
+}
+
+# Define the main pipeline function
+run_molevolvr_pipeline <- function(input_paths, db, nhits, eval,
+                                   is_query, type, i) {
+
+  # Start time
+  start <- Sys.time()
+  OUTPATH <- getwd()   # Set output directory to the current working directory
+
+  # If IS_QUERY is True, handle query data
+  if (is_query == TRUE) {
+
+    FILE <- input_paths
+    PREFIX <- "query_data"
+    OUTDIR <- file.path(OUTPATH, PREFIX)
+    dir.create(OUTDIR, showWarnings = FALSE)  # Create output directory
+
+    all_accnums_file <- file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa"))
+    file.copy(FILE, all_accnums_file)
+
+    # Parse accession numbers
+    accnums_file <- file.path(OUTPATH, "query-fasta_header-map.tsv")
+    parsed_accnums_file <- file.path(OUTDIR, "parsed_accnums.txt")
+
+    if (file.exists(accnums_file)) {
+      parsed_accnums <- read.table(accnums_file,
+                                   header = FALSE, sep = "\t",
+                                   stringsAsFactors = FALSE)
+      writeLines(parsed_accnums$V2[-1], parsed_accnums_file)  # Skip header
+    }
+
+    # Copy starting_accs.txt if exists, or fallback to accs.txt
+    if (file.exists("starting_accs.txt")) {
+      file.copy("starting_accs.txt",
+                file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt")))
+    } else {
+      file.copy("accs.txt",
+                file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt")))
+      file.copy(parsed_accnums_file,
+                file.path(OUTDIR, paste0(PREFIX, ".parsed_accnums.txt")))
+    }
+
+    # setwd(OUTDIR)
+
+    # Run acc2info
+    run_acc2info(parsed_accnums_file, PREFIX, OUTDIR)
+
+    replace_accession_numbers(file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")),
+                              file.path(OUTPATH, "query-fasta_header-map.tsv"),
+                              file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")))
+
+    file.copy(file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")),
+              file.path(OUTDIR, paste0(PREFIX, ".blast.cln.tsv")))
+
+  } else {
+    # Handle homolog data
+    FILE <- readLines(input_paths)[1]  # Assume single file for local usage
+    F_value <- basename(FILE)
+    PREFIX <- sub("\\.faa$", "", basename(FILE))
+    PREFIX <- gsub(">", "", PREFIX)  # Extract prefix from file name
+    OUTDIR <- file.path(OUTPATH, paste0(PREFIX, "_", type))
+    dir.create(OUTDIR, showWarnings = FALSE)  # Create output directory
+    # setwd(OUTDIR)
+
+    # Run DELTABLAST
+    run_deltablast(input_paths, PREFIX, OUTDIR, db, nhits, eval)
+
+    # Run ACC2FA
+    convert_accnum_to_fasta(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")),
+                            PREFIX, OUTDIR)
+
+    # Run ACC2INFO
+    run_acc2info(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt")),
+                 PREFIX, OUTDIR)
+
+    # Clean up BLAST results
+    cleanup_blast(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")),
+                  file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")),
+                  PREFIX, F)
+
+  }
+
+  # Sys.sleep(30)
+
+  # Run BLASTCLUST
+  run_blastclust(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
+                 PREFIX, OUTDIR )
+
+  # Convert clusters to table
+  clust2tbl(file.path(OUTDIR, paste0(PREFIX, ".bclust.L60S80.tsv")),
+            file.path(OUTDIR, paste0(PREFIX, ".blast.cln.tsv")))
+
+  # Run INTERPROSCAN
+  run_interproscan(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
+                   PREFIX, OUTDIR)
+  new_header <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis", "DB.ID",
+                  "SignDesc", "StartLoc", "StopLoc", "Score",
+                  "Status", "RunDate", "IPRAcc", "IPRDesc")
+
+  temp_data <- read_tsv(file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")),
+                        col_names = FALSE)
+
+  colnames(temp_data) <- new_header
+
+  write.table(temp_data, file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")),
+              sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
+  # Run IPR2LIN
+  ipr2lin(file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")),
+          file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), PREFIX)
+
+
+  # Optionally run IPR2DA if IS_QUERY is true
+  if (is_query == "T") {
+    ## perform ipr2da on iprscan results
+    da <- ipr2da(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")),
+                 PREFIX, "NA")
+
+    ## if blast results are provided, call append_ipr
+    if (is.null(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv"))) |
+        is.na(PREFIX)) {
+      print("No blast results provided, moving on.")
+    } else {
+      append_ipr(ipr_da = da, blast = "NA", prefix = PREFIX)
+    }
+
+    file.copy(file.path(OUTDIR, paste0(PREFIX, ".ipr_domarch.tsv")),
+              file.path(OUTDIR, PREFIX, paste0(PREFIX, ".full_analysis.tsv")))
+  } else {
+    # perform ipr2da on iprscan results
+    da <- ipr2da(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")),
+                 PREFIX)
+
+    ## if blast results are provided, call append_ipr
+    if (is.null(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv"))) |
+        is.na(PREFIX)) {
+      print("No blast results provided, moving on.")
+    } else {
+      append_ipr(ipr_da = da,
+                 blast =  file.path(OUTDIR, paste0(PREFIX, ".cln.clust.tsv")),
+                 prefix = PREFIX)
+    }
+  }
+
+  # Copy the input fasta file to the output directory
+  # file.copy(input_paths, OUTDIR)
+
+  # Total run time
+  dur <- difftime(Sys.time(), start, units = "secs")
+  cat("\nTotal run time:", dur, "seconds\n")
+
+  # Log the run times
+  logfile_path <- file.path(OUTPATH, "logfile.tsv")
+  log_data <- data.frame(
+    START_DT = format(start, "%d/%m/%Y-%H:%M:%S"),
+    STOP_DT = format(Sys.time(), "%d/%m/%Y-%H:%M:%S"),
+    PREFIX = PREFIX,
+    dur = dur
+  )
+
+  write.table(log_data, file = logfile_path, sep = "\t",
+              row.names = FALSE, col.names = FALSE,
+              append = TRUE, quote = FALSE)
+}
+
+# Define the acc2info function
+acc2info <- function(infile, prefix, outdir) {
+  # Ensure output directory exists
+  if (!dir.exists(outdir)) {
+    dir.create(outdir, recursive = TRUE)
+  }
+
+  outfile <- file.path(outdir, paste0(prefix, ".acc2info.tsv"))
+
+  # Print column names to the output file
+  # Print column names to outfile
+
+  # Read the input file of accession numbers
+  acc_nums <- readLines(infile)
+
+  # Random sleep to avoid overloading the server
+  # Sys.sleep(sample(1:10, 1))
+
+  # EPOST to get WebEnv and QueryKey
+  epost_response <- entrez_post(db = "protein", id = acc_nums)
+  webenv <- epost_response$WebEnv
+  query_key <- epost_response$QueryKey
+  # EFETCH to get the document summaries
+  docsums <- entrez_summary(db = "protein", web_history = epost_response)
+
+  # Check if any atomic values exist in docsums
+  any_atomic <- FALSE
+  # Check if any value in the current docsum is atomic
+  for (docsum in docsums) {
+    # Check if any value in the current docsum is atomic
+    if (is.atomic(docsum)) {
+      any_atomic <- TRUE
+      break  # Exit loop early if an atomic value is found
+    }
+  }
+
+
+  if (any_atomic) {
+    parsed_data <- data.frame(
+      AccNum = docsums$oslt$value,
+      AccNum.noV = docsums$caption,
+      FullAccNum = docsums$extra,
+      Description = docsums$title,
+      Length = docsums$slen,
+      TaxID = docsums$taxid,
+      Species = docsums$organism,
+      SourceDB = ifelse(is.null(docsums$sourcedb), NA,
+                        docsums$sourcedb),
+      Completeness = ifelse(is.null(docsums$completeness), NA,
+                            docsums$completeness),
+      stringsAsFactors = FALSE  # Avoid factors in data frame
+    )
+  } else {
+    # If no atomic values exist, use lapply to parse each docsum
+    parsed_data <- do.call(rbind, lapply(docsums, function(docsum) {
+      tryCatch({
+        data.frame(
+          AccNum = docsum$oslt$value,
+          AccNum.noV = docsum$caption,
+          FullAccNum = docsum$extra,
+          Description = docsum$title,
+          Length = docsum$slen,
+          TaxID = docsum$taxid,
+          Species = docsum$organism,
+          SourceDB = ifelse(is.null(docsum$sourcedb), NA,
+                            docsum$sourcedb),
+          Completeness = ifelse(is.null(docsum$completeness), NA,
+                                docsum$completeness),
+          stringsAsFactors = FALSE  # Avoid factors in data frame
+        )
+      }, error = function(e) {
+        # Return a data frame filled with NA values in case of an error
+        return(data.frame(
+          AccNum = NA,
+          AccNum.noV = NA,
+          FullAccNum = NA,
+          Description = NA,
+          Length = NA,
+          TaxID = NA,
+          Species = NA,
+          SourceDB = NA,
+          Completeness = NA,
+          stringsAsFactors = FALSE
+        ))
+      })
+    }))
+  }
+
+  # Parse the fetched data
+  # Check if we got any data
+  if (nrow(parsed_data) == 0) {
+    cat("No data found for accession numbers in NCBI. Trying UniProt...\n")
+
+    # Split input file into smaller chunks
+    split_files <- split(acc_nums, ceiling(seq_along(acc_nums) / 100))
+
+    for (i in seq_along(split_files)) {
+      accnum <- paste(split_files[[i]], collapse = ",")  # Join with comma
+      url <- paste0("https://www.ebi.ac.uk/proteins/api/proteins?accession=",
+                    accnum)
+
+      response <- GET(url, accept("application/xml"))
+
+      if (status_code(response) == 200) {
+        xml_content <- content(response, as = "text")
+
+        # Parse the XML response
+        parsed_uniprot <- xml2::read_xml(xml_content)
+
+        # Extract required elements (customize based on XML structure)
+        entries <- xml2::xml_find_all(parsed_uniprot, "//entry")
+
+        for (entry in entries) {
+          accession <- xml2::xml_text(xml2::xml_find_first(entry,
+                                                           ".//accession"))
+          full_name <- xml2::xml_text(xml2::xml_find_first(entry,
+                                                           ".//fullName"))
+          length_seq <- xml2::xml_attr(xml2::xml_find_first(entry,
+                                                            ".//sequence"),
+                                                            "length")
+          db_reference <- xml2::xml_attr(xml2::xml_find_first(entry,
+                                                              ".//dbReference"),
+                                                              "id")
+          dataset <- xml2::xml_attr(entry, "dataset")
+          name <- xml2::xml_text(xml2::xml_find_first(entry, ".//name"))
+
+          # Create a row of data
+          new_row <- data.frame(AccNum = accession,
+                                AccNum.noV = gsub("\\|", "", accession),
+                                FullAccNum = accession,
+                                Description = full_name,
+                                Length = as.integer(length_seq),
+                                TaxID = db_reference,
+                                Species = name,
+                                SourceDB = dataset,
+                                Completeness = "NA",  # Adjust as needed
+                                stringsAsFactors = FALSE)
+
+          # Append to the outfile
+          write.table(new_row, file = outfile, sep = "\t", col.names = TRUE,
+                      row.names = FALSE, quote = FALSE, append = TRUE)
+        }
+      }
+    }
+  } else {
+    # Append NCBI data to outfile
+    write.table(parsed_data, file = outfile, sep = "\t", col.names = TRUE,
+                row.names = FALSE, quote = FALSE, append = TRUE)
+  }
+
+  cat("Data saved to:", outfile, "\n")
+}
+
+acc2info_phylo <- function(infile, outdir) {
+  # Ensure output directory exists
+  if (!dir.exists(outdir)) {
+    dir.create(outdir, recursive = TRUE)
+  }
+
+  outfile <- file.path(outdir, "acc2info.tsv")
+
+  # Create a data frame to store results
+  results <- data.frame(
+    Caption = character(),
+    Extra = character(),
+    Title = character(),
+    Length = character(),
+    TaxID = character(),
+    Organism = character(),
+    SourceDB = character(),
+    Completeness = character(),
+    stringsAsFactors = FALSE
+  )
+
+  # Process accession numbers in batches to avoid overloading
+  for (acc in acc_nums) {
+    # Fetch data using epost and efetch
+    response <- GET(sprintf("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&format=xml&id=%s",
+                            acc))
+
+    # Check if the request was successful
+    if (response$status_code == 200) {
+      # Parse the XML response
+      parsed_xml <- read_xml(content(response, "text"))
+
+      # Extract relevant information
+      doc_summary <- xml_find_all(parsed_xml, ".//DocumentSummary")
+
+      for (summary in doc_summary) {
+        results <- rbind(results, data.frame(
+          Caption = xml_text(xml_find_first(summary, ".//Caption")),
+          Extra = xml_text(xml_find_first(summary, ".//Extra")),
+          Title = xml_text(xml_find_first(summary, ".//Title")),
+          Length = xml_text(xml_find_first(summary, ".//Slen")),
+          TaxID = xml_text(xml_find_first(summary, ".//TaxId")),
+          Organism = xml_text(xml_find_first(summary, ".//Organism")),
+          SourceDB = xml_text(xml_find_first(summary, ".//SourceDb")),
+          Completeness = xml_text(xml_find_first(summary, ".//Completeness")),
+          stringsAsFactors = FALSE
+        ))
+      }
+    } else {
+      warning(sprintf("Failed to fetch data for accession %s: %s",
+                      acc, response$status_code))
+    }
+  }
+
+  # Write results to the output file
+  write.table(results, file = outfile, sep = "\t", row.names = FALSE,
+              quote = FALSE)
+}
+
+# Main function to run based on the prefix
+run_acc2info <- function(infile, prefix, outdir) {
+  if (prefix == "NA") {
+    acc2info_phylo(infile, outdir)
+  } else {
+    acc2info(infile, prefix, outdir)
+  }
+}
+
+substitute_accnum_for_acc2info <- function(df_acc2info, df_header_map) {
+  df_result <- df_header_map |>
+    # set column name in header map to match accnum col in acc2info
+    dplyr::rename(AccNum = header_accnum) |>
+    # join onto header map
+    dplyr::left_join(df_acc2info, by = "AccNum") |>
+    # deselect accnum
+    dplyr::select(-AccNum) |>
+    # set the accnum col to the cleaned form
+    dplyr::rename(AccNum = header_clean) |>
+    # rm excess columns from header map file
+    dplyr::select(-header_original)
+  return(df_result)
+}
+
+
+replace_accession_numbers <- function(path_acc2info,
+                                      path_query_header_map, path_out) {
+
+  # Read the input files
+  df_acc2info <- read_tsv(path_acc2info)
+  df_query_header_map <- read_tsv(path_query_header_map)
+
+  # Substitute accession numbers
+  df_acc2info_substituted <- substitute_accnum_for_acc2info(df_acc2info,
+                                                            df_query_header_map)
+
+  # Print the substituted dataframe
+  print("### df_acc2info_substituted")
+  print(df_acc2info_substituted)
+
+  # Write the substituted dataframe to the output file
+  write_tsv(df_acc2info_substituted, file = path_out, col_names = TRUE)
+}
+
+
+run_deltablast <- function(infile, prefix, outdir,
+                           db = "refseq_protein",
+                           nhits = 5000, evalue = 1e-5,
+                           threads = 10) {
+
+  # Prepare output file path
+  # outfile <- file.path(outdir, paste0(prefix, ".dblast.tsv"))
+
+  outfile <- paste0(outdir, "/" , prefix, ".dblast.tsv")
+
+  # Print I/O messages
+  cat("\nNow processing:", infile, "\n")
+  cat("Running against:", db, "\n")
+  cat("E-value: ≤", evalue, "\n")
+  cat("Top", nhits, "hits/alignments\n")
+  cat("Output filepath:", outfile, "\n")
+
+  # Designating database based on user input
+  dblast_db <- switch(db,
+                      nr = "nr",
+                      refseq_protein = "refseq_protein",
+                      stop("Invalid database specified.
+                           Choose either 'nr' or 'refseq'."))
+
+  # Core command for running DELTABLAST
+  command <- sprintf(
+    "deltablast -query %s -db %s -out %s -num_alignments %d -evalue %s -outfmt '6 qacc sacc sseqid sallseqid stitle sscinames staxids pident length mismatch gapopen qstart qend qlen sstart send slen evalue bitscore ppos' -remote",
+    infile, dblast_db, outfile, nhits, evalue
+  )
+  # Run the DELTABLAST command
+  system(command, intern = TRUE)
+
+  cat("DELTABLAST completed.\n")
+}
+
+
+# This script converts AccNum to Fasta using NCBI's EDirect or EBI's API
+
+convert_accnum_to_fasta <- function(infile, prefix, outdir) {
+
+  # Create the output file path
+  outfile <- file.path(outdir, paste0(prefix, ".all_accnums.fa"))
+
+  # Print status
+  cat("\n####################\n")
+  cat("BEGIN EDIRECT SEARCH\n")
+  cat("####################\n")
+  cat("Processing input file:", infile, "\n")
+
+  # Create temporary files based on the number of columns in input file
+  temp_acc_file <- file.path(outdir, paste0(prefix, ".all_accnums.txt"))
+  cols <- length(strsplit(readLines(infile, n = 1), "\t")[[1]])
+
+  if (cols > 1) {
+    # Extract unique homolog accession numbers from the 2nd column
+    cat("Creating temp file with 2nd column unique accessions...\n")
+    system(sprintf("awk -F '\\t' '{ print $2 }' %s | sort -u > %s", infile, temp_acc_file))
+  } else {
+    # Extract unique homolog accession numbers from the 1st column
+    cat("Creating temp file with 1st column unique accessions...\n")
+    system(sprintf("awk -F '\\t' '{ print $1 }' %s | sort -u > %s", infile, temp_acc_file))
+  }
+
+  # Split accessions into chunks of 1000
+  system(sprintf("split -l 1000 -e %s %s/acc", temp_acc_file, outdir))
+
+  # Fetch FASTA sequences
+  cat("\nObtaining FASTA files\n")
+  acc_files <- list.files(path = outdir, pattern = "^acc", full.names = TRUE)
+
+  for (x in acc_files) {
+    # Accession numbers in a comma-separated format
+    accnum <- paste(readLines(x), collapse = ",")
+    fasta_sequence <- entrez_fetch(db = "protein", id = accnum,
+                                   rettype = "fasta")
+    # Write the fetched sequence to the output file
+    writeLines(fasta_sequence, con = outfile)
+  }
+
+  # Check if any sequences were retrieved
+  num_seqs <- as.numeric(system(sprintf("grep '>' %s | wc -l", outfile),
+                                intern = TRUE))
+
+  # If no sequences found, try fetching from EBI API
+  if (num_seqs < 1) {
+    cat("\nNo sequences retrieved. Trying EBI API...\n")
+    for (x in acc_files) {
+      accnum <- paste(readLines(x), collapse = ",")
+      system(sprintf(
+        "curl -X GET --header 'Accept:text/x-fasta' 'https://www.ebi.ac.uk/proteins/api/proteins?accession=%s' >> %s",
+        accnum, outfile
+      ))
+
+    }
+  }
+
+  # Remove temporary files
+  cat("\nRemoving temporary files\n")
+  system(sprintf("rm %s/acc*", outdir))
+
+  # Print end message
+  cat("#####################\n")
+  cat("END OF EDIRECT SEARCH\n")
+  cat("#####################\n")
+}
+
+
+cleanup_blast <- function(infile_blast, acc2info, prefix, wblast = F) {
+
+  outdir <- dirname(infile_blast)
+  # Load and clean acc2info file
+  acc2info_out <- fread(input = acc2info, sep = "\t",
+                        header = TRUE, fill = TRUE) %>%
+    mutate(FullAccNum = gsub("\\|", "", FullAccNum)) %>%
+    mutate(FullAccNum = gsub(".*[a-z]", "", FullAccNum))
+
+  # If using web-blast output
+  if (wblast == "T") {
+    blast_out <- fread(input = infile_blast, sep = "\t", header = FALSE,
+                       col.names = web_blastp_hit_colnames, fill = TRUE)
+    cleanedup_blast <- blast_out %>%
+      mutate(AccNum = gsub("\\|", "", AccNum)) %>%
+      mutate(AccNum = gsub(".*[a-z]", "", AccNum)) %>%
+      mutate(PcIdentity = round(as.double(PcIdentity), 2))
+
+    # Merge cleaned blast output with acc2info
+    cleanedup_blast <- merge(cleanedup_blast, acc2info_out,
+                             by.x = "AccNum",
+                             by.y = "FullAccNum",
+                             all.x = TRUE)
+    names(cleanedup_blast)[names(cleanedup_blast) == "Species.y"] <- "Species"
+
+    # Additional calculations for PcPositive
+    cleanedup_blast <- cleanedup_blast %>%
+      mutate(PcPosOrig = as.numeric(PcPosOrig)) %>%
+      mutate(AlnLength = as.numeric(AlnLength)) %>%
+      mutate(PcPositive = PcPosOrig)
+
+  } else if (wblast == FALSE) {
+    # If using classic-blast output
+    blast_out <- read_tsv(file = infile_blast, col_names = cl_blast_colnames)
+    cleanedup_blast <- blast_out %>%
+      mutate(AccNum = gsub("\\|", "", AccNum)) %>%
+      mutate(AccNum = gsub(".*[a-z]", "", AccNum)) %>%
+      mutate(Species = gsub(";.*$", "", Species)) %>%
+      mutate(PcIdentity = round(PcIdentity, 2)) %>%
+      mutate(PcPositive = round((PcPosOrig * AlnLength / QLength), digits = 2))
+
+    # Merge cleaned blast output with acc2info
+    cleanedup_blast <- merge(cleanedup_blast, acc2info_out,
+                             by.x = "AccNum",
+                             by.y = "FullAccNum",
+                             all.x = TRUE) %>%
+      select(-Species.x, -TaxID.x)
+    names(cleanedup_blast)[names(cleanedup_blast) == "Species.y"] <- "Species"
+  }
+
+  # TaxID to lineage mapping
+  cleanedup_blast$TaxID <- as.integer(cleanedup_blast$TaxID)
+  lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt",
+                       header = TRUE, fill = TRUE,
+                       colClasses = lineage_map_cols)
+
+  # Merge with lineage map and clean up columns
+  mergedLins <- merge(cleanedup_blast, lineage_map, by = "TaxID",
+                      all.x = TRUE) %>%
+    mutate(Species = Species.y, Spp.blast = Species.x) %>%
+    select(any_of(cl_blast_postcln_cols))
+
+  # Add names and prepare the output
+  blast_names <- addName(mergedLins)
+
+  # Create output file name
+  file_name <- file.path(outdir, paste0(prefix, ".blast.cln.tsv"))
+
+  # Write cleaned data to file
+  write_tsv(blast_names, file_name, col_names = TRUE)
+}
+
+
+# Function to run BLASTCLUST on given input
+run_blastclust <- function(infile, suffix, outdir) {
+
+  # Prepare output file path
+  outfile <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv"))
+
+
+  # Print process messages
+  cat("\n#####################################\n")
+  cat("## Now running BLASTCLUST on file(s):", infile, "\n")
+  cat("#####################################\n")
+
+  # Run BLASTCLUST
+  # blastclust_cmd <- paste("blastclust -i", infile, "-o", outfile, "-p T -L .6 -b T -S 80 -a 8")
+  cdhit_command <- sprintf(
+    "cd-hit -i %s -o %s -c 0.8 -aS 0.6 -T 8",
+    infile, outfile
+  )
+  cat("\nPerforming BLASTCLUST analysis on", infile, "\n")
+  # system(blastclust_cmd)
+
+  system(cdhit_command)
+
+}
+
+# Function to format blastclust output
+clust2tbl <- function(clust, blast) {
+  clust_out <- read_tsv(file = clust, col_names = F)
+  blast_out <- read_tsv(file = blast, col_names = T)
+  ## Count the number of accession numbers in a cluster
+  # Counting number of spaces between acc. no. +1
+  clust_out$NumAccs <- map(.x = clust_out$X1, function(x) {
+    (str_count(string = x, pattern = " ") + 1)
+  })
+  ## Create empty vectors to store information
+  empty_vec <- c("ClusterID")
+  empty_vec2 <- c("RowNum")
+  ## Adding empty vectors to dataframe
+  clust_out[, empty_vec] <- NA
+  clust_out[, empty_vec2] <- NA
+  ## Counting number of rows to add to the RowNum column -- used for creating cluster name
+  rows <- as.numeric(rownames(clust_out)) %>%
+    str_pad(width = 4, pad = 0)
+  ## Add row number info to dataframe
+  clust_out[, "RowNum"] <- rows
+  # Name columns
+  colnam <- list("AccNum", "NoAccs", "ClusterID", "NumRows")
+  ## Create cluster name from row number (num of clusters) and num of accessions
+  clust_out <- clust_out %>%
+    `colnames<-`(colnam) %>%
+    mutate(ClusterID = paste0(NumRows, ".", NoAccs))
+
+  # Store data frame column as vector
+  myvar <- c("AccNum", "ClusterID")
+  ## Add only the 2 colummns wanted to a new varible
+  clusters <- clust_out[myvar]
+  # Initialize empty data frame
+  new_clust <- data.frame(ClusterID = character(0),
+                          AccNum = character(0),
+                          stringsAsFactors = F)
+
+  ## Assigning each sseqid to a ClusterID
+  # Iterate over dataframe
+  for (i in 1:nrow(clusters)) {
+    # Extract the cluster name for this row
+    cname <- clusters$ClusterID[i]
+    # Split accession numbers by a space
+    # vals is a vecto of all accession numbers in a row
+    vals <- clusters$AccNum[i] %>%
+      strsplit(split = " ") %>%
+      unlist()
+
+    # Iterate over each element in vals
+    for (v in vals) {
+      # add each accession num w/ corresponding cluster ID to new df
+      new_clust[nrow(new_clust) + 1, ] <- c(cname, v)
+    }
+  }
+  # blast_out$AccNum <- gsub("^>", "", blast_out$AccNum)
+  blast_out$AccNum <- paste0(">", blast_out$AccNum)
+  blast_clustnames <- merge(blast_out, new_clust, by = "AccNum")
+
+  for (i in 1:nrow(blast_clustnames)) {
+    blast_clustnames$ClusterID[i] <- str_c(blast_clustnames$ClusterID[i])
+  }
+
+  first_prot <- as.data.frame(word(clust_out$AccNum), word(clust_out$ClusterID))
+
+  ## write the new file as a TSV file
+  newarg <- gsub(".bclust.L[0-9][0-9]S[0-9][0-9].tsv", "", clust)
+  # accnum + clusterID
+  write_tsv(new_clust, file = paste0(newarg, ".clustIDs"), append = F)
+  # first protein from every cluster
+  write_tsv(first_prot, file = paste0(newarg, ".clust_reps"),
+            col_names = F, append = F)
+  # cleaned up blast file + clusterIDs
+  write_tsv(blast_clustnames, file = paste0(newarg, ".cln.clust.tsv"),
+            col_names = T, append = F)
+}
+
+# Function to run InterProScan
+run_interproscan <- function(query_file, prefix, outdir) {
+
+  # Start InterProScan run
+  cat("\n######################\n")
+  cat("BEGIN INTERPROSCAN RUN\n")
+  cat("Input file:", query_file, "\n")
+  cat("######################\n")
+
+  # Output file path
+  outfile <- file.path(outdir, paste0(prefix, ".iprscan"))
+
+  # Process the input query file with InterProScan
+  cat("Now processing", query_file, "\n")
+
+  # Run InterProScan command
+  # Construct the command
+  command <- paste(
+    "/home/simple/iprdir/interproscan-5.70-102.0/interproscan.sh -i",
+    shQuote(query_file),
+    "-b", shQuote(outfile),
+    "-f TSV --cpu", Sys.getenv("INTERPROSCAN_CPUS", "4"),
+    "--appl Pfam,MobiDBlite,Phobius,Coils,SignalP_GRAM_POSITIVE,",
+    "SignalP_GRAM_NEGATIVE,Hamap,Gene3D,SignalP_EUK"
+  )
+
+  # Run the command
+  system(command)
+
+  cat("##################\n")
+  cat("END OF IPRSCAN RUN\n")
+  cat("##################\n")
+}
+
+ipr2lin <- function(ipr, acc2info, prefix) {
+  # read in iprscan results
+  # duplicate rows in iprscan file
+  ipr_in <- read_tsv(ipr, col_names = TRUE) %>%
+    mutate(DB.ID = gsub("G3DSA:", "", DB.ID))
+
+  acc2info_out <- fread(input = acc2info, sep = "\t", header = T, fill = T) %>%
+    mutate(FullAccNum = gsub("\\|", "", FullAccNum)) %>%
+    mutate(FullAccNum = gsub(".*[a-z]", "", FullAccNum))
+
+  # merge ipr file with acc2info file
+  ipr_in <- ipr_in %>%
+    # remove version number and any other suffices
+    mutate(AccNum.noV = gsub("\\.[0-9].*", "", AccNum))
+
+  ipr_tax <- left_join(ipr_in, acc2info_out, by = "AccNum")
+
+  # read in lineage map
+  lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt",
+                       header = T, fill = T)
+
+  # merge ipr+info w/ lineage
+  # both tables have a species column, but only
+  # the lineage_map (y) species column is kept
+  ipr_tax <- ipr_tax %>%
+    mutate(TaxID = as.numeric(TaxID))
+
+  ipr_lin <- left_join(ipr_tax, lineage_map, by = "TaxID") |>
+    mutate(Species = Species.y) %>%
+    select(-Species.x, -Species.y)
+
+  # add lookup table to iprscan file
+  lookup_tbl <- fread(input = "~/awasyn/new_trial/cln_lookup_tbl.tsv",
+                      sep = "\t", header = T, fill = T) %>%
+    distinct()
+  if ("AccNum.x" %in% names(ipr_lin)) {
+    ipr_lin <- ipr_lin %>%
+      # deselect the AccNum.y from the lineage table (y) and set
+      # the AccNum.x (x) from the ipr/acc2info tables to simply 'AccNum'
+      mutate(AccNum = AccNum.x) %>%
+      select(-AccNum.x, -AccNum.y)
+  }
+  # run add_name f(x) on ipr+lineage dataframe
+  ipr_lin <- ipr_lin %>%
+    addName() %>%
+    mutate(Name = gsub("^_", "", Name))
+
+  # add domarch info to iprscan + lineage df, only keep what's in x
+  ipr_cln <- left_join(ipr_lin, lookup_tbl, by = "DB.ID")
+
+  # populate empty description/short name columns
+  for (i in 1:nrow(ipr_cln)) {
+    if ((is.na(ipr_cln$ShortName[i]) || ipr_cln$ShortName[i] == "") &&
+        (is.na(ipr_cln$SignDesc[i]) || ipr_cln$SignDesc[i] == "-")) {
+
+      ipr_cln$SignDesc[i] <- ipr_cln$IPRDesc[i]
+      if (length(ipr_cln$LookupTblDesc[i]) != 0) {
+        ipr_cln$ShortName[i] <- ipr_cln$LookupTblDesc[i]
+      }
+    }
+    if (is.na(ipr_cln$ShortName[i]) || ipr_cln$ShortName[i] == "") {
+      ipr_cln$ShortName[i] <- ipr_cln$SignDesc[i]
+    }
+    if (is.na(ipr_cln$SignDesc[i]) || ipr_cln$SignDesc[i] == "-") {
+      ipr_cln$SignDesc[i] <- ipr_cln$ShortName[i]
+    }
+  }
+  # rename unclear/duplicated columns
+  names(ipr_cln)[names(ipr_cln) == "Description.x"] <- "ProteinName"
+  names(ipr_cln)[names(ipr_cln) == "Description.y"] <- "LookupTblDesc"
+  # deselect the AccNum.noV from the lineage table (y) and set
+  # the AccNum.noV.x (x) from the ipr/acc2info tables to simply 'AccNum.noV'
+  ipr_cln <- ipr_cln |> dplyr::select(-AccNum.noV.y) |>
+    dplyr::mutate(AccNum.noV = AccNum.noV.x) |>
+    dplyr::select(-AccNum.noV.x)
+  # create label column to use in ipr2viz
+  ipr_cln <- ipr_cln %>%
+    mutate(Label = strtrim(ShortName, 30)) %>%
+    mutate(Label = gsub(", .*", "", Label)) %>%
+    mutate(Label = gsub("C-terminal region of a signal",
+                        "C-term signal peptide", Label)) %>%
+    mutate(Label = gsub("N-terminal region of a signal",
+                        "N-term signal peptide", Label)) %>%
+    mutate(Label = gsub("Twin arginine translocation \\(T",
+                        "Tat signal profile", Label)) %>%
+    mutate(Label = gsub("GLUTATHIONE HYDROLASE PROENZYM",
+                        "GLUTATHIONE HYDROLASE PROENZYME", Label)) %>%
+    mutate(Label = gsub("N-terminal nucleophile aminohy",
+                        "Ntn hydrolases", Label)) %>%
+    mutate(Label = gsub("Region of a membrane-bound pro",
+                        "cytoplasmic reg of mem-bound prot",
+                        Label)) %>%
+    mutate(ShortName = gsub("Region of a membrane-bound protein predicted to be
+    outside the membrane, in the cytoplasm.",
+                            "cytoplasmic reg of mem-bound prot", ShortName))
+    outdir <- dirname(ipr)
+  # write results to file
+  write_tsv(ipr_cln, file.path(paste0(outdir, "/",paste0(prefix,".iprscan_cln.tsv"))))
+}
+
+ipr2da <- function(infile_ipr, prefix,
+                   analysis = c(
+                     "Pfam", "SMART", "Phobius",
+                     "Gene3D", "TMHMM", "SignalP_GRAM_POSITIVE",
+                     "SUPERFAMILY", "MobiDBLite", "TIGRFAM", "PANTHER", "Coils"
+                   )) {
+  # read in cleaned up iprscan results
+  ipr_in <- read_tsv(infile_ipr, col_names = T, col_types = ipr_cln_cols)
+
+  # split dataframe into unique proteins
+  x <- split(x = ipr_in, f = ipr_in$AccNum)
+
+  # plan(strategy = "multicore", .skip = T)
+
+  # within each data.table
+  domarch <- map(x, function(y) {
+    # domarch <- future_map(x, function(y) {
+    acc_row <- data.frame(AccNum = y$AccNum[1], stringsAsFactors = F)
+    DAs <- data.frame(matrix(nrow = 1, ncol = length(analysis)))
+    DA <- y %>%
+      group_by(Analysis) %>%
+      arrange(StartLoc)
+    i <- 1
+    for (a in analysis) {
+      a_da <- DA %>% filter(Analysis == a)
+      if (a == "SignalP_EUK" || a == "SignalP_GRAM_NEGATIVE" ||
+          a == "SignalP_GRAM_POSITIVE") {
+        var_shortname <- "DB.ID"
+      } else {
+        var_shortname <- "ShortName"
+      }
+      var_shortname_sym <- sym(var_shortname)
+      a_da <- a_da %>%
+        ungroup() %>%
+        select({{ var_shortname_sym }}) %>%
+        filter(!is.na({{ var_shortname_sym }})) %>%
+        filter(!is.null({{ var_shortname_sym }})) %>%
+        pull(var_shortname) %>%
+        paste(collapse = "+")
+      DAs[1, i] <- a_da
+      i <- (i + 1)
+    }
+
+    colnames(DAs) <- paste("DomArch", analysis, sep = ".")
+    return(cbind(acc_row, DAs))
+  })
+
+  # select relevant rows from ipr input to add to domarch
+  ipr_select <- ipr_in %>%
+    select(Name, AccNum, Species, TaxID, Lineage, Lineage_long_na,
+           Lineage_long, Lineage_med, Lineage_short, ProteinName,
+           SourceDB, Completeness, AccNum.noV) %>%
+    distinct()
+
+  # combine domarchs to one data frame, merge w/ acc2info
+  domarch2 <- do.call(rbind.data.frame, domarch)
+
+  domarch_lins <- domarch2 %>%
+    merge(ipr_select, by = "AccNum", all.x = T)
+
+  # save domarch_lins file
+  write_tsv(domarch_lins,
+            file = paste0(prefix, ".ipr_domarch.tsv"),
+            append = F, na = "NA"
+  )
+
+  # return domarch2 dataframe to append to blast results if given
+  return(domarch2)
+}
+
+## function to add results from ipr2da to blast results
+append_ipr <- function(ipr_da, blast, prefix) {
+  # ! an 'AccNum' or 'AccNum.noV' column is required in blast table for joining !
+  blast_out <- read_tsv(blast, col_names = T)
+  if ("AccNum.noV" %in% colnames(blast_out)) {
+    ipr_da <- read_tsv(paste0(prefix, ".ipr_domarch.tsv"), col_names = T)
+    blast_ipr <- merge(blast_out, ipr_da, by = "AccNum.noV", all.x = T)
+  } else {
+    blast_ipr <- merge(blast_out, ipr_da, by = "AccNum", all.x = T)
+  }
+
+  write_tsv(blast_ipr, file = paste0(prefix, ".full_analysis.tsv"), na = "NA")
+}
+
+# Web BLAST output
+web_blast_colnames <- c("Query", "AccNum",
+                        "PcIdentity", "AlnLength", "Mismatch", "GapOpen",
+                        "QStart", "QEnd", "SStart", "SEnd",
+                        "EValue", "BitScore", "PcPosOrig",
+                        "QSFrames") # specific to "blastx"
+
+
+# BLAST Command line
+cl_blast_colnames <- c("Query", "SAccNum", "AccNum",
+                       "SAllSeqID", "STitle", "Species", "TaxID",
+                       "PcIdentity", "AlnLength", "Mismatch", "GapOpen",
+                       "QStart", "QEnd", "QLength",
+                       "SStart", "SEnd", "SLength",
+                       "EValue", "BitScore", "PcPosOrig",
+                       "PcPositive", "ClusterID") # post-cleanup
+
+# IPRSCAN (web+command-line)
+ipr_colnames <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis",
+                  "DB.ID", "SignDesc", "StartLoc", "StopLoc", "Score",
+                  "Status", "RunDate", "IPRAcc", "IPRDesc")
+
+# RPSBLAST
+rps_colnames <- c("AccNum", "DBID", "DBSeqID",
+                  "PcIdentity", "PcPosOrig", # Ppos missing
+                  "AlnLength", "Mismatch",
+                  # Q here is Subject; S here is the matching domain; rename!
+                  "SStart", "SEnd", "DStart", "DEnd",
+                  "EValue", "BitScore", "TaxID") # TaxID missing (NA); remove?
+
+# IPG
+ipg_colnames <- c("IPG.ID", "Source", "NucAccNum",
+                  "NucStart", "NucStop", "Strand",
+                  "AccNum", "ProtDesc",
+                  "Species", "SppStrain", "AssemblyID")
+# Final ColNames
+combo_colnames <- c("Query", "AccNum", "Species", "TaxID", "Lineage",
+                    "PcPositive", "ClusterID",
+                    # "Leaf", # MISSING (useful for all dataviz)
+                    # "AssemblyID", "GeneName", "ProtDesc", # MISSING NOW!?!
+                    "DomArch.Pfam", "DomArch.COG", "DomArch.Gene3D",
+                    "DomArch.TMHMM", "DomArch.Phobius", "DomArch.SignalP")
+
+
+## ############ ##
+## COLUMN names ##
+## ############ ##
+## BLAST
+############
+## Web-BLAST
+############
+## Downloaded as HIT-TABLE csv
+# BLASTP and related protein BLASTs
+web_blastp_hit_colnames <- c(
+  "Query", "AccNum",
+  "PcIdentity", "AlnLength", "Mismatch", "GapOpen",
+  "QStart", "QEnd", "SStart", "SEnd",
+  "EValue", "BitScore", "PcPosOrig"
+)
+# BLASTX
+web_blastx_colnames <- c(
+  "Query", "AccNum",
+  "PcIdentity", "AlnLength", "Mismatch", "GapOpen",
+  "QStart", "QEnd", "SStart", "SEnd",
+  "EValue", "BitScore", "PcPosOrig",
+  "QSFrames"
+) # specific to "blastx"
+
+## Downloaded as Descriptions csv
+# BLASTP and related protein BLASTs
+web_blastp_desc_colnames <- c(
+  "Description", "Species", "CommonName", "TaxID",
+  "BitScore", "TotalScore",
+  "PcQCover", "EValue", "PcIdentity",
+  "SLen", "AccNum"
+)
+
+#####################
+## Command line BLAST
+#####################
+
+# pre-cleanup
+cl_blast_colnames <- c(
+  "Query", "SAccNum", "AccNum",
+  "SAllSeqID", "STitle", "Species", "TaxID",
+  "PcIdentity", "AlnLength", "Mismatch", "GapOpen",
+  "QStart", "QEnd", "QLength",
+  "SStart", "SEnd", "SLength",
+  "EValue", "BitScore", "PcPosOrig"
+)
+
+# post-cleanup
+cl_blast_postcln_cols <- c(
+  "Query", "AccNum",
+  "STitle", "Species", "TaxID", "Lineage", "Lineage_long",
+  "Lineage_long_na", "Lineage_med", "Lineage_short",
+  "PcPositive", "PcIdentity", "AlnLength",
+  "SAccNum", "SAllSeqID",
+  "Mismatch", "GapOpen",
+  "QStart", "QEnd", "QLength",
+  "SStart", "SEnd", "SLength",
+  "EValue", "BitScore", "PcPosOrig", "QueryName"
+)
+
+##########
+## IPRSCAN
+##########
+# ipr_colnames_orig <- c("AccNum", "Seq_MD5_digest", "SeqLen", "Analysis",
+#                        "DB_ID", "SignDesc", "StartLoc", "StopLoc", "Score",
+#                        "Status", "RunDate", "IPRAcc", "IPRDesc")
+
+ipr_colnames <- c(
+  "AccNum", "SeqMD5Digest", "SLength", "Analysis",
+  "DB.ID", "SignDesc", "StartLoc", "StopLoc", "Score",
+  "Status", "RunDate", "IPRAcc", "IPRDesc"
+)
+
+# post cleanup
+##########################
+## NEED TO BE REORDERED ##
+##########################
+ipr_cln_colnames <- c(
+  "DB.ID", "TaxID", "AccNum.noV", "AccNum",
+  "SeqMD5Digest", "SLength", "Analysis", "SignDesc",
+  "StartLoc", "StopLoc", "Score", "Status", "RunDate",
+  "IPRAcc", "IPRDesc", "FullAccNum", "ProteinName",
+  "Length", "SourceDB", "Completeness", "Lineage",
+  "Species", "Name", "ShortName", "LookupTblDesc",
+  "ID", "Label"
+)
+
+###########
+## RPSBLAST
+###########
+rps_colnames <- c(
+  "AccNum", "DB.ID", "DBSeqID",
+  "PcIdentity.Dom", "PcPosOrig.Dom", # "PcPos.Dom", # Ppos missing
+  "AlnLength", "Mismatch",
+  "SStart", "SEnd", "DStart", "DEnd",
+  "EValue", "BitScore", "TaxID"
+) # TaxID missing (NA); remove?
+
+#######################
+## IPG and Lineage maps
+#######################
+ipg_colnames <- c(
+  "IPG.ID", "Source", "NucAccNum",
+  "NucStart", "NucStop", "Strand",
+  "AccNum", "Description",
+  "Species", "Spp.Strain", "AssemblyID"
+)
+
+##################
+## Assembly files
+## Genbank, Refseq
+##################
+assembly_colnames <- c(
+  "AssemblyID",
+  "bioproject", "biosample", "wgs_master", # not used
+  "RefseqCategory", "TaxID", "Spp.TaxID",
+  "Species", "Spp.Strain",
+  "isolate", "version_status", # not used
+  "assembly_level", "release_type", # not used
+  "GenomeStatus",
+  "seq_rel_date", "asm_name", "submitter", # not used
+  "AssemblyID.GBRS",
+  "paired_asm_comp", "ftp_path", # not used
+  "excluded_from_refseq", "relation_to_type_material"
+) # not used
+assembly_sub_colnames <- c(
+  "TaxID", "Spp.TaxID", "Species", "Spp.Strain",
+  "RefseqCategory", "GenomeStatus",
+  "AssemblyID", "AssemblyID.GBRS"
+)
+
+#################
+## Lookup tables
+## in common_data
+#################
+lineage_lookup_colnames <- c("TaxID", "Species", "Lineage_long",
+                             "Lineage_long_na", "Lineage_med",
+                             "Lineage_short", "Lineage")
+domarch_lookup_colnames <- c("DB.ID", "ShortName", "Description", "ID")
+# !! SC and LS will fix other piecemeal files based on these
+
+######################
+## FINAL UPLOADED DATA
+######################
+## Combined data frame that is loaded on to the webapp
+combo_colnames <- c(
+  "Query", "UID", "AccNum", "Species", "TaxID", "Lineage",
+  "PcPositive", "ClusterID", "QueryName",
+  # "AssemblyID", "GeneName", "Description", # MISSING NOW!?!
+  "DomArch.Pfam", "DomArch.COG", "DomArch.Gene3D",
+  "DomArch.TMHMM", "DomArch.Phobius", "DomArch.SignalP",
+  "DomArch.SMART", "DomArch.TIGR"
+)
+
+
+################
+## read tsv colnames
+################
+lookup_table_cols <- cols(
+  DB.ID = col_character(),
+  ShortName = col_character(),
+  Description = col_character(),
+  ID = col_character()
+)
+
+iprscan_cols <- cols(
+  .default = col_character(),
+  TaxID = col_double(),
+  SLength = col_double(),
+  SignDesc = col_character(),
+  StartLoc = col_double(),
+  StopLoc = col_double(),
+  Score = col_double(),
+  Status = col_logical(),
+  IPRAcc = col_character(),
+  IPRDesc = col_character(),
+  Length = col_double(),
+  ShortName = col_character(),
+  LookupTblDesc = col_character(),
+  ID = col_character(),
+  Label = col_character()
+)
+
+ipr_cln_cols <- cols(
+  .default = col_character(),
+  TaxID = col_double(),
+  SLength = col_double(),
+  StartLoc = col_double(),
+  StopLoc = col_double(),
+  Score = col_double(),
+  Status = col_logical(),
+  IPRAcc = col_logical(),
+  IPRDesc = col_logical(),
+  Length = col_double(),
+  ID = col_logical()
+)
+
+lineage_map_cols <- c(
+  "double",
+  "character",
+  "character", "character", "character", "character", "character"
+)
diff --git a/inst/report/scripts/viz_utils.R b/inst/report/scripts/viz_utils.R
new file mode 100644
index 00000000..d721855b
--- /dev/null
+++ b/inst/report/scripts/viz_utils.R
@@ -0,0 +1,742 @@
+# Author(s): Awa Synthia
+# Last modified: 2024
+
+# Load necessary packages
+library(dplyr)
+library(stringr)
+library(visNetwork)
+library(DT)
+library(plotly)
+
+# Function to generate the InterProScan Visualization
+generate_ipr_genes_visualization <- function(data, app_data,
+                                             input_rs_iprDatabases,
+                                             input_rs_iprVisType) {
+
+    # Check if analysis is loaded
+    if (nrow(data@df) == 0 || app_data@ipr_path == "") {
+        stop("Analysis data is not loaded properly or ipr_path is missing.")
+    }
+
+    # If there is no cln_path or PcPositive column is NULL
+    if (length(data@cln_path) == 0 || is.null(data@df$PcPositive)) {
+
+        # Set column name for accessing based on the data
+        n <- if ("name" %in% colnames(data)) {
+            "name"
+        } else {
+            "AccNum"
+        }
+        n <- "Name"  # Hardcoded to "Name" based on original code
+
+        # Call the `ipr2viz_web` function
+        ipr_plot <- ipr2viz_web(
+            infile_ipr = data@ipr_path,
+            accessions = data@df$Name,
+            analysis = input_rs_iprDatabases,
+            group_by = input_rs_iprVisType,
+            name = n
+        )
+
+    } else {
+
+        # Call the `ipr2viz` function with additional arguments
+        ipr_plot <- ipr2viz(
+            infile_ipr = data@ipr_path,
+            infile_full = data@df,
+            accessions = unique(data@df$Name),
+            analysis = input_rs_iprDatabases,
+            group_by = input_rs_iprVisType,
+            topn = 20,   # This value is hardcoded in the original code
+            query = "All"
+        )
+    }
+
+    # Return the plot object for further use
+    return(ipr_plot)
+}
+
+# Function to generate the domain network layout visualization
+generate_rs_network_layout <- function(data, app_data,
+                                       cutoff = 100,
+                                       layout = "nice") {
+
+    # Check if analysis is loaded and app data has a valid ipr_path
+    if (nrow(data@df) == 0 || app_data@ipr_path == "") {
+        stop("Analysis not loaded or ipr_path missing.")
+    }
+
+    # Extract column names and find the first matching "DomArch" column
+    cols <- colnames(data@df)
+    if (is.null(cols)) {
+        stop("Dataframe columns are missing.")
+    }
+
+    col <- cols[grepl("DomArch.*", cols)][1]
+    if (is.null(col)) {
+        stop("No domain architecture column found.")
+    }
+
+    # Clean up domain architecture columns in the data
+    df_data <- data@df %>%
+        mutate(across(tidyselect::starts_with("DomArch"), clean_string))
+
+    # Generate the network using the domain_network function
+    res_network <- domain_network(
+        df_data,
+        column = col,
+        domains_of_interest = ".*",
+        cutoff = cutoff,
+        layout = layout
+    )
+
+    # Validate that the result is not an error
+    if (res_network == "error") {
+        stop("Not enough nodes to construct a network.")
+    }
+
+    return(res_network)
+}
+
+# Function to generate the data table
+generate_data_table <- function(data) {
+    if (nrow(data@df) == 0) {
+        stop("No data available.
+             Please ensure you have uploaded your data correctly.")
+    }
+
+    # Define the columns to be shown
+    viewing_cols <- c(
+        "AccNum", "QueryName", "Name", "Lineage", "Species",
+        "Length", "PcPositive", "DomArch.Pfam"
+    )
+
+    # Extract the data
+    d <- data@df
+
+    # Identify columns to hide (those not in viewing_cols)
+    hide_cols <- which(!colnames(d) %in% viewing_cols) - 1
+
+    # Add hyperlinks to the "AccNum" column
+    d$AccNum <- paste0("<a href='https://www.ncbi.nlm.nih.gov/protein/",
+                       d$AccNum, "' target='_blank' >", d$AccNum, "</a>")
+
+    # Create the DataTable
+    dt <- DT::datatable(d,
+          rownames = FALSE,
+          filter = "top",  # Enable filtering
+          extensions = c("Buttons"),  # Enable buttons (e.g., CSV export)
+          escape = FALSE,  # Allow HTML rendering (for the hyperlinks)
+          options = list(
+              dom = "frlBtip",  # Layout for the table (filters, buttons, etc.)
+              pageLength = 10,  # Number of rows per page
+              paging = TRUE,  # Enable pagination
+              # Enable regex search
+              search = list(caseInsensitive = TRUE, regex = TRUE),
+              language = list(
+                  searchPlaceholder = "Regex or Text...",
+                  filterPlaceholder = "Test"
+              ),
+              buttons = list(
+                  list(
+                      extend = "colvis",
+                      text = "Add/remove column(s)"
+                  ),
+                  list(
+                      extend = "csv",
+                      text = "Download",
+                      filename = "molevolvr-data",
+                      exportOptions = list(
+                        # Export all data, not just visible page
+                          modifier = list(page = "all")
+                      )
+                  )
+              ),
+              scrollX = FALSE,  # Disable horizontal scrolling
+              fixedHeader = FALSE,  # Disable fixed header
+              columnDefs = list(
+                # Hide columns not in 'viewing_cols'
+                  list(visible = FALSE, targets = hide_cols)
+              )
+          )
+    )
+
+    return(dt)
+}
+
+# Function to generate query data table
+generate_query_data_table <- function(query_data, query_select = NULL) {
+
+    # Check if analysis is loaded and data is available
+    if (nrow(query_data@df) == 0) {
+        stop("No data available. Please ensure you have uploaded your data
+             correctly.")
+    }
+
+    # Define the columns to be shown
+    viewing_cols <- c("QueryName", "Species", "Lineage", "DomArch.Pfam")
+
+    # If no specific queries are selected, display all data
+    if (is.null(query_select)) {
+        d <- query_data@df
+    } else {
+        # Filter the data for selected queries
+        d <- query_data@df %>% filter(QueryName %in% query_select)
+        d <- droplevels(d)
+    }
+
+    # Identify columns to hide (those not in viewing_cols)
+    hide_cols <- which(!colnames(d) %in% viewing_cols) - 1
+
+    # Add hyperlinks to the "Query" column (replace 'Query' with the actual
+    # column name if different)
+    d$Query <- paste0("<a href='https://www.ncbi.nlm.nih.gov/protein/",
+                      d$Query, "' target='_blank' >", d$Query, "</a>")
+
+    # Create and return the DataTable
+    dt <- DT::datatable(d,
+              rownames = FALSE,
+              filter = "top",  # Enable filtering
+              extensions = c("Buttons"),  # Enable buttons (e.g., CSV export)
+              callback = DT::JS(c(
+                  '$(\'div.has-feedback input[type="search"]\').attr( "placeholder", "Search..." );'
+              )),
+              escape = FALSE,  # Allow HTML rendering (for the hyperlinks)
+              options = list(
+                  dom = "frlBtip",  # Layout for the table (filters, buttons, etc.)
+                  pageLength = 25,  # Number of rows per page
+                  paging = TRUE,  # Enable pagination
+                  # Enable regex search
+                  search = list(caseInsensitive = TRUE, regex = TRUE),
+                  language = list(
+                      searchPlaceholder = "Regex or Text..."
+                  ),
+                  buttons = list(
+                      list(
+                          extend = "colvis",
+                          text = "Add/remove column(s)"
+                      ),
+                      list(
+                          extend = "csv",
+                          text = "Download",
+                          filename = "molevolvr-query",
+                          exportOptions = list(
+                            # Export all data, not just visible page
+                              modifier = list(page = "all")
+                          )
+                      )
+                  ),
+                  scrollX = FALSE,  # Disable horizontal scrolling
+                  fixedHeader = FALSE,  # Disable fixed header
+                  columnDefs = list(
+                    # Hide columns not in 'viewing_cols'
+                      list(visible = FALSE, targets = hide_cols)
+                  )
+              )
+    )
+
+    return(dt)
+}
+
+# Function to read and return the FASTA file contents
+read_fasta_data <- function(fasta_path) {
+
+    # Check if analysis is loaded and the file path is not empty
+    if (fasta_path == "" || !file.exists(fasta_path)) {
+        stop("FASTA file path is invalid or the file does not exist.")
+    }
+
+    # Read the content of the FASTA file
+    fasta_content <- read_file(fasta_path)
+
+    return(fasta_content)
+}
+
+get_fasta_data <- function(fasta_path) {
+    if (is.null(fasta_path) || fasta_path == "") {
+        stop("Error: FASTA path is not provided.")
+    }
+    return(read_file(fasta_path))  # Read and return the FASTA data
+}
+
+# Function to get domain sequences (assumes `data` is a predefined object)
+get_domain_data <- function() {
+    return(data@domainSeqs)  # Return domain sequences
+}
+
+# Function to get MSA data from a given path
+get_msa_data <- function(msa_path) {
+    if (is.null(msa_path) || msa_path == "") {
+        stop("Error: MSA path is not provided.")
+    }
+    return(read_file(msa_path))
+}
+
+# Function to generate a heatmap
+generate_query_heatmap <- function(query_data_df,
+                                   heatmap_select = "All",
+                                   heatmap_color = "blue") {
+
+    # Check if analysis is loaded and query data exists
+    if (nrow(query_data_df) == 0) {
+        stop("No query data available.")
+    }
+
+    # Filter queries based on user selection
+    if (heatmap_select == "All") {
+        queries <- unique(query_data_df$QueryName)
+        prot <- query_data_df %>%
+            filter(Lineage != "") %>%
+            tidyr::drop_na(Lineage)
+    } else {
+        queries <- heatmap_select
+        prot <- query_data_df %>%
+            filter(grepl(heatmap_select, QueryName, ignore.case = TRUE)) %>%
+            filter(Lineage != "") %>%
+            tidyr::drop_na(Lineage)
+    }
+
+    # Validate that the Lineage column has values
+    if (all(unique(prot$Lineage) == "")) {
+        stop("Lineage column not found for selected proteins.
+             See the FAQ for possible reasons/solutions.")
+    }
+
+    # Assuming `lineage.Query.plot` is a custom function for plotting
+    lineage.Query.plot(prot, queries = queries, colname = "QueryName",
+                       cutoff = 100, color = heatmap_color)
+}
+
+# Function to retrieve domain architecture columns
+get_domarch_columns <- function(query_data_df) {
+    # Check if query data exists
+    if (nrow(query_data_df) == 0) {
+        stop("No query data available.")
+    }
+
+    # Get column names
+    cols <- colnames(query_data_df)
+
+    # Filter domain architecture columns, excluding repeats
+    domarch_cols <- cols[grepl("^DomArch", cols) & !grepl("repeats$", cols)]
+
+    # Identify columns that are completely NA
+    na_cols <- names(query_data_df)[apply(query_data_df, 2,
+                                          function(x) all(is.na(x)))]
+
+    # Remove NA columns from domain architecture columns
+    domarch_cols <- setdiff(domarch_cols, na_cols)
+
+    # Include SignalP_GRAM_POSITIVE if present
+    if ("SignalP_GRAM_POSITIVE" %in% domarch_cols) {
+        domarch_cols <- setdiff(domarch_cols, "SignalP_GRAM_POSITIVE")  # Remove first
+        domarch_cols <- append(domarch_cols, "SignalP_GRAM_POSITIVE")    # Append it last
+    }
+
+    # Remove prefix from column names
+    domarch_cols <- substring(domarch_cols, first = 9)
+
+    return(domarch_cols)
+}
+
+# Function to generate main data table
+generate_main_table <- function(data_df, main_select = NULL) {
+    # Validate input data
+    if (nrow(data_df) == 0) {
+        stop("No data available. Please ensure you have uploaded your data
+             correctly. See Help documentation or contact JRaviLab
+             (janani.ravi[AT]cuanschutz[DOT]edu).")
+    }
+
+    # Define columns to view
+    viewing_cols <- c("AccNum", "QueryName", "Name", "Lineage",
+                      "Species", "Length", "PcPositive",
+                      "DomArch.Pfam")
+
+    # Filter data based on selection
+    if (!is.null(main_select)) {
+        d <- data_df %>%
+            filter(QueryName %in% main_select) %>%
+            droplevels()
+    } else {
+        d <- data_df
+    }
+
+    # Identify columns to hide
+    hide_cols <- which(!colnames(d) %in% viewing_cols)
+    hide_cols <- hide_cols - 1
+
+    # Create hyperlinks for AccNum
+    d$AccNum <- paste0("<a href='https://www.ncbi.nlm.nih.gov/protein/",
+                       d$AccNum, "' target='_blank'>", d$AccNum, "</a>")
+
+    # Generate DataTable
+    datatable_output <- DT::datatable(d,
+                        rownames = FALSE,
+                        filter = "top",
+                        callback = DT::JS(c(
+                            '$(\'div.has-feedback input[type="search"]\').attr( "placeholder", "Search..." );'
+                        )),
+                        extensions = c("Buttons"),
+                        escape = FALSE,
+                        options = list(
+                            dom = "frlBtip",
+                            pageLength = 25,
+                            paging = TRUE,
+                            search = list(caseInsensitive = TRUE, regex = TRUE),
+                            language = list(searchPlaceholder = "Regex or Text...",
+                                            filterPlaceholder = "Test"),
+                            buttons = list(
+                                list(extend = "colvis", text = "Add/remove column(s)"),
+                                list(
+                                    extend = "csv",
+                                    text = "Download",
+                                    filename = "molevolvr-homolog",
+                                    exportOptions = list(modifier = list(page = "all"))
+                                )
+                            ),
+                            scrollX = FALSE,
+                            fixedHeader = FALSE,
+                            columnDefs = list(list(visible = FALSE, targets = hide_cols))
+                        )
+    )
+
+    return(datatable_output)
+}
+
+# Function to generate Domain Architecture Linear Table
+generate_DA_lin_table <- function(DA_col, ipr_path, DAlin_count_table_DT) {
+    # Check if ipr_path is valid
+    if (ipr_path == "") {
+        stop("InterPro path is empty.")
+    }
+
+    # Check if DA_col is provided
+    if (is.null(DA_col)) {
+        stop("DA_col input is required.")
+    }
+
+    # Generate the DAlin count table
+    DAlin_table <- DAlin_count_table_DT()
+
+    return(DAlin_table)
+}
+
+# Function to generate the Domain Architecture Lineage Plot
+generate_DA_heatmap_plot <- function(DA_col, DACutoff,
+                                     DA_Prot, DA_lin_color,
+                                     analysis_loaded, ipr_path) {
+    # Check if ipr_path is valid
+    if (ipr_path == "") {
+        stop("InterPro path is empty.")
+    }
+
+    # Filter the protein data for plotting
+    prot <- DA_Prot() %>%
+        filter(Lineage != "") %>%
+        drop_na(Lineage)
+
+    # Create the plot
+    plot <- lineage.DA.plot(
+        prot,
+        colname = DA_col,
+        cutoff = DACutoff,
+        RowsCutoff = FALSE,
+        color = DA_lin_color
+    )
+
+    # Convert ggplot to plotly object
+    plot <- plotly::ggplotly(plot)
+    plot <- plot %>% plotly::layout(xaxis = list(side = "top"))
+
+    return(plot)
+}
+
+# Function to generate the Domain Architecture Network
+generate_domain_network <- function(DA_col, DACutoff, DA_Prot,
+                                    networkLayout, ipr_path) {
+    # Check if ipr_path is valid
+    if (ipr_path == "") {
+        stop("InterPro path is empty.")
+    }
+
+    # Prepare the selected protein data
+    dn_data <- DA_Prot()
+    col <- sym(DA_col)  # Convert DA_col to a symbol for use with dplyr
+    dn_data[[col]] <- str_replace_all(dn_data[[col]], " ", "_")
+
+    # Filter data based on the specified column
+    dn_data <- dn_data %>%
+        drop_na(!!col) %>%
+        filter(!!col != "")
+
+    # Generate the domain network
+    res <- domain_network(
+        prot = dn_data,
+        column = col,
+        domains_of_interest = ".*",
+        cutoff = DACutoff,
+        layout = networkLayout
+    )
+
+    # Validate the result
+    if (res == "error") {
+        stop("Not enough nodes to construct a network.
+             Try increasing 'Total Count Cutoff'.")
+    }
+
+    # Add export functionality
+    visNetwork::addExport(res)
+    res <- res %>% visExport(
+        type = "png",
+        name = "domain_architecture-network",
+        float = "left",
+        label = "Download plot",
+        style = "
+      background-color: #ffffff;
+      color: #333333;
+      border-color: #cccccc;
+      padding: 6px 12px;
+      font-size: 14px;
+      line-height: 1.42857143;
+      border-radius: 4px;
+      cursor: pointer;
+      display: inline-block;
+      margin-bottom: 0;
+      font-weight: normal;
+      text-align: center;
+      vertical-align: middle;
+      touch-action: manipulation;
+      user-select: none;
+      background-image: none;
+      text-decoration: none;
+    "
+    )
+
+    return(res)
+}
+
+# Function to retrieve and clean Domain Architecture data
+get_DA_Prot <- function(app_data, validate_da, DASelect) {
+    # Check if the ipr_path is valid
+    if (app_data@ipr_path == "") {
+        stop("InterPro path is empty.")
+    }
+
+    # Validate domain architecture
+    validate_da()
+
+    # Retrieve app data and clean domain architecture columns
+    df_app_data <- app_data@df
+
+    # Domain architecture column cleanup
+    df_app_data <- df_app_data %>%
+        mutate(across(starts_with("DomArch"), clean_string))
+
+    # Filter based on user selection
+    if (DASelect == "All") {
+        return(df_app_data)
+    } else {
+        return(df_app_data %>% filter(grepl(DASelect, QueryName,
+                                            ignore.case = TRUE)))
+    }
+}
+
+# Function to retrieve domain architecture columns
+get_domarch_cols <- function(app_data, DASelect) {
+    # Check if app data DataFrame is not empty
+    if (nrow(app_data@df) <= 0) {
+        stop("No data available in app data.")
+    }
+
+    # Check if ipr_path is valid
+    if (app_data@ipr_path == "") {
+        stop("InterPro path is empty.")
+    }
+
+    # Get column names
+    cols <- colnames(app_data@df)
+
+    # Filter domain architecture columns
+    domarch_cols <- cols[grepl("^DomArch", cols) & !grepl("repeats$", cols)]
+
+    # Retrieve the data frame
+    domarch_data <- app_data@df
+
+    # Filter data frame based on user selection
+    if (DASelect != "All") {
+        domarch_data <- domarch_data %>% filter(QueryName == DASelect)
+    }
+
+    # Identify columns that are completely NA or empty
+    na_cols <- apply(domarch_data, 2, function(x) {
+        all(is.na(x)) || all(x == "")
+    })
+    na_cols <- names(domarch_data)[na_cols]
+
+    # Remove NA columns from the domain architecture columns
+    domarch_cols <- setdiff(domarch_cols, na_cols)
+
+    # Ensure "SignalP_GRAM_POSITIVE" is included if it exists
+    if ("SignalP_GRAM_POSITIVE" %in% domarch_cols) {
+        domarch_cols <- append(domarch_cols[!domarch_cols %in%
+                                              "SignalP_GRAM_POSITIVE"],
+                               "SignalP_GRAM_POSITIVE")
+    }
+
+    return(domarch_cols)
+}
+
+# Function to generate the IPR genes plot
+generate_da_ipr_genes_plot <- function(app_data, da_iprDatabases,
+                                       da_iprVisType, DASelect) {
+
+    if (app_data@ipr_path == "") {
+        stop("IPR path is not set.")
+    }
+
+    # Validate the data frame
+    df <- app_data@df
+    if (nrow(df) == 0) {
+        stop("No data available. Please ensure you have uploaded
+             your data correctly.")
+    }
+
+    if (is.null(da_iprDatabases) || length(da_iprDatabases) == 0) {
+        stop("Please select an analysis.")
+    }
+
+    # Determine the name to use
+    if (length(app_data@cln_path) == 0 || is.null(df$PcPositive)) {
+        name_column <- ifelse("name" %in% colnames(df), "name", "AccNum")
+        name_column <- "Name"
+
+        # Generate the plot using the web version
+        plot <- ipr2viz_web(
+            infile_ipr = app_data@ipr_path,
+            accessions = df$Name,
+            analysis = da_iprDatabases,
+            group_by = da_iprVisType,
+            name = name_column
+        )
+    } else {
+        # Generate the plot using the local version
+        plot <- ipr2viz(
+            infile_ipr = app_data@ipr_path,
+            infile_full = df,
+            accessions = unique(df$Name),
+            analysis = da_iprDatabases,
+            group_by = da_iprVisType,
+            topn = 20,
+            query = DASelect
+        )
+    }
+
+    return(plot)
+}
+
+# Function to filter proteins for phylogeny
+filter_phylogeny_proteins <- function(app_data, phylo_select) {
+    # Validate the input app_data
+    if (!analysis_loaded()) {
+        stop("Analysis not loaded.")
+    }
+
+    # Get the data frame from app_data
+    df <- app_data@df
+
+    # Check if the data frame is empty
+    if (nrow(df) == 0) {
+        stop("No data available in app_data.
+             Please ensure it has been loaded correctly.")
+    }
+
+    # Filter the data based on user selection
+    if (phylo_select == "All") {
+        filtered_df <- df
+    } else {
+        filtered_df <- df %>%
+            filter(grepl(phylo_select, QueryName, ignore.case = TRUE)) %>%
+            filter(Lineage != "")
+    }
+
+    return(filtered_df)
+}
+
+# Function to retrieve representative accession numbers
+get_representative_accession_numbers <- function(app_data, phylo_select,
+                                                 msa_reduce_by, msa_rep_num,
+                                                 rval_phylo) {
+
+    if (rval_phylo()) {
+        return(app_data@df$AccNum)
+    } else {
+        switch(msa_reduce_by,
+               "Species" = rep_acc_species(),
+               "Lineage" = rep_acc_lineage(),
+               "DomArch" = {
+                   if (app_data@ipr_path == "") {
+                       stop("IPR path is empty. Please ensure it is set correctly.")
+                   }
+                   seqs <- find_top_acc(infile_full = app_data@df,
+                                        n = msa_rep_num,
+                                        query = phylo_select)
+                   if (is.null(seqs)) {
+                       stop("No sequences found.")
+                   }
+                   return(seqs)
+               },
+               "Full" = {
+                   tmp <- app_data@df %>% filter(QueryName == phylo_select)
+                   return(tmp$AccNum)
+               },
+               stop("Invalid selection for msa_reduce_by.")
+        )
+    }
+}
+
+generate_domain_architecture_plot <- function(ipr_path, query_names,
+                                              analysis_type, group_by) {
+    # Check if the input path is provided
+    if (is.null(ipr_path) || ipr_path == "") {
+        stop("Error: Input path for IPR data is not provided.")
+    }
+
+    # Validate that at least one domain is found
+    if (length(query_domarch_cols()) < 1) {
+        stop("Error: No domains found in the input sequences.")
+    }
+
+    n <- "Name"
+    plot <- ipr2viz_web(
+        infile_ipr = ipr_path,
+        accessions = query_names,
+        analysis = analysis_type,
+        group_by = group_by,
+        name = n
+    )
+
+    return(plot)  # Return the generated plot
+}
+
+# Function to convert accessions to names
+acc_to_name <- function(app_data) {
+    # Check if "AccNum" is a column in the data
+    if (!("AccNum" %in% colnames(app_data@df))) {
+        stop("Column 'AccNum' not found in data.")
+    }
+
+    # Check if "Name" column exists and create the output data frame
+    if ("Name" %in% colnames(app_data@df)) {
+        # Select "AccNum" and "Name"
+        df <- select(app_data@df, "AccNum", "Name")
+    } else {
+        # Create a data frame with "AccNum" and assign "AccNum" to "Name"
+        df <- app_data@df %>%
+            select("AccNum") %>%
+            mutate(Name = AccNum)
+    }
+
+    return(df)
+}
+

From 36d0e4f7da73f2f1e71db0d58744555ad85ecc5d Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Sat, 26 Oct 2024 12:17:19 +0300
Subject: [PATCH 02/23] change path to local files

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 inst/report/scripts/run_molevolvr_pipeline.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R
index 0ddc2761..b35cd9f2 100644
--- a/inst/report/scripts/run_molevolvr_pipeline.R
+++ b/inst/report/scripts/run_molevolvr_pipeline.R
@@ -702,7 +702,7 @@ cleanup_blast <- function(infile_blast, acc2info, prefix, wblast = F) {
 
   # TaxID to lineage mapping
   cleanedup_blast$TaxID <- as.integer(cleanedup_blast$TaxID)
-  lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt",
+  lineage_map <- fread("~/data/lineage_lookup.txt",
                        header = TRUE, fill = TRUE,
                        colClasses = lineage_map_cols)
 
@@ -841,7 +841,7 @@ run_interproscan <- function(query_file, prefix, outdir) {
   # Run InterProScan command
   # Construct the command
   command <- paste(
-    "/home/simple/iprdir/interproscan-5.70-102.0/interproscan.sh -i",
+    "~/interproscan-5.70-102.0/interproscan.sh -i",
     shQuote(query_file),
     "-b", shQuote(outfile),
     "-f TSV --cpu", Sys.getenv("INTERPROSCAN_CPUS", "4"),
@@ -875,7 +875,7 @@ ipr2lin <- function(ipr, acc2info, prefix) {
   ipr_tax <- left_join(ipr_in, acc2info_out, by = "AccNum")
 
   # read in lineage map
-  lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt",
+  lineage_map <- fread("~/data/lineage_lookup.txt",
                        header = T, fill = T)
 
   # merge ipr+info w/ lineage
@@ -889,7 +889,7 @@ ipr2lin <- function(ipr, acc2info, prefix) {
     select(-Species.x, -Species.y)
 
   # add lookup table to iprscan file
-  lookup_tbl <- fread(input = "~/awasyn/new_trial/cln_lookup_tbl.tsv",
+  lookup_tbl <- fread(input = "~/data/cln_lookup_tbl.tsv",
                       sep = "\t", header = T, fill = T) %>%
     distinct()
   if ("AccNum.x" %in% names(ipr_lin)) {

From f404cad1657f9ecd0c919801f450c87a1f3abf2a Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Sat, 26 Oct 2024 12:20:01 +0300
Subject: [PATCH 03/23] chane dupload_type to upload_type:)

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 inst/report/scripts/generate_report.R | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R
index c8b4301f..59c02b76 100644
--- a/inst/report/scripts/generate_report.R
+++ b/inst/report/scripts/generate_report.R
@@ -73,7 +73,7 @@ get_card_data <- function(pathogen = NULL, drug = NULL) {
 
 # Run analysis
 run_analysis <- function(
-        dupload_type = "Fasta",
+        upload_type = "Fasta",
         evalue = 0.00001,
         accnum_fasta_input = "",
         file_paths = list(accnum = NULL, fasta = tempfile(), msa = tempfile(),
@@ -114,8 +114,8 @@ run_analysis <- function(
                     fasta_path = file_paths$fasta)
     query_data <- new("queryData")
 
-    if (length(dupload_type) != 1) {
-        stop("dupload_type must be a single value.")
+    if (length(upload_type) != 1) {
+        stop("upload_type must be a single value.")
     }
 
     # Reset default analysis function
@@ -129,8 +129,8 @@ run_analysis <- function(
     fasta_set <- c("Fasta", "AccNum", "MSA")
 
     # Update settings based on upload type
-    updateUploadType <- function(dupload_type) {
-        switch(dupload_type,
+    updateUploadType <- function(upload_type) {
+        switch(upload_type,
                "Fasta" = {
                    resetSettings()
                    acc_homology_analysis <<- TRUE
@@ -157,7 +157,7 @@ run_analysis <- function(
         )
     }
 
-    updateUploadType(dupload_type)
+    updateUploadType(upload_type)
 
     ####### File Upload Functions ########
 
@@ -239,7 +239,7 @@ run_analysis <- function(
     if (phylo) {
         # Validate phylogenetic analysis based on upload type
         is_valid_phylo <- switch(
-            dupload_type,
+            upload_type,
             "Fasta" = {
                 str_count(string = sequence_upload_data@seqs, ">") > 1
             },
@@ -261,7 +261,7 @@ run_analysis <- function(
         }
     }
     # Fasta-like submissions
-    if (dupload_type %in% fasta_set) {
+    if (upload_type %in% fasta_set) {
         # Can have any combination of select options
         type <- ""
         script <- ""
@@ -308,7 +308,7 @@ run_analysis <- function(
     path <- paste0(dir, "/", pin_id, ".fa")
     # Adding validation logic based on upload type
     system(paste0("mkdir ", dir), wait = TRUE)
-    switch(dupload_type,
+    switch(upload_type,
            "Fasta" = {
                # Validate sequence limit
                if (str_count(sequence_upload_data@seqs, ">") > 200) {
@@ -458,7 +458,7 @@ run_analysis <- function(
     }
 
     # Additional handling for different upload types
-    if (dupload_type == "BLAST Output") {
+    if (upload_type == "BLAST Output") {
         dir <- paste0(OUT_PATH, pin_id, "_blast")
         if (blast_upload_data@df == "") {
             stop("Error: Please upload a BLAST file.")
@@ -511,7 +511,7 @@ run_analysis <- function(
             )
         }
 
-    } else if (dupload_type == "InterProScan Output") {
+    } else if (upload_type == "InterProScan Output") {
         dir <- paste0(OUT_PATH, pin_id, "_ipr")
         path <- paste0(dir, "/", pin_id, "_ipr.tsv")
         if (ipr_upload_data()@df == "") {

From f9d2c654043ff6c308a30e0133a11ae522631fbc Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Fri, 1 Nov 2024 09:06:28 +0200
Subject: [PATCH 04/23] update pipeline

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 inst/report/scripts/generate_report.R        |  67 ++++--
 inst/report/scripts/run_molevolvr_pipeline.R |  28 +--
 inst/report/scripts/viz_utils.R              | 206 ++++++++++++++++---
 3 files changed, 244 insertions(+), 57 deletions(-)

diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R
index 59c02b76..7dd89030 100644
--- a/inst/report/scripts/generate_report.R
+++ b/inst/report/scripts/generate_report.R
@@ -2,6 +2,7 @@
 # Last modified: 2024
 
 # get fasta of pathogen and/or drug
+#' @export
 get_card_data <- function(pathogen = NULL, drug = NULL) {
   destination_dir <- "CARD_data"
   # Check if CARD data exists
@@ -72,6 +73,7 @@ get_card_data <- function(pathogen = NULL, drug = NULL) {
 }
 
 # Run analysis
+#' @export
 run_analysis <- function(
         upload_type = "Fasta",
         evalue = 0.00001,
@@ -94,12 +96,15 @@ run_analysis <- function(
         GCCutoff = 0.5,
         query_select = NULL,
         query_iprDatabases = NULL,
-        query_iprVisType = NULL, tree_msa_tool = "ClustalO",
+        query_iprVisType = "Analysis", tree_msa_tool = "ClustalO",
         levels = 2,
         DA_Col = "DomArch.Pfam",
         msa_rep_num = NULL,
         msa_reduce_by = "Species",
         rval_phylo = FALSE,
+        DA_lin_color = c("default", "viridis", "inferno", "magma", "plasma", "cividis"),
+        networkLayout = c("nice", "grid", "circle", "random"),
+        phylo_select = "All",
         ...
 
 ) {
@@ -570,11 +575,11 @@ run_analysis <- function(
 
     # Assign fetched data to objects if the fetched list has the expected length
     if (length(fetched) == 2) {
-        data <<- fetched[[1]]       # Assign the first element to 'data'
-        app_data <<- fetched[[1]]   # Assign the same to 'app_data'
-        query_data <<- fetched[[2]] # Assign the second element to 'query_data'
+        data <- fetched[[1]]       # Assign the first element to 'data'
+        app_data <- fetched[[1]]   # Assign the same to 'app_data'
+        query_data <- fetched[[2]] # Assign the second element to 'query_data'
 
-        r_nrow_initial(nrow(fetched[[1]]@df))  # Initialize row count
+        r_nrow_initial <- nrow(fetched[[1]]@df)  # Initialize row count
     }
 
     domarch_cols_value <- get_domarch_cols(app_data, DASelect)
@@ -587,27 +592,55 @@ run_analysis <- function(
 
     fastaDataText_value <- get_fasta_data(query_data@fasta_path)
 
-    domainDataText_value <- get_domain_data()
+    domainDataText_value <- get_domain_data(data)
 
     msaDataText_value <- get_msa_data(query_data@msa_path)
 
-    rs_IprGenes_value <- generate_ipr_genes_visualization(data, app_data, input_rs_iprDatabases, input_rs_iprVisType)
+    rs_IprGenes_value <- generate_ipr_genes_visualization(data,
+                                                          app_data,
+                                                          input_rs_iprDatabases,
+                                                          input_rs_iprVisType)
 
-    rval_rs_network_layout_value <- generate_rs_network_layout(data, app_data, cutoff = 100, layout = "nice")
+    rs_network_layout_value <- generate_rs_network_layout(data,
+                                                          app_data,
+                                                          cutoff = 100,
+                                                          layout = "nice")
 
     rs_data_table_value <- generate_data_table(data)
 
-    da_IprGenes_value <- generate_da_ipr_genes_plot(app_data, da_iprDatabases, da_iprVisType, DASelect)
+    da_IprGenes_value <- generate_da_ipr_genes_plot(app_data,
+                                                    da_iprDatabases,
+                                                    da_iprVisType,
+                                                    DASelect)
 
-    query_heatmap_value <- generate_query_heatmap(query_data_df, heatmap_select = "All", heatmap_color = "blue")
+    query_heatmap_value <- generate_query_heatmap(query_data_df,
+                                                  heatmap_select = "All",
+                                                  heatmap_color = "blue")
 
-    DA_Prot_value <- get_DA_Prot(app_data, validate_da, DASelect)
+    DA_Prot_value <- get_DA_Prot(app_data, DASelect)
 
-    DALinPlot_value <- generate_DA_heatmap_plot(DA_col, DACutoff, DA_Prot, DA_lin_color, ipr_path)
+    DALinPlot_value <- generate_DA_heatmap_plot(DA_col = "DomArch.Pfam",
+                                                DACutoff,
+                                                DA_Prot_value,
+                                                DA_lin_color = "viridis",
+                                                app_data@ipr_path)
 
-    DALinTable_value <- generate_DA_lin_table(DA_col, ipr_path, DAlin_count_table_DT)
+    DALin_TotalCounts_value <- DA_TotalCounts(DA_Prot_value,
+                                              DACutoff = 95,
+                                              DA_col = "DomArch.Pfam",
+                                              app_data)
 
-    DANetwork_value <- generate_domain_network(DA_col, DACutoff, DA_Prot, networkLayout, ipr_path)
+    DALinTable_value <- generate_DA_lin_table(DA_col = "DomArch.Pfam",
+                                              app_data@ipr_path,
+                                              DALin_TotalCounts_value)
+
+    DANetwork_value <- generate_domain_network(DA_col = "DomArch.Pfam",
+                                               DACutoff,
+                                               DA_Prot_value,
+                                               networkLayout = "nice",
+                                               app_data@ipr_path)
+
+    rep_accnums_value <-
 
     phylogeny_prot_value <- filter_phylogeny_proteins(app_data, phylo_select)
 
@@ -622,7 +655,7 @@ run_analysis <- function(
         # List of graphics to include in report
         params <- list(
             rs_interproscan_visualization = rs_IprGenes_value,
-            proximity_network = rval_rs_network_layout_value,
+            proximity_network = rs_network_layout_value,
             sunburst = data@df,
             data = rs_data_table_value,
             queryDataTable = queryDataTable_value,
@@ -632,7 +665,7 @@ run_analysis <- function(
             query_domarch_cols = query_domarch_cols_value,
             query_iprDatabases = query_iprDatabases,
             query_iprVisType = query_iprVisType,
-            mainTable = maintable_value,
+            mainTable = mainTable_value,
             DALinTable = DALinTable_value,
             DALinPlot = DALinPlot_value,
             DANetwork = DANetwork_value,
@@ -644,7 +677,7 @@ run_analysis <- function(
             phylo_sunburst_levels = levels,
             phylo_sunburst = phylogeny_prot_value,
             tree_msa_tool = tree_msa_tool,
-            rep_accnums = rep_accnums(),
+            rep_accnums = rep_accnums_value,
             msa_rep_num = 10,
             app_data = app_data,
             PhyloSelect = PhyloSelect,
diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R
index b35cd9f2..d2d0b303 100644
--- a/inst/report/scripts/run_molevolvr_pipeline.R
+++ b/inst/report/scripts/run_molevolvr_pipeline.R
@@ -124,6 +124,7 @@ submit_full <- function(
 }
 
 # Define the main pipeline function
+#' @export
 run_molevolvr_pipeline <- function(input_paths, db, nhits, eval,
                                    is_query, type, i) {
 
@@ -182,7 +183,7 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval,
     F_value <- basename(FILE)
     PREFIX <- sub("\\.faa$", "", basename(FILE))
     PREFIX <- gsub(">", "", PREFIX)  # Extract prefix from file name
-    OUTDIR <- file.path(OUTPATH, paste0(PREFIX, "_", type))
+    OUTDIR <- file.path(OUTPATH, paste0(PREFIX))
     dir.create(OUTDIR, showWarnings = FALSE)  # Create output directory
     # setwd(OUTDIR)
 
@@ -234,7 +235,7 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval,
 
 
   # Optionally run IPR2DA if IS_QUERY is true
-  if (is_query == "T") {
+  if (is_query == TRUE) {
     ## perform ipr2da on iprscan results
     da <- ipr2da(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")),
                  PREFIX, "NA")
@@ -244,11 +245,10 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval,
         is.na(PREFIX)) {
       print("No blast results provided, moving on.")
     } else {
-      append_ipr(ipr_da = da, blast = "NA", prefix = PREFIX)
+      browser()
+      file.copy(file.path(OUTDIR, paste0(PREFIX, ".ipr_domarch.tsv")),
+                file.path(OUTDIR, paste0(PREFIX, ".full_analysis.tsv")))
     }
-
-    file.copy(file.path(OUTDIR, paste0(PREFIX, ".ipr_domarch.tsv")),
-              file.path(OUTDIR, PREFIX, paste0(PREFIX, ".full_analysis.tsv")))
   } else {
     # perform ipr2da on iprscan results
     da <- ipr2da(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")),
@@ -660,7 +660,7 @@ cleanup_blast <- function(infile_blast, acc2info, prefix, wblast = F) {
     mutate(FullAccNum = gsub(".*[a-z]", "", FullAccNum))
 
   # If using web-blast output
-  if (wblast == "T") {
+  if (wblast == TRUE) {
     blast_out <- fread(input = infile_blast, sep = "\t", header = FALSE,
                        col.names = web_blastp_hit_colnames, fill = TRUE)
     cleanedup_blast <- blast_out %>%
@@ -702,7 +702,7 @@ cleanup_blast <- function(infile_blast, acc2info, prefix, wblast = F) {
 
   # TaxID to lineage mapping
   cleanedup_blast$TaxID <- as.integer(cleanedup_blast$TaxID)
-  lineage_map <- fread("~/data/lineage_lookup.txt",
+  lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt",
                        header = TRUE, fill = TRUE,
                        colClasses = lineage_map_cols)
 
@@ -841,7 +841,7 @@ run_interproscan <- function(query_file, prefix, outdir) {
   # Run InterProScan command
   # Construct the command
   command <- paste(
-    "~/interproscan-5.70-102.0/interproscan.sh -i",
+    "~/iprdir/interproscan-5.70-102.0/interproscan.sh -i",
     shQuote(query_file),
     "-b", shQuote(outfile),
     "-f TSV --cpu", Sys.getenv("INTERPROSCAN_CPUS", "4"),
@@ -875,7 +875,7 @@ ipr2lin <- function(ipr, acc2info, prefix) {
   ipr_tax <- left_join(ipr_in, acc2info_out, by = "AccNum")
 
   # read in lineage map
-  lineage_map <- fread("~/data/lineage_lookup.txt",
+  lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt",
                        header = T, fill = T)
 
   # merge ipr+info w/ lineage
@@ -889,7 +889,7 @@ ipr2lin <- function(ipr, acc2info, prefix) {
     select(-Species.x, -Species.y)
 
   # add lookup table to iprscan file
-  lookup_tbl <- fread(input = "~/data/cln_lookup_tbl.tsv",
+  lookup_tbl <- fread(input = "~/awasyn/new_trial/cln_lookup_tbl.tsv",
                       sep = "\t", header = T, fill = T) %>%
     distinct()
   if ("AccNum.x" %in% names(ipr_lin)) {
@@ -1019,7 +1019,7 @@ ipr2da <- function(infile_ipr, prefix,
 
   # save domarch_lins file
   write_tsv(domarch_lins,
-            file = paste0(prefix, ".ipr_domarch.tsv"),
+            file = paste0(prefix, "/", prefix, ".ipr_domarch.tsv"),
             append = F, na = "NA"
   )
 
@@ -1032,13 +1032,13 @@ append_ipr <- function(ipr_da, blast, prefix) {
   # ! an 'AccNum' or 'AccNum.noV' column is required in blast table for joining !
   blast_out <- read_tsv(blast, col_names = T)
   if ("AccNum.noV" %in% colnames(blast_out)) {
-    ipr_da <- read_tsv(paste0(prefix, ".ipr_domarch.tsv"), col_names = T)
+    ipr_da <- read_tsv(paste0(prefix, "/", prefix, ".ipr_domarch.tsv"), col_names = T)
     blast_ipr <- merge(blast_out, ipr_da, by = "AccNum.noV", all.x = T)
   } else {
     blast_ipr <- merge(blast_out, ipr_da, by = "AccNum", all.x = T)
   }
 
-  write_tsv(blast_ipr, file = paste0(prefix, ".full_analysis.tsv"), na = "NA")
+  write_tsv(blast_ipr, file = paste0(prefix, "/", prefix, ".full_analysis.tsv"), na = "NA")
 }
 
 # Web BLAST output
diff --git a/inst/report/scripts/viz_utils.R b/inst/report/scripts/viz_utils.R
index d721855b..4d9fdfe6 100644
--- a/inst/report/scripts/viz_utils.R
+++ b/inst/report/scripts/viz_utils.R
@@ -10,8 +10,8 @@ library(plotly)
 
 # Function to generate the InterProScan Visualization
 generate_ipr_genes_visualization <- function(data, app_data,
-                                             input_rs_iprDatabases,
-                                             input_rs_iprVisType) {
+                                             input_rs_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"),
+                                             input_rs_iprVisType = "Analysis") {
 
     # Check if analysis is loaded
     if (nrow(data@df) == 0 || app_data@ipr_path == "") {
@@ -30,7 +30,7 @@ generate_ipr_genes_visualization <- function(data, app_data,
         n <- "Name"  # Hardcoded to "Name" based on original code
 
         # Call the `ipr2viz_web` function
-        ipr_plot <- ipr2viz_web(
+        ipr_plot <- plotIPR2VizWeb(
             infile_ipr = data@ipr_path,
             accessions = data@df$Name,
             analysis = input_rs_iprDatabases,
@@ -41,7 +41,7 @@ generate_ipr_genes_visualization <- function(data, app_data,
     } else {
 
         # Call the `ipr2viz` function with additional arguments
-        ipr_plot <- ipr2viz(
+        ipr_plot <- plotIPR2Viz(
             infile_ipr = data@ipr_path,
             infile_full = data@df,
             accessions = unique(data@df$Name),
@@ -79,10 +79,10 @@ generate_rs_network_layout <- function(data, app_data,
 
     # Clean up domain architecture columns in the data
     df_data <- data@df %>%
-        mutate(across(tidyselect::starts_with("DomArch"), clean_string))
+        mutate(across(tidyselect::starts_with("DomArch"), cleanString))
 
     # Generate the network using the domain_network function
-    res_network <- domain_network(
+    res_network <- createDomainNetwork(
         df_data,
         column = col,
         domains_of_interest = ".*",
@@ -260,7 +260,7 @@ get_fasta_data <- function(fasta_path) {
 }
 
 # Function to get domain sequences (assumes `data` is a predefined object)
-get_domain_data <- function() {
+get_domain_data <- function(data) {
     return(data@domainSeqs)  # Return domain sequences
 }
 
@@ -302,8 +302,7 @@ generate_query_heatmap <- function(query_data_df,
              See the FAQ for possible reasons/solutions.")
     }
 
-    # Assuming `lineage.Query.plot` is a custom function for plotting
-    lineage.Query.plot(prot, queries = queries, colname = "QueryName",
+  plotLineageQuery(prot, queries = queries, colname = "QueryName",
                        cutoff = 100, color = heatmap_color)
 }
 
@@ -404,8 +403,89 @@ generate_main_table <- function(data_df, main_select = NULL) {
     return(datatable_output)
 }
 
+total_counts <- function(prot, column = "DomArch", lineage_col = "Lineage",
+                         cutoff = 90, RowsCutoff = FALSE, digits = 2
+                         # type = "GC"
+) {
+  column <- sym(column)
+
+  prot <- select(prot, {{ column }}, {{ lineage_col }}) %>%
+    filter(!is.na({{ column }}) & !is.na({{ lineage_col }})) %>%
+    filter({{ column }} != "")
+
+  prot <- summarizeByLineage(prot, column, by = lineage_col, query = "all")
+  col_count <- prot %>%
+    group_by({{ column }}) %>%
+    summarise(totalcount = sum(count))
+
+  total <- left_join(prot, col_count, by = as_string(column))
+
+  sum_count <- sum(total$count)
+  total <- total %>%
+    mutate("IndividualCountPercent" = totalcount / sum_count * 100) %>%
+    arrange(-totalcount, -count)
+
+  cumm_percent <- total %>%
+    select({{ column }}, totalcount) %>%
+    distinct() %>%
+    mutate("CumulativePercent" = 0)
+  total_counter <- 0
+  for (x in length(cumm_percent$totalcount):1) {
+    total_counter <- total_counter + cumm_percent$totalcount[x]
+    cumm_percent$CumulativePercent[x] <- total_counter / sum_count * 100
+  }
+
+  cumm_percent <- cumm_percent %>% select(CumulativePercent, {{ column }})
+
+  total <- total %>% left_join(cumm_percent, by = as_string(column))
+
+  # Round the percentage columns
+  total$CumulativePercent <- total$CumulativePercent %>% round(digits = digits)
+  total$IndividualCountPercent <- total$IndividualCountPercent %>% round(digits = digits)
+
+  if (RowsCutoff) {
+    # If total counts is being used for plotting based on number of rows,
+    # don't include other observations that fall below the cummulative percent cutoff
+    # , but that have the same 'totalcount' number as the cutoff observation
+    total <- total %>% filter(CumulativePercent >= 100 - cutoff - .0001)
+    return(total)
+  }
+
+  # Include observations that fall below the cummulative percent cutoff,
+  # but that have the same 'totalcount' as the cutoff observation
+  t <- total %>% filter(CumulativePercent >= 100 - cutoff)
+  if (length(t) == 0) {
+    cutoff_count <- 0
+  } else {
+    cutoff_count <- t$totalcount[nrow(t)]
+  }
+
+  total <- total %>%
+    filter(totalcount >= cutoff_count) %>%
+    ungroup()
+
+  return(total)
+}
+
+DA_TotalCounts <- function(DA_Prot, DACutoff, DA_col, app_data) {
+  # Check if ipr_path is not empty
+  if (app_data@ipr_path == "") {
+    stop("ipr_path is missing.")
+  }
+
+  # Calculate total counts with the specified cutoff and column
+  prot_tc <- total_counts(DA_Prot, cutoff = DACutoff, column = DA_col)
+
+  # Replace all instances of ">" in the Lineage column with "_"
+  prot_tc$Lineage <- map(prot_tc$Lineage, ~ str_replace_all(.x, ">", "_")) %>%
+    unlist()
+
+  # Return the processed data
+  prot_tc
+}
+
 # Function to generate Domain Architecture Linear Table
-generate_DA_lin_table <- function(DA_col, ipr_path, DAlin_count_table_DT) {
+generate_DA_lin_table <- function(DA_col, ipr_path, DA_TotalCounts_value) {
     # Check if ipr_path is valid
     if (ipr_path == "") {
         stop("InterPro path is empty.")
@@ -416,8 +496,39 @@ generate_DA_lin_table <- function(DA_col, ipr_path, DAlin_count_table_DT) {
         stop("DA_col input is required.")
     }
 
+    da_col <- sym(DA_col)
+
+    # Perform the data transformation
+    DA_TotalCounts_value %>%
+        group_by({{ da_col }}, totalcount, CumulativePercent) %>%
+        summarize(LineageCount = n()) %>%
+        select({{ da_col }}, LineageCount, totalcount, CumulativePercent) %>%
+        arrange(-totalcount)
+
     # Generate the DAlin count table
-    DAlin_table <- DAlin_count_table_DT()
+    DAlin_table <- DT::datatable(
+      DA_TotalCounts_value,
+      selection = "single",
+      extensions = c("Buttons"),
+      options = list(
+        pageLength = 25,
+        dom = "frlBtip",
+        buttons = list(
+          list(
+            extend = "csv",
+            text = "Download",
+            filename = "MolEvolvR_domarch",
+            exportOptions = list(
+              modifier = list(page = "all")
+            )
+          )
+        ),
+        scrollX = FALSE,
+        paging = TRUE,
+        fixedHeader = FALSE,
+        fixedColumns = list(leftColumns = 0, rightColumns = 0)
+      )
+    )
 
     return(DAlin_table)
 }
@@ -425,19 +536,19 @@ generate_DA_lin_table <- function(DA_col, ipr_path, DAlin_count_table_DT) {
 # Function to generate the Domain Architecture Lineage Plot
 generate_DA_heatmap_plot <- function(DA_col, DACutoff,
                                      DA_Prot, DA_lin_color,
-                                     analysis_loaded, ipr_path) {
+                                     ipr_path) {
     # Check if ipr_path is valid
     if (ipr_path == "") {
         stop("InterPro path is empty.")
     }
 
     # Filter the protein data for plotting
-    prot <- DA_Prot() %>%
+    prot <- DA_Prot %>%
         filter(Lineage != "") %>%
         drop_na(Lineage)
 
     # Create the plot
-    plot <- lineage.DA.plot(
+    plot <- plotLineageDA(
         prot,
         colname = DA_col,
         cutoff = DACutoff,
@@ -461,7 +572,7 @@ generate_domain_network <- function(DA_col, DACutoff, DA_Prot,
     }
 
     # Prepare the selected protein data
-    dn_data <- DA_Prot()
+    dn_data <- DA_Prot
     col <- sym(DA_col)  # Convert DA_col to a symbol for use with dplyr
     dn_data[[col]] <- str_replace_all(dn_data[[col]], " ", "_")
 
@@ -471,7 +582,7 @@ generate_domain_network <- function(DA_col, DACutoff, DA_Prot,
         filter(!!col != "")
 
     # Generate the domain network
-    res <- domain_network(
+    res <- createDomainNetwork(
         prot = dn_data,
         column = col,
         domains_of_interest = ".*",
@@ -517,21 +628,34 @@ generate_domain_network <- function(DA_col, DACutoff, DA_Prot,
 }
 
 # Function to retrieve and clean Domain Architecture data
-get_DA_Prot <- function(app_data, validate_da, DASelect) {
+get_DA_Prot <- function(app_data, DASelect) {
     # Check if the ipr_path is valid
     if (app_data@ipr_path == "") {
         stop("InterPro path is empty.")
     }
 
     # Validate domain architecture
-    validate_da()
+    # Check if ipr_path is not empty
+    if (app_data@ipr_path == "") {
+        stop("ipr_path is missing.")
+    }
+
+    # Check if the data frame has rows
+    if (nrow(app_data@df) <= 0) {
+        stop("No data available. Please ensure you have uploaded your data correctly. See Help documentation or contact JRaviLab (janani.ravi[AT]cuanschutz[DOT]edu).")
+    }
+
+    # Check if there are domain architecture columns
+    # if (length(domarch_cols) == 0) {
+    #    stop("Please ensure uploaded data has domain architecture columns.")
+    # }
 
     # Retrieve app data and clean domain architecture columns
     df_app_data <- app_data@df
 
     # Domain architecture column cleanup
     df_app_data <- df_app_data %>%
-        mutate(across(starts_with("DomArch"), clean_string))
+        mutate(across(starts_with("DomArch"), cleanString))
 
     # Filter based on user selection
     if (DASelect == "All") {
@@ -612,7 +736,7 @@ generate_da_ipr_genes_plot <- function(app_data, da_iprDatabases,
         name_column <- "Name"
 
         # Generate the plot using the web version
-        plot <- ipr2viz_web(
+        plot <- plotIPR2VizWeb(
             infile_ipr = app_data@ipr_path,
             accessions = df$Name,
             analysis = da_iprDatabases,
@@ -621,7 +745,7 @@ generate_da_ipr_genes_plot <- function(app_data, da_iprDatabases,
         )
     } else {
         # Generate the plot using the local version
-        plot <- ipr2viz(
+        plot <- plotIPR2Viz(
             infile_ipr = app_data@ipr_path,
             infile_full = df,
             accessions = unique(df$Name),
@@ -637,11 +761,6 @@ generate_da_ipr_genes_plot <- function(app_data, da_iprDatabases,
 
 # Function to filter proteins for phylogeny
 filter_phylogeny_proteins <- function(app_data, phylo_select) {
-    # Validate the input app_data
-    if (!analysis_loaded()) {
-        stop("Analysis not loaded.")
-    }
-
     # Get the data frame from app_data
     df <- app_data@df
 
@@ -740,3 +859,38 @@ acc_to_name <- function(app_data) {
     return(df)
 }
 
+rep_accnums <- function(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data) {
+  # If `phylo` is true, return all `AccNum` values from `app_data`
+  if (phylo) {
+    return(app_data@df$AccNum)
+  } else {
+    # Switch based on `msa_reduce_by` value
+    tmp <- filter(app_data@df, QueryName == PhyloSelect)
+    rep_acc_species <- RepresentativeAccNums(tmp, reduced = "Species", column = "AccNum")
+
+    # Get representative accession numbers by "Lineage"
+    rep_acc_lineage <- RepresentativeAccNums(tmp, reduced = "Lineage", column = "AccNum")
+    switch(msa_reduce_by,
+           "Species" = rep_acc_species,
+           "Lineage" = rep_acc_lineage,
+           "DomArch" = {
+             # Check if `ipr_path` is not empty
+             if (app_data@ipr_path == "") {
+               stop("ipr_path is missing.")
+             }
+             # Find top sequences by accession number based on criteria
+             seqs <- getTopAccByLinDomArch(infile_full = app_data@df, n = msa_rep_num, query = PhyloSelect)
+             if (is.null(seqs)) {
+               stop("No sequences found.")
+             }
+             return(seqs)
+           },
+           "Full" = {
+             # Filter data for matching `QueryName` and return `AccNum`
+             tmp <- app_data@df %>% filter(QueryName == PhyloSelect)
+             return(tmp$AccNum)
+           }
+    )
+  }
+}
+

From 57f8cc53ade1591d7922eae23da81447950a2c47 Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Mon, 18 Nov 2024 11:02:38 +0200
Subject: [PATCH 05/23] fix bugs

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 R/ipr2viz.R                                  |   8 +-
 R/networks_domarch.R                         |  20 +--
 R/pre-msa-tree.R                             |  70 ++++----
 R/summarize.R                                | 176 +++++++++----------
 inst/report/report_template.Rmd              |  28 +--
 inst/report/scripts/generate_report.R        |  24 +--
 inst/report/scripts/run_molevolvr_pipeline.R |  11 +-
 inst/report/scripts/viz_utils.R              |  46 +++--
 8 files changed, 212 insertions(+), 171 deletions(-)

diff --git a/R/ipr2viz.R b/R/ipr2viz.R
index e582ab09..a4769ed6 100644
--- a/R/ipr2viz.R
+++ b/R/ipr2viz.R
@@ -94,14 +94,14 @@ getTopAccByLinDomArch <- function(infile_full,
     cln_domarch <- cln %>% select(domarch_cols)
     col_counts <- colSums(is.na(cln_domarch))
     DA_sym <- sym(names(which.min(col_counts)))
-    showNotification(paste0("Selecting representatives by unique ", DA_sym, " and lineage combinations"))
+    # showNotification(paste0("Selecting representatives by unique ", DA_sym, " and lineage combinations"))
     ## Group by Lineage, DomArch and reverse sort by group counts
     grouped <- cln %>%
         group_by({{ DA_sym }}, {{ lin_sym }}) %>%
         arrange(desc(PcPositive)) %>%
         summarise(count = n(), AccNum = dplyr::first(AccNum)) %>%
         arrange(-count) %>%
-        filter({{ lin_sym }} != "" && {{ DA_sym }} != "")
+        filter({{ lin_sym }} != "" & {{ DA_sym }} != "")
     top_acc <- grouped$AccNum[1:n]
     top_acc <- na.omit(top_acc)
     return(top_acc)
@@ -180,7 +180,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
     ipr_out <- read_tsv(infile_ipr, col_names = T, col_types = MolEvolvR::iprscan_cols)
     ipr_out <- ipr_out %>% filter(.data$Name %in% accessions)
     analysis_cols <- paste0("DomArch.", analysis)
-    infile_full <- infile_full %>% select(.data$analysis_cols, .data$Lineage_short, .data$QueryName, .data$PcPositive, .data$AccNum)
+    infile_full <- infile_full %>% select(analysis_cols, .data$Lineage_short, .data$QueryName, .data$PcPositive, .data$AccNum)
     ## To filter by Analysis
     analysis <- paste(analysis, collapse = "|")
     ## @SAM: This can't be set in stone since the analysis may change!
@@ -212,7 +212,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
     analysis_labeler <- analyses %>%
         pivot_wider(names_from = .data$Analysis, values_from = .data$Analysis)
 
-    lookup_tbl_path <- "/data/research/jravilab/common_data/cln_lookup_tbl.tsv"
+    lookup_tbl_path <- "~/awasyn/new_trial/cln_lookup_tbl.tsv"
     lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols)
 
     lookup_tbl <- lookup_tbl %>% select(-.data$ShortName) # Already has ShortName -- Just needs SignDesc
diff --git a/R/networks_domarch.R b/R/networks_domarch.R
index ae6fe8be..e4da7aca 100755
--- a/R/networks_domarch.R
+++ b/R/networks_domarch.R
@@ -24,16 +24,16 @@
 #' A network of domains is returned based on shared domain architectures.
 #'
 #' @param prot A data frame that contains the column 'DomArch'.
-#' @param column Name of column containing Domain architecture from which nodes 
+#' @param column Name of column containing Domain architecture from which nodes
 #' and edges are generated.
 #' @param domains_of_interest Character vector specifying domains of interest.
-#' @param cutoff Integer. Only use domains that occur at or above the cutoff for 
+#' @param cutoff Integer. Only use domains that occur at or above the cutoff for
 #' total counts if cutoff_type is "Total Count".
-#' Only use domains that appear in cutoff or greater lineages if cutoff_type is 
+#' Only use domains that appear in cutoff or greater lineages if cutoff_type is
 #' Lineage.
 #' @param layout Character. Layout type to be used for the network. Options are:
 #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"}
-#' @param query_color Character. Color to represent the queried domain in the 
+#' @param query_color Character. Color to represent the queried domain in the
 #' network.
 #'
 #' @importFrom dplyr across add_row all_of distinct filter mutate pull select
@@ -211,7 +211,7 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c
                 visOptions(highlightNearest = TRUE)
         },
         error = function(e) {
-            showNotification(toString(e))
+            # showNotification(toString(e))
             vis_g <- "error"
         },
         finally = {
@@ -231,18 +231,18 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c
 #'
 #'
 #' @param prot A data frame that contains the column 'DomArch'.
-#' @param column Name of column containing Domain architecture from which nodes 
+#' @param column Name of column containing Domain architecture from which nodes
 #' and edges are generated.
 #' @param domains_of_interest Character vector specifying the domains of interest.
-#' @param cutoff Integer. Only use domains that occur at or above the cutoff for 
+#' @param cutoff Integer. Only use domains that occur at or above the cutoff for
 #' total counts if cutoff_type is "Total Count".
-#' Only use domains that appear in cutoff or greater lineages if cutoff_type is 
+#' Only use domains that appear in cutoff or greater lineages if cutoff_type is
 #' Lineage.
 #' @param layout Character. Layout type to be used for the network. Options are:
 #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"}
-#' @param query_color Color that the nodes of the domains in the 
+#' @param query_color Color that the nodes of the domains in the
 #' domains_of_interest vector are colored
-#' @param partner_color Color that the nodes that are not part of the 
+#' @param partner_color Color that the nodes that are not part of the
 #' domains_of_interest vector are colored
 #' @param border_color Color for the borders of the nodes.
 #' @param IsDirected Is the network directed? Set to false to eliminate arrows
diff --git a/R/pre-msa-tree.R b/R/pre-msa-tree.R
index 75cc375d..e15e7996 100644
--- a/R/pre-msa-tree.R
+++ b/R/pre-msa-tree.R
@@ -46,7 +46,7 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE")
 #' @param y Delimitter. Default is space (" ").
 #'
 #' @importFrom rlang abort
-#' 
+#'
 #' @return A character vector in title case.
 #' @export
 #'
@@ -112,21 +112,21 @@ addLeaves2Alignment <- function(aln_file = "",
     lin_file = "data/rawdata_tsv/all_semiclean.txt", # !! finally change to all_clean.txt!!
     # lin_file="data/rawdata_tsv/PspA.txt",
     reduced = FALSE) {
-    
+
     #Check if the alignment file is provided and exists
     if (nchar(aln_file) == 0) {
         abort("Error: Alignment file path must be provided.")
     }
-    
+
     if (!file.exists(aln_file)) {
         abort(paste("Error: The alignment file '", aln_file, "' does not exist."))
     }
-    
+
     # Check if the lineage file exists
     if (!file.exists(lin_file)) {
         abort(paste("Error: The lineage file '", lin_file, "' does not exist."))
     }
-    
+
     # Check that the 'reduced' parameter is logical
     if (!is.logical(reduced) || length(reduced) != 1) {
         abort("Error: 'reduced' must be a single logical value (TRUE or FALSE).")
@@ -249,15 +249,15 @@ addName <- function(data,
     if (!is.data.frame(data)) {
         abort("Error: The input 'data' must be a data frame")
     }
-    
+
     # Check that the specified columns exist in the data
     required_cols <- c(accnum_col, spec_col, lin_col)
     missing_cols <- setdiff(required_cols, names(data))
     if (length(missing_cols) > 0) {
-        abort(paste("Error: The following columns are missing from the data:", 
+        abort(paste("Error: The following columns are missing from the data:",
                    paste(missing_cols, collapse = ", ")))
     }
-    
+
     cols <- c(accnum_col, "Kingdom", "Phylum", "Genus", "Spp")
     split_data <- data %>%
         separate(
@@ -347,16 +347,16 @@ convertAlignment2FA <- function(aln_file = "",
     if (nchar(aln_file) == 0) {
         abort("Error: Alignment file path must be provided.")
     }
-    
+
     if (!file.exists(aln_file)) {
         abort(paste("Error: The alignment file '", aln_file, "' does not exist."))
     }
-    
+
     # Check if the lineage file exists
     if (!file.exists(lin_file)) {
         abort(paste("Error: The lineage file '", lin_file, "' does not exist."))
     }
-    
+
     # Check that the 'reduced' parameter is logical
     if (!is.logical(reduced) || length(reduced) != 1) {
         abort("Error: 'reduced' must be a single logical value (TRUE or FALSE).")
@@ -424,14 +424,14 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") {
     if (!is.data.frame(acc2name)) {
         abort("Error: acc2name must be a data frame.")
     }
-    
+
     # Check if the specified columns exist in the data frame
     if (!(acc_col %in% colnames(acc2name))) {
-        abort("Error: The specified acc_col '", acc_col, "' does not exist in 
+        abort("Error: The specified acc_col '", acc_col, "' does not exist in
              acc2name.")
     }
     if (!(name_col %in% colnames(acc2name))) {
-        abort("Error: The specified name_col '", name_col, "' does not exist in 
+        abort("Error: The specified name_col '", name_col, "' does not exist in
              acc2name.")
     }
 
@@ -475,7 +475,7 @@ rename_fasta <- function(fa_path, outpath,
         abort("Error: The input FASTA file does not exist at the specified
              path: ", fa_path)
     }
-    
+
     # Check if the output path is writable
     outdir <- dirname(outpath)
     if (!dir.exists(outdir)) {
@@ -541,20 +541,20 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"),
     reduced = F) {
     # Check if the alignment path exists
     if (!dir.exists(aln_path)) {
-        abort("Error: The alignment directory does not exist at the specified 
+        abort("Error: The alignment directory does not exist at the specified
              path: ", aln_path)
     }
-    
+
     # Check if the output path exists; if not, attempt to create it
     if (!dir.exists(fa_outpath)) {
         dir.create(fa_outpath, recursive = TRUE)
-        message("Note: The output directory did not exist and has been created: ", 
+        message("Note: The output directory did not exist and has been created: ",
                 fa_outpath)
     }
-    
+
     # Check if the linear file exists
     if (!file.exists(lin_file)) {
-        abort("Error: The linear file does not exist at the specified path: ", 
+        abort("Error: The linear file does not exist at the specified path: ",
              lin_file)
     }
     # library(here)
@@ -626,7 +626,7 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
     if (!is.character(accessions) || length(accessions) == 0) {
         abort("Error: 'accessions' must be a non-empty character vector.")
     }
-    
+
     if (!dir.exists(dirname(outpath))) {
         abort("Error: The output directory does not exist: ", dirname(outpath))
     }
@@ -676,7 +676,7 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
                 id = accessions_partitioned[[x]],
                 db = "protein",
                 rettype = "fasta",
-                api_key = Sys.getenv("ENTREZ_API_KEY")
+                #api_key = Sys.getenv("ENTREZ_API_KEY")
             )
         )
     })
@@ -732,21 +732,21 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
 createRepresentativeAccNum <- function(prot_data,
     reduced = "Lineage",
     accnum_col = "AccNum") {
-    
+
     # Validate input
     if (!is.data.frame(prot_data)) {
         abort("Error: 'prot_data' must be a data frame.")
     }
-    
+
     # Check if the reduced column exists in prot_data
     if (!(reduced %in% colnames(prot_data))) {
-        abort("Error: The specified reduced column '", reduced, "' does not 
+        abort("Error: The specified reduced column '", reduced, "' does not
              exist in the data frame.")
     }
-    
+
     # Check if the accnum_col exists in prot_data
     if (!(accnum_col %in% colnames(prot_data))) {
-        abort("Error: The specified accession number column '", accnum_col, "' 
+        abort("Error: The specified accession number column '", accnum_col, "'
              does not exist in the data frame.")
     }
     # Get Unique reduced column and then bind the AccNums back to get one AccNum per reduced column
@@ -808,10 +808,10 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
     if (!file.exists(fasta_file)) {
         abort("Error: The FASTA file does not exist: ", fasta_file)
     }
-    
-    if (file_ext(fasta_file) != "fasta" && file_ext(fasta_file) != "fa") {
-        abort("Error: The specified file is not a valid FASTA file: ", fasta_file)
-    }
+
+    # if (file_ext(fasta_file) != "fasta" && file_ext(fasta_file) != "fa") {
+    #    abort("Error: The specified file is not a valid FASTA file: ", fasta_file)
+    # }
     fasta <- readAAStringSet(fasta_file)
 
     aligned <- switch(tool,
@@ -857,23 +857,23 @@ writeMSA_AA2FA <- function(alignment, outpath) {
     if (!inherits(alignment, "AAMultipleAlignment")) {
         abort("Error: The alignment must be of type 'AAMultipleAlignment'.")
     }
-    
+
     # Check the output path is a character string
     if (!is.character(outpath) || nchar(outpath) == 0) {
         abort("Error: Invalid output path specified.")
     }
-    
+
     # Check if the output directory exists
     outdir <- dirname(outpath)
     if (!dir.exists(outdir)) {
         abort("Error: The output directory does not exist: ", outdir)
     }
 
-    l <- length(rownames(alignment))
+    l <- length(names(unmasked(alignment)))
     fasta <- ""
     for (i in 1:l)
     {
-        fasta <- paste0(fasta, paste(">", rownames(alignment)[i]), "\n")
+        fasta <- paste0(fasta, paste(">", names(unmasked(alignment)[i])), "\n")
         seq <- toString(unmasked(alignment)[[i]])
         fasta <- paste0(fasta, seq, "\n")
     }
diff --git a/R/summarize.R b/R/summarize.R
index e76a86da..350c533d 100644
--- a/R/summarize.R
+++ b/R/summarize.R
@@ -41,18 +41,18 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo
     # Any row containing a domain in doms_remove will be removed
 
     # ^word$|(?<=\+)word$|(?<=\+)word(?=\+)|word(?=\+)
-    
+
     # Check if prot is a data frame
     if (!is.data.frame(prot)) {
         abort("Error: 'prot' must be a data frame.")
     }
-    
+
     # Check if the specified column exists in the data frame
     if (!column %in% names(prot)) {
-        abort(paste("Error: The specified column '", column, "' does not exist 
+        abort(paste("Error: The specified column '", column, "' does not exist
                    in the data frame.", sep = ""))
     }
-    
+
     # If doms_keep or doms_remove are not provided, inform the user
     if (length(doms_keep) == 0 && length(doms_remove) == 0) {
         warning("Warning: No domains specified to keep or remove. Returning the
@@ -109,25 +109,25 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo
 #' countByColumn
 #' @description
 #' Function to obtain element counts (DA, GC)
-#' 
-#' @param prot A data frame containing the dataset to analyze, typically with 
+#'
+#' @param prot A data frame containing the dataset to analyze, typically with
 #' multiple columns including the one specified by the `column` parameter.
-#' @param column A character string specifying the name of the column to analyze. 
+#' @param column A character string specifying the name of the column to analyze.
 #' The default is "DomArch".
-#' @param min.freq An integer specifying the minimum frequency an element must 
+#' @param min.freq An integer specifying the minimum frequency an element must
 #' have to be included in the output. Default is 1.
 #'
 #' @importFrom dplyr arrange as_tibble filter select
 #'
 #' @return A tibble with two columns:
 #' \describe{
-#'   \item{`column`}{The unique elements from the specified column 
+#'   \item{`column`}{The unique elements from the specified column
 #'   (e.g., "DomArch").}
-#'   \item{`freq`}{The frequency of each element, i.e., the number of times 
+#'   \item{`freq`}{The frequency of each element, i.e., the number of times
 #'   each element appears in the specified column.}
 #' }
-#' The tibble is filtered to only include elements that have a frequency 
-#' greater than or equal to `min.freq` and does not include elements with `NA` 
+#' The tibble is filtered to only include elements that have a frequency
+#' greater than or equal to `min.freq` and does not include elements with `NA`
 #' values or those starting with a hyphen ("-").
 #' @export
 #'
@@ -136,20 +136,20 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo
 #' countByColumn(prot = my_data, column = "DomArch", min.freq = 10)
 #' }
 countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) {
-    
+
     # Check if 'prot' is a data frame
     if (!is.data.frame(prot)) {
         abort("Error: 'prot' must be a data frame.")
     }
-    
+
     # Check if the specified column exists in the data frame
     if (!column %in% names(prot)) {
         abort(paste("Error: The specified column '", column, "' does not exist in
                    the data frame.", sep = ""))
     }
-    
+
     # Check if min.freq is a positive integer
-    if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 || 
+    if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 ||
         floor(min.freq) != min.freq) {
         abort("Error: 'min.freq' must be a positive integer.")
     }
@@ -171,14 +171,14 @@ countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) {
 #' Break string ELEMENTS into WORDS for domain architecture (DA) and genomic
 #' context (GC)
 #'
-#' @param prot A dataframe containing the dataset to analyze. The specified 
+#' @param prot A dataframe containing the dataset to analyze. The specified
 #' `column` contains the string elements to be processed.
-#' @param column A character string specifying the name of the column to analyze. 
+#' @param column A character string specifying the name of the column to analyze.
 #' Default is "DomArch".
-#' @param conversion_type A character string specifying the type of conversion. 
+#' @param conversion_type A character string specifying the type of conversion.
 #' Two options are available:
 #' \describe{
-#'   \item{`da2doms`}{Convert domain architectures into individual domains by 
+#'   \item{`da2doms`}{Convert domain architectures into individual domains by
 #'   replacing `+` symbols with spaces.}
 #'   \item{`gc2da`}{Convert genomic context into domain architectures by
 #'    replacing directional symbols (`<-`, `->`, and `|`) with spaces.}
@@ -187,13 +187,13 @@ countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) {
 #' @importFrom dplyr pull
 #' @importFrom stringr str_replace_all
 #'
-#' @return A single string where elements are delimited by spaces. The function 
-#' performs necessary substitutions based on the `conversion_type` and cleans up 
+#' @return A single string where elements are delimited by spaces. The function
+#' performs necessary substitutions based on the `conversion_type` and cleans up
 #' extraneous characters like newlines, tabs, and multiple spaces.
 #'
 #' @examples
 #' \dontrun{
-#' tibble::tibble(DomArch = c("aaa+bbb", 
+#' tibble::tibble(DomArch = c("aaa+bbb",
 #' "a+b", "b+c", "b-c")) |> elements2Words()
 #' }
 #'
@@ -202,20 +202,20 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms"
     if (!is.data.frame(prot)) {
         abort("Error: 'prot' must be a data frame.")
     }
-    
+
     # Check if the specified column exists in the data frame
     if (!column %in% names(prot)) {
-        abort(paste("Error: The specified column '", column, "' does not exist in 
+        abort(paste("Error: The specified column '", column, "' does not exist in
                    the data frame.", sep = ""))
     }
-    
+
     # Check for valid conversion_type values
     valid_types <- c("da2doms", "doms2da")
     if (!conversion_type %in% valid_types) {
-        abort(paste("Error: Invalid 'conversion_type'. Must be one of:", 
+        abort(paste("Error: Invalid 'conversion_type'. Must be one of:",
                    paste(valid_types, collapse = ", ")))
     }
-    
+
     z1 <- prot %>%
         dplyr::pull(column) %>%
         str_replace_all("\\,", " ") %>%
@@ -252,20 +252,20 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms"
 #' @description
 #' Get word counts (wc) [DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)]
 #'
-#' @param string A character string containing the elements (words) to count. 
-#' This would typically be a space-delimited string representing domain 
+#' @param string A character string containing the elements (words) to count.
+#' This would typically be a space-delimited string representing domain
 #' architectures or genomic contexts.
 #'
 #' @importFrom dplyr as_tibble filter arrange
 #' @importFrom stringr str_replace_all
 #'
-#' @return A tibble (tbl_df) with two columns: 
+#' @return A tibble (tbl_df) with two columns:
 #' \describe{
-#'   \item{`words`}{A column containing the individual words 
+#'   \item{`words`}{A column containing the individual words
 #'   (domains or domain architectures).}
 #'   \item{`freq`}{A column containing the frequency counts for each word.}
 #' }
-#' 
+#'
 #'
 #' @examples
 #' \dontrun{
@@ -279,7 +279,7 @@ words2WordCounts <- function(string) {
     if (!is.character(string) || length(string) != 1) {
         abort("Error: 'string' must be a single character vector.")
     }
-    
+
     df_word_count <- string %>%
         # reduce spaces with length 2 or greater to a single space
         str_replace_all("\\s{2,}", " ") %>%
@@ -311,14 +311,14 @@ words2WordCounts <- function(string) {
 #' filterByFrequency
 #' @description
 #' Function to filter based on frequencies
-#' 
-#' @param x A tibble (tbl_df) containing at least two columns: one for 
+#'
+#' @param x A tibble (tbl_df) containing at least two columns: one for
 #' elements (e.g., `words`) and one for their frequency (e.g., `freq`).
-#' @param min.freq A numeric value specifying the minimum frequency threshold. 
-#' Only elements with frequencies greater than or equal to this value will be 
+#' @param min.freq A numeric value specifying the minimum frequency threshold.
+#' Only elements with frequencies greater than or equal to this value will be
 #' retained.
 #'
-#' @return A tibble with the same structure as `x`, but filtered to include 
+#' @return A tibble with the same structure as `x`, but filtered to include
 #' only rows where the frequency is greater than or equal to `min.freq`.
 #'
 #' @export
@@ -328,18 +328,18 @@ words2WordCounts <- function(string) {
 #' filterByFrequency()
 #' }
 filterByFrequency <- function(x, min.freq) {
-    
+
     # Check if 'x' is a data frame
     if (!is.data.frame(x)) {
         abort("Error: 'x' must be a data frame.")
     }
-    
+
     # Check if 'min.freq' is a positive integer
-    if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 || 
+    if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 ||
         floor(min.freq) != min.freq) {
         abort("Error: 'min.freq' must be a positive integer.")
     }
-    
+
     # Check if the 'freq' column exists in the data frame
     if (!"freq" %in% names(x)) {
         abort("Error: The data frame must contain a 'freq' column.")
@@ -355,24 +355,24 @@ filterByFrequency <- function(x, min.freq) {
 #' @name MolEvolvR_summary
 #' @description
 #' A collection of summary functions for the MolEvolvR package.
-#' 
+#'
 NULL
 
 #' summarizeByLineage
 #'
 #' @param prot A dataframe or tibble containing the data.
-#' @param column A string representing the column to be summarized 
+#' @param column A string representing the column to be summarized
 #' (e.g., `DomArch`). Default is "DomArch".
-#' @param by A string representing the grouping column (e.g., `Lineage`). 
+#' @param by A string representing the grouping column (e.g., `Lineage`).
 #' Default is "Lineage".
-#' @param query A string specifying the query pattern for filtering the target 
+#' @param query A string specifying the query pattern for filtering the target
 #' column. Use "all" to skip filtering and include all rows.
 #'
 #' @importFrom dplyr arrange filter group_by summarise
 #' @importFrom rlang sym
 #'
-#' @return A tibble summarizing the counts of occurrences of elements in 
-#' the `column`, grouped by the `by` column. The result includes the number 
+#' @return A tibble summarizing the counts of occurrences of elements in
+#' the `column`, grouped by the `by` column. The result includes the number
 #' of occurrences (`count`) and is arranged in descending order of count.
 #' @rdname MolEvolvR_summary
 #' @export
@@ -390,19 +390,19 @@ summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage"
     if (!is.data.frame(prot)) {
         abort("Error: 'prot' must be a data frame.")
     }
-    
+
     # Check if the specified column exists in the data frame
     if (!column %in% names(prot)) {
-        abort(paste("Error: The specified column '", column, "' does not exist in 
+        abort(paste("Error: The specified column '", column, "' does not exist in
                    the data frame.", sep = ""))
     }
-    
+
     # Check if the 'by' column exists in the data frame
     if (!by %in% names(prot)) {
-        abort(paste("Error: The specified 'by' column '", by, "' does not exist 
+        abort(paste("Error: The specified 'by' column '", by, "' does not exist
                    n the data frame.", sep = ""))
     }
-    
+
     column <- sym(column)
     by <- sym(by)
     if (query == "all") {
@@ -427,15 +427,15 @@ summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage"
 #' Function to summarize and retrieve counts by Domains & Domains+Lineage
 #'
 #'
-#' @param x A dataframe or tibble containing the data. It must have columns 
+#' @param x A dataframe or tibble containing the data. It must have columns
 #' named `DomArch` and `Lineage`.
 #'
 #' @importFrom dplyr arrange count desc filter group_by summarise
 #'
-#' @return A tibble summarizing the counts of unique domain architectures 
-#' (`DomArch`) per lineage (`Lineage`). The resulting table contains three 
-#' columns: `DomArch`, `Lineage`, and `count`, which indicates the frequency 
-#' of each domain architecture for each lineage. The results are arranged in 
+#' @return A tibble summarizing the counts of unique domain architectures
+#' (`DomArch`) per lineage (`Lineage`). The resulting table contains three
+#' columns: `DomArch`, `Lineage`, and `count`, which indicates the frequency
+#' of each domain architecture for each lineage. The results are arranged in
 #' descending order of `count`.
 #' @rdname MolEvolvR_summary
 #'
@@ -450,13 +450,13 @@ summarizeDomArch_ByLineage <- function(x) {
     if (!is.data.frame(x)) {
         abort("Error: 'x' must be a data frame.")
     }
-    
+
     # Check if required columns exist in the data frame
     required_columns <- c("DomArch", "Lineage")
     missing_columns <- setdiff(required_columns, names(x))
-    
+
     if (length(missing_columns) > 0) {
-        abort(paste("Error: The following required columns are 
+        abort(paste("Error: The following required columns are
                    missing:", paste(missing_columns, collapse = ", ")))
     }
     x %>%
@@ -472,16 +472,16 @@ summarizeDomArch_ByLineage <- function(x) {
 #' @description
 #' Function to retrieve counts of how many lineages a DomArch appears in
 #'
-#' @param x A dataframe or tibble containing the data. It must have a column 
-#' named `DomArch` and a count column, such as `count`, which represents the 
+#' @param x A dataframe or tibble containing the data. It must have a column
+#' named `DomArch` and a count column, such as `count`, which represents the
 #' occurrences of each architecture in various lineages.
 #'
 #' @importFrom dplyr arrange group_by filter summarise
 #'
-#' @return A tibble summarizing each unique `DomArch`, along with the following 
+#' @return A tibble summarizing each unique `DomArch`, along with the following
 #' columns:
 #' - `totalcount`: The total occurrences of each `DomArch` across all lineages.
-#' - `totallin`: The total number of unique lineages in which each `DomArch` 
+#' - `totallin`: The total number of unique lineages in which each `DomArch`
 #' appears.
 #' The results are arranged in descending order of `totallin` and `totalcount`.
 #' @rdname MolEvolvR_summary
@@ -506,17 +506,17 @@ summarizeDomArch <- function(x) {
 
 #' summarizeGenContext_ByDomArchLineage
 #'
-#' @param x A dataframe or tibble containing the data. It must have columns 
+#' @param x A dataframe or tibble containing the data. It must have columns
 #' named `GenContext`, `DomArch`, and `Lineage`.
 #'
 #' @importFrom dplyr arrange desc filter group_by n summarise
 #'
-#' @return A tibble summarizing each unique combination of `GenContext`, 
+#' @return A tibble summarizing each unique combination of `GenContext`,
 #' `DomArch`, and `Lineage`, along with the following columns:
 #' - `GenContext`: The genomic context for each entry.
 #' - `DomArch`: The domain architecture for each entry.
 #' - `Lineage`: The lineage associated with each entry.
-#' - `count`: The total number of occurrences for each combination of 
+#' - `count`: The total number of occurrences for each combination of
 #' `GenContext`, `DomArch`, and `Lineage`.
 #'
 #' The results are arranged in descending order of `count`.
@@ -573,12 +573,12 @@ summarizeGenContext_ByLineage <- function(x) {
 
 #' summarizeGenContext
 #'
-#' @param x A dataframe or tibble containing the data. It must have columns 
+#' @param x A dataframe or tibble containing the data. It must have columns
 #' named `GenContext`, `DomArch`, and `Lineage`.
 #'
 #' @importFrom dplyr arrange desc filter group_by n n_distinct summarise
 #'
-#' @return A tibble summarizing each unique combination of `GenContext` and 
+#' @return A tibble summarizing each unique combination of `GenContext` and
 #' `Lineage`, along with the following columns:
 #' - `GenContext`: The genomic context for each entry.
 #' - `Lineage`: The lineage associated with each entry.
@@ -623,13 +623,13 @@ summarizeGenContext <- function(x) {
 #' @param prot  A data frame that must contain columns:
 #' \itemize{\item Either 'GenContext' or 'DomArch.norep' \item count}
 #' @param column Character. The column to summarize, default is "DomArch".
-#' @param lineage_col Character. The name of the lineage column, default is 
+#' @param lineage_col Character. The name of the lineage column, default is
 #' "Lineage".
-#' @param cutoff Numeric. Cutoff for total count. Counts below this cutoff value 
+#' @param cutoff Numeric. Cutoff for total count. Counts below this cutoff value
 #' will not be shown. Default is 0.
-#' @param RowsCutoff Logical. If TRUE, filters based on cumulative percentage 
+#' @param RowsCutoff Logical. If TRUE, filters based on cumulative percentage
 #' cutoff. Default is FALSE.
-#' @param digits Numeric. Number of decimal places for percentage columns. 
+#' @param digits Numeric. Number of decimal places for percentage columns.
 #' Default is 2.
 #'
 #'
@@ -638,9 +638,9 @@ summarizeGenContext <- function(x) {
 #'
 #' @return A data frame with the following columns:
 #' - `{{ column }}`: Unique values from the specified column.
-#' - `totalcount`: The total count of occurrences for each unique value in 
+#' - `totalcount`: The total count of occurrences for each unique value in
 #' the specified column.
-#' - `IndividualCountPercent`: The percentage of each `totalcount` relative to 
+#' - `IndividualCountPercent`: The percentage of each `totalcount` relative to
 #' the overall count.
 #' - `CumulativePercent`: The cumulative percentage of total counts.
 #' @rdname MolEvolvR_summary
@@ -661,28 +661,28 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col
     if (!is.data.frame(prot)) {
         abort("Error: 'prot' must be a data frame.")
     }
-    
+
     # Check if the specified columns exist in the data frame
     required_columns <- c(column, lineage_col)
     missing_columns <- setdiff(required_columns, names(prot))
-    
+
     if (length(missing_columns) > 0) {
-        abort(paste("Error: The following required columns are missing:", 
+        abort(paste("Error: The following required columns are missing:",
                    paste(missing_columns, collapse = ", ")))
     }
-    
+
     # Check that cutoff is a numeric value between 0 and 100
     if (!is.numeric(cutoff) || length(cutoff) != 1 || cutoff < 0 || cutoff > 100) {
         abort("Error: 'cutoff' must be a numeric value between 0 and 100.")
     }
-    
+
     # Check that digits is a non-negative integer
-    if (!is.numeric(digits) || length(digits) != 1 || digits < 0 || 
+    if (!is.numeric(digits) || length(digits) != 1 || digits < 0 ||
         floor(digits) != digits) {
         abort("Error: 'digits' must be a non-negative integer.")
     }
-    
-    column <- sym(column)
+
+    # column <- sym(column)
 
     prot <- select(prot, {{ column }}, {{ lineage_col }}) %>%
         filter(!is.na({{ column }}) & !is.na({{ lineage_col }})) %>%
@@ -690,10 +690,10 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col
 
     prot <- summarizeByLineage(prot, column, by = lineage_col, query = "all")
     col_count <- prot %>%
-        group_by({{ column }}) %>%
+        group_by(!!sym(column)) %>%
         summarise(totalcount = sum(count))
 
-    total <- left_join(prot, col_count, by = as_string(column))
+    total <- left_join(prot, col_count, by = column)
 
     sum_count <- sum(total$count)
     total <- total %>%
@@ -845,7 +845,7 @@ findParalogs <- function(prot) {
     if (!is.data.frame(prot)) {
         abort("Error: 'prot' must be a data frame.")
     }
-    
+
     # Remove eukaryotes
     prot <- prot %>% filter(!grepl("^eukaryota", Lineage))
     paralogTable <- prot %>%
diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd
index c42aa443..46dcf57e 100644
--- a/inst/report/report_template.Rmd
+++ b/inst/report/report_template.Rmd
@@ -353,7 +353,7 @@ if (is.null(params$DA_col)) {
 
 plot_data <- params$DA_Prot
 plot_data[[da_col]] <- str_replace_all(plot_data[[da_col]], " ", "_")
-wordcloud_element(plot_data, colname = da_col, cutoff = params$DACutoff, UsingRowsCutoff = F)
+createWordCloudElement(plot_data, colname = da_col, cutoff = params$DACutoff, UsingRowsCutoff = F)
 ```
 
 ### Interproscan Visualization
@@ -367,7 +367,7 @@ params$da_interproscan_visualization
 ```{r, echo=FALSE}
 upset_plot_data <- params$DA_Prot
 upset_plot_data[[da_col]] <- str_replace_all(upset_plot_data[[da_col]], " ", "_")
-final_plot <- upset.plot(upset_plot_data, colname = da_col, cutoff = params$DACutoff)
+final_plot <- plotUpSet(upset_plot_data, colname = da_col, cutoff = params$DACutoff)
 domains <- upset_plot_data %>% 
   dplyr::pull(da_col)
 n_unique_domains <- domains %>%
@@ -413,30 +413,33 @@ if(length(params$rep_accnums) >= 3 ) {
   if (!params$rval_phylo) {
     seqs <- readAAStringSet(params$app_data@fasta_path)
     names(seqs) <- sub(" .*", "", names(seqs))
-    query_accession <- params$app_data@df %>% filter(params$app_data@df$QueryName == params$PhyloSelect)
+    query_accession <- params$app_data@df %>% filter(!duplicated(QueryName))
     query_accession <- unique(query_accession$Query)
     query <- seqs[query_accession]
-    names(query) <- params$PhyloSelect
+    names(query) <- unique(params$app_data@df$QueryName)
     query <- AAStringSet(query)
   }
   # Generate Fasta File
   rep_fasta_path <- tempfile()
 
-  acc2fa(rep, outpath = rep_fasta_path, "sequential")
+  acc2FA(rep, outpath = rep_fasta_path, "sequential")
   rename_fasta(rep_fasta_path, rep_fasta_path,
-    replacement_function = map_acc2name,
+    replacement_function = mapAcc2Name,
     acc2name = params$acc_to_name
   )
   if (!params$rval_phylo) {
     writeXStringSet(query, rep_fasta_path, append = TRUE)
   }
-
   rep_msa_path <- tempfile()
   alignFasta(rep_fasta_path, tree_msa_tool, rep_msa_path)
 }
 ```
 
 ```{r, echo=FALSE}
+library(msa)
+library(ape)
+library(tidytree)
+library(ggtree)
 if(length(params$rep_accnums) >= 3 ) {
   seq_tree(fasta_filepath = rep_msa_path) 
  } else {"Not enough representative sequences: try changing the 'Reduce By' field."}
@@ -449,7 +452,7 @@ if(length(params$rep_accnums) >= 3 ) {
   if (!params$rval_phylo) {
     seqs <- readAAStringSet(params$app_data@fasta_path)
     names(seqs) <- sub(" .*", "", names(seqs))
-    query_accession <- params$app_data@df %>% filter(params$app_data@df$QueryName == params$PhyloSelect)
+    query_accession <- params$app_data@df %>% filter(!duplicated(QueryName))
     query_accession <- unique(query_accession$Query)
     query <- seqs[query_accession]
     names(query) <- params$PhyloSelect
@@ -458,9 +461,9 @@ if(length(params$rep_accnums) >= 3 ) {
 
   # Generate Fasta File
   rep_fasta_path <- tempfile()
-  acc2fa(rep, outpath = rep_fasta_path, "sequential")
+  acc2FA(rep, outpath = rep_fasta_path, "sequential")
   rename_fasta(rep_fasta_path, rep_fasta_path,
-    replacement_function = map_acc2name,
+    replacement_function = mapAcc2Name,
     acc2name = params$acc_to_name
   )
   if (!params$rval_phylo) {
@@ -469,16 +472,17 @@ if(length(params$rep_accnums) >= 3 ) {
 
   # Call MSA2PDF
   msa_pdf_path <- tempfile()
-  msa_prefix <- "/data/research/jravilab/molevolvr_app/www/msa_figs/"
+  msa_prefix <- "~/awasyn/MolEvolvRsHzFyk_full/msa_figs/"
   post_fix <- paste("msa", params$query_pin, params$PhyloSelect, params$msa_reduce_by, ".pdf", sep = "_")
 
   msa_pdf_path <- paste0(msa_prefix, post_fix)
 
-  msa_pdf(fasta_path = rep_fasta_path, msa_pdf_path)
+  createMSA_PDF(fasta_path = rep_fasta_path, msa_pdf_path)
   }
 ```
 
 ```{r msa_pdf, echo = FALSE, out.width = "95%", out.height  = "800px"}
+# apt-get install texlive
 if(length(params$rep_accnums) >= 3 ) {
   knitr::include_graphics(msa_pdf_path)
   } else {"Not enough representative sequences: try changing the 'Reduce By' field."}
diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R
index 7dd89030..3ea331d5 100644
--- a/inst/report/scripts/generate_report.R
+++ b/inst/report/scripts/generate_report.R
@@ -90,16 +90,20 @@ run_analysis <- function(
         output_file = file.path(tempdir(), "report.html"),
         DASelect = "All",
         mainSelect = NULL,
-        PhyloSelect = NULL,
+        PhyloSelect = "All",
         q_heatmap_select = "All",
         DACutoff = 95,
         GCCutoff = 0.5,
         query_select = NULL,
-        query_iprDatabases = NULL,
+        query_iprDatabases = c(
+          "Pfam", "SMART", "Phobius",
+          "Gene3D", "TMHMM", "SignalP_GRAM_POSITIVE",
+          "SUPERFAMILY", "MobiDBLite", "TIGRFAM", "PANTHER", "Coils"
+        ),
         query_iprVisType = "Analysis", tree_msa_tool = "ClustalO",
         levels = 2,
         DA_Col = "DomArch.Pfam",
-        msa_rep_num = NULL,
+        msa_rep_num = 10,
         msa_reduce_by = "Species",
         rval_phylo = FALSE,
         DA_lin_color = c("default", "viridis", "inferno", "magma", "plasma", "cividis"),
@@ -375,7 +379,7 @@ run_analysis <- function(
                                    pattern = paste0("molevolvr_acccnum_validation-", accnum, "-", "XXXXX"),
                                    fileext = ".fa"
                                )
-                               acc2fa(accnum, tmp)
+                               acc2FA(accnum, tmp)
                                readAAStringSet(tmp)
                                TRUE
                            },
@@ -398,7 +402,7 @@ run_analysis <- function(
       Please try submitting FASTA sequences instead.")
                } else {
                    # Write a multifasta
-                   acc2fa(accnum_vect, outpath = path)
+                   acc2FA(accnum_vect, outpath = path)
                }
            },
            "MSA" = {
@@ -584,7 +588,7 @@ run_analysis <- function(
 
     domarch_cols_value <- get_domarch_cols(app_data, DASelect)
 
-    query_domarch_cols_value <- get_domarch_columns(query_data)
+    query_domarch_cols_value <- get_domarch_columns(query_data@df)
 
     mainTable_value <- generate_data_table(data)
 
@@ -613,7 +617,7 @@ run_analysis <- function(
                                                     da_iprVisType,
                                                     DASelect)
 
-    query_heatmap_value <- generate_query_heatmap(query_data_df,
+    query_heatmap_value <- generate_query_heatmap(query_data@df,
                                                   heatmap_select = "All",
                                                   heatmap_color = "blue")
 
@@ -626,7 +630,7 @@ run_analysis <- function(
                                                 app_data@ipr_path)
 
     DALin_TotalCounts_value <- DA_TotalCounts(DA_Prot_value,
-                                              DACutoff = 95,
+                                              DACutoff,
                                               DA_col = "DomArch.Pfam",
                                               app_data)
 
@@ -640,7 +644,7 @@ run_analysis <- function(
                                                networkLayout = "nice",
                                                app_data@ipr_path)
 
-    rep_accnums_value <-
+    rep_accnums_value <- rep_accnums(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data)
 
     phylogeny_prot_value <- filter_phylogeny_proteins(app_data, phylo_select)
 
@@ -678,7 +682,7 @@ run_analysis <- function(
             phylo_sunburst = phylogeny_prot_value,
             tree_msa_tool = tree_msa_tool,
             rep_accnums = rep_accnums_value,
-            msa_rep_num = 10,
+            msa_rep_num = 3,
             app_data = app_data,
             PhyloSelect = PhyloSelect,
             acc_to_name = acc_to_name_value,
diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R
index d2d0b303..2fc1fdbe 100644
--- a/inst/report/scripts/run_molevolvr_pipeline.R
+++ b/inst/report/scripts/run_molevolvr_pipeline.R
@@ -245,7 +245,6 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval,
         is.na(PREFIX)) {
       print("No blast results provided, moving on.")
     } else {
-      browser()
       file.copy(file.path(OUTDIR, paste0(PREFIX, ".ipr_domarch.tsv")),
                 file.path(OUTDIR, paste0(PREFIX, ".full_analysis.tsv")))
     }
@@ -728,6 +727,7 @@ run_blastclust <- function(infile, suffix, outdir) {
 
   # Prepare output file path
   outfile <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv"))
+  input_file <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv.clstr"))
 
 
   # Print process messages
@@ -741,10 +741,17 @@ run_blastclust <- function(infile, suffix, outdir) {
     "cd-hit -i %s -o %s -c 0.8 -aS 0.6 -T 8",
     infile, outfile
   )
+  clean_command <- sprintf(
+    "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s",
+    input_file,
+    outfile
+  )
+
   cat("\nPerforming BLASTCLUST analysis on", infile, "\n")
   # system(blastclust_cmd)
 
   system(cdhit_command)
+  system(clean_command)
 
 }
 
@@ -802,7 +809,7 @@ clust2tbl <- function(clust, blast) {
     }
   }
   # blast_out$AccNum <- gsub("^>", "", blast_out$AccNum)
-  blast_out$AccNum <- paste0(">", blast_out$AccNum)
+  # blast_out$AccNum <- paste0(">", blast_out$AccNum)
   blast_clustnames <- merge(blast_out, new_clust, by = "AccNum")
 
   for (i in 1:nrow(blast_clustnames)) {
diff --git a/inst/report/scripts/viz_utils.R b/inst/report/scripts/viz_utils.R
index 4d9fdfe6..41304360 100644
--- a/inst/report/scripts/viz_utils.R
+++ b/inst/report/scripts/viz_utils.R
@@ -91,8 +91,8 @@ generate_rs_network_layout <- function(data, app_data,
     )
 
     # Validate that the result is not an error
-    if (res_network == "error") {
-        stop("Not enough nodes to construct a network.")
+    if (any(res_network == "error")) {
+      stop("Not enough nodes to construct a network.")
     }
 
     return(res_network)
@@ -407,7 +407,7 @@ total_counts <- function(prot, column = "DomArch", lineage_col = "Lineage",
                          cutoff = 90, RowsCutoff = FALSE, digits = 2
                          # type = "GC"
 ) {
-  column <- sym(column)
+  # column <- sym(column)
 
   prot <- select(prot, {{ column }}, {{ lineage_col }}) %>%
     filter(!is.na({{ column }}) & !is.na({{ lineage_col }})) %>%
@@ -415,10 +415,10 @@ total_counts <- function(prot, column = "DomArch", lineage_col = "Lineage",
 
   prot <- summarizeByLineage(prot, column, by = lineage_col, query = "all")
   col_count <- prot %>%
-    group_by({{ column }}) %>%
+    group_by(!!sym(column)) %>%
     summarise(totalcount = sum(count))
 
-  total <- left_join(prot, col_count, by = as_string(column))
+  total <- left_join(prot, col_count, by = column)
 
   sum_count <- sum(total$count)
   total <- total %>%
@@ -584,14 +584,14 @@ generate_domain_network <- function(DA_col, DACutoff, DA_Prot,
     # Generate the domain network
     res <- createDomainNetwork(
         prot = dn_data,
-        column = col,
+        column = DA_col,
         domains_of_interest = ".*",
         cutoff = DACutoff,
         layout = networkLayout
     )
 
     # Validate the result
-    if (res == "error") {
+    if (any(res == "error")) {
         stop("Not enough nodes to construct a network.
              Try increasing 'Total Count Cutoff'.")
     }
@@ -865,11 +865,11 @@ rep_accnums <- function(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data
     return(app_data@df$AccNum)
   } else {
     # Switch based on `msa_reduce_by` value
-    tmp <- filter(app_data@df, QueryName == PhyloSelect)
-    rep_acc_species <- RepresentativeAccNums(tmp, reduced = "Species", column = "AccNum")
+    tmp <- filter(app_data@df)
+    rep_acc_species <- createRepresentativeAccNum(tmp, reduced = "Species", accnum_col = "AccNum")
 
     # Get representative accession numbers by "Lineage"
-    rep_acc_lineage <- RepresentativeAccNums(tmp, reduced = "Lineage", column = "AccNum")
+    rep_acc_lineage <- createRepresentativeAccNum(tmp, reduced = "Lineage", accnum_col = "AccNum")
     switch(msa_reduce_by,
            "Species" = rep_acc_species,
            "Lineage" = rep_acc_lineage,
@@ -894,3 +894,29 @@ rep_accnums <- function(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data
   }
 }
 
+seq_tree <- function(fasta_filepath){
+  my_seqs <- readAAStringSet(fasta_filepath) #, format="fasta", seek.first.rec=T)
+  my_seqs_msa <- msa(my_seqs)
+  my_seqs_msa_aln <- msaConvert(my_seqs_msa, type="seqinr::alignment")
+
+  #below was commented out, does it need to change as one of the parameters? the bottom keeps
+  d <- dist.alignment(my_seqs_msa_aln, "identity")
+  #as.matrix(d)[2:5, "HBA1_Homo_sapiens", drop=FALSE]
+
+  ## Phylogenetic tree
+  ## using package ape
+  ## build neighbor-joining tree
+  seqTree <- nj(d)
+  #plot(seqTree, main="Phylogenetic Tree of MSA")
+  groupInfo <- split(seqTree$tip.label,
+                     gsub("_\\w+", "", seqTree$tip.label))
+  seqTree <- groupOTU(seqTree, groupInfo)
+  # ggtree(seqTree, aes(color=group),
+  #        layout='circular') +
+  #   geom_tiplab(size=1, aes(angle=angle))
+  #https://yulab-smu.top/treedata-book/chapter4.html
+  #offs <- 0
+  tree <- ggtree(seqTree, branch.length = "dN_vs_dS") + theme_tree2(axis.line.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank())
+  msaplot(tree, fasta=fasta_filepath, offset=0.5, bg_line = TRUE) + geom_tiplab(align=TRUE, linesize=0.5, size=3)
+
+}

From 22e644507665de5e514bed2ba29db5f07e08c2bd Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Wed, 20 Nov 2024 02:59:59 +0200
Subject: [PATCH 06/23] camelcase case study functions

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 inst/report/report_template.Rmd              | 20 ++---
 inst/report/scripts/MolEvolData_class.R      | 52 ++++++------
 inst/report/scripts/generate_report.R        | 62 +++++++--------
 inst/report/scripts/run_molevolvr_pipeline.R | 83 ++++++++++----------
 inst/report/scripts/viz_utils.R              | 52 ++++++------
 5 files changed, 133 insertions(+), 136 deletions(-)

diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd
index 46dcf57e..dddf357c 100644
--- a/inst/report/report_template.Rmd
+++ b/inst/report/report_template.Rmd
@@ -30,11 +30,11 @@ params:
   ## Tree
   tree_msa_tool: default ##input
   ## MSA
-  rep_accnums: default
+  repAccNums: default
   msa_rep_num: default ##input
   app_data: default
   PhyloSelect: default ##input
-  acc_to_name: default
+  acc2Name: default
   rval_phylo: default
   query_pin: default
   msa_reduce_by: default
@@ -408,8 +408,8 @@ if (is.null(params$msa_rep_num)) {
 ```
 
 ```{r, include=FALSE}
-if(length(params$rep_accnums) >= 3 ) {
-  rep <- params$rep_accnums[1:msa_rep_num]
+if(length(params$repAccNums) >= 3 ) {
+  rep <- params$repAccNums[1:msa_rep_num]
   if (!params$rval_phylo) {
     seqs <- readAAStringSet(params$app_data@fasta_path)
     names(seqs) <- sub(" .*", "", names(seqs))
@@ -425,7 +425,7 @@ if(length(params$rep_accnums) >= 3 ) {
   acc2FA(rep, outpath = rep_fasta_path, "sequential")
   rename_fasta(rep_fasta_path, rep_fasta_path,
     replacement_function = mapAcc2Name,
-    acc2name = params$acc_to_name
+    acc2name = params$acc2Name
   )
   if (!params$rval_phylo) {
     writeXStringSet(query, rep_fasta_path, append = TRUE)
@@ -440,15 +440,15 @@ library(msa)
 library(ape)
 library(tidytree)
 library(ggtree)
-if(length(params$rep_accnums) >= 3 ) {
-  seq_tree(fasta_filepath = rep_msa_path) 
+if(length(params$repAccNums) >= 3 ) {
+  seqTree(fasta_filepath = rep_msa_path) 
  } else {"Not enough representative sequences: try changing the 'Reduce By' field."}
 ```
 
 ### MSA
 
 ```{r, include=FALSE}
-if(length(params$rep_accnums) >= 3 ) {
+if(length(params$repAccNums) >= 3 ) {
   if (!params$rval_phylo) {
     seqs <- readAAStringSet(params$app_data@fasta_path)
     names(seqs) <- sub(" .*", "", names(seqs))
@@ -464,7 +464,7 @@ if(length(params$rep_accnums) >= 3 ) {
   acc2FA(rep, outpath = rep_fasta_path, "sequential")
   rename_fasta(rep_fasta_path, rep_fasta_path,
     replacement_function = mapAcc2Name,
-    acc2name = params$acc_to_name
+    acc2name = params$acc2Name
   )
   if (!params$rval_phylo) {
     writeXStringSet(query, rep_fasta_path, append = TRUE)
@@ -483,7 +483,7 @@ if(length(params$rep_accnums) >= 3 ) {
 
 ```{r msa_pdf, echo = FALSE, out.width = "95%", out.height  = "800px"}
 # apt-get install texlive
-if(length(params$rep_accnums) >= 3 ) {
+if(length(params$repAccNums) >= 3 ) {
   knitr::include_graphics(msa_pdf_path)
   } else {"Not enough representative sequences: try changing the 'Reduce By' field."}
 ```
diff --git a/inst/report/scripts/MolEvolData_class.R b/inst/report/scripts/MolEvolData_class.R
index c50e2925..791f1d6a 100644
--- a/inst/report/scripts/MolEvolData_class.R
+++ b/inst/report/scripts/MolEvolData_class.R
@@ -72,7 +72,7 @@ top_acc <- function(cln_file, DA_col = "DomArch.Pfam",
 }
 
 
-combine_files_nopmap <- function(inpath, pattern, outpath,
+combineFilesNopmap <- function(inpath, pattern, outpath,
                                  delim = "\t", skip = 0,
                                  col_names) {
     source_files <- dir(path = inpath, pattern = pattern, recursive = T)
@@ -111,7 +111,7 @@ full_analysis_colnames <- c("AccNum", "QueryName", "STitle", "Species.x",
                             "Species.y", "SourceDB", "Completeness")
 
 
-process_wrapper_dir <- function(path, pinName, type = "full") {
+processWrapperDir <- function(path, pinName, type = "full") {
     if (type == "full") {
         if (file.exists(paste0(path, "/cln_combined.tsv")) && file.exists(paste0(path, "/ipr_combined.tsv"))) {
             query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv"))
@@ -164,12 +164,12 @@ process_wrapper_dir <- function(path, pinName, type = "full") {
 
             com_blast_path <- paste0(path, "/blast_combined.tsv")
             ipr_blast_path <- paste0(path, "/ipr_combined.tsv")
-            com_blast_data <- combine_files_nopmap(paste0(path, "/"),
+            com_blast_data <- combineFilesNopmap(paste0(path, "/"),
                                                    pattern = "*.full_analysis.tsv", skip = 0,
                                                    col_names = c(), outpath = com_blast_path, delim = "\t"
             )
 
-            ipr_blast_data <- combine_files_nopmap(paste0(path, "/"),
+            ipr_blast_data <- combineFilesNopmap(paste0(path, "/"),
                                                    pattern = "*.iprscan_cln.tsv", skip = 0,
                                                    col_names = ipr_colnames, outpath = ipr_blast_path, delim = "\t"
             )
@@ -212,7 +212,7 @@ process_wrapper_dir <- function(path, pinName, type = "full") {
         )
         query_data <- query_data %>% select(Query, QueryName)
         com_blast_path <- paste0(path, "/blast_combined.tsv")
-        com_blast_data <- combine_files_nopmap(paste0(path, "/"),
+        com_blast_data <- combineFilesNopmap(paste0(path, "/"),
                                                pattern = "*.blast.cln.tsv",
                                                skip = 0,
                                                col_names = c(),
@@ -282,7 +282,7 @@ process_wrapper_dir <- function(path, pinName, type = "full") {
                                   queries = queries, cln_path = cln_path
         )
         com_blast_path <- paste0(path, "/blast_combined.tsv")
-        com_blast_data <- combine_files_nopmap(paste0(path, "/"),
+        com_blast_data <- combineFilesNopmap(paste0(path, "/"),
                                                pattern = "*.full_analysis.tsv",
                                                skip = 0,
                                                col_names = c(),
@@ -355,8 +355,8 @@ clean_fetched <- function(df) {
 }
 
 # Description
-# validation functions for accession numbers/headers for FASTA (validate_accnum_fasta)
-# & accession number input (validate_accnum) submission types.
+# validation functions for accession numbers/headers for FASTA (validateAccNumFasta)
+# & accession number input (validateAccNum) submission types.
 
 library(Biostrings)
 library(httr)
@@ -364,7 +364,7 @@ library(httr2)
 library(rentrez)
 library(shiny)
 
-.get_accnum_from_fasta <- function(biostrings_aa_string_set, verbose = FALSE) {
+getAccNumFromFasta <- function(biostrings_aa_string_set, verbose = FALSE) {
     # parsing/cleaning accession numbers using the same methods as
     # `upstream_scripts/00_submit_full.R`'s `get_sequences()` function
     accnums <- c()
@@ -392,7 +392,7 @@ library(shiny)
     return(accnums)
 }
 
-validate_accnum_fasta <- function(text) {
+validateAccNumFasta <- function(text) {
     # INPUT: text from input object that houses FASTA data
     # validate the headers/accnums for FASTA submission
     # Return:
@@ -434,7 +434,7 @@ validate_accnum_fasta <- function(text) {
     }
 
     # parse accnums from fasta
-    accnums <- .get_accnum_from_fasta(fasta)
+    accnums <- getAccNumFromFasta(fasta)
     tb_accnum_counts <- tibble("frequencies" = table(accnums))
     # check for duplicates
     if (any(tb_accnum_counts$frequencies > 1)) {
@@ -447,7 +447,7 @@ validate_accnum_fasta <- function(text) {
 
 
 #' Test whether a single accession returns a valid protein from Entrez
-is_accnum_valid_entrez <- function(accnum, verbose = FALSE) {
+isAccNumValidEntrez <- function(accnum, verbose = FALSE) {
     # empty accnum wil not raise an error from efetch, so test for this first
     if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)}
 
@@ -467,7 +467,7 @@ is_accnum_valid_entrez <- function(accnum, verbose = FALSE) {
 }
 
 #' Test whether a single accession returns a valid protein from EBI
-is_accnum_valid_ebi <- function(accnum, verbose = FALSE) {
+isAccNumValidEbi <- function(accnum, verbose = FALSE) {
     # validation: ensure there's some text to parse
     if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)}
     # construct a httr2 request to POST an accession number, then GET the fasta
@@ -492,7 +492,7 @@ is_accnum_valid_ebi <- function(accnum, verbose = FALSE) {
 }
 
 #' Perform a series of API reqs using NCBI entrez to validate accession numbers
-perform_entrez_reqs <- function(accnums, verbose = FALSE, track_progress = FALSE) {
+performEntrezReqs <- function(accnums, verbose = FALSE, track_progress = FALSE) {
     # API guidelines docs
     # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html
     # entrez recommends no more than 3 POSTs per second
@@ -501,7 +501,7 @@ perform_entrez_reqs <- function(accnums, verbose = FALSE, track_progress = FALSE
     results <- vapply(
         X = accnums,
         FUN = function(accnum) {
-            result <- is_accnum_valid_entrez(accnum, verbose = TRUE)
+            result <- isAccNumValidEntrez(accnum, verbose = TRUE)
             i <<- i + 1L
             if (i >= 3L) {Sys.sleep(1); print('sleeping for entrez API reqs'); i <<- 0L}
             if (track_progress) {incProgress(1)}
@@ -512,7 +512,7 @@ perform_entrez_reqs <- function(accnums, verbose = FALSE, track_progress = FALSE
     return(results)
 }
 
-perform_ebi_reqs <- function(accnums, verbose = FALSE, track_progress = FALSE) {
+performEbiReqs <- function(accnums, verbose = FALSE, track_progress = FALSE) {
     # API guidelines docs
     # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html
     # EBI allows 200 POSTs per second
@@ -521,7 +521,7 @@ perform_ebi_reqs <- function(accnums, verbose = FALSE, track_progress = FALSE) {
     results <- vapply(
         X = accnums,
         FUN = function(accnum) {
-            result <- is_accnum_valid_ebi(accnum, verbose = TRUE)
+            result <- isAccNumValidEbi(accnum, verbose = TRUE)
             i <<- i + 1L
             if (i >= 200L) {Sys.sleep(1); i <<- 0L}
             if (track_progress) incProgress(1)
@@ -533,7 +533,7 @@ perform_ebi_reqs <- function(accnums, verbose = FALSE, track_progress = FALSE) {
 }
 
 #' Validate accession numbers from MolEvolvR user input
-validate_accnum <- function(text, verbose = FALSE, track_progress = FALSE, n_steps = integer()) {
+validateAccNum <- function(text, verbose = FALSE, track_progress = FALSE, n_steps = integer()) {
     # API guidelines docs
     # entrez https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Usage_Guidelines_and_Requiremen
     # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html
@@ -558,7 +558,7 @@ validate_accnum <- function(text, verbose = FALSE, track_progress = FALSE, n_ste
     if (track_progress) {incProgress(1 / n_steps,
         message = "Please wait (searching NCBI's protein database) . . .")}
     # entrez API
-    entrez_results <- perform_entrez_reqs(accnums, verbose = verbose)
+    entrez_results <- performEntrezReqs(accnums, verbose = verbose)
     if (verbose) {
         stringr::str_glue("rentrez
                 results:\n\t{paste0(entrez_results, collapse=",")}\n") |>
@@ -568,7 +568,7 @@ validate_accnum <- function(text, verbose = FALSE, track_progress = FALSE, n_ste
     # EBI API
     # if all of entrez resulted in success, then skip EBI. Else, try EBI
     if (!all(entrez_results)) {
-        ebi_results <- perform_ebi_reqs(accnums, verbose = verbose)
+        ebi_results <- performEbiReqs(accnums, verbose = verbose)
         if (verbose) {
             stringr::str_glue("ebi results:\n\t{paste0(ebi_results,
                               collapse=",")}\n") |>
@@ -606,7 +606,7 @@ validate_accnum <- function(text, verbose = FALSE, track_progress = FALSE, n_ste
 # Description
 #   validation function for evalue input
 #===============================================================================
-validate_evalue <- function(input_value) {
+validateEvalue <- function(input_value) {
     is_valid_evalue <- is.numeric(input_value) && input_value != 0
     return(is_valid_evalue)
 }
@@ -619,7 +619,7 @@ validate_evalue <- function(input_value) {
 library("Biostrings")
 library("tidyverse")
 #-------------------------------------------------------------------------------
-.guess_seq_type <- function(single_fasta, dna_guess_cutoff = 0.9, other_guess_cutoff = 0.5) {
+guessSeqType <- function(single_fasta, dna_guess_cutoff = 0.9, other_guess_cutoff = 0.5) {
     tb <- as_tibble(alphabetFrequency(single_fasta))
     n_other <- if ("other" %in% colnames(tb)) sum(unlist(tb["other"])) else 0
 
@@ -649,7 +649,7 @@ library("tidyverse")
     return(guess)
 }
 
-.validate_seq_body <- function(text) {
+validateSeqBody <- function(text) {
     # convert string to single letter character vector
     individual_chars <- unlist(strsplit(text, ""))
     # return the characters from input that are NOT in the AA_ALPHABET
@@ -669,7 +669,7 @@ library("tidyverse")
 }
 
 #-------------------------------------------------------------------------------
-validate_fasta <- function(fasta_path, .type = "AA") {
+validateFasta <- function(fasta_path, .type = "AA") {
     # handle case of fasta being unreadable/unrecognized format
     fasta <- tryCatch(
         expr = Biostrings::readAAStringSet(fasta_path),
@@ -695,7 +695,7 @@ validate_fasta <- function(fasta_path, .type = "AA") {
     seq_body_validations <- c()
     for (i in seq_along(1:length(fasta))) {
         seq_body <- as.character(fasta[i])
-        seq_body_validations <- c(.validate_seq_body(seq_body), seq_body_validations)
+        seq_body_validations <- c(validateSeqBody(seq_body), seq_body_validations)
     }
     # require all sequences to have chars only in AA_ALPHABET
     if (!(all(seq_body_validations))) {
@@ -707,7 +707,7 @@ validate_fasta <- function(fasta_path, .type = "AA") {
     # iteratively guess the type (DNA, AA, or other of each seq in the FASTA file)
     seq_types <- c()
     for (idx in seq_along(1:length(fasta))) {
-        seq_types <- c(.guess_seq_type(fasta[idx]), seq_types)
+        seq_types <- c(guessSeqType(fasta[idx]), seq_types)
     }
 
     switch(.type,
diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R
index 3ea331d5..4ad5b076 100644
--- a/inst/report/scripts/generate_report.R
+++ b/inst/report/scripts/generate_report.R
@@ -2,8 +2,7 @@
 # Last modified: 2024
 
 # get fasta of pathogen and/or drug
-#' @export
-get_card_data <- function(pathogen = NULL, drug = NULL) {
+getCardData <- function(pathogen = NULL, drug = NULL) {
   destination_dir <- "CARD_data"
   # Check if CARD data exists
   if (!dir.exists(destination_dir)) {
@@ -73,8 +72,7 @@ get_card_data <- function(pathogen = NULL, drug = NULL) {
 }
 
 # Run analysis
-#' @export
-run_analysis <- function(
+runAnalysis <- function(
         upload_type = "Fasta",
         evalue = 0.00001,
         accnum_fasta_input = "",
@@ -224,13 +222,13 @@ run_analysis <- function(
     # Validation of inputs
     fasta_data <- read_file(file_paths$fasta)
     validate_and_process_inputs <- function(evalue, fasta_data) {
-        if (!validate_evalue(evalue)) {
+        if (!validateEvalue(evalue)) {
             return("Error: A numeric E-value is required. Please set a valid
                    value (e.g., 0.0001).")
         }
 
 
-        if (!validate_accnum_fasta(fasta_data)) {
+        if (!validateAccNumFasta(fasta_data)) {
             return("Error: Input for AccNum/Fasta cannot be empty or invalid.")
         }
 
@@ -327,7 +325,7 @@ run_analysis <- function(
                }
 
                # Validate accession numbers for FASTA submission
-               if (!(validate_accnum_fasta(sequence_upload_data@seqs))) {
+               if (!(validateAccNumFasta(sequence_upload_data@seqs))) {
                    stop("Error: Please adjust the FASTA headers. Ensure a header
                         line for each sequence, no duplicate header names, and
                         no duplicate protein accession numbers.")
@@ -338,7 +336,7 @@ run_analysis <- function(
                    expr = {
                        tmp_file <- tempfile()
                        writeLines(sequence_upload_data@seqs, tmp_file)
-                       validate_fasta(tmp_file)
+                       validateFasta(tmp_file)
                    },
                    error = function(e) {
                        warning("Error: Failed to run input FASTA verification")
@@ -454,7 +452,7 @@ run_analysis <- function(
             job_code = pin_id,
         )
     } else {
-        submit_full(
+        runFull(
             dir = dir,
             sequences = path,
             DB = blast_db,
@@ -575,7 +573,7 @@ run_analysis <- function(
     }
 
     # Process results for results
-    fetched <- process_wrapper_dir(dir, pinName = pinName, type = type)
+    fetched <- processWrapperDir(dir, pinName = pinName, type = type)
 
     # Assign fetched data to objects if the fetched list has the expected length
     if (length(fetched) == 2) {
@@ -586,69 +584,69 @@ run_analysis <- function(
         r_nrow_initial <- nrow(fetched[[1]]@df)  # Initialize row count
     }
 
-    domarch_cols_value <- get_domarch_cols(app_data, DASelect)
+    domarch_cols_value <- getDomArchCols(app_data, DASelect)
 
-    query_domarch_cols_value <- get_domarch_columns(query_data@df)
+    query_domarch_cols_value <- getDomArchCols(query_data@df)
 
-    mainTable_value <- generate_data_table(data)
+    mainTable_value <- getDataTable(data)
 
-    queryDataTable_value <- generate_query_data_table(query_data, query_select)
+    queryDataTable_value <- getQueryDataTable(query_data, query_select)
 
-    fastaDataText_value <- get_fasta_data(query_data@fasta_path)
+    fastaDataText_value <- getFastaData(query_data@fasta_path)
 
-    domainDataText_value <- get_domain_data(data)
+    domainDataText_value <- getDomData(data)
 
-    msaDataText_value <- get_msa_data(query_data@msa_path)
+    msaDataText_value <- getMSAData(query_data@msa_path)
 
-    rs_IprGenes_value <- generate_ipr_genes_visualization(data,
+    rs_IprGenes_value <- getIPRGenesVisualization(data,
                                                           app_data,
                                                           input_rs_iprDatabases,
                                                           input_rs_iprVisType)
 
-    rs_network_layout_value <- generate_rs_network_layout(data,
+    rs_network_layout_value <- getRSNetworkLayout(data,
                                                           app_data,
                                                           cutoff = 100,
                                                           layout = "nice")
 
-    rs_data_table_value <- generate_data_table(data)
+    rs_data_table_value <- getDataTable(data)
 
-    da_IprGenes_value <- generate_da_ipr_genes_plot(app_data,
+    da_IprGenes_value <- getDomArchIPRGenesPlot(app_data,
                                                     da_iprDatabases,
                                                     da_iprVisType,
                                                     DASelect)
 
-    query_heatmap_value <- generate_query_heatmap(query_data@df,
+    query_heatmap_value <- getQueryHeatmap(query_data@df,
                                                   heatmap_select = "All",
                                                   heatmap_color = "blue")
 
-    DA_Prot_value <- get_DA_Prot(app_data, DASelect)
+    DA_Prot_value <- getDomArchProt(app_data, DASelect)
 
-    DALinPlot_value <- generate_DA_heatmap_plot(DA_col = "DomArch.Pfam",
+    DALinPlot_value <- getDomArchHeatmapPlot(DA_col = "DomArch.Pfam",
                                                 DACutoff,
                                                 DA_Prot_value,
                                                 DA_lin_color = "viridis",
                                                 app_data@ipr_path)
 
-    DALin_TotalCounts_value <- DA_TotalCounts(DA_Prot_value,
+    DALin_TotalCounts_value <- getDomArchTotalCounts(DA_Prot_value,
                                               DACutoff,
                                               DA_col = "DomArch.Pfam",
                                               app_data)
 
-    DALinTable_value <- generate_DA_lin_table(DA_col = "DomArch.Pfam",
+    DALinTable_value <- getDomArchLinearTable(DA_col = "DomArch.Pfam",
                                               app_data@ipr_path,
                                               DALin_TotalCounts_value)
 
-    DANetwork_value <- generate_domain_network(DA_col = "DomArch.Pfam",
+    DANetwork_value <- getDomNetwork(DA_col = "DomArch.Pfam",
                                                DACutoff,
                                                DA_Prot_value,
                                                networkLayout = "nice",
                                                app_data@ipr_path)
 
-    rep_accnums_value <- rep_accnums(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data)
+    rep_accnums_value <- repAccNums(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data)
 
-    phylogeny_prot_value <- filter_phylogeny_proteins(app_data, phylo_select)
+    phylogeny_prot_value <- filterPhylogenyProteins(app_data, phylo_select)
 
-    acc_to_name_value <- acc_to_name(app_data)
+    acc_to_name_value <- acc2Name(app_data)
 
     ####### Report Generation ########
 
@@ -681,11 +679,11 @@ run_analysis <- function(
             phylo_sunburst_levels = levels,
             phylo_sunburst = phylogeny_prot_value,
             tree_msa_tool = tree_msa_tool,
-            rep_accnums = rep_accnums_value,
+            repAccNums = rep_accnums_value,
             msa_rep_num = 3,
             app_data = app_data,
             PhyloSelect = PhyloSelect,
-            acc_to_name = acc_to_name_value,
+            acc2Name = acc_to_name_value,
             rval_phylo = rval_phylo,
             msa_reduce_by = msa_reduce_by
         )
diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R
index 2fc1fdbe..ea52a3b5 100644
--- a/inst/report/scripts/run_molevolvr_pipeline.R
+++ b/inst/report/scripts/run_molevolvr_pipeline.R
@@ -7,7 +7,7 @@ library(data.table)
 library(readr)
 library(rentrez)
 
-get_sequences <- function(sequences,
+getSeqs <- function(sequences,
                           acc_file_path = "accs.txt",
                           dir_path = "~",
                           separate = TRUE) {
@@ -35,7 +35,7 @@ get_sequences <- function(sequences,
   writeXStringSet(seqs, sequences, format = "fasta")
   return(length(seqs))
 }
-submit_full <- function(
+runFull <- function(
     dir = "/data/scratch",
     DB = Sys.getenv("BLAST_DB", unset = "refseq"),
     NHITS = Sys.getenv("BLAST_HITS", unset = 100),
@@ -70,13 +70,13 @@ submit_full <- function(
 
   # Create a log file
   write("START_DT\tSTOP_DT\tquery\tdblast\tacc2info\tdblast_cleanup\tacc2fa
-        \tblast_clust\tclust2table\tiprscan\tipr2lineage\tipr2da\tduration",
+        \tblast_clust\tclust2table\tiprscan\tipr2lineage\tipr2DomArch\tduration",
         "logfile.tsv")
 
   # Process sequences (local handling)
   if (phylo == "FALSE") {
     # Split the sequences if needed, store them locally
-    num_seqs <- get_sequences(sequences, dir_path = dir, separate = TRUE)
+    num_seqs <- getSeqs(sequences, dir_path = dir, separate = TRUE)
 
     fasta <- Biostrings::readAAStringSet(sequences)
     headers_original <- names(fasta)
@@ -89,7 +89,7 @@ submit_full <- function(
       # output_file <- paste0(dir, "/blast_output_", i, ".txt")
 
       # Construct the local BLAST command (make sure 'blastn' is available locally)
-      run_molevolvr_pipeline(input_file, DB, NHITS, EVAL, is_query = F, type, i)
+      runMolevolvrPipeline(input_file, DB, NHITS, EVAL, is_query = F, type, i)
 
       #cmd <- sprintf(
       #  "deltablast -query %s -db %s -out %s -num_alignments %d -evalue %f -remote",
@@ -107,7 +107,7 @@ submit_full <- function(
   }
 
   # Simulate query run locally
-  run_molevolvr_pipeline(sequences, DB, NHITS, EVAL, is_query = TRUE, type)
+  runMolevolvrPipeline(sequences, DB, NHITS, EVAL, is_query = TRUE, type)
   # cmd_query <- sprintf(
   #   "deltablast -query %s -db %s -out %s_query.txt -num_alignments %d -evalue %f -remote",
   #   sequences, DB, paste0(dir, "/query_output"), NHITS, EVAL
@@ -124,8 +124,7 @@ submit_full <- function(
 }
 
 # Define the main pipeline function
-#' @export
-run_molevolvr_pipeline <- function(input_paths, db, nhits, eval,
+runMolevolvrPipeline <- function(input_paths, db, nhits, eval,
                                    is_query, type, i) {
 
   # Start time
@@ -168,9 +167,9 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval,
     # setwd(OUTDIR)
 
     # Run acc2info
-    run_acc2info(parsed_accnums_file, PREFIX, OUTDIR)
+    runAcc2Info(parsed_accnums_file, PREFIX, OUTDIR)
 
-    replace_accession_numbers(file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")),
+    replaceAccNums(file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")),
                               file.path(OUTPATH, "query-fasta_header-map.tsv"),
                               file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")))
 
@@ -188,18 +187,18 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval,
     # setwd(OUTDIR)
 
     # Run DELTABLAST
-    run_deltablast(input_paths, PREFIX, OUTDIR, db, nhits, eval)
+    runDeltablast(input_paths, PREFIX, OUTDIR, db, nhits, eval)
 
     # Run ACC2FA
-    convert_accnum_to_fasta(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")),
+    convertAccNum2Fasta(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")),
                             PREFIX, OUTDIR)
 
     # Run ACC2INFO
-    run_acc2info(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt")),
+    runAcc2Info(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt")),
                  PREFIX, OUTDIR)
 
     # Clean up BLAST results
-    cleanup_blast(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")),
+    cleanupBlast(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")),
                   file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")),
                   PREFIX, F)
 
@@ -208,15 +207,15 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval,
   # Sys.sleep(30)
 
   # Run BLASTCLUST
-  run_blastclust(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
+  runBlastclust(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
                  PREFIX, OUTDIR )
 
   # Convert clusters to table
-  clust2tbl(file.path(OUTDIR, paste0(PREFIX, ".bclust.L60S80.tsv")),
+  clust2Table(file.path(OUTDIR, paste0(PREFIX, ".bclust.L60S80.tsv")),
             file.path(OUTDIR, paste0(PREFIX, ".blast.cln.tsv")))
 
   # Run INTERPROSCAN
-  run_interproscan(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
+  runIPRScan(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
                    PREFIX, OUTDIR)
   new_header <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis", "DB.ID",
                   "SignDesc", "StartLoc", "StopLoc", "Score",
@@ -230,17 +229,17 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval,
   write.table(temp_data, file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")),
               sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
   # Run IPR2LIN
-  ipr2lin(file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")),
+  ipr2Linear(file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")),
           file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), PREFIX)
 
 
-  # Optionally run IPR2DA if IS_QUERY is true
+  # Optionally run ipr2DomArch if IS_QUERY is true
   if (is_query == TRUE) {
-    ## perform ipr2da on iprscan results
-    da <- ipr2da(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")),
+    ## perform ipr2DomArch on iprscan results
+    da <- ipr2DomArch(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")),
                  PREFIX, "NA")
 
-    ## if blast results are provided, call append_ipr
+    ## if blast results are provided, call appendIPR
     if (is.null(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv"))) |
         is.na(PREFIX)) {
       print("No blast results provided, moving on.")
@@ -249,16 +248,16 @@ run_molevolvr_pipeline <- function(input_paths, db, nhits, eval,
                 file.path(OUTDIR, paste0(PREFIX, ".full_analysis.tsv")))
     }
   } else {
-    # perform ipr2da on iprscan results
-    da <- ipr2da(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")),
+    # perform ipr2DomArch on iprscan results
+    da <- ipr2DomArch(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")),
                  PREFIX)
 
-    ## if blast results are provided, call append_ipr
+    ## if blast results are provided, call appendIPR
     if (is.null(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv"))) |
         is.na(PREFIX)) {
       print("No blast results provided, moving on.")
     } else {
-      append_ipr(ipr_da = da,
+      appendIPR(ipr_da = da,
                  blast =  file.path(OUTDIR, paste0(PREFIX, ".cln.clust.tsv")),
                  prefix = PREFIX)
     }
@@ -438,7 +437,7 @@ acc2info <- function(infile, prefix, outdir) {
   cat("Data saved to:", outfile, "\n")
 }
 
-acc2info_phylo <- function(infile, outdir) {
+acc2InfoPhylo <- function(infile, outdir) {
   # Ensure output directory exists
   if (!dir.exists(outdir)) {
     dir.create(outdir, recursive = TRUE)
@@ -498,15 +497,15 @@ acc2info_phylo <- function(infile, outdir) {
 }
 
 # Main function to run based on the prefix
-run_acc2info <- function(infile, prefix, outdir) {
+runAcc2Info <- function(infile, prefix, outdir) {
   if (prefix == "NA") {
-    acc2info_phylo(infile, outdir)
+    acc2InfoPhylo(infile, outdir)
   } else {
     acc2info(infile, prefix, outdir)
   }
 }
 
-substitute_accnum_for_acc2info <- function(df_acc2info, df_header_map) {
+subsAccnum4cc2Info <- function(df_acc2info, df_header_map) {
   df_result <- df_header_map |>
     # set column name in header map to match accnum col in acc2info
     dplyr::rename(AccNum = header_accnum) |>
@@ -522,7 +521,7 @@ substitute_accnum_for_acc2info <- function(df_acc2info, df_header_map) {
 }
 
 
-replace_accession_numbers <- function(path_acc2info,
+replaceAccNums <- function(path_acc2info,
                                       path_query_header_map, path_out) {
 
   # Read the input files
@@ -530,7 +529,7 @@ replace_accession_numbers <- function(path_acc2info,
   df_query_header_map <- read_tsv(path_query_header_map)
 
   # Substitute accession numbers
-  df_acc2info_substituted <- substitute_accnum_for_acc2info(df_acc2info,
+  df_acc2info_substituted <- subsAccnum4cc2Info(df_acc2info,
                                                             df_query_header_map)
 
   # Print the substituted dataframe
@@ -542,7 +541,7 @@ replace_accession_numbers <- function(path_acc2info,
 }
 
 
-run_deltablast <- function(infile, prefix, outdir,
+runDeltablast <- function(infile, prefix, outdir,
                            db = "refseq_protein",
                            nhits = 5000, evalue = 1e-5,
                            threads = 10) {
@@ -580,7 +579,7 @@ run_deltablast <- function(infile, prefix, outdir,
 
 # This script converts AccNum to Fasta using NCBI's EDirect or EBI's API
 
-convert_accnum_to_fasta <- function(infile, prefix, outdir) {
+convertAccNum2Fasta <- function(infile, prefix, outdir) {
 
   # Create the output file path
   outfile <- file.path(outdir, paste0(prefix, ".all_accnums.fa"))
@@ -649,7 +648,7 @@ convert_accnum_to_fasta <- function(infile, prefix, outdir) {
 }
 
 
-cleanup_blast <- function(infile_blast, acc2info, prefix, wblast = F) {
+cleanupBlast <- function(infile_blast, acc2info, prefix, wblast = F) {
 
   outdir <- dirname(infile_blast)
   # Load and clean acc2info file
@@ -723,7 +722,7 @@ cleanup_blast <- function(infile_blast, acc2info, prefix, wblast = F) {
 
 
 # Function to run BLASTCLUST on given input
-run_blastclust <- function(infile, suffix, outdir) {
+runBlastclust <- function(infile, suffix, outdir) {
 
   # Prepare output file path
   outfile <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv"))
@@ -756,7 +755,7 @@ run_blastclust <- function(infile, suffix, outdir) {
 }
 
 # Function to format blastclust output
-clust2tbl <- function(clust, blast) {
+clust2Table <- function(clust, blast) {
   clust_out <- read_tsv(file = clust, col_names = F)
   blast_out <- read_tsv(file = blast, col_names = T)
   ## Count the number of accession numbers in a cluster
@@ -831,7 +830,7 @@ clust2tbl <- function(clust, blast) {
 }
 
 # Function to run InterProScan
-run_interproscan <- function(query_file, prefix, outdir) {
+runIPRScan <- function(query_file, prefix, outdir) {
 
   # Start InterProScan run
   cat("\n######################\n")
@@ -864,7 +863,7 @@ run_interproscan <- function(query_file, prefix, outdir) {
   cat("##################\n")
 }
 
-ipr2lin <- function(ipr, acc2info, prefix) {
+ipr2Linear <- function(ipr, acc2info, prefix) {
   # read in iprscan results
   # duplicate rows in iprscan file
   ipr_in <- read_tsv(ipr, col_names = TRUE) %>%
@@ -964,7 +963,7 @@ ipr2lin <- function(ipr, acc2info, prefix) {
   write_tsv(ipr_cln, file.path(paste0(outdir, "/",paste0(prefix,".iprscan_cln.tsv"))))
 }
 
-ipr2da <- function(infile_ipr, prefix,
+ipr2DomArch <- function(infile_ipr, prefix,
                    analysis = c(
                      "Pfam", "SMART", "Phobius",
                      "Gene3D", "TMHMM", "SignalP_GRAM_POSITIVE",
@@ -1034,8 +1033,8 @@ ipr2da <- function(infile_ipr, prefix,
   return(domarch2)
 }
 
-## function to add results from ipr2da to blast results
-append_ipr <- function(ipr_da, blast, prefix) {
+## function to add results from ipr2DomArch to blast results
+appendIPR <- function(ipr_da, blast, prefix) {
   # ! an 'AccNum' or 'AccNum.noV' column is required in blast table for joining !
   blast_out <- read_tsv(blast, col_names = T)
   if ("AccNum.noV" %in% colnames(blast_out)) {
diff --git a/inst/report/scripts/viz_utils.R b/inst/report/scripts/viz_utils.R
index 41304360..573e0739 100644
--- a/inst/report/scripts/viz_utils.R
+++ b/inst/report/scripts/viz_utils.R
@@ -9,7 +9,7 @@ library(DT)
 library(plotly)
 
 # Function to generate the InterProScan Visualization
-generate_ipr_genes_visualization <- function(data, app_data,
+getIPRGenesVisualization <- function(data, app_data,
                                              input_rs_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"),
                                              input_rs_iprVisType = "Analysis") {
 
@@ -57,7 +57,7 @@ generate_ipr_genes_visualization <- function(data, app_data,
 }
 
 # Function to generate the domain network layout visualization
-generate_rs_network_layout <- function(data, app_data,
+getRSNetworkLayout <- function(data, app_data,
                                        cutoff = 100,
                                        layout = "nice") {
 
@@ -99,7 +99,7 @@ generate_rs_network_layout <- function(data, app_data,
 }
 
 # Function to generate the data table
-generate_data_table <- function(data) {
+getDataTable <- function(data) {
     if (nrow(data@df) == 0) {
         stop("No data available.
              Please ensure you have uploaded your data correctly.")
@@ -165,7 +165,7 @@ generate_data_table <- function(data) {
 }
 
 # Function to generate query data table
-generate_query_data_table <- function(query_data, query_select = NULL) {
+getQueryDataTable <- function(query_data, query_select = NULL) {
 
     # Check if analysis is loaded and data is available
     if (nrow(query_data@df) == 0) {
@@ -239,7 +239,7 @@ generate_query_data_table <- function(query_data, query_select = NULL) {
 }
 
 # Function to read and return the FASTA file contents
-read_fasta_data <- function(fasta_path) {
+readFastaData <- function(fasta_path) {
 
     # Check if analysis is loaded and the file path is not empty
     if (fasta_path == "" || !file.exists(fasta_path)) {
@@ -252,7 +252,7 @@ read_fasta_data <- function(fasta_path) {
     return(fasta_content)
 }
 
-get_fasta_data <- function(fasta_path) {
+getFastaData <- function(fasta_path) {
     if (is.null(fasta_path) || fasta_path == "") {
         stop("Error: FASTA path is not provided.")
     }
@@ -260,12 +260,12 @@ get_fasta_data <- function(fasta_path) {
 }
 
 # Function to get domain sequences (assumes `data` is a predefined object)
-get_domain_data <- function(data) {
+getDomData <- function(data) {
     return(data@domainSeqs)  # Return domain sequences
 }
 
 # Function to get MSA data from a given path
-get_msa_data <- function(msa_path) {
+getMSAData <- function(msa_path) {
     if (is.null(msa_path) || msa_path == "") {
         stop("Error: MSA path is not provided.")
     }
@@ -273,7 +273,7 @@ get_msa_data <- function(msa_path) {
 }
 
 # Function to generate a heatmap
-generate_query_heatmap <- function(query_data_df,
+getQueryHeatmap <- function(query_data_df,
                                    heatmap_select = "All",
                                    heatmap_color = "blue") {
 
@@ -307,7 +307,7 @@ generate_query_heatmap <- function(query_data_df,
 }
 
 # Function to retrieve domain architecture columns
-get_domarch_columns <- function(query_data_df) {
+getDomArchCols <- function(query_data_df) {
     # Check if query data exists
     if (nrow(query_data_df) == 0) {
         stop("No query data available.")
@@ -339,7 +339,7 @@ get_domarch_columns <- function(query_data_df) {
 }
 
 # Function to generate main data table
-generate_main_table <- function(data_df, main_select = NULL) {
+getMainTable <- function(data_df, main_select = NULL) {
     # Validate input data
     if (nrow(data_df) == 0) {
         stop("No data available. Please ensure you have uploaded your data
@@ -403,7 +403,7 @@ generate_main_table <- function(data_df, main_select = NULL) {
     return(datatable_output)
 }
 
-total_counts <- function(prot, column = "DomArch", lineage_col = "Lineage",
+totalCounts <- function(prot, column = "DomArch", lineage_col = "Lineage",
                          cutoff = 90, RowsCutoff = FALSE, digits = 2
                          # type = "GC"
 ) {
@@ -467,14 +467,14 @@ total_counts <- function(prot, column = "DomArch", lineage_col = "Lineage",
   return(total)
 }
 
-DA_TotalCounts <- function(DA_Prot, DACutoff, DA_col, app_data) {
+getDomArchTotalCounts <- function(DA_Prot, DACutoff, DA_col, app_data) {
   # Check if ipr_path is not empty
   if (app_data@ipr_path == "") {
     stop("ipr_path is missing.")
   }
 
   # Calculate total counts with the specified cutoff and column
-  prot_tc <- total_counts(DA_Prot, cutoff = DACutoff, column = DA_col)
+  prot_tc <- totalCounts(DA_Prot, cutoff = DACutoff, column = DA_col)
 
   # Replace all instances of ">" in the Lineage column with "_"
   prot_tc$Lineage <- map(prot_tc$Lineage, ~ str_replace_all(.x, ">", "_")) %>%
@@ -485,7 +485,7 @@ DA_TotalCounts <- function(DA_Prot, DACutoff, DA_col, app_data) {
 }
 
 # Function to generate Domain Architecture Linear Table
-generate_DA_lin_table <- function(DA_col, ipr_path, DA_TotalCounts_value) {
+getDomArchLinearTable <- function(DA_col, ipr_path, DA_TotalCounts_value) {
     # Check if ipr_path is valid
     if (ipr_path == "") {
         stop("InterPro path is empty.")
@@ -534,7 +534,7 @@ generate_DA_lin_table <- function(DA_col, ipr_path, DA_TotalCounts_value) {
 }
 
 # Function to generate the Domain Architecture Lineage Plot
-generate_DA_heatmap_plot <- function(DA_col, DACutoff,
+getDomArchHeatmapPlot <- function(DA_col, DACutoff,
                                      DA_Prot, DA_lin_color,
                                      ipr_path) {
     # Check if ipr_path is valid
@@ -564,7 +564,7 @@ generate_DA_heatmap_plot <- function(DA_col, DACutoff,
 }
 
 # Function to generate the Domain Architecture Network
-generate_domain_network <- function(DA_col, DACutoff, DA_Prot,
+getDomNetwork <- function(DA_col, DACutoff, DA_Prot,
                                     networkLayout, ipr_path) {
     # Check if ipr_path is valid
     if (ipr_path == "") {
@@ -628,7 +628,7 @@ generate_domain_network <- function(DA_col, DACutoff, DA_Prot,
 }
 
 # Function to retrieve and clean Domain Architecture data
-get_DA_Prot <- function(app_data, DASelect) {
+getDomArchProt <- function(app_data, DASelect) {
     # Check if the ipr_path is valid
     if (app_data@ipr_path == "") {
         stop("InterPro path is empty.")
@@ -667,7 +667,7 @@ get_DA_Prot <- function(app_data, DASelect) {
 }
 
 # Function to retrieve domain architecture columns
-get_domarch_cols <- function(app_data, DASelect) {
+getDomArchCols <- function(app_data, DASelect) {
     # Check if app data DataFrame is not empty
     if (nrow(app_data@df) <= 0) {
         stop("No data available in app data.")
@@ -712,7 +712,7 @@ get_domarch_cols <- function(app_data, DASelect) {
 }
 
 # Function to generate the IPR genes plot
-generate_da_ipr_genes_plot <- function(app_data, da_iprDatabases,
+getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases,
                                        da_iprVisType, DASelect) {
 
     if (app_data@ipr_path == "") {
@@ -760,7 +760,7 @@ generate_da_ipr_genes_plot <- function(app_data, da_iprDatabases,
 }
 
 # Function to filter proteins for phylogeny
-filter_phylogeny_proteins <- function(app_data, phylo_select) {
+filterPhylogenyProteins <- function(app_data, phylo_select) {
     # Get the data frame from app_data
     df <- app_data@df
 
@@ -783,7 +783,7 @@ filter_phylogeny_proteins <- function(app_data, phylo_select) {
 }
 
 # Function to retrieve representative accession numbers
-get_representative_accession_numbers <- function(app_data, phylo_select,
+getRepAccNum <- function(app_data, phylo_select,
                                                  msa_reduce_by, msa_rep_num,
                                                  rval_phylo) {
 
@@ -814,7 +814,7 @@ get_representative_accession_numbers <- function(app_data, phylo_select,
     }
 }
 
-generate_domain_architecture_plot <- function(ipr_path, query_names,
+getDomArchPlot <- function(ipr_path, query_names,
                                               analysis_type, group_by) {
     # Check if the input path is provided
     if (is.null(ipr_path) || ipr_path == "") {
@@ -839,7 +839,7 @@ generate_domain_architecture_plot <- function(ipr_path, query_names,
 }
 
 # Function to convert accessions to names
-acc_to_name <- function(app_data) {
+acc2Name <- function(app_data) {
     # Check if "AccNum" is a column in the data
     if (!("AccNum" %in% colnames(app_data@df))) {
         stop("Column 'AccNum' not found in data.")
@@ -859,7 +859,7 @@ acc_to_name <- function(app_data) {
     return(df)
 }
 
-rep_accnums <- function(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data) {
+repAccNums <- function(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data) {
   # If `phylo` is true, return all `AccNum` values from `app_data`
   if (phylo) {
     return(app_data@df$AccNum)
@@ -894,7 +894,7 @@ rep_accnums <- function(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data
   }
 }
 
-seq_tree <- function(fasta_filepath){
+seqTree <- function(fasta_filepath){
   my_seqs <- readAAStringSet(fasta_filepath) #, format="fasta", seek.first.rec=T)
   my_seqs_msa <- msa(my_seqs)
   my_seqs_msa_aln <- msaConvert(my_seqs_msa, type="seqinr::alignment")

From 5f360b147aaf43bf221c423532b4d175af725eab Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Wed, 20 Nov 2024 03:05:34 +0200
Subject: [PATCH 07/23] trailing spaces in sum

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 R/summarize.R | 138 ++++++++++++++++++++++++++------------------------
 1 file changed, 73 insertions(+), 65 deletions(-)

diff --git a/R/summarize.R b/R/summarize.R
index 350c533d..9ec823de 100644
--- a/R/summarize.R
+++ b/R/summarize.R
@@ -36,7 +36,7 @@
 #' filterByDomains()
 #' }
 filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remove = c(),
-    ignore.case = FALSE) {
+                            ignore.case = FALSE) {
     # Only rows with a domain in doms_keep will be kept
     # Any row containing a domain in doms_remove will be removed
 
@@ -213,7 +213,7 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms"
     valid_types <- c("da2doms", "doms2da")
     if (!conversion_type %in% valid_types) {
         abort(paste("Error: Invalid 'conversion_type'. Must be one of:",
-                   paste(valid_types, collapse = ", ")))
+                    paste(valid_types, collapse = ", ")))
     }
 
     z1 <- prot %>%
@@ -221,16 +221,16 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms"
         str_replace_all("\\,", " ") %>%
         str_replace_all("\"", " ")
     switch(conversion_type,
-        da2doms = {
-            z2 <- z1 %>%
-                str_replace_all("\\+", " ")
-        },
-        gc2da = {
-            z2 <- z1 %>%
-                str_replace_all("\\<-", " ") %>%
-                str_replace_all("-\\>", " ") %>%
-                str_replace_all("\\|", " ")
-        }
+           da2doms = {
+               z2 <- z1 %>%
+                   str_replace_all("\\+", " ")
+           },
+           gc2da = {
+               z2 <- z1 %>%
+                   str_replace_all("\\<-", " ") %>%
+                   str_replace_all("-\\>", " ") %>%
+                   str_replace_all("\\|", " ")
+           }
     )
     # str_replace_all("^c\\($", " ") %>%		# remove "c("
     # str_replace_all("\\)$", " ") %>%			# remove ")"
@@ -385,7 +385,7 @@ NULL
 #' }
 #'
 summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage",
-    query) {
+                               query) {
     # Check if 'prot' is a data frame
     if (!is.data.frame(prot)) {
         abort("Error: 'prot' must be a data frame.")
@@ -431,6 +431,7 @@ summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage"
 #' named `DomArch` and `Lineage`.
 #'
 #' @importFrom dplyr arrange count desc filter group_by summarise
+#' @importFrom rlang .data
 #'
 #' @return A tibble summarizing the counts of unique domain architectures
 #' (`DomArch`) per lineage (`Lineage`). The resulting table contains three
@@ -443,7 +444,7 @@ summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage"
 #'
 #' @examples
 #' \dontrun{
-#' summarizeDomArch_ByLineage()
+#' summarizeDomArch_ByLineage(data1)
 #' }
 summarizeDomArch_ByLineage <- function(x) {
     # Check if 'x' is a data frame
@@ -460,8 +461,8 @@ summarizeDomArch_ByLineage <- function(x) {
                    missing:", paste(missing_columns, collapse = ", ")))
     }
     x %>%
-        filter(!grepl("^-$", DomArch)) %>%
-        group_by(DomArch, Lineage) %>%
+        filter(!grepl("^-$", .data$DomArch)) %>%
+        group_by(.data$DomArch, .data$Lineage) %>%
         summarise(count = n()) %>% # , bin=as.numeric(as.logical(n()))
         arrange(desc(count))
 }
@@ -476,7 +477,8 @@ summarizeDomArch_ByLineage <- function(x) {
 #' named `DomArch` and a count column, such as `count`, which represents the
 #' occurrences of each architecture in various lineages.
 #'
-#' @importFrom dplyr arrange group_by filter summarise
+#' @importFrom dplyr arrange group_by filter summarise desc
+#' @importFrom rlang .data
 #'
 #' @return A tibble summarizing each unique `DomArch`, along with the following
 #' columns:
@@ -489,7 +491,7 @@ summarizeDomArch_ByLineage <- function(x) {
 #'
 #' @examples
 #' \dontrun{
-#' summarizeDomArch()
+#' summarizeDomArch(data1)
 #' }
 summarizeDomArch <- function(x) {
     # Check if 'x' is a data frame
@@ -497,11 +499,14 @@ summarizeDomArch <- function(x) {
         abort("Error: 'x' must be a data frame.")
     }
     x %>%
-        group_by(DomArch) %>%
-        summarise(totalcount = sum(count), totallin = n()) %>% # totallin=n_distinct(Lineage),
-        arrange(desc(totallin), desc(totalcount)) %>%
-        filter(!grepl(" \\{n\\}", DomArch)) %>%
-        filter(!grepl("^-$", DomArch))
+        group_by(.data$DomArch) %>%
+        summarise(
+            totalcount = sum(.data$count),
+            totallin = n()
+        ) %>%
+        arrange(desc(.data$totallin), desc(.data$totalcount)) %>%
+        filter(!grepl(" \\{n\\}", .data$DomArch)) %>%
+        filter(!grepl("^-$", .data$DomArch))
 }
 
 #' summarizeGenContext_ByDomArchLineage
@@ -510,6 +515,7 @@ summarizeDomArch <- function(x) {
 #' named `GenContext`, `DomArch`, and `Lineage`.
 #'
 #' @importFrom dplyr arrange desc filter group_by n summarise
+#' @importFrom rlang .data
 #'
 #' @return A tibble summarizing each unique combination of `GenContext`,
 #' `DomArch`, and `Lineage`, along with the following columns:
@@ -525,7 +531,7 @@ summarizeDomArch <- function(x) {
 #'
 #' @examples
 #' \dontrun{
-#' summarizeGenContext_ByDomArchLineage
+#' summarizeGenContext_ByDomArchLineage(your_data)
 #' }
 summarizeGenContext_ByDomArchLineage <- function(x) {
     # Check if 'x' is a data frame
@@ -533,28 +539,31 @@ summarizeGenContext_ByDomArchLineage <- function(x) {
         abort("Error: 'x' must be a data frame.")
     }
     x %>%
-        filter(!grepl("^-$", GenContext)) %>%
-        filter(!grepl("^-$", DomArch)) %>%
-        filter(!grepl("^-$", Lineage)) %>%
-        filter(!grepl("^NA$", DomArch)) %>%
-        group_by(GenContext, DomArch, Lineage) %>%
-        summarise(count = n()) %>% # , bin=as.numeric(as.logical(n()))
-        arrange(desc(count))
+        filter(!grepl("^-$", .data$GenContext)) %>%
+        filter(!grepl("^-$", .data$DomArch)) %>%
+        filter(!grepl("^-$", .data$Lineage)) %>%
+        filter(!grepl("^NA$", .data$DomArch)) %>%
+        group_by(.data$GenContext, .data$DomArch, .data$Lineage) %>%
+        summarise(count = n()) %>%
+        arrange(desc(.data$count))
 }
 
 #' summarizeGenContext_ByLineage
 #'
-#' @param x A dataframe or tibble containing the data.
+#' @param x A dataframe or tibble containing the data. It must have columns
+#' named `GenContext`, `DomArch`, and `Lineage`.
 #'
 #' @importFrom dplyr arrange desc filter group_by n summarise
+#' @importFrom rlang .data
 #'
-#' @return Describe return, in detail
+#' @return A tibble summarizing each unique combination of `GenContext` and `Lineage`,
+#' along with the count of occurrences. The results are arranged in descending order of count.
 #' @rdname MolEvolvR_summary
 #' @export
 #'
 #' @examples
 #' \dontrun{
-#' summarizeGenContext_ByLineage()
+#' summarizeGenContext_ByLineage(your_data)
 #' }
 summarizeGenContext_ByLineage <- function(x) {
     # Check if 'x' is a data frame
@@ -562,36 +571,35 @@ summarizeGenContext_ByLineage <- function(x) {
         abort("Error: 'x' must be a data frame.")
     }
     x %>%
-        filter(!grepl("^-$", GenContext)) %>%
-        filter(!grepl("^-$", DomArch)) %>%
-        filter(!grepl("^-$", Lineage)) %>%
-        filter(!grepl("^NA$", DomArch)) %>%
-        group_by(GenContext, Lineage) %>% # DomArch.norep,
-        summarise(count = n()) %>% # , bin=as.numeric(as.logical(n()))
-        arrange(desc(count))
+        filter(!grepl("^-$", .data$GenContext)) %>%
+        filter(!grepl("^-$", .data$DomArch)) %>%
+        filter(!grepl("^-$", .data$Lineage)) %>%
+        filter(!grepl("^NA$", .data$DomArch)) %>%
+        group_by(.data$GenContext, .data$Lineage) %>%
+        summarise(count = n()) %>%
+        arrange(desc(.data$count))
 }
 
 #' summarizeGenContext
 #'
 #' @param x A dataframe or tibble containing the data. It must have columns
-#' named `GenContext`, `DomArch`, and `Lineage`.
+#' named `GenContext`, `DomArch`, `Lineage`, and `count`.
 #'
-#' @importFrom dplyr arrange desc filter group_by n n_distinct summarise
+#' @importFrom dplyr arrange desc filter group_by n_distinct summarise
+#' @importFrom rlang .data
 #'
-#' @return A tibble summarizing each unique combination of `GenContext` and
-#' `Lineage`, along with the following columns:
-#' - `GenContext`: The genomic context for each entry.
-#' - `Lineage`: The lineage associated with each entry.
-#' - `count`: The total number of occurrences for each combination of
-#'  `GenContext` and `Lineage`.
+#' @return A tibble summarizing each unique `GenContext`, along with the following columns:
+#' - `totalcount`: The total count for each `GenContext`.
+#' - `totalDA`: The number of distinct `DomArch` for each `GenContext`.
+#' - `totallin`: The number of distinct `Lineage` for each `GenContext`.
 #'
-#' The results are arranged in descending order of `count`.
+#' The results are arranged in descending order of `totalcount`, `totalDA`, and `totallin`.
 #' @rdname MolEvolvR_summary
 #' @export
 #'
 #' @examples
 #' \dontrun{
-#' summarizeGenContext()
+#' summarizeGenContext(data1)
 #' }
 summarizeGenContext <- function(x) {
     # Check if 'x' is a data frame
@@ -599,15 +607,15 @@ summarizeGenContext <- function(x) {
         abort("Error: 'x' must be a data frame.")
     }
     x %>%
-        group_by(GenContext) %>%
+        group_by(.data$GenContext) %>%
         summarise(
-            totalcount = sum(count),
-            totalDA = n_distinct(DomArch),
-            totallin = n_distinct(Lineage)
-        ) %>% # totallin=n_distinct(Lineage),
-        arrange(desc(totalcount), desc(totalDA), desc(totallin)) %>%
-        filter(!grepl(" \\{n\\}", GenContext)) %>%
-        filter(!grepl("^-$", GenContext))
+            totalcount = sum(.data$count),
+            totalDA = n_distinct(.data$DomArch),
+            totallin = n_distinct(.data$Lineage)
+        ) %>%
+        arrange(desc(.data$totalcount), desc(.data$totalDA), desc(.data$totallin)) %>%
+        filter(!grepl(" \\{n\\}", .data$GenContext)) %>%
+        filter(!grepl("^-$", .data$GenContext))
 }
 
 
@@ -654,8 +662,8 @@ summarizeGenContext <- function(x) {
 #' totalGenContextOrDomArchCounts(pspa - gc_lin_counts, 0, "GC")
 #' }
 totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col = "Lineage",
-    cutoff = 90, RowsCutoff = FALSE, digits = 2
-    # type = "GC"
+                                           cutoff = 90, RowsCutoff = FALSE, digits = 2
+                                           # type = "GC"
 ) {
     # Check if 'prot' is a data frame
     if (!is.data.frame(prot)) {
@@ -668,7 +676,7 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col
 
     if (length(missing_columns) > 0) {
         abort(paste("Error: The following required columns are missing:",
-                   paste(missing_columns, collapse = ", ")))
+                    paste(missing_columns, collapse = ", ")))
     }
 
     # Check that cutoff is a numeric value between 0 and 100
@@ -682,7 +690,7 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col
         abort("Error: 'digits' must be a non-negative integer.")
     }
 
-    # column <- sym(column)
+    column <- sym(column)
 
     prot <- select(prot, {{ column }}, {{ lineage_col }}) %>%
         filter(!is.na({{ column }}) & !is.na({{ lineage_col }})) %>%
@@ -690,10 +698,10 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col
 
     prot <- summarizeByLineage(prot, column, by = lineage_col, query = "all")
     col_count <- prot %>%
-        group_by(!!sym(column)) %>%
+        group_by({{ column }}) %>%
         summarise(totalcount = sum(count))
 
-    total <- left_join(prot, col_count, by = column)
+    total <- left_join(prot, col_count, by = as_string(column))
 
     sum_count <- sum(total$count)
     total <- total %>%

From e946d09052349390f28ca150e4c7696edb060ad5 Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Wed, 20 Nov 2024 03:11:00 +0200
Subject: [PATCH 08/23] trailing spaces in sum

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 R/summarize.R | 208 +++++++++++++++++++++++++-------------------------
 1 file changed, 104 insertions(+), 104 deletions(-)

diff --git a/R/summarize.R b/R/summarize.R
index 9ec823de..c9e965bd 100644
--- a/R/summarize.R
+++ b/R/summarize.R
@@ -36,23 +36,23 @@
 #' filterByDomains()
 #' }
 filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remove = c(),
-                            ignore.case = FALSE) {
+    ignore.case = FALSE) {
     # Only rows with a domain in doms_keep will be kept
     # Any row containing a domain in doms_remove will be removed
 
     # ^word$|(?<=\+)word$|(?<=\+)word(?=\+)|word(?=\+)
-
+    
     # Check if prot is a data frame
     if (!is.data.frame(prot)) {
         abort("Error: 'prot' must be a data frame.")
     }
-
+    
     # Check if the specified column exists in the data frame
     if (!column %in% names(prot)) {
-        abort(paste("Error: The specified column '", column, "' does not exist
+        abort(paste("Error: The specified column '", column, "' does not exist 
                    in the data frame.", sep = ""))
     }
-
+    
     # If doms_keep or doms_remove are not provided, inform the user
     if (length(doms_keep) == 0 && length(doms_remove) == 0) {
         warning("Warning: No domains specified to keep or remove. Returning the
@@ -109,25 +109,25 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo
 #' countByColumn
 #' @description
 #' Function to obtain element counts (DA, GC)
-#'
-#' @param prot A data frame containing the dataset to analyze, typically with
+#' 
+#' @param prot A data frame containing the dataset to analyze, typically with 
 #' multiple columns including the one specified by the `column` parameter.
-#' @param column A character string specifying the name of the column to analyze.
+#' @param column A character string specifying the name of the column to analyze. 
 #' The default is "DomArch".
-#' @param min.freq An integer specifying the minimum frequency an element must
+#' @param min.freq An integer specifying the minimum frequency an element must 
 #' have to be included in the output. Default is 1.
 #'
 #' @importFrom dplyr arrange as_tibble filter select
 #'
 #' @return A tibble with two columns:
 #' \describe{
-#'   \item{`column`}{The unique elements from the specified column
+#'   \item{`column`}{The unique elements from the specified column 
 #'   (e.g., "DomArch").}
-#'   \item{`freq`}{The frequency of each element, i.e., the number of times
+#'   \item{`freq`}{The frequency of each element, i.e., the number of times 
 #'   each element appears in the specified column.}
 #' }
-#' The tibble is filtered to only include elements that have a frequency
-#' greater than or equal to `min.freq` and does not include elements with `NA`
+#' The tibble is filtered to only include elements that have a frequency 
+#' greater than or equal to `min.freq` and does not include elements with `NA` 
 #' values or those starting with a hyphen ("-").
 #' @export
 #'
@@ -136,20 +136,20 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo
 #' countByColumn(prot = my_data, column = "DomArch", min.freq = 10)
 #' }
 countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) {
-
+    
     # Check if 'prot' is a data frame
     if (!is.data.frame(prot)) {
         abort("Error: 'prot' must be a data frame.")
     }
-
+    
     # Check if the specified column exists in the data frame
     if (!column %in% names(prot)) {
         abort(paste("Error: The specified column '", column, "' does not exist in
                    the data frame.", sep = ""))
     }
-
+    
     # Check if min.freq is a positive integer
-    if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 ||
+    if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 || 
         floor(min.freq) != min.freq) {
         abort("Error: 'min.freq' must be a positive integer.")
     }
@@ -171,14 +171,14 @@ countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) {
 #' Break string ELEMENTS into WORDS for domain architecture (DA) and genomic
 #' context (GC)
 #'
-#' @param prot A dataframe containing the dataset to analyze. The specified
+#' @param prot A dataframe containing the dataset to analyze. The specified 
 #' `column` contains the string elements to be processed.
-#' @param column A character string specifying the name of the column to analyze.
+#' @param column A character string specifying the name of the column to analyze. 
 #' Default is "DomArch".
-#' @param conversion_type A character string specifying the type of conversion.
+#' @param conversion_type A character string specifying the type of conversion. 
 #' Two options are available:
 #' \describe{
-#'   \item{`da2doms`}{Convert domain architectures into individual domains by
+#'   \item{`da2doms`}{Convert domain architectures into individual domains by 
 #'   replacing `+` symbols with spaces.}
 #'   \item{`gc2da`}{Convert genomic context into domain architectures by
 #'    replacing directional symbols (`<-`, `->`, and `|`) with spaces.}
@@ -187,13 +187,13 @@ countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) {
 #' @importFrom dplyr pull
 #' @importFrom stringr str_replace_all
 #'
-#' @return A single string where elements are delimited by spaces. The function
-#' performs necessary substitutions based on the `conversion_type` and cleans up
+#' @return A single string where elements are delimited by spaces. The function 
+#' performs necessary substitutions based on the `conversion_type` and cleans up 
 #' extraneous characters like newlines, tabs, and multiple spaces.
 #'
 #' @examples
 #' \dontrun{
-#' tibble::tibble(DomArch = c("aaa+bbb",
+#' tibble::tibble(DomArch = c("aaa+bbb", 
 #' "a+b", "b+c", "b-c")) |> elements2Words()
 #' }
 #'
@@ -202,35 +202,35 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms"
     if (!is.data.frame(prot)) {
         abort("Error: 'prot' must be a data frame.")
     }
-
+    
     # Check if the specified column exists in the data frame
     if (!column %in% names(prot)) {
-        abort(paste("Error: The specified column '", column, "' does not exist in
+        abort(paste("Error: The specified column '", column, "' does not exist in 
                    the data frame.", sep = ""))
     }
-
+    
     # Check for valid conversion_type values
     valid_types <- c("da2doms", "doms2da")
     if (!conversion_type %in% valid_types) {
-        abort(paste("Error: Invalid 'conversion_type'. Must be one of:",
-                    paste(valid_types, collapse = ", ")))
+        abort(paste("Error: Invalid 'conversion_type'. Must be one of:", 
+                   paste(valid_types, collapse = ", ")))
     }
-
+    
     z1 <- prot %>%
         dplyr::pull(column) %>%
         str_replace_all("\\,", " ") %>%
         str_replace_all("\"", " ")
     switch(conversion_type,
-           da2doms = {
-               z2 <- z1 %>%
-                   str_replace_all("\\+", " ")
-           },
-           gc2da = {
-               z2 <- z1 %>%
-                   str_replace_all("\\<-", " ") %>%
-                   str_replace_all("-\\>", " ") %>%
-                   str_replace_all("\\|", " ")
-           }
+        da2doms = {
+            z2 <- z1 %>%
+                str_replace_all("\\+", " ")
+        },
+        gc2da = {
+            z2 <- z1 %>%
+                str_replace_all("\\<-", " ") %>%
+                str_replace_all("-\\>", " ") %>%
+                str_replace_all("\\|", " ")
+        }
     )
     # str_replace_all("^c\\($", " ") %>%		# remove "c("
     # str_replace_all("\\)$", " ") %>%			# remove ")"
@@ -252,20 +252,20 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms"
 #' @description
 #' Get word counts (wc) [DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)]
 #'
-#' @param string A character string containing the elements (words) to count.
-#' This would typically be a space-delimited string representing domain
+#' @param string A character string containing the elements (words) to count. 
+#' This would typically be a space-delimited string representing domain 
 #' architectures or genomic contexts.
 #'
 #' @importFrom dplyr as_tibble filter arrange
 #' @importFrom stringr str_replace_all
 #'
-#' @return A tibble (tbl_df) with two columns:
+#' @return A tibble (tbl_df) with two columns: 
 #' \describe{
-#'   \item{`words`}{A column containing the individual words
+#'   \item{`words`}{A column containing the individual words 
 #'   (domains or domain architectures).}
 #'   \item{`freq`}{A column containing the frequency counts for each word.}
 #' }
-#'
+#' 
 #'
 #' @examples
 #' \dontrun{
@@ -279,7 +279,7 @@ words2WordCounts <- function(string) {
     if (!is.character(string) || length(string) != 1) {
         abort("Error: 'string' must be a single character vector.")
     }
-
+    
     df_word_count <- string %>%
         # reduce spaces with length 2 or greater to a single space
         str_replace_all("\\s{2,}", " ") %>%
@@ -311,14 +311,14 @@ words2WordCounts <- function(string) {
 #' filterByFrequency
 #' @description
 #' Function to filter based on frequencies
-#'
-#' @param x A tibble (tbl_df) containing at least two columns: one for
+#' 
+#' @param x A tibble (tbl_df) containing at least two columns: one for 
 #' elements (e.g., `words`) and one for their frequency (e.g., `freq`).
-#' @param min.freq A numeric value specifying the minimum frequency threshold.
-#' Only elements with frequencies greater than or equal to this value will be
+#' @param min.freq A numeric value specifying the minimum frequency threshold. 
+#' Only elements with frequencies greater than or equal to this value will be 
 #' retained.
 #'
-#' @return A tibble with the same structure as `x`, but filtered to include
+#' @return A tibble with the same structure as `x`, but filtered to include 
 #' only rows where the frequency is greater than or equal to `min.freq`.
 #'
 #' @export
@@ -328,18 +328,18 @@ words2WordCounts <- function(string) {
 #' filterByFrequency()
 #' }
 filterByFrequency <- function(x, min.freq) {
-
+    
     # Check if 'x' is a data frame
     if (!is.data.frame(x)) {
         abort("Error: 'x' must be a data frame.")
     }
-
+    
     # Check if 'min.freq' is a positive integer
-    if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 ||
+    if (!is.numeric(min.freq) || length(min.freq) != 1 || min.freq < 1 || 
         floor(min.freq) != min.freq) {
         abort("Error: 'min.freq' must be a positive integer.")
     }
-
+    
     # Check if the 'freq' column exists in the data frame
     if (!"freq" %in% names(x)) {
         abort("Error: The data frame must contain a 'freq' column.")
@@ -355,24 +355,24 @@ filterByFrequency <- function(x, min.freq) {
 #' @name MolEvolvR_summary
 #' @description
 #' A collection of summary functions for the MolEvolvR package.
-#'
+#' 
 NULL
 
 #' summarizeByLineage
 #'
 #' @param prot A dataframe or tibble containing the data.
-#' @param column A string representing the column to be summarized
+#' @param column A string representing the column to be summarized 
 #' (e.g., `DomArch`). Default is "DomArch".
-#' @param by A string representing the grouping column (e.g., `Lineage`).
+#' @param by A string representing the grouping column (e.g., `Lineage`). 
 #' Default is "Lineage".
-#' @param query A string specifying the query pattern for filtering the target
+#' @param query A string specifying the query pattern for filtering the target 
 #' column. Use "all" to skip filtering and include all rows.
 #'
 #' @importFrom dplyr arrange filter group_by summarise
 #' @importFrom rlang sym
 #'
-#' @return A tibble summarizing the counts of occurrences of elements in
-#' the `column`, grouped by the `by` column. The result includes the number
+#' @return A tibble summarizing the counts of occurrences of elements in 
+#' the `column`, grouped by the `by` column. The result includes the number 
 #' of occurrences (`count`) and is arranged in descending order of count.
 #' @rdname MolEvolvR_summary
 #' @export
@@ -385,24 +385,24 @@ NULL
 #' }
 #'
 summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage",
-                               query) {
+    query) {
     # Check if 'prot' is a data frame
     if (!is.data.frame(prot)) {
         abort("Error: 'prot' must be a data frame.")
     }
-
+    
     # Check if the specified column exists in the data frame
     if (!column %in% names(prot)) {
-        abort(paste("Error: The specified column '", column, "' does not exist in
+        abort(paste("Error: The specified column '", column, "' does not exist in 
                    the data frame.", sep = ""))
     }
-
+    
     # Check if the 'by' column exists in the data frame
     if (!by %in% names(prot)) {
-        abort(paste("Error: The specified 'by' column '", by, "' does not exist
+        abort(paste("Error: The specified 'by' column '", by, "' does not exist 
                    n the data frame.", sep = ""))
     }
-
+    
     column <- sym(column)
     by <- sym(by)
     if (query == "all") {
@@ -427,16 +427,16 @@ summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage"
 #' Function to summarize and retrieve counts by Domains & Domains+Lineage
 #'
 #'
-#' @param x A dataframe or tibble containing the data. It must have columns
+#' @param x A dataframe or tibble containing the data. It must have columns 
 #' named `DomArch` and `Lineage`.
 #'
 #' @importFrom dplyr arrange count desc filter group_by summarise
 #' @importFrom rlang .data
 #'
-#' @return A tibble summarizing the counts of unique domain architectures
-#' (`DomArch`) per lineage (`Lineage`). The resulting table contains three
-#' columns: `DomArch`, `Lineage`, and `count`, which indicates the frequency
-#' of each domain architecture for each lineage. The results are arranged in
+#' @return A tibble summarizing the counts of unique domain architectures 
+#' (`DomArch`) per lineage (`Lineage`). The resulting table contains three 
+#' columns: `DomArch`, `Lineage`, and `count`, which indicates the frequency 
+#' of each domain architecture for each lineage. The results are arranged in 
 #' descending order of `count`.
 #' @rdname MolEvolvR_summary
 #'
@@ -451,13 +451,13 @@ summarizeDomArch_ByLineage <- function(x) {
     if (!is.data.frame(x)) {
         abort("Error: 'x' must be a data frame.")
     }
-
+    
     # Check if required columns exist in the data frame
     required_columns <- c("DomArch", "Lineage")
     missing_columns <- setdiff(required_columns, names(x))
-
+    
     if (length(missing_columns) > 0) {
-        abort(paste("Error: The following required columns are
+        abort(paste("Error: The following required columns are 
                    missing:", paste(missing_columns, collapse = ", ")))
     }
     x %>%
@@ -473,17 +473,17 @@ summarizeDomArch_ByLineage <- function(x) {
 #' @description
 #' Function to retrieve counts of how many lineages a DomArch appears in
 #'
-#' @param x A dataframe or tibble containing the data. It must have a column
-#' named `DomArch` and a count column, such as `count`, which represents the
+#' @param x A dataframe or tibble containing the data. It must have a column 
+#' named `DomArch` and a count column, such as `count`, which represents the 
 #' occurrences of each architecture in various lineages.
 #'
 #' @importFrom dplyr arrange group_by filter summarise desc
 #' @importFrom rlang .data
 #'
-#' @return A tibble summarizing each unique `DomArch`, along with the following
+#' @return A tibble summarizing each unique `DomArch`, along with the following 
 #' columns:
 #' - `totalcount`: The total occurrences of each `DomArch` across all lineages.
-#' - `totallin`: The total number of unique lineages in which each `DomArch`
+#' - `totallin`: The total number of unique lineages in which each `DomArch` 
 #' appears.
 #' The results are arranged in descending order of `totallin` and `totalcount`.
 #' @rdname MolEvolvR_summary
@@ -501,7 +501,7 @@ summarizeDomArch <- function(x) {
     x %>%
         group_by(.data$DomArch) %>%
         summarise(
-            totalcount = sum(.data$count),
+            totalcount = sum(.data$count), 
             totallin = n()
         ) %>%
         arrange(desc(.data$totallin), desc(.data$totalcount)) %>%
@@ -511,18 +511,18 @@ summarizeDomArch <- function(x) {
 
 #' summarizeGenContext_ByDomArchLineage
 #'
-#' @param x A dataframe or tibble containing the data. It must have columns
+#' @param x A dataframe or tibble containing the data. It must have columns 
 #' named `GenContext`, `DomArch`, and `Lineage`.
 #'
 #' @importFrom dplyr arrange desc filter group_by n summarise
 #' @importFrom rlang .data
 #'
-#' @return A tibble summarizing each unique combination of `GenContext`,
+#' @return A tibble summarizing each unique combination of `GenContext`, 
 #' `DomArch`, and `Lineage`, along with the following columns:
 #' - `GenContext`: The genomic context for each entry.
 #' - `DomArch`: The domain architecture for each entry.
 #' - `Lineage`: The lineage associated with each entry.
-#' - `count`: The total number of occurrences for each combination of
+#' - `count`: The total number of occurrences for each combination of 
 #' `GenContext`, `DomArch`, and `Lineage`.
 #'
 #' The results are arranged in descending order of `count`.
@@ -550,13 +550,13 @@ summarizeGenContext_ByDomArchLineage <- function(x) {
 
 #' summarizeGenContext_ByLineage
 #'
-#' @param x A dataframe or tibble containing the data. It must have columns
+#' @param x A dataframe or tibble containing the data. It must have columns 
 #' named `GenContext`, `DomArch`, and `Lineage`.
 #'
 #' @importFrom dplyr arrange desc filter group_by n summarise
 #' @importFrom rlang .data
 #'
-#' @return A tibble summarizing each unique combination of `GenContext` and `Lineage`,
+#' @return A tibble summarizing each unique combination of `GenContext` and `Lineage`, 
 #' along with the count of occurrences. The results are arranged in descending order of count.
 #' @rdname MolEvolvR_summary
 #' @export
@@ -582,7 +582,7 @@ summarizeGenContext_ByLineage <- function(x) {
 
 #' summarizeGenContext
 #'
-#' @param x A dataframe or tibble containing the data. It must have columns
+#' @param x A dataframe or tibble containing the data. It must have columns 
 #' named `GenContext`, `DomArch`, `Lineage`, and `count`.
 #'
 #' @importFrom dplyr arrange desc filter group_by n_distinct summarise
@@ -631,13 +631,13 @@ summarizeGenContext <- function(x) {
 #' @param prot  A data frame that must contain columns:
 #' \itemize{\item Either 'GenContext' or 'DomArch.norep' \item count}
 #' @param column Character. The column to summarize, default is "DomArch".
-#' @param lineage_col Character. The name of the lineage column, default is
+#' @param lineage_col Character. The name of the lineage column, default is 
 #' "Lineage".
-#' @param cutoff Numeric. Cutoff for total count. Counts below this cutoff value
+#' @param cutoff Numeric. Cutoff for total count. Counts below this cutoff value 
 #' will not be shown. Default is 0.
-#' @param RowsCutoff Logical. If TRUE, filters based on cumulative percentage
+#' @param RowsCutoff Logical. If TRUE, filters based on cumulative percentage 
 #' cutoff. Default is FALSE.
-#' @param digits Numeric. Number of decimal places for percentage columns.
+#' @param digits Numeric. Number of decimal places for percentage columns. 
 #' Default is 2.
 #'
 #'
@@ -646,9 +646,9 @@ summarizeGenContext <- function(x) {
 #'
 #' @return A data frame with the following columns:
 #' - `{{ column }}`: Unique values from the specified column.
-#' - `totalcount`: The total count of occurrences for each unique value in
+#' - `totalcount`: The total count of occurrences for each unique value in 
 #' the specified column.
-#' - `IndividualCountPercent`: The percentage of each `totalcount` relative to
+#' - `IndividualCountPercent`: The percentage of each `totalcount` relative to 
 #' the overall count.
 #' - `CumulativePercent`: The cumulative percentage of total counts.
 #' @rdname MolEvolvR_summary
@@ -662,34 +662,34 @@ summarizeGenContext <- function(x) {
 #' totalGenContextOrDomArchCounts(pspa - gc_lin_counts, 0, "GC")
 #' }
 totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col = "Lineage",
-                                           cutoff = 90, RowsCutoff = FALSE, digits = 2
-                                           # type = "GC"
+    cutoff = 90, RowsCutoff = FALSE, digits = 2
+    # type = "GC"
 ) {
     # Check if 'prot' is a data frame
     if (!is.data.frame(prot)) {
         abort("Error: 'prot' must be a data frame.")
     }
-
+    
     # Check if the specified columns exist in the data frame
     required_columns <- c(column, lineage_col)
     missing_columns <- setdiff(required_columns, names(prot))
-
+    
     if (length(missing_columns) > 0) {
-        abort(paste("Error: The following required columns are missing:",
-                    paste(missing_columns, collapse = ", ")))
+        abort(paste("Error: The following required columns are missing:", 
+                   paste(missing_columns, collapse = ", ")))
     }
-
+    
     # Check that cutoff is a numeric value between 0 and 100
     if (!is.numeric(cutoff) || length(cutoff) != 1 || cutoff < 0 || cutoff > 100) {
         abort("Error: 'cutoff' must be a numeric value between 0 and 100.")
     }
-
+    
     # Check that digits is a non-negative integer
-    if (!is.numeric(digits) || length(digits) != 1 || digits < 0 ||
+    if (!is.numeric(digits) || length(digits) != 1 || digits < 0 || 
         floor(digits) != digits) {
         abort("Error: 'digits' must be a non-negative integer.")
     }
-
+    
     column <- sym(column)
 
     prot <- select(prot, {{ column }}, {{ lineage_col }}) %>%
@@ -853,7 +853,7 @@ findParalogs <- function(prot) {
     if (!is.data.frame(prot)) {
         abort("Error: 'prot' must be a data frame.")
     }
-
+    
     # Remove eukaryotes
     prot <- prot %>% filter(!grepl("^eukaryota", Lineage))
     paralogTable <- prot %>%
@@ -901,4 +901,4 @@ findParalogs <- function(prot) {
 # cat("Word counts for broken up domains from DAs and DAs from GCs.
 # \nFor e.g.:
 # DA.doms.wc <- query.sub$DA.doms %>%
-#   words2WordCounts()")
+#   words2WordCounts()")
\ No newline at end of file

From 424b0e8751db2428ee58ae02fcd383f4561eb25d Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Wed, 20 Nov 2024 03:23:32 +0200
Subject: [PATCH 09/23] trailing spaces

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 R/networks_domarch.R | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/R/networks_domarch.R b/R/networks_domarch.R
index e4da7aca..84522943 100755
--- a/R/networks_domarch.R
+++ b/R/networks_domarch.R
@@ -24,16 +24,16 @@
 #' A network of domains is returned based on shared domain architectures.
 #'
 #' @param prot A data frame that contains the column 'DomArch'.
-#' @param column Name of column containing Domain architecture from which nodes
+#' @param column Name of column containing Domain architecture from which nodes 
 #' and edges are generated.
 #' @param domains_of_interest Character vector specifying domains of interest.
-#' @param cutoff Integer. Only use domains that occur at or above the cutoff for
+#' @param cutoff Integer. Only use domains that occur at or above the cutoff for 
 #' total counts if cutoff_type is "Total Count".
-#' Only use domains that appear in cutoff or greater lineages if cutoff_type is
+#' Only use domains that appear in cutoff or greater lineages if cutoff_type is 
 #' Lineage.
 #' @param layout Character. Layout type to be used for the network. Options are:
 #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"}
-#' @param query_color Character. Color to represent the queried domain in the
+#' @param query_color Character. Color to represent the queried domain in the 
 #' network.
 #'
 #' @importFrom dplyr across add_row all_of distinct filter mutate pull select
@@ -231,18 +231,18 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c
 #'
 #'
 #' @param prot A data frame that contains the column 'DomArch'.
-#' @param column Name of column containing Domain architecture from which nodes
+#' @param column Name of column containing Domain architecture from which nodes 
 #' and edges are generated.
 #' @param domains_of_interest Character vector specifying the domains of interest.
-#' @param cutoff Integer. Only use domains that occur at or above the cutoff for
+#' @param cutoff Integer. Only use domains that occur at or above the cutoff for 
 #' total counts if cutoff_type is "Total Count".
-#' Only use domains that appear in cutoff or greater lineages if cutoff_type is
+#' Only use domains that appear in cutoff or greater lineages if cutoff_type is 
 #' Lineage.
 #' @param layout Character. Layout type to be used for the network. Options are:
 #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"}
-#' @param query_color Color that the nodes of the domains in the
+#' @param query_color Color that the nodes of the domains in the 
 #' domains_of_interest vector are colored
-#' @param partner_color Color that the nodes that are not part of the
+#' @param partner_color Color that the nodes that are not part of the 
 #' domains_of_interest vector are colored
 #' @param border_color Color for the borders of the nodes.
 #' @param IsDirected Is the network directed? Set to false to eliminate arrows
@@ -364,4 +364,4 @@ createBinaryDomainNetwork <- function(prot, column = "DomArch", domains_of_inter
         "auto" = visIgraphLayout(vg, "layout.auto")
     )
     vg
-}
+}
\ No newline at end of file

From 9ce780b652fe43bfa8098b35afab05a8f494b4e1 Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Wed, 20 Nov 2024 03:35:03 +0200
Subject: [PATCH 10/23] trailing spaces

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 R/pre-msa-tree.R | 66 ++++++++++++++++++++++++------------------------
 R/summarize.R    |  6 ++---
 2 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/R/pre-msa-tree.R b/R/pre-msa-tree.R
index e15e7996..6e2e8fd7 100644
--- a/R/pre-msa-tree.R
+++ b/R/pre-msa-tree.R
@@ -46,7 +46,7 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE")
 #' @param y Delimitter. Default is space (" ").
 #'
 #' @importFrom rlang abort
-#'
+#' 
 #' @return A character vector in title case.
 #' @export
 #'
@@ -112,21 +112,21 @@ addLeaves2Alignment <- function(aln_file = "",
     lin_file = "data/rawdata_tsv/all_semiclean.txt", # !! finally change to all_clean.txt!!
     # lin_file="data/rawdata_tsv/PspA.txt",
     reduced = FALSE) {
-
+    
     #Check if the alignment file is provided and exists
     if (nchar(aln_file) == 0) {
         abort("Error: Alignment file path must be provided.")
     }
-
+    
     if (!file.exists(aln_file)) {
         abort(paste("Error: The alignment file '", aln_file, "' does not exist."))
     }
-
+    
     # Check if the lineage file exists
     if (!file.exists(lin_file)) {
         abort(paste("Error: The lineage file '", lin_file, "' does not exist."))
     }
-
+    
     # Check that the 'reduced' parameter is logical
     if (!is.logical(reduced) || length(reduced) != 1) {
         abort("Error: 'reduced' must be a single logical value (TRUE or FALSE).")
@@ -249,15 +249,15 @@ addName <- function(data,
     if (!is.data.frame(data)) {
         abort("Error: The input 'data' must be a data frame")
     }
-
+    
     # Check that the specified columns exist in the data
     required_cols <- c(accnum_col, spec_col, lin_col)
     missing_cols <- setdiff(required_cols, names(data))
     if (length(missing_cols) > 0) {
-        abort(paste("Error: The following columns are missing from the data:",
+        abort(paste("Error: The following columns are missing from the data:", 
                    paste(missing_cols, collapse = ", ")))
     }
-
+    
     cols <- c(accnum_col, "Kingdom", "Phylum", "Genus", "Spp")
     split_data <- data %>%
         separate(
@@ -347,16 +347,16 @@ convertAlignment2FA <- function(aln_file = "",
     if (nchar(aln_file) == 0) {
         abort("Error: Alignment file path must be provided.")
     }
-
+    
     if (!file.exists(aln_file)) {
         abort(paste("Error: The alignment file '", aln_file, "' does not exist."))
     }
-
+    
     # Check if the lineage file exists
     if (!file.exists(lin_file)) {
         abort(paste("Error: The lineage file '", lin_file, "' does not exist."))
     }
-
+    
     # Check that the 'reduced' parameter is logical
     if (!is.logical(reduced) || length(reduced) != 1) {
         abort("Error: 'reduced' must be a single logical value (TRUE or FALSE).")
@@ -424,14 +424,14 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") {
     if (!is.data.frame(acc2name)) {
         abort("Error: acc2name must be a data frame.")
     }
-
+    
     # Check if the specified columns exist in the data frame
     if (!(acc_col %in% colnames(acc2name))) {
-        abort("Error: The specified acc_col '", acc_col, "' does not exist in
+        abort("Error: The specified acc_col '", acc_col, "' does not exist in 
              acc2name.")
     }
     if (!(name_col %in% colnames(acc2name))) {
-        abort("Error: The specified name_col '", name_col, "' does not exist in
+        abort("Error: The specified name_col '", name_col, "' does not exist in 
              acc2name.")
     }
 
@@ -475,7 +475,7 @@ rename_fasta <- function(fa_path, outpath,
         abort("Error: The input FASTA file does not exist at the specified
              path: ", fa_path)
     }
-
+    
     # Check if the output path is writable
     outdir <- dirname(outpath)
     if (!dir.exists(outdir)) {
@@ -541,20 +541,20 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"),
     reduced = F) {
     # Check if the alignment path exists
     if (!dir.exists(aln_path)) {
-        abort("Error: The alignment directory does not exist at the specified
+        abort("Error: The alignment directory does not exist at the specified 
              path: ", aln_path)
     }
-
+    
     # Check if the output path exists; if not, attempt to create it
     if (!dir.exists(fa_outpath)) {
         dir.create(fa_outpath, recursive = TRUE)
-        message("Note: The output directory did not exist and has been created: ",
+        message("Note: The output directory did not exist and has been created: ", 
                 fa_outpath)
     }
-
+    
     # Check if the linear file exists
     if (!file.exists(lin_file)) {
-        abort("Error: The linear file does not exist at the specified path: ",
+        abort("Error: The linear file does not exist at the specified path: ", 
              lin_file)
     }
     # library(here)
@@ -626,7 +626,7 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
     if (!is.character(accessions) || length(accessions) == 0) {
         abort("Error: 'accessions' must be a non-empty character vector.")
     }
-
+    
     if (!dir.exists(dirname(outpath))) {
         abort("Error: The output directory does not exist: ", dirname(outpath))
     }
@@ -676,7 +676,7 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
                 id = accessions_partitioned[[x]],
                 db = "protein",
                 rettype = "fasta",
-                #api_key = Sys.getenv("ENTREZ_API_KEY")
+                # api_key = Sys.getenv("ENTREZ_API_KEY")
             )
         )
     })
@@ -732,21 +732,21 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
 createRepresentativeAccNum <- function(prot_data,
     reduced = "Lineage",
     accnum_col = "AccNum") {
-
+    
     # Validate input
     if (!is.data.frame(prot_data)) {
         abort("Error: 'prot_data' must be a data frame.")
     }
-
+    
     # Check if the reduced column exists in prot_data
     if (!(reduced %in% colnames(prot_data))) {
-        abort("Error: The specified reduced column '", reduced, "' does not
+        abort("Error: The specified reduced column '", reduced, "' does not 
              exist in the data frame.")
     }
-
+    
     # Check if the accnum_col exists in prot_data
     if (!(accnum_col %in% colnames(prot_data))) {
-        abort("Error: The specified accession number column '", accnum_col, "'
+        abort("Error: The specified accession number column '", accnum_col, "' 
              does not exist in the data frame.")
     }
     # Get Unique reduced column and then bind the AccNums back to get one AccNum per reduced column
@@ -808,7 +808,7 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
     if (!file.exists(fasta_file)) {
         abort("Error: The FASTA file does not exist: ", fasta_file)
     }
-
+    
     # if (file_ext(fasta_file) != "fasta" && file_ext(fasta_file) != "fa") {
     #    abort("Error: The specified file is not a valid FASTA file: ", fasta_file)
     # }
@@ -857,23 +857,23 @@ writeMSA_AA2FA <- function(alignment, outpath) {
     if (!inherits(alignment, "AAMultipleAlignment")) {
         abort("Error: The alignment must be of type 'AAMultipleAlignment'.")
     }
-
+    
     # Check the output path is a character string
     if (!is.character(outpath) || nchar(outpath) == 0) {
         abort("Error: Invalid output path specified.")
     }
-
+    
     # Check if the output directory exists
     outdir <- dirname(outpath)
     if (!dir.exists(outdir)) {
         abort("Error: The output directory does not exist: ", outdir)
     }
 
-    l <- length(names(unmasked(alignment)))
+    l <- length(names(unmasked((alignment))))
     fasta <- ""
     for (i in 1:l)
     {
-        fasta <- paste0(fasta, paste(">", names(unmasked(alignment)[i])), "\n")
+        fasta <- paste0(fasta, paste(">", names(unmasked((alignment)))[i]), "\n")
         seq <- toString(unmasked(alignment)[[i]])
         fasta <- paste0(fasta, seq, "\n")
     }
@@ -955,4 +955,4 @@ getAccNumFromFA <- function(fasta_file) {
 #   cfile <- read_delim("data/alignments/pspc.gismo.aln", delim=" ")
 #   cfile <- as.data.frame(map(cfile,function(x) gsub("\\s+", "",x)))
 #   colnames(cfile) <- c("AccNum", "Alignment")
-# }
+# }
\ No newline at end of file
diff --git a/R/summarize.R b/R/summarize.R
index c9e965bd..b1fef89d 100644
--- a/R/summarize.R
+++ b/R/summarize.R
@@ -690,7 +690,7 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col
         abort("Error: 'digits' must be a non-negative integer.")
     }
     
-    column <- sym(column)
+    # column <- sym(column)
 
     prot <- select(prot, {{ column }}, {{ lineage_col }}) %>%
         filter(!is.na({{ column }}) & !is.na({{ lineage_col }})) %>%
@@ -698,10 +698,10 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col
 
     prot <- summarizeByLineage(prot, column, by = lineage_col, query = "all")
     col_count <- prot %>%
-        group_by({{ column }}) %>%
+        group_by(!!sym(column)) %>%
         summarise(totalcount = sum(count))
 
-    total <- left_join(prot, col_count, by = as_string(column))
+    total <- left_join(prot, col_count, by = column)
 
     sum_count <- sum(total$count)
     total <- total %>%

From 397c02c028cd429bcf157afbea06c560f12db633 Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Wed, 20 Nov 2024 04:29:41 +0200
Subject: [PATCH 11/23] update paths and address suggestions

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 R/ipr2viz.R                                  |  2 +-
 inst/report/scripts/MolEvolData_class.R      | 31 --------------------
 inst/report/scripts/generate_report.R        |  6 ++--
 inst/report/scripts/run_molevolvr_pipeline.R | 30 ++++++++++---------
 4 files changed, 20 insertions(+), 49 deletions(-)

diff --git a/R/ipr2viz.R b/R/ipr2viz.R
index a4769ed6..e5fed49f 100644
--- a/R/ipr2viz.R
+++ b/R/ipr2viz.R
@@ -212,7 +212,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
     analysis_labeler <- analyses %>%
         pivot_wider(names_from = .data$Analysis, values_from = .data$Analysis)
 
-    lookup_tbl_path <- "~/awasyn/new_trial/cln_lookup_tbl.tsv"
+    system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE)
     lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols)
 
     lookup_tbl <- lookup_tbl %>% select(-.data$ShortName) # Already has ShortName -- Just needs SignDesc
diff --git a/inst/report/scripts/MolEvolData_class.R b/inst/report/scripts/MolEvolData_class.R
index 791f1d6a..f4998ee9 100644
--- a/inst/report/scripts/MolEvolData_class.R
+++ b/inst/report/scripts/MolEvolData_class.R
@@ -41,37 +41,6 @@ setClass("seqUpload",
          )
 )
 
-# Group by lineage + DA then take top 20
-top_acc <- function(cln_file, DA_col = "DomArch.Pfam",
-                    lin_col = "Lineage", n = 20) {
-    lin_sym <- sym(lin_col)
-    DA_sym <- sym(DA_col)
-
-    cln <- fread(cln_file, sep = "\t", fill = T)
-
-    grouped <- cln %>%
-        group_by({{ lin_sym }}, {{ DA_sym }}) %>%
-        summarise(count = n()) %>%
-        arrange(-count) %>%
-        filter(!is.na({{ lin_sym }}) & !is.na({{ DA_sym }}))
-
-    top_acc <- character(n)
-    for (r in 1:min(nrow(grouped), n))
-    {
-        l <- (grouped %>% pull({{ lin_sym }}))[r]
-        DA <- (grouped %>% pull({{ DA_sym }}))[r]
-
-        filt <- cln %>% filter({{ lin_sym }} == l & {{ DA_sym }} == DA)
-
-        top <- filt[which(filt$PcPositive == max(filt$PcPositive))[1], ]
-
-        top_acc[r] <- top$AccNum
-    }
-    top_acc <- top_acc[which(top_acc != "")]
-    return(top_acc)
-}
-
-
 combineFilesNopmap <- function(inpath, pattern, outpath,
                                  delim = "\t", skip = 0,
                                  col_names) {
diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R
index 4ad5b076..e7ab37b3 100644
--- a/inst/report/scripts/generate_report.R
+++ b/inst/report/scripts/generate_report.R
@@ -303,7 +303,7 @@ runAnalysis <- function(
 
     # After uploading the sequence data, you would check the uploaded data
     if (sequence_upload_data@seqs == "") {
-        stop("Error: Please upload a protein sequence")
+        stop("Error: Please upload a protein sequence.")
     }
     OUT_PATH <- getwd()
     unavailable_pins <- list.files(OUT_PATH)
@@ -339,7 +339,7 @@ runAnalysis <- function(
                        validateFasta(tmp_file)
                    },
                    error = function(e) {
-                       warning("Error: Failed to run input FASTA verification")
+                       warning("Error: Failed to run input FASTA verification.")
                        return(FALSE)  # Return FALSE if an error occurs
                    },
                    finally = {
@@ -526,7 +526,7 @@ runAnalysis <- function(
         }
         if (ipr_upload_data()@seqs == "" && !ipr_ncbi_check) {
             stop("Error: Please provide a file containing sequences or check the
-                 box to use fetch sequences for NCBI accession numbers.")
+                box to fetch sequences using NCBI accession numbers.")
         }
 
         ipr <- read_tsv(ipr_upload_data()@df, col_names = FALSE)
diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R
index ea52a3b5..7cf1ebe6 100644
--- a/inst/report/scripts/run_molevolvr_pipeline.R
+++ b/inst/report/scripts/run_molevolvr_pipeline.R
@@ -187,7 +187,7 @@ runMolevolvrPipeline <- function(input_paths, db, nhits, eval,
     # setwd(OUTDIR)
 
     # Run DELTABLAST
-    runDeltablast(input_paths, PREFIX, OUTDIR, db, nhits, eval)
+    runDELTABLAST(input_paths, PREFIX, OUTDIR, db, nhits, eval)
 
     # Run ACC2FA
     convertAccNum2Fasta(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")),
@@ -207,7 +207,7 @@ runMolevolvrPipeline <- function(input_paths, db, nhits, eval,
   # Sys.sleep(30)
 
   # Run BLASTCLUST
-  runBlastclust(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
+  runCDHIT(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
                  PREFIX, OUTDIR )
 
   # Convert clusters to table
@@ -541,7 +541,7 @@ replaceAccNums <- function(path_acc2info,
 }
 
 
-runDeltablast <- function(infile, prefix, outdir,
+runDELTABLAST <- function(infile, prefix, outdir,
                            db = "refseq_protein",
                            nhits = 5000, evalue = 1e-5,
                            threads = 10) {
@@ -700,9 +700,9 @@ cleanupBlast <- function(infile_blast, acc2info, prefix, wblast = F) {
 
   # TaxID to lineage mapping
   cleanedup_blast$TaxID <- as.integer(cleanedup_blast$TaxID)
-  lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt",
-                       header = TRUE, fill = TRUE,
-                       colClasses = lineage_map_cols)
+  lineage_map <- fread(
+    system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE),
+    header = TRUE, fill = TRUE, colClasses = lineage_map_cols)
 
   # Merge with lineage map and clean up columns
   mergedLins <- merge(cleanedup_blast, lineage_map, by = "TaxID",
@@ -722,7 +722,7 @@ cleanupBlast <- function(infile_blast, acc2info, prefix, wblast = F) {
 
 
 # Function to run BLASTCLUST on given input
-runBlastclust <- function(infile, suffix, outdir) {
+runCDHIT <- function(infile, suffix, outdir) {
 
   # Prepare output file path
   outfile <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv"))
@@ -734,13 +734,13 @@ runBlastclust <- function(infile, suffix, outdir) {
   cat("## Now running BLASTCLUST on file(s):", infile, "\n")
   cat("#####################################\n")
 
-  # Run BLASTCLUST
+  # Cluster sequences w/ BLASTCLUST/CD-HIT
   # blastclust_cmd <- paste("blastclust -i", infile, "-o", outfile, "-p T -L .6 -b T -S 80 -a 8")
   cdhit_command <- sprintf(
     "cd-hit -i %s -o %s -c 0.8 -aS 0.6 -T 8",
     infile, outfile
   )
-  clean_command <- sprintf(
+  clean_cdhit_format <- sprintf(
     "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s",
     input_file,
     outfile
@@ -750,7 +750,7 @@ runBlastclust <- function(infile, suffix, outdir) {
   # system(blastclust_cmd)
 
   system(cdhit_command)
-  system(clean_command)
+  system(clean_cdhit_format)
 
 }
 
@@ -881,8 +881,9 @@ ipr2Linear <- function(ipr, acc2info, prefix) {
   ipr_tax <- left_join(ipr_in, acc2info_out, by = "AccNum")
 
   # read in lineage map
-  lineage_map <- fread("~/awasyn/new_trial/lineage_lookup.txt",
-                       header = T, fill = T)
+  lineage_map <- fread(
+    system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE),
+    header = TRUE, fill = TRUE)
 
   # merge ipr+info w/ lineage
   # both tables have a species column, but only
@@ -895,8 +896,9 @@ ipr2Linear <- function(ipr, acc2info, prefix) {
     select(-Species.x, -Species.y)
 
   # add lookup table to iprscan file
-  lookup_tbl <- fread(input = "~/awasyn/new_trial/cln_lookup_tbl.tsv",
-                      sep = "\t", header = T, fill = T) %>%
+  lookup_tbl <- fread(
+    system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE),
+                      sep = "\t", header = TRUE, fill = TRUE) %>%
     distinct()
   if ("AccNum.x" %in% names(ipr_lin)) {
     ipr_lin <- ipr_lin %>%

From 16e957159ff950149a2e6ae3621c15ebd67ec0cc Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Wed, 20 Nov 2024 05:26:55 +0200
Subject: [PATCH 12/23] add generate case study function

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 .gitignore                                   |  3 +-
 inst/report/report_template.Rmd              | 10 +++++-
 inst/report/scripts/generate_report.R        | 34 ++++++++++++++++++--
 inst/report/scripts/run_molevolvr_pipeline.R |  7 +++-
 4 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index ef11006e..74dbc757 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .Rproj.user
 docs
+common_data
 .Rhistory
-.DS_Store
\ No newline at end of file
+.DS_Store
diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd
index dddf357c..195e7887 100644
--- a/inst/report/report_template.Rmd
+++ b/inst/report/report_template.Rmd
@@ -472,7 +472,15 @@ if(length(params$repAccNums) >= 3 ) {
 
   # Call MSA2PDF
   msa_pdf_path <- tempfile()
-  msa_prefix <- "~/awasyn/MolEvolvRsHzFyk_full/msa_figs/"
+  msa_parent_dir <- dirname(params$app_data@fasta_path)
+  msa_subdir_path <- file.path(parent_dir, "msa_figs")
+  if (!dir.exists(subdir_path)) {
+    dir.create(subdir_path, recursive = TRUE)
+    message("Subdirectory 'msa_figs' created at: ", subdir_path)
+  } else {
+    message("Subdirectory 'msa_figs' already exists at: ", subdir_path)
+  }
+  msa_prefix <- paste0(msa_subdir_path, "/")
   post_fix <- paste("msa", params$query_pin, params$PhyloSelect, params$msa_reduce_by, ".pdf", sep = "_")
 
   msa_pdf_path <- paste0(msa_prefix, post_fix)
diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R
index e7ab37b3..6f7fcaef 100644
--- a/inst/report/scripts/generate_report.R
+++ b/inst/report/scripts/generate_report.R
@@ -1,6 +1,36 @@
 # Author(s): Awa Synthia
 # Last modified: 2024
 
+getCaseStudyReport <- function(pathogen = NULL, drug = NULL, ...) {
+  cat("\n")
+  cat("\033[1;36m")  # Cyan for the "MolEvolvR" header
+  cat("=========================================\n")
+  cat("            M o l E v o l R\n")
+  cat("=========================================\n")
+  cat("\033[0m")
+
+  cat("\033[1;32m")
+  cat("\n>>>>> Case Study Report Generated <<<<<\n")
+  cat("\033[0m")
+
+  # Step 1: Fetch the FASTA sequences of the given pathogen and drug
+  getCardData(pathogen, drug)
+
+  # Check if the FASTA file exists after fetching
+  fasta_file <- "filtered_proteins.fasta"
+  if (!file.exists(fasta_file)) {
+    stop("Failed to retrieve FASTA sequences. The file
+         'filtered_proteins.fasta' does not exist.")
+  }
+  # Step 2: Once the FASTA file is ready, call runAnalysis to run pipeline
+  message("FASTA file downloaded, running analysis...")
+
+  # Pass the file to runAnalysis function
+  runAnalysis(file_paths = list(fasta = fasta_file))
+
+  message("Case study report completed")
+}
+
 # get fasta of pathogen and/or drug
 getCardData <- function(pathogen = NULL, drug = NULL) {
   destination_dir <- "CARD_data"
@@ -84,8 +114,8 @@ runAnalysis <- function(
         acc_homology_analysis = TRUE,
         acc_da_analysis = TRUE,
         acc_phylogeny_analysis = FALSE,
-        report_template_path = "/report/report_template.Rmd",
-        output_file = file.path(tempdir(), "report.html"),
+        report_template_path = system.file("report", "report_template.Rmd", package = "MolEvolvR", mustWork = TRUE),
+        output_file = file.path(getwd(), "report.html"),
         DASelect = "All",
         mainSelect = NULL,
         PhyloSelect = "All",
diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R
index 7cf1ebe6..c9c0deeb 100644
--- a/inst/report/scripts/run_molevolvr_pipeline.R
+++ b/inst/report/scripts/run_molevolvr_pipeline.R
@@ -846,8 +846,13 @@ runIPRScan <- function(query_file, prefix, outdir) {
 
   # Run InterProScan command
   # Construct the command
+
+  # get the path to the interproscan.sh script from the environment
+  # variable INTERPROSCAN_CMD, or assume it's on the path if unspecified
+  iprscan_cmd <- Sys.getenv("INTERPROSCAN_CMD", unset="interproscan.sh")
+
   command <- paste(
-    "~/iprdir/interproscan-5.70-102.0/interproscan.sh -i",
+    iprscan_cmd, "-i",
     shQuote(query_file),
     "-b", shQuote(outfile),
     "-f TSV --cpu", Sys.getenv("INTERPROSCAN_CPUS", "4"),

From 83ccfb751ad8f71663d637c6d3e82576feea09d6 Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Wed, 20 Nov 2024 05:54:31 +0200
Subject: [PATCH 13/23] update generate function

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 inst/report/scripts/generate_report.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R
index 6f7fcaef..06128453 100644
--- a/inst/report/scripts/generate_report.R
+++ b/inst/report/scripts/generate_report.R
@@ -10,7 +10,7 @@ getCaseStudyReport <- function(pathogen = NULL, drug = NULL, ...) {
   cat("\033[0m")
 
   cat("\033[1;32m")
-  cat("\n>>>>> Case Study Report Generated <<<<<\n")
+  cat("\n>>>>> Case Study Report Generation started... <<<<<\n")
   cat("\033[0m")
 
   # Step 1: Fetch the FASTA sequences of the given pathogen and drug
@@ -26,7 +26,7 @@ getCaseStudyReport <- function(pathogen = NULL, drug = NULL, ...) {
   message("FASTA file downloaded, running analysis...")
 
   # Pass the file to runAnalysis function
-  runAnalysis(file_paths = list(fasta = fasta_file))
+  runAnalysis(file_paths = list(fasta = fasta_file), ...)
 
   message("Case study report completed")
 }

From 4bf5b00e32802c1a42997d510789a590af0ba174 Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Wed, 20 Nov 2024 19:14:00 +0200
Subject: [PATCH 14/23] add getCaseStudyReport function

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 R/ipr2viz.R                                  |  2 +-
 inst/report/scripts/generate_report.R        | 18 +++++---
 inst/report/scripts/run_molevolvr_pipeline.R | 43 ++++++++++++++++----
 inst/report/scripts/viz_utils.R              | 34 +++++++++-------
 4 files changed, 69 insertions(+), 28 deletions(-)

diff --git a/R/ipr2viz.R b/R/ipr2viz.R
index e5fed49f..fb145c67 100644
--- a/R/ipr2viz.R
+++ b/R/ipr2viz.R
@@ -212,7 +212,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
     analysis_labeler <- analyses %>%
         pivot_wider(names_from = .data$Analysis, values_from = .data$Analysis)
 
-    system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE)
+    lookup_tbl_path <- system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE)
     lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols)
 
     lookup_tbl <- lookup_tbl %>% select(-.data$ShortName) # Already has ShortName -- Just needs SignDesc
diff --git a/inst/report/scripts/generate_report.R b/inst/report/scripts/generate_report.R
index 06128453..d1dd178b 100644
--- a/inst/report/scripts/generate_report.R
+++ b/inst/report/scripts/generate_report.R
@@ -1,6 +1,12 @@
 # Author(s): Awa Synthia
 # Last modified: 2024
 
+# load functions to use
+source("MolEvolData_class.R")
+source("run_molevolvr_pipeline.R")
+source("viz_utils.R")
+
+# example usage: getCaseStudyReport("Acinetobacter baumannii", "Beta-lactams")
 getCaseStudyReport <- function(pathogen = NULL, drug = NULL, ...) {
   cat("\n")
   cat("\033[1;36m")  # Cyan for the "MolEvolvR" header
@@ -616,7 +622,7 @@ runAnalysis <- function(
 
     domarch_cols_value <- getDomArchCols(app_data, DASelect)
 
-    query_domarch_cols_value <- getDomArchCols(query_data@df)
+    query_domarch_cols_value <- getQueryDomArchCols(query_data@df)
 
     mainTable_value <- getDataTable(data)
 
@@ -630,8 +636,8 @@ runAnalysis <- function(
 
     rs_IprGenes_value <- getIPRGenesVisualization(data,
                                                           app_data,
-                                                          input_rs_iprDatabases,
-                                                          input_rs_iprVisType)
+                                                          query_iprDatabases,
+                                                          query_iprVisType)
 
     rs_network_layout_value <- getRSNetworkLayout(data,
                                                           app_data,
@@ -641,9 +647,9 @@ runAnalysis <- function(
     rs_data_table_value <- getDataTable(data)
 
     da_IprGenes_value <- getDomArchIPRGenesPlot(app_data,
-                                                    da_iprDatabases,
-                                                    da_iprVisType,
-                                                    DASelect)
+                                                query_iprDatabases,
+                                                query_iprVisType,
+                                                DASelect)
 
     query_heatmap_value <- getQueryHeatmap(query_data@df,
                                                   heatmap_select = "All",
diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/inst/report/scripts/run_molevolvr_pipeline.R
index c9c0deeb..94324599 100644
--- a/inst/report/scripts/run_molevolvr_pipeline.R
+++ b/inst/report/scripts/run_molevolvr_pipeline.R
@@ -740,18 +740,47 @@ runCDHIT <- function(infile, suffix, outdir) {
     "cd-hit -i %s -o %s -c 0.8 -aS 0.6 -T 8",
     infile, outfile
   )
-  clean_cdhit_format <- sprintf(
-    "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s",
-    input_file,
-    outfile
-  )
+  # clean_cdhit_format <- sprintf(
+  #   "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s",
+  #   input_file,
+  #   outfile
+  # )
 
   cat("\nPerforming BLASTCLUST analysis on", infile, "\n")
   # system(blastclust_cmd)
 
   system(cdhit_command)
-  system(clean_cdhit_format)
+  # system(clean_cdhit_format)
+  cleanCDHIT(input_file, outfile)
+  cat("Cleaning CDHIT results completed")
+}
 
+# extract_sequences("input_file.tsv.clstr", "output_file.tsv")
+cleanCDHIT <- function(input_file, output_file) {
+  # Read the input file line by line
+  lines <- readLines(input_file)
+
+  # Initialize an empty vector to store extracted identifiers
+  extracted <- c()
+
+  # Loop through lines and extract content between ">" and "..."
+  for (line in lines) {
+    # Only process lines containing valid sequences (skip cluster headers)
+    if (grepl(">", line) && grepl("\\.\\.\\.", line)) {
+      # Extract the sequence identifier between ">" and "..."
+      match <- regmatches(line, regexpr(">(.*?)\\.\\.\\.", line))
+      if (length(match) > 0) {
+        sequence <- gsub("^>", "", match)  # Remove the leading ">"
+        sequence <- gsub("\\.\\.\\.$", "", sequence)  # Remove trailing dots
+        extracted <- c(extracted, sequence)
+      }
+    }
+  }
+
+  # Write the extracted sequences to the output file
+  writeLines(extracted, output_file)
+
+  cat("Sequences extracted and written to:", output_file, "\n")
 }
 
 # Function to format blastclust output
@@ -902,7 +931,7 @@ ipr2Linear <- function(ipr, acc2info, prefix) {
 
   # add lookup table to iprscan file
   lookup_tbl <- fread(
-    system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE),
+    system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE),
                       sep = "\t", header = TRUE, fill = TRUE) %>%
     distinct()
   if ("AccNum.x" %in% names(ipr_lin)) {
diff --git a/inst/report/scripts/viz_utils.R b/inst/report/scripts/viz_utils.R
index 573e0739..aa62e126 100644
--- a/inst/report/scripts/viz_utils.R
+++ b/inst/report/scripts/viz_utils.R
@@ -10,8 +10,8 @@ library(plotly)
 
 # Function to generate the InterProScan Visualization
 getIPRGenesVisualization <- function(data, app_data,
-                                             input_rs_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"),
-                                             input_rs_iprVisType = "Analysis") {
+                                    query_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"),
+                                    query_iprVisType = "Analysis") {
 
     # Check if analysis is loaded
     if (nrow(data@df) == 0 || app_data@ipr_path == "") {
@@ -33,8 +33,8 @@ getIPRGenesVisualization <- function(data, app_data,
         ipr_plot <- plotIPR2VizWeb(
             infile_ipr = data@ipr_path,
             accessions = data@df$Name,
-            analysis = input_rs_iprDatabases,
-            group_by = input_rs_iprVisType,
+            analysis = query_iprDatabases,
+            group_by = query_iprVisType,
             name = n
         )
 
@@ -45,8 +45,8 @@ getIPRGenesVisualization <- function(data, app_data,
             infile_ipr = data@ipr_path,
             infile_full = data@df,
             accessions = unique(data@df$Name),
-            analysis = input_rs_iprDatabases,
-            group_by = input_rs_iprVisType,
+            analysis = query_iprDatabases,
+            group_by = query_iprVisType,
             topn = 20,   # This value is hardcoded in the original code
             query = "All"
         )
@@ -269,7 +269,13 @@ getMSAData <- function(msa_path) {
     if (is.null(msa_path) || msa_path == "") {
         stop("Error: MSA path is not provided.")
     }
-    return(read_file(msa_path))
+    # Attempt to read the file and handle potential errors
+    if (file.exists(msa_path)) {
+        return(read_file(msa_path))
+    } else {
+        warning(sprintf("Warning: Unable to read the file at path '%s'. Ignoring...", msa_path))
+        return(NULL)  # Return NULL if the file cannot be read
+    }
 }
 
 # Function to generate a heatmap
@@ -307,7 +313,7 @@ getQueryHeatmap <- function(query_data_df,
 }
 
 # Function to retrieve domain architecture columns
-getDomArchCols <- function(query_data_df) {
+getQueryDomArchCols <- function(query_data_df) {
     # Check if query data exists
     if (nrow(query_data_df) == 0) {
         stop("No query data available.")
@@ -712,8 +718,8 @@ getDomArchCols <- function(app_data, DASelect) {
 }
 
 # Function to generate the IPR genes plot
-getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases,
-                                       da_iprVisType, DASelect) {
+getDomArchIPRGenesPlot <- function(app_data, query_iprDatabases,
+                                   query_iprVisType, DASelect) {
 
     if (app_data@ipr_path == "") {
         stop("IPR path is not set.")
@@ -739,8 +745,8 @@ getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases,
         plot <- plotIPR2VizWeb(
             infile_ipr = app_data@ipr_path,
             accessions = df$Name,
-            analysis = da_iprDatabases,
-            group_by = da_iprVisType,
+            analysis = query_iprDatabases,
+            group_by = query_iprVisType,
             name = name_column
         )
     } else {
@@ -749,8 +755,8 @@ getDomArchIPRGenesPlot <- function(app_data, da_iprDatabases,
             infile_ipr = app_data@ipr_path,
             infile_full = df,
             accessions = unique(df$Name),
-            analysis = da_iprDatabases,
-            group_by = da_iprVisType,
+            analysis = query_iprDatabases,
+            group_by = query_iprVisType,
             topn = 20,
             query = DASelect
         )

From e4af641bf998987e23a27b97e046cdf19fdaf91f Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Thu, 21 Nov 2024 00:58:02 +0200
Subject: [PATCH 15/23] move functions to R/

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 NAMESPACE                                              |  1 +
 {inst/report/scripts => R}/MolEvolData_class.R         |  8 ++++----
 {inst/report/scripts => R}/generate_report.R           |  6 ++----
 .../run_molevolvr_pipeline.R => R/run_pipeline.R       |  4 ++--
 {inst/report/scripts => R}/viz_utils.R                 |  2 +-
 inst/report/report_template.Rmd                        | 10 +++++-----
 6 files changed, 15 insertions(+), 16 deletions(-)
 rename {inst/report/scripts => R}/MolEvolData_class.R (99%)
 rename {inst/report/scripts => R}/generate_report.R (99%)
 rename inst/report/scripts/run_molevolvr_pipeline.R => R/run_pipeline.R (99%)
 rename {inst/report/scripts => R}/viz_utils.R (99%)

diff --git a/NAMESPACE b/NAMESPACE
index 6ae464f6..1a2f3a99 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -49,6 +49,7 @@ export(findParalogs)
 export(formatJobArgumentsHTML)
 export(generateAllAlignments2FA)
 export(getAccNumFromFA)
+export(getCaseStudyReport)
 export(getProcessRuntimeWeights)
 export(getTopAccByLinDomArch)
 export(mapAcc2Name)
diff --git a/inst/report/scripts/MolEvolData_class.R b/R/MolEvolData_class.R
similarity index 99%
rename from inst/report/scripts/MolEvolData_class.R
rename to R/MolEvolData_class.R
index f4998ee9..bfdd846c 100644
--- a/inst/report/scripts/MolEvolData_class.R
+++ b/R/MolEvolData_class.R
@@ -415,7 +415,7 @@ validateAccNumFasta <- function(text) {
 }
 
 
-#' Test whether a single accession returns a valid protein from Entrez
+# Test whether a single accession returns a valid protein from Entrez
 isAccNumValidEntrez <- function(accnum, verbose = FALSE) {
     # empty accnum wil not raise an error from efetch, so test for this first
     if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)}
@@ -435,7 +435,7 @@ isAccNumValidEntrez <- function(accnum, verbose = FALSE) {
     return(result)
 }
 
-#' Test whether a single accession returns a valid protein from EBI
+# Test whether a single accession returns a valid protein from EBI
 isAccNumValidEbi <- function(accnum, verbose = FALSE) {
     # validation: ensure there's some text to parse
     if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)}
@@ -460,7 +460,7 @@ isAccNumValidEbi <- function(accnum, verbose = FALSE) {
     return(result)
 }
 
-#' Perform a series of API reqs using NCBI entrez to validate accession numbers
+# Perform a series of API reqs using NCBI entrez to validate accession numbers
 performEntrezReqs <- function(accnums, verbose = FALSE, track_progress = FALSE) {
     # API guidelines docs
     # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html
@@ -501,7 +501,7 @@ performEbiReqs <- function(accnums, verbose = FALSE, track_progress = FALSE) {
     return(results)
 }
 
-#' Validate accession numbers from MolEvolvR user input
+# Validate accession numbers from MolEvolvR user input
 validateAccNum <- function(text, verbose = FALSE, track_progress = FALSE, n_steps = integer()) {
     # API guidelines docs
     # entrez https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Usage_Guidelines_and_Requiremen
diff --git a/inst/report/scripts/generate_report.R b/R/generate_report.R
similarity index 99%
rename from inst/report/scripts/generate_report.R
rename to R/generate_report.R
index d1dd178b..98377448 100644
--- a/inst/report/scripts/generate_report.R
+++ b/R/generate_report.R
@@ -2,10 +2,8 @@
 # Last modified: 2024
 
 # load functions to use
-source("MolEvolData_class.R")
-source("run_molevolvr_pipeline.R")
-source("viz_utils.R")
 
+#' @export
 # example usage: getCaseStudyReport("Acinetobacter baumannii", "Beta-lactams")
 getCaseStudyReport <- function(pathogen = NULL, drug = NULL, ...) {
   cat("\n")
@@ -687,7 +685,7 @@ runAnalysis <- function(
     ####### Report Generation ########
 
     tryCatch({
-        tempReport <- file.path(tempdir(), "report.Rmd")
+        tempReport <- file.path(dir, "report.Rmd")
         file.copy(report_template_path, tempReport, overwrite = TRUE)
 
         # List of graphics to include in report
diff --git a/inst/report/scripts/run_molevolvr_pipeline.R b/R/run_pipeline.R
similarity index 99%
rename from inst/report/scripts/run_molevolvr_pipeline.R
rename to R/run_pipeline.R
index 94324599..ffcc4f52 100644
--- a/inst/report/scripts/run_molevolvr_pipeline.R
+++ b/R/run_pipeline.R
@@ -215,7 +215,7 @@ runMolevolvrPipeline <- function(input_paths, db, nhits, eval,
             file.path(OUTDIR, paste0(PREFIX, ".blast.cln.tsv")))
 
   # Run INTERPROSCAN
-  runIPRScan(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
+  runIPRScan2(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
                    PREFIX, OUTDIR)
   new_header <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis", "DB.ID",
                   "SignDesc", "StartLoc", "StopLoc", "Score",
@@ -859,7 +859,7 @@ clust2Table <- function(clust, blast) {
 }
 
 # Function to run InterProScan
-runIPRScan <- function(query_file, prefix, outdir) {
+runIPRScan2 <- function(query_file, prefix, outdir) {
 
   # Start InterProScan run
   cat("\n######################\n")
diff --git a/inst/report/scripts/viz_utils.R b/R/viz_utils.R
similarity index 99%
rename from inst/report/scripts/viz_utils.R
rename to R/viz_utils.R
index aa62e126..590f987e 100644
--- a/inst/report/scripts/viz_utils.R
+++ b/R/viz_utils.R
@@ -732,7 +732,7 @@ getDomArchIPRGenesPlot <- function(app_data, query_iprDatabases,
              your data correctly.")
     }
 
-    if (is.null(da_iprDatabases) || length(da_iprDatabases) == 0) {
+    if (is.null(query_iprDatabases) || length(query_iprVisType) == 0) {
         stop("Please select an analysis.")
     }
 
diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd
index 195e7887..9d616e9e 100644
--- a/inst/report/report_template.Rmd
+++ b/inst/report/report_template.Rmd
@@ -473,12 +473,12 @@ if(length(params$repAccNums) >= 3 ) {
   # Call MSA2PDF
   msa_pdf_path <- tempfile()
   msa_parent_dir <- dirname(params$app_data@fasta_path)
-  msa_subdir_path <- file.path(parent_dir, "msa_figs")
-  if (!dir.exists(subdir_path)) {
-    dir.create(subdir_path, recursive = TRUE)
-    message("Subdirectory 'msa_figs' created at: ", subdir_path)
+  msa_subdir_path <- file.path(msa_parent_dir, "msa_figs")
+  if (!dir.exists(msa_subdir_path)) {
+    dir.create(msa_subdir_path, recursive = TRUE)
+    message("Subdirectory 'msa_figs' created at: ", msa_subdir_path)
   } else {
-    message("Subdirectory 'msa_figs' already exists at: ", subdir_path)
+    message("Subdirectory 'msa_figs' already exists at: ", msa_subdir_path)
   }
   msa_prefix <- paste0(msa_subdir_path, "/")
   post_fix <- paste("msa", params$query_pin, params$PhyloSelect, params$msa_reduce_by, ".pdf", sep = "_")

From 7b50cbf60345736930a8bfb754d0122422647f9a Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Thu, 21 Nov 2024 23:12:45 +0200
Subject: [PATCH 16/23] add libraries to description

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 DESCRIPTION           |  3 +++
 R/MolEvolData_class.R |  3 ---
 R/combine_files.R     | 16 ++++++++--------
 R/run_pipeline.R      | 14 --------------
 R/viz_utils.R         |  7 -------
 man/combineFiles.Rd   |  2 +-
 6 files changed, 12 insertions(+), 33 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 0fa9a949..fd0ba89a 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -40,6 +40,7 @@ Imports:
     DBI,
     doParallel,
     dplyr,
+    DT,
     forcats,
     fs,
     furrr,
@@ -51,6 +52,7 @@ Imports:
     here,
     htmltools,
     htmlwidgets,
+    httr,
     igraph,
     latexpdf,
     magrittr,
@@ -73,6 +75,7 @@ Imports:
     tibble,
     tidyr,
     tinytex,
+    tidyverse,
     tools,
     UpSetR,
     viridis,
diff --git a/R/MolEvolData_class.R b/R/MolEvolData_class.R
index bfdd846c..427bd85b 100644
--- a/R/MolEvolData_class.R
+++ b/R/MolEvolData_class.R
@@ -584,9 +584,6 @@ validateEvalue <- function(input_value) {
 # Author(s): JK
 # Last modified: 2023_06
 #===============================================================================
-
-library("Biostrings")
-library("tidyverse")
 #-------------------------------------------------------------------------------
 guessSeqType <- function(single_fasta, dna_guess_cutoff = 0.9, other_guess_cutoff = 0.5) {
     tb <- as_tibble(alphabetFrequency(single_fasta))
diff --git a/R/combine_files.R b/R/combine_files.R
index 4f03b1d2..1c5ac6a2 100755
--- a/R/combine_files.R
+++ b/R/combine_files.R
@@ -24,22 +24,22 @@
 #'
 #' @author Janani Ravi
 #'
-#' @param inpath Character. The master directory path where the files reside. 
+#' @param inpath Character. The master directory path where the files reside.
 #' The search is recursive (i.e., it will look in subdirectories as well).
-#' @param pattern Character. A search pattern to identify files to be combined. 
+#' @param pattern Character. A search pattern to identify files to be combined.
 #' Default is "*full_analysis.tsv".
-#' @param delim Character. The delimiter used in the input files. 
-#' Default is tab ("\t").
-#' @param skip Integer. The number of lines to skip at the beginning of each file. 
+#' @param delim Character. The delimiter used in the input files.
+#' Default is tab \code{"\t"}.
+#' @param skip Integer. The number of lines to skip at the beginning of each file.
 #' Default is 0.
-#' @param col_names Logical or character vector. If TRUE, the first row of each file 
-#' is treated as column names. Alternatively, a character vector can 
+#' @param col_names Logical or character vector. If TRUE, the first row of each file
+#' is treated as column names. Alternatively, a character vector can
 #' be provided to specify custom column names.
 #'
 #' @importFrom purrr pmap_dfr
 #' @importFrom readr cols
 #'
-#' @return A data frame containing the combined contents of all matched files. 
+#' @return A data frame containing the combined contents of all matched files.
 #' Each row will include a new column "ByFile" indicating the source file of the data.
 #'
 #' @export
diff --git a/R/run_pipeline.R b/R/run_pipeline.R
index ffcc4f52..c49b5f06 100644
--- a/R/run_pipeline.R
+++ b/R/run_pipeline.R
@@ -1,12 +1,6 @@
 # Author(s): Awa Synthia
 # Last modified: 2024
 
-# Load necessary libraries
-library(httr)
-library(data.table)
-library(readr)
-library(rentrez)
-
 getSeqs <- function(sequences,
                           acc_file_path = "accs.txt",
                           dir_path = "~",
@@ -320,7 +314,6 @@ acc2info <- function(infile, prefix, outdir) {
     }
   }
 
-
   if (any_atomic) {
     parsed_data <- data.frame(
       AccNum = docsums$oslt$value,
@@ -520,7 +513,6 @@ subsAccnum4cc2Info <- function(df_acc2info, df_header_map) {
   return(df_result)
 }
 
-
 replaceAccNums <- function(path_acc2info,
                                       path_query_header_map, path_out) {
 
@@ -540,7 +532,6 @@ replaceAccNums <- function(path_acc2info,
   write_tsv(df_acc2info_substituted, file = path_out, col_names = TRUE)
 }
 
-
 runDELTABLAST <- function(infile, prefix, outdir,
                            db = "refseq_protein",
                            nhits = 5000, evalue = 1e-5,
@@ -576,9 +567,7 @@ runDELTABLAST <- function(infile, prefix, outdir,
   cat("DELTABLAST completed.\n")
 }
 
-
 # This script converts AccNum to Fasta using NCBI's EDirect or EBI's API
-
 convertAccNum2Fasta <- function(infile, prefix, outdir) {
 
   # Create the output file path
@@ -647,7 +636,6 @@ convertAccNum2Fasta <- function(infile, prefix, outdir) {
   cat("#####################\n")
 }
 
-
 cleanupBlast <- function(infile_blast, acc2info, prefix, wblast = F) {
 
   outdir <- dirname(infile_blast)
@@ -720,7 +708,6 @@ cleanupBlast <- function(infile_blast, acc2info, prefix, wblast = F) {
   write_tsv(blast_names, file_name, col_names = TRUE)
 }
 
-
 # Function to run BLASTCLUST on given input
 runCDHIT <- function(infile, suffix, outdir) {
 
@@ -1090,7 +1077,6 @@ web_blast_colnames <- c("Query", "AccNum",
                         "EValue", "BitScore", "PcPosOrig",
                         "QSFrames") # specific to "blastx"
 
-
 # BLAST Command line
 cl_blast_colnames <- c("Query", "SAccNum", "AccNum",
                        "SAllSeqID", "STitle", "Species", "TaxID",
diff --git a/R/viz_utils.R b/R/viz_utils.R
index 590f987e..5549ff97 100644
--- a/R/viz_utils.R
+++ b/R/viz_utils.R
@@ -1,13 +1,6 @@
 # Author(s): Awa Synthia
 # Last modified: 2024
 
-# Load necessary packages
-library(dplyr)
-library(stringr)
-library(visNetwork)
-library(DT)
-library(plotly)
-
 # Function to generate the InterProScan Visualization
 getIPRGenesVisualization <- function(data, app_data,
                                     query_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"),
diff --git a/man/combineFiles.Rd b/man/combineFiles.Rd
index 81464fa6..afbc6c5f 100644
--- a/man/combineFiles.Rd
+++ b/man/combineFiles.Rd
@@ -20,7 +20,7 @@ The search is recursive (i.e., it will look in subdirectories as well).}
 Default is "*full_analysis.tsv".}
 
 \item{delim}{Character. The delimiter used in the input files.
-Default is tab ("\t").}
+Default is tab \code{"\t"}.}
 
 \item{skip}{Integer. The number of lines to skip at the beginning of each file.
 Default is 0.}

From beb01c2699f4d6d71083e2aa591cb8862ec1c594 Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Thu, 21 Nov 2024 23:43:51 +0200
Subject: [PATCH 17/23] short fix

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 DESCRIPTION      | 1 +
 R/run_pipeline.R | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/DESCRIPTION b/DESCRIPTION
index fd0ba89a..e2ff7eca 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -53,6 +53,7 @@ Imports:
     htmltools,
     htmlwidgets,
     httr,
+    httr2,
     igraph,
     latexpdf,
     magrittr,
diff --git a/R/run_pipeline.R b/R/run_pipeline.R
index c49b5f06..b94ba001 100644
--- a/R/run_pipeline.R
+++ b/R/run_pipeline.R
@@ -1,6 +1,9 @@
 # Author(s): Awa Synthia
 # Last modified: 2024
 
+# Temporal fix. should work normally in descr
+library(readr)
+
 getSeqs <- function(sequences,
                           acc_file_path = "accs.txt",
                           dir_path = "~",

From a1a90f9c3854cf39fa6b78e665f25a7e9f678e26 Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Fri, 22 Nov 2024 00:08:46 +0200
Subject: [PATCH 18/23] short fix

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 NAMESPACE           | 9 +++++++++
 R/generate_report.R | 3 ++-
 R/run_pipeline.R    | 4 ++--
 R/viz_utils.R       | 3 +++
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 1a2f3a99..b22ec462 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -99,6 +99,15 @@ export(wordcloud3)
 export(writeMSA_AA2FA)
 export(writeProcessRuntime2TSV)
 export(writeProcessRuntime2YML)
+import(DT)
+import(data.table)
+import(dplyr)
+import(httr)
+import(plotly)
+import(readr)
+import(rentrez)
+import(stringr)
+import(visNetwork)
 importFrom(Biostrings,AAStringSet)
 importFrom(Biostrings,readAAStringSet)
 importFrom(Biostrings,toString)
diff --git a/R/generate_report.R b/R/generate_report.R
index 98377448..aa2872c8 100644
--- a/R/generate_report.R
+++ b/R/generate_report.R
@@ -1,7 +1,8 @@
 # Author(s): Awa Synthia
 # Last modified: 2024
 
-# load functions to use
+# import libs
+#' @import readr stringr
 
 #' @export
 # example usage: getCaseStudyReport("Acinetobacter baumannii", "Beta-lactams")
diff --git a/R/run_pipeline.R b/R/run_pipeline.R
index b94ba001..24b08272 100644
--- a/R/run_pipeline.R
+++ b/R/run_pipeline.R
@@ -1,8 +1,8 @@
 # Author(s): Awa Synthia
 # Last modified: 2024
 
-# Temporal fix. should work normally in descr
-library(readr)
+# import libs
+#' @import readr data.table httr rentrez
 
 getSeqs <- function(sequences,
                           acc_file_path = "accs.txt",
diff --git a/R/viz_utils.R b/R/viz_utils.R
index 5549ff97..6f4a2b3e 100644
--- a/R/viz_utils.R
+++ b/R/viz_utils.R
@@ -1,6 +1,9 @@
 # Author(s): Awa Synthia
 # Last modified: 2024
 
+# import libs
+#' @import dplyr stringr visNetwork DT plotly
+#'
 # Function to generate the InterProScan Visualization
 getIPRGenesVisualization <- function(data, app_data,
                                     query_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"),

From 3d0ca3a5079db266a5f3a18d8de7452a3f587927 Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Fri, 22 Nov 2024 08:21:45 +0200
Subject: [PATCH 19/23] add fix

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 NAMESPACE             | 2 ++
 R/MolEvolData_class.R | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/NAMESPACE b/NAMESPACE
index b22ec462..eadec16a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -99,6 +99,7 @@ export(wordcloud3)
 export(writeMSA_AA2FA)
 export(writeProcessRuntime2TSV)
 export(writeProcessRuntime2YML)
+import(Biostrings)
 import(DT)
 import(data.table)
 import(dplyr)
@@ -107,6 +108,7 @@ import(plotly)
 import(readr)
 import(rentrez)
 import(stringr)
+import(tidyverse)
 import(visNetwork)
 importFrom(Biostrings,AAStringSet)
 importFrom(Biostrings,readAAStringSet)
diff --git a/R/MolEvolData_class.R b/R/MolEvolData_class.R
index 427bd85b..ab496048 100644
--- a/R/MolEvolData_class.R
+++ b/R/MolEvolData_class.R
@@ -1,6 +1,8 @@
 # Author(s): Samuel Chen
 # Last modified: 2020
 
+#' @import Biostrings tidyverse
+
 setClass("MolEvolData",
          slots = list(
              df = "data.frame",

From 9e11b4f69732c9b08850d7be5d4e951100679f87 Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Sun, 24 Nov 2024 01:04:32 +0200
Subject: [PATCH 20/23] use iprscanr

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 R/generate_report.R             | 18 +++----
 R/ipr2viz.R                     |  6 +--
 R/run_pipeline.R                | 85 +++++++++++++++++++++------------
 R/viz_utils.R                   |  4 +-
 inst/report/report_template.Rmd |  6 +--
 man/getTopAccByLinDomArch.Rd    |  2 +-
 6 files changed, 73 insertions(+), 48 deletions(-)

diff --git a/R/generate_report.R b/R/generate_report.R
index aa2872c8..65b8364d 100644
--- a/R/generate_report.R
+++ b/R/generate_report.R
@@ -129,13 +129,13 @@ runAnalysis <- function(
         GCCutoff = 0.5,
         query_select = NULL,
         query_iprDatabases = c(
-          "Pfam", "SMART", "Phobius",
-          "Gene3D", "TMHMM", "SignalP_GRAM_POSITIVE",
-          "SUPERFAMILY", "MobiDBLite", "TIGRFAM", "PANTHER", "Coils"
+          "PfamA", "SMART", "Phobius",
+          "Gene3d", "TMHMM", "SignalP_GRAM_POSITIVE",
+          "SuperFamily", "MobiDBLite", "Panther", "Coils"
         ),
         query_iprVisType = "Analysis", tree_msa_tool = "ClustalO",
         levels = 2,
-        DA_Col = "DomArch.Pfam",
+        DA_Col = "DomArch.PfamA",
         msa_rep_num = 10,
         msa_reduce_by = "Species",
         rval_phylo = FALSE,
@@ -496,6 +496,7 @@ runAnalysis <- function(
             phylo = phylo,
             type = type,
             job_code = pin_id,
+            APPLICATION = query_iprDatabases
         )
     }
 
@@ -656,7 +657,7 @@ runAnalysis <- function(
 
     DA_Prot_value <- getDomArchProt(app_data, DASelect)
 
-    DALinPlot_value <- getDomArchHeatmapPlot(DA_col = "DomArch.Pfam",
+    DALinPlot_value <- getDomArchHeatmapPlot(DA_col = "DomArch.PfamA",
                                                 DACutoff,
                                                 DA_Prot_value,
                                                 DA_lin_color = "viridis",
@@ -664,14 +665,14 @@ runAnalysis <- function(
 
     DALin_TotalCounts_value <- getDomArchTotalCounts(DA_Prot_value,
                                               DACutoff,
-                                              DA_col = "DomArch.Pfam",
+                                              DA_col = "DomArch.PfamA",
                                               app_data)
 
-    DALinTable_value <- getDomArchLinearTable(DA_col = "DomArch.Pfam",
+    DALinTable_value <- getDomArchLinearTable(DA_col = "DomArch.PfamA",
                                               app_data@ipr_path,
                                               DALin_TotalCounts_value)
 
-    DANetwork_value <- getDomNetwork(DA_col = "DomArch.Pfam",
+    DANetwork_value <- getDomNetwork(DA_col = "DomArch.PfamA",
                                                DACutoff,
                                                DA_Prot_value,
                                                networkLayout = "nice",
@@ -728,6 +729,7 @@ runAnalysis <- function(
                           output_file = output_file,
                           params = params,
                           envir = new.env(parent = globalenv()))
+        browseURL(output_file)
     }, error = function(e) {
         return(paste("Error in report generation:", e$message))
     })
diff --git a/R/ipr2viz.R b/R/ipr2viz.R
index fb145c67..51650761 100644
--- a/R/ipr2viz.R
+++ b/R/ipr2viz.R
@@ -79,7 +79,7 @@ themeGenes2 <- function() {
 #' n = 20, query = "specific_query_name")
 #' }
 getTopAccByLinDomArch <- function(infile_full,
-    DA_col = "DomArch.Pfam",
+    DA_col = "DomArch.PfamA",
     lin_col = "Lineage_short",
     n = 20,
     query) {
@@ -187,7 +187,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
     ## Getting top n accession numbers using getTopAccByLinDomArch()
     top_acc <- getTopAccByLinDomArch(
         infile_full = infile_full,
-        DA_col = "DomArch.Pfam",
+        DA_col = "DomArch.PfamA",
         ## @SAM, you could pick by the Analysis w/ max rows!
         lin_col = "Lineage_short",
         n = topn, query = query
@@ -360,7 +360,7 @@ plotIPR2VizWeb <- function(infile_ipr,
 
     ## @SAM, colnames, merges, everything neeeds to be done now based on the
     ## combined lookup table from "common_data"
-    lookup_tbl_path <- "/data/research/jravilab/common_data/cln_lookup_tbl.tsv"
+    lookup_tbl_path <- system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE)
     lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols)
 
     ## Read IPR file and subset by Accessions
diff --git a/R/run_pipeline.R b/R/run_pipeline.R
index 24b08272..d3f60e98 100644
--- a/R/run_pipeline.R
+++ b/R/run_pipeline.R
@@ -45,7 +45,8 @@ runFull <- function(
     job_code=NULL,
     submitter_email=NULL,
     advanced_options=NULL,
-    get_slurm_mails=FALSE
+    get_slurm_mails=FALSE,
+    APPLICATION
 ) {
   # Set working directory
   setwd(dir)
@@ -86,7 +87,7 @@ runFull <- function(
       # output_file <- paste0(dir, "/blast_output_", i, ".txt")
 
       # Construct the local BLAST command (make sure 'blastn' is available locally)
-      runMolevolvrPipeline(input_file, DB, NHITS, EVAL, is_query = F, type, i)
+      runMolevolvrPipeline(input_file, DB, NHITS, EVAL, is_query = F, type, i, APPLICATION = APPLICATION)
 
       #cmd <- sprintf(
       #  "deltablast -query %s -db %s -out %s -num_alignments %d -evalue %f -remote",
@@ -104,7 +105,7 @@ runFull <- function(
   }
 
   # Simulate query run locally
-  runMolevolvrPipeline(sequences, DB, NHITS, EVAL, is_query = TRUE, type)
+  runMolevolvrPipeline(sequences, DB, NHITS, EVAL, is_query = TRUE, type, APPLICATION = APPLICATION)
   # cmd_query <- sprintf(
   #   "deltablast -query %s -db %s -out %s_query.txt -num_alignments %d -evalue %f -remote",
   #   sequences, DB, paste0(dir, "/query_output"), NHITS, EVAL
@@ -122,7 +123,7 @@ runFull <- function(
 
 # Define the main pipeline function
 runMolevolvrPipeline <- function(input_paths, db, nhits, eval,
-                                   is_query, type, i) {
+                                   is_query, type, i, APPLICATION) {
 
   # Start time
   start <- Sys.time()
@@ -213,18 +214,8 @@ runMolevolvrPipeline <- function(input_paths, db, nhits, eval,
 
   # Run INTERPROSCAN
   runIPRScan2(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
-                   PREFIX, OUTDIR)
-  new_header <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis", "DB.ID",
-                  "SignDesc", "StartLoc", "StopLoc", "Score",
-                  "Status", "RunDate", "IPRAcc", "IPRDesc")
-
-  temp_data <- read_tsv(file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")),
-                        col_names = FALSE)
-
-  colnames(temp_data) <- new_header
+                  prefix =  PREFIX, outdir = OUTDIR, appl = APPLICATION)
 
-  write.table(temp_data, file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")),
-              sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
   # Run IPR2LIN
   ipr2Linear(file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")),
           file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), PREFIX)
@@ -849,7 +840,7 @@ clust2Table <- function(clust, blast) {
 }
 
 # Function to run InterProScan
-runIPRScan2 <- function(query_file, prefix, outdir) {
+runIPRScan2 <- function(query_file, prefix, outdir, appl) {
 
   # Start InterProScan run
   cat("\n######################\n")
@@ -868,19 +859,42 @@ runIPRScan2 <- function(query_file, prefix, outdir) {
 
   # get the path to the interproscan.sh script from the environment
   # variable INTERPROSCAN_CMD, or assume it's on the path if unspecified
-  iprscan_cmd <- Sys.getenv("INTERPROSCAN_CMD", unset="interproscan.sh")
-
-  command <- paste(
-    iprscan_cmd, "-i",
-    shQuote(query_file),
-    "-b", shQuote(outfile),
-    "-f TSV --cpu", Sys.getenv("INTERPROSCAN_CPUS", "4"),
-    "--appl Pfam,MobiDBlite,Phobius,Coils,SignalP_GRAM_POSITIVE,",
-    "SignalP_GRAM_NEGATIVE,Hamap,Gene3D,SignalP_EUK"
-  )
+  # iprscan_cmd <- Sys.getenv("INTERPROSCAN_CMD", unset="interproscan.sh")
+  #
+  # command <- paste(
+  #   iprscan_cmd, "-i",
+  #   shQuote(query_file),
+  #   "-b", shQuote(outfile),
+  #   "-f TSV --cpu", Sys.getenv("INTERPROSCAN_CPUS", "4"),
+  #   "--appl Pfam,MobiDBlite,Phobius,Coils,SignalP_GRAM_POSITIVE,",
+  #   "SignalP_GRAM_NEGATIVE,Hamap,Gene3D,SignalP_EUK"
+  # )
 
   # Run the command
-  system(command)
+  # system(command)
+  submit_ipr(path2seq = query_file,
+             outfolder = outdir,
+             appl = appl,
+             email = "jravilab.msu@gmail.com")
+
+  new_header <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis", "DB.ID",
+                  "SignDesc", "StartLoc", "StopLoc", "Score",
+                  "Status", "RunDate", "IPRAcc", "IPRDesc")
+
+  temp_data <- read_tsv(file.path(outdir, paste0("ipr_joined.tsv")))
+
+  # Drop the last two columns
+  temp_data <- temp_data[, -((ncol(temp_data) - 1):ncol(temp_data))]
+
+  # remove last 10 xters in AccNum
+  temp_data$file_name <- substr(temp_data$file_name, 1,
+                                nchar(temp_data$file_name) - 12)
+
+  colnames(temp_data) <- new_header
+
+  write.table(temp_data, file.path(outdir, paste0(prefix, ".iprscan.tsv")),
+              sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
+
 
   cat("##################\n")
   cat("END OF IPRSCAN RUN\n")
@@ -893,6 +907,10 @@ ipr2Linear <- function(ipr, acc2info, prefix) {
   ipr_in <- read_tsv(ipr, col_names = TRUE) %>%
     mutate(DB.ID = gsub("G3DSA:", "", DB.ID))
 
+  if (prefix == "query_data") {
+    ipr_in$AccNum <- sub("(.*)([0-9])([0-9])$", "\\1.\\2_\\3", ipr_in$AccNum)
+  }
+
   acc2info_out <- fread(input = acc2info, sep = "\t", header = T, fill = T) %>%
     mutate(FullAccNum = gsub("\\|", "", FullAccNum)) %>%
     mutate(FullAccNum = gsub(".*[a-z]", "", FullAccNum))
@@ -991,9 +1009,9 @@ ipr2Linear <- function(ipr, acc2info, prefix) {
 
 ipr2DomArch <- function(infile_ipr, prefix,
                    analysis = c(
-                     "Pfam", "SMART", "Phobius",
-                     "Gene3D", "TMHMM", "SignalP_GRAM_POSITIVE",
-                     "SUPERFAMILY", "MobiDBLite", "TIGRFAM", "PANTHER", "Coils"
+                     "PfamA", "SMART", "Phobius",
+                     "Gene3d", "TMHMM", "SignalP_GRAM_POSITIVE",
+                     "SuperFamily", "MobiDBLite", "Panther", "Coils"
                    )) {
   # read in cleaned up iprscan results
   ipr_in <- read_tsv(infile_ipr, col_names = T, col_types = ipr_cln_cols)
@@ -1012,7 +1030,12 @@ ipr2DomArch <- function(infile_ipr, prefix,
       group_by(Analysis) %>%
       arrange(StartLoc)
     i <- 1
-    for (a in analysis) {
+    analysis_var <- c(
+      "Pfam", "SMART", "Phobius",
+      "Gene3D", "TMHMM", "SignalP_GRAM_POSITIVE",
+      "SUPERFAMILY", "MobiDBLite", "PANTHER", "Coils"
+    )
+    for (a in analysis_var) {
       a_da <- DA %>% filter(Analysis == a)
       if (a == "SignalP_EUK" || a == "SignalP_GRAM_NEGATIVE" ||
           a == "SignalP_GRAM_POSITIVE") {
diff --git a/R/viz_utils.R b/R/viz_utils.R
index 6f4a2b3e..4697204e 100644
--- a/R/viz_utils.R
+++ b/R/viz_utils.R
@@ -6,7 +6,7 @@
 #'
 # Function to generate the InterProScan Visualization
 getIPRGenesVisualization <- function(data, app_data,
-                                    query_iprDatabases = c("Pfam", "Phobius", "TMHMM", "Gene3D"),
+                                    query_iprDatabases = c("PfamA", "Phobius", "TMHMM", "Gene3d"),
                                     query_iprVisType = "Analysis") {
 
     # Check if analysis is loaded
@@ -829,7 +829,7 @@ getDomArchPlot <- function(ipr_path, query_names,
     }
 
     n <- "Name"
-    plot <- ipr2viz_web(
+    plot <- plotIPR2VizWeb(
         infile_ipr = ipr_path,
         accessions = query_names,
         analysis = analysis_type,
diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd
index 9d616e9e..d2c9acde 100644
--- a/inst/report/report_template.Rmd
+++ b/inst/report/report_template.Rmd
@@ -277,8 +277,8 @@ Visualizations and analyses of all query and homologous protein domains, structu
 ```{r, echo=FALSE}
 if(is.null(params$query_iprDatabases)){
     choices <- params$query_domarch_cols
-    if ("Pfam" %in% choices & "Phobius" %in% choices) {
-      analysis_type <- c("Pfam", "Phobius")
+    if ("PfamA" %in% choices & "Phobius" %in% choices) {
+      analysis_type <- c("PfamA", "Phobius")
       } else {
         default <- choices[1]
         }
@@ -293,7 +293,7 @@ if(is.null(params$query_iprVisType)){
     }
 
 if(length(params$query_domarch_cols) >= 1 ) {
-  DomArchPlot <- ipr2viz_web(
+  DomArchPlot <- plotIPR2VizWeb(
     infile_ipr = params$query_data@ipr_path,
     accessions = params$query_data@df$QueryName,
     analysis = analysis_type, group_by = analysis_group, name = "Name"
diff --git a/man/getTopAccByLinDomArch.Rd b/man/getTopAccByLinDomArch.Rd
index 0eeb0610..9e91d606 100644
--- a/man/getTopAccByLinDomArch.Rd
+++ b/man/getTopAccByLinDomArch.Rd
@@ -6,7 +6,7 @@
 \usage{
 getTopAccByLinDomArch(
   infile_full,
-  DA_col = "DomArch.Pfam",
+  DA_col = "DomArch.PfamA",
   lin_col = "Lineage_short",
   n = 20,
   query

From 53c94dea592522ea655621666b788d30026d7ec8 Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Sun, 24 Nov 2024 12:27:26 +0200
Subject: [PATCH 21/23] make it work

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 R/networks_domarch.R            | 40 ++++++++++++++++-----------------
 inst/report/report_template.Rmd |  2 +-
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/R/networks_domarch.R b/R/networks_domarch.R
index 84522943..b171b3a8 100755
--- a/R/networks_domarch.R
+++ b/R/networks_domarch.R
@@ -24,20 +24,20 @@
 #' A network of domains is returned based on shared domain architectures.
 #'
 #' @param prot A data frame that contains the column 'DomArch'.
-#' @param column Name of column containing Domain architecture from which nodes 
+#' @param column Name of column containing Domain architecture from which nodes
 #' and edges are generated.
 #' @param domains_of_interest Character vector specifying domains of interest.
-#' @param cutoff Integer. Only use domains that occur at or above the cutoff for 
+#' @param cutoff Integer. Only use domains that occur at or above the cutoff for
 #' total counts if cutoff_type is "Total Count".
-#' Only use domains that appear in cutoff or greater lineages if cutoff_type is 
+#' Only use domains that appear in cutoff or greater lineages if cutoff_type is
 #' Lineage.
 #' @param layout Character. Layout type to be used for the network. Options are:
 #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"}
-#' @param query_color Character. Color to represent the queried domain in the 
+#' @param query_color Character. Color to represent the queried domain in the
 #' network.
 #'
 #' @importFrom dplyr across add_row all_of distinct filter mutate pull select
-#' @importFrom igraph delete_vertices graph_from_edgelist vertex
+#' @importFrom igraph delete_vertices graph_from_edgelist vertex V
 #' @importFrom purrr map
 #' @importFrom rlang sym
 #' @importFrom shiny showNotification
@@ -174,21 +174,21 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c
                 if ("X" %in% V(g)$name) {
                     g <- delete_vertices(g, "X")
                 }
-                V(g)$size <- as.numeric(wc[V(g)$name])
+                igraph::V(g)$size <- as.numeric(wc[igraph::V(g)$name])
 
-                V(g)$size <- (V(g)$size - min(V(g)$size)) / (max(V(g)$size) - min(V(g)$size)) * 20 + 10 # scaled by degree
+                igraph::V(g)$size <- (igraph::V(g)$size - min(igraph::V(g)$size)) / (max(igraph::V(g)$size) - min(igraph::V(g)$size)) * 20 + 10 # scaled by degree
 
                 # setting vertex color by size
-                V(g)$color <- rainbow(5, alpha = .5)[round((V(g)$size - min(V(g)$size)) / (max(V(g)$size) - min(V(g)$size)) * 4 + 1)]
-                V(g)$frame.color <- V(g)$color
+                igraph::V(g)$color <- rainbow(5, alpha = .5)[round((igraph::V(g)$size - min(igraph::V(g)$size)) / (max(igraph::V(g)$size) - min(igraph::V(g)$size)) * 4 + 1)]
+                igraph::V(g)$frame.color <- igraph::V(g)$color
                 # scaling edge width
-                E(g)$width <- e.sz
-                E(g)$width <- ifelse(log(E(g)$width) == 0, .3, log(E(g)$width))
+                igraph::E(g)$width <- e.sz
+                igraph::E(g)$width <- ifelse(log(igraph::E(g)$width) == 0, .3, log(igraph::E(g)$width))
                 # coloring edges by width
                 ew <- c(2.7, 4.5)
-                E(g)$color <- sapply(
-                    E(g)$width,
-                    function(x) if (x >= ew[1] && x <= ew[2]) E(g)$color <- adjustcolor("cadetblue", alpha.f = .7) else if (x > ew[2]) E(g)$color <- adjustcolor("maroon", alpha.f = .5) else E(g)$color <- "gray55"
+                igraph::E(g)$color <- sapply(
+                    igraph::E(g)$width,
+                    function(x) if (x >= ew[1] && x <= ew[2]) igraph::E(g)$color <- adjustcolor("cadetblue", alpha.f = .7) else if (x > ew[2]) igraph::E(g)$color <- adjustcolor("maroon", alpha.f = .5) else igraph::E(g)$color <- "gray55"
                 )
                 vis_g <- visIgraph(g, type = "full")
             }
@@ -231,18 +231,18 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c
 #'
 #'
 #' @param prot A data frame that contains the column 'DomArch'.
-#' @param column Name of column containing Domain architecture from which nodes 
+#' @param column Name of column containing Domain architecture from which nodes
 #' and edges are generated.
 #' @param domains_of_interest Character vector specifying the domains of interest.
-#' @param cutoff Integer. Only use domains that occur at or above the cutoff for 
+#' @param cutoff Integer. Only use domains that occur at or above the cutoff for
 #' total counts if cutoff_type is "Total Count".
-#' Only use domains that appear in cutoff or greater lineages if cutoff_type is 
+#' Only use domains that appear in cutoff or greater lineages if cutoff_type is
 #' Lineage.
 #' @param layout Character. Layout type to be used for the network. Options are:
 #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"}
-#' @param query_color Color that the nodes of the domains in the 
+#' @param query_color Color that the nodes of the domains in the
 #' domains_of_interest vector are colored
-#' @param partner_color Color that the nodes that are not part of the 
+#' @param partner_color Color that the nodes that are not part of the
 #' domains_of_interest vector are colored
 #' @param border_color Color for the borders of the nodes.
 #' @param IsDirected Is the network directed? Set to false to eliminate arrows
@@ -364,4 +364,4 @@ createBinaryDomainNetwork <- function(prot, column = "DomArch", domains_of_inter
         "auto" = visIgraphLayout(vg, "layout.auto")
     )
     vg
-}
\ No newline at end of file
+}
diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd
index d2c9acde..ab02bdeb 100644
--- a/inst/report/report_template.Rmd
+++ b/inst/report/report_template.Rmd
@@ -455,7 +455,7 @@ if(length(params$repAccNums) >= 3 ) {
     query_accession <- params$app_data@df %>% filter(!duplicated(QueryName))
     query_accession <- unique(query_accession$Query)
     query <- seqs[query_accession]
-    names(query) <- params$PhyloSelect
+    # names(query) <- params$PhyloSelect
     query <- AAStringSet(query)
     }
 

From 3c57ec71f247d42efa8074a66f17a2215fba8ac7 Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Sun, 24 Nov 2024 14:12:12 +0200
Subject: [PATCH 22/23] add iprscanr to desc

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 DESCRIPTION      | 1 +
 NAMESPACE        | 1 +
 R/run_pipeline.R | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index e2ff7eca..8933ae4c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -55,6 +55,7 @@ Imports:
     httr,
     httr2,
     igraph,
+    iprscanr,
     latexpdf,
     magrittr,
     msa,
diff --git a/NAMESPACE b/NAMESPACE
index eadec16a..a5a44a79 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -104,6 +104,7 @@ import(DT)
 import(data.table)
 import(dplyr)
 import(httr)
+import(iprscanr)
 import(plotly)
 import(readr)
 import(rentrez)
diff --git a/R/run_pipeline.R b/R/run_pipeline.R
index d3f60e98..fe1fa4c9 100644
--- a/R/run_pipeline.R
+++ b/R/run_pipeline.R
@@ -2,7 +2,7 @@
 # Last modified: 2024
 
 # import libs
-#' @import readr data.table httr rentrez
+#' @import readr data.table httr rentrez iprscanr
 
 getSeqs <- function(sequences,
                           acc_file_path = "accs.txt",

From f6b379116d3db78164d9c7db98e475b6fedc7c02 Mon Sep 17 00:00:00 2001
From: Awa Synthia <ndahili14@gmail.com>
Date: Mon, 25 Nov 2024 14:09:31 +0200
Subject: [PATCH 23/23] add iprscanr to desc suggests

Signed-off-by: Awa Synthia <ndahili14@gmail.com>
---
 DESCRIPTION | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 8933ae4c..a30025de 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -55,7 +55,6 @@ Imports:
     httr,
     httr2,
     igraph,
-    iprscanr,
     latexpdf,
     magrittr,
     msa,
@@ -87,4 +86,5 @@ Imports:
     XVector,
     yaml
 Suggests:
-    knitr
+    knitr,
+    iprscanr