diff --git a/.buildinfo b/.buildinfo new file mode 100644 index 0000000..99ec388 --- /dev/null +++ b/.buildinfo @@ -0,0 +1,4 @@ +# Sphinx build info version 1 +# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. +config: 6ef85c61a07ec8e9f0ed07676e851c59 +tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/.doctrees/cpg.doctree b/.doctrees/cpg.doctree new file mode 100644 index 0000000..d1a7c98 Binary files /dev/null and b/.doctrees/cpg.doctree differ diff --git a/.doctrees/dust.doctree b/.doctrees/dust.doctree new file mode 100644 index 0000000..93a8a69 Binary files /dev/null and b/.doctrees/dust.doctree differ diff --git a/.doctrees/environment.pickle b/.doctrees/environment.pickle new file mode 100644 index 0000000..dbb6b29 Binary files /dev/null and b/.doctrees/environment.pickle differ diff --git a/.doctrees/eponine.doctree b/.doctrees/eponine.doctree new file mode 100644 index 0000000..d1c61bc Binary files /dev/null and b/.doctrees/eponine.doctree differ diff --git a/.doctrees/genblast.doctree b/.doctrees/genblast.doctree new file mode 100644 index 0000000..d86992c Binary files /dev/null and b/.doctrees/genblast.doctree differ diff --git a/.doctrees/index.doctree b/.doctrees/index.doctree new file mode 100644 index 0000000..3141c25 Binary files /dev/null and b/.doctrees/index.doctree differ diff --git a/.doctrees/install.doctree b/.doctrees/install.doctree new file mode 100644 index 0000000..1efdc99 Binary files /dev/null and b/.doctrees/install.doctree differ diff --git a/.doctrees/license.doctree b/.doctrees/license.doctree new file mode 100644 index 0000000..6af76fe Binary files /dev/null and b/.doctrees/license.doctree differ diff --git a/.doctrees/minimap.doctree b/.doctrees/minimap.doctree new file mode 100644 index 0000000..1135099 Binary files /dev/null and b/.doctrees/minimap.doctree differ diff --git a/.doctrees/red.doctree b/.doctrees/red.doctree new file mode 100644 index 0000000..5faa08d Binary files /dev/null and b/.doctrees/red.doctree differ diff --git a/.doctrees/repeatmasker.doctree b/.doctrees/repeatmasker.doctree new file mode 100644 index 0000000..e56e5b0 Binary files /dev/null and b/.doctrees/repeatmasker.doctree differ diff --git a/.doctrees/scallop.doctree b/.doctrees/scallop.doctree new file mode 100644 index 0000000..610e945 Binary files /dev/null and b/.doctrees/scallop.doctree differ diff --git a/.doctrees/star.doctree b/.doctrees/star.doctree new file mode 100644 index 0000000..3cad10e Binary files /dev/null and b/.doctrees/star.doctree differ diff --git a/.doctrees/stringtie.doctree b/.doctrees/stringtie.doctree new file mode 100644 index 0000000..952b4e0 Binary files /dev/null and b/.doctrees/stringtie.doctree differ diff --git a/.doctrees/trf.doctree b/.doctrees/trf.doctree new file mode 100644 index 0000000..61be72e Binary files /dev/null and b/.doctrees/trf.doctree differ diff --git a/.doctrees/trnascan.doctree b/.doctrees/trnascan.doctree new file mode 100644 index 0000000..991c879 Binary files /dev/null and b/.doctrees/trnascan.doctree differ diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/_modules/ensembl/tools/anno/protein_annotation/genblast.html b/_modules/ensembl/tools/anno/protein_annotation/genblast.html new file mode 100644 index 0000000..cb71cf4 --- /dev/null +++ b/_modules/ensembl/tools/anno/protein_annotation/genblast.html @@ -0,0 +1,615 @@ + + + + + + + ensembl.tools.anno.protein_annotation.genblast — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.protein_annotation.genblast

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+GenBlast identifies homologous gene sequences in genomic databases.
+One of the key features of GenBlast is its flexibility to handle
+comparative genomics tasks and accurately identify homologs even when
+the sequences have undergone significant evolutionary changes.
+This capability makes it a valuable resource for researchers studying gene
+evolution, gene families, and gene function across diverse species.
+GenBlast has been widely used in various genomic analyses and is available as
+a standalone command-line tool or as part of different bioinformatics pipelines.
+Researchers in the field of comparative genomics and gene function analysis
+often rely on GenBlast to perform sensitive homology searches and obtain
+valuable insights into the evolutionary relationships and functional conservation
+of genes in different organisms.
+
+
+She, R., Chu, J.S., Uyar, B., Wang, J., Wang, K., and Chen, N. (2011).
+GenBlastA: enabling BLAST to identify homologous gene sequences.
+Genome Res., 21(5): 936-949.
+"""
+__all__ = ["run_genblast"]
+
+import logging
+import logging.config
+import multiprocessing
+import os
+from pathlib import Path
+import random
+import re
+import shutil
+import signal
+import subprocess
+from typing import List
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+)
+
+logger = logging.getLogger(__name__)
+
+
+
[docs]def run_genblast(#pylint:disable=dangerous-default-value + masked_genome: Path, + output_dir: Path, + protein_dataset: Path, + max_intron_length: int, + genblast_timeout_secs: int = 10800, + genblast_bin: Path = Path("genblast"), + convert2blastmask_bin: Path = Path("convert2blastmask"), + makeblastdb_bin: Path = Path("makeblastdb"), + num_threads: int = 1, + protein_set: str = ["uniprot", "orthodb"], +) -> None: + """ + + Executes GenBlast on genomic slices + + :param masked_genome: Masked genome file path. + :type masked_genome: Path + :param output_dir: Working directory path. + :type output_dir: Path + :param protein_dataset: Protein dataset (Uniprot/OrthoDb) path. + :type protein_dataset: Path + :param genblast_timeout_secs: Time for timeout (sec). + :type genblast_timeout_secs: int, default 10800 + :param max_intron_length: Maximum intron length. + :type max_intron_length: int + :param genblast_bin: Software path. + :type genblast_bin: Path, default genblast + :param convert2blastmask_bin: Software path. + :type convert2blastmask_bin: Path, default convert2blastmask + :param makeblastdb_bin: Software path. + :type makeblastdb_bin: Path, default makeblastdb + :param genblast_timeout: seconds + :type genblast_timeout: int, default 1 + :param num_threads: int, number of threads. + :type num_threads: int, default 1 + :param protein_set: Source + :type str: ["uniprot", "orthodb"] + + :return: None + :rtype: None + + """ + + check_exe(genblast_bin) + check_exe(convert2blastmask_bin) + check_exe(makeblastdb_bin) + if protein_set == "uniprot": + genblast_dir = create_dir(output_dir, "uniprot_output") + elif protein_set == "orthodb": + genblast_dir = create_dir(output_dir, "orthodb_output") + output_file = genblast_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "transcript") + if transcript_count > 0: + logger.info("Genblast gtf file exists, skipping analysis") + return + logging.info(Path(f"{output_dir}/alignscore.txt")) + if not Path(f"{genblast_dir}/alignscore.txt").exists(): + # Get the repo directory + repo_root_dir = Path(__file__).parents[6] + shutil.copy(Path(f"{repo_root_dir}/data/alignscore.txt"), genblast_dir) + + if not masked_genome.exists(): + raise IOError(f"Masked genome file does not exist: {masked_genome}") + if not protein_dataset.exists(): + raise IOError(f"Protein file does not exist: {protein_dataset}") + asnb_file = Path(f"{masked_genome}.asnb") + if asnb_file.exists(): + logger.info("Found an existing asnb, so will skip convert2blastmask") + else: + _run_convert2blastmask(convert2blastmask_bin, masked_genome, asnb_file) + _run_makeblastdb(makeblastdb_bin, masked_genome, asnb_file) + batched_protein_files = _split_protein_file( + protein_dataset, genblast_dir, num_threads + ) + pool = multiprocessing.Pool(num_threads) # pylint:disable=consider-using-with + for batched_protein_file in batched_protein_files: + pool.apply_async( + _multiprocess_genblast, + args=( + batched_protein_file, + masked_genome, + genblast_bin, + genblast_timeout_secs, + max_intron_length, + ), + ) + pool.close() + pool.join() + _generate_genblast_gtf(genblast_dir) + for i in range(0, 10): + shutil.rmtree(genblast_dir / f"bin_{i}") + logger.info("Completed running GenBlast")
+ + +def _multiprocess_genblast( + protein_file: Path, + masked_genome: Path, + genblast_bin: Path, + genblast_timeout: int, + max_intron_length: int, +): + """ + Executes GenBlast on genomic slice + Args: + protein_file: Path of a single batched file. + masked_genome : Masked genome file path. + genblast_bin : Software path. + genblast_timeout_secs: Time for timeout (sec). + max_intron_length: Maximum intron length. + Command line options: + -P Search program used to produce HSPs, + can be either "blast" or "wublast", default is "blast", + optional + -p specifies the program option of genBlast: genblasta or genblastg + -q List of query sequences to blast, must be in fasta format, + required + -t The target database of genomic sequences in fasta format, + required + -g parameter for blast: Perform gapped alignment (T/F) + [default: F], optional + -d parameter for genBlast: maximum allowed distance between HSPs + within the same gene, a non-negative integer [default: 100000], + optional + -r parameter for genBlast: number of ranks in the output, + a positive integer, optional + -e parameter for blast: The e-value, [default: 1e-2], + optional + -c parameter for genBlast: minimum percentage of query gene + coverage in the output, between 0 and 1 (e.g. for 50% + gene coverage, use "0.5"), optional + -W parameter for blast: Set word size, 0 means using blast default [default: 0], + optional + -scodon The number of base pairs to search for start codon within the region of HSP + group (inside the first HSP). If not specified, default is 15. + -i parameter for genBlastG: minimum intron length, optional. + If not specified, the default value is 15. + -x parameter for genBlastG: minimum internal exon length, optional. + If not specified, default is 20. + -n parameter for genBlastG: maximum number of splice sites per region, optional. + If not specified, default is 20. + -gff output options: turn on GFF output + -o output filename, optional. If not specified, the output + will be the same as the query filename with ".gblast" + extension. + -pid turn on final alignment PID computation (global alignment between predicted + gene and query) in output. + -softmask With this option NCBI blast will create a masking library, + you need to use it when blasting against a whole genome + """ + logger.info("Running GenBlast on : %s", protein_file) + + genblast_cmd = [ + str(genblast_bin), + "-p", + "genblastg", + "-q", + str(protein_file), + "-t", + str(masked_genome), + "-g", + "T", + "-pid", + "-r", + "1", + "-P", + "blast", + "-gff", + "-e", + "1e-1", + "-c", + "0.8", + "-W", + "3", + "-softmask", + "-scodon", + "50", + "-i", + "30", + "-x", + "10", + "-n", + "30", + "-d", + str(max_intron_length), + "-o", + str(protein_file), + ] + + logger.info(" ".join(genblast_cmd)) + # Using the child process termination as described here: + # https://alexandra-zaharia.github.io/posts/kill-subprocess + # -and-its-children-on-timeout-python/ + try: + p = subprocess.Popen(# pylint:disable=consider-using-with + genblast_cmd, start_new_session=True + ) + p.wait(timeout=genblast_timeout) + except subprocess.TimeoutExpired: + logger.error("Timeout reached for file: %s \n", protein_file) + subprocess.run(# pylint:disable=subprocess-run-check + ["touch", (Path(f"{protein_file}.except"))] + ) + os.killpg(os.getpgid(p.pid), signal.SIGTERM) + + +def _generate_genblast_gtf(genblast_dir: Path) -> None: + """ + Collect output from geneblast and create the final gtf file + genblast_dir: Working directory path. + """ + logging.info("AAAAA _generate_genblast_gtf") + output_file = genblast_dir / "annotation.gtf" + with open(output_file, "w+", encoding="utf8") as file_out: + genblast_extension = "_1.1c_2.3_s1_0_16_1" + for path in genblast_dir.rglob("*"): + # for root, dirs, files in os.walk(genblast_dir): + # for genblast_file in files: + # genblast_file = os.path.join(root, genblast_file) + if path.is_file() and path.suffix == ".gff": + gtf_string = _convert_genblast_gff_to_gtf(path) + file_out.write(gtf_string) + elif path.is_file() and path.suffix in ( + ".fa.blast", + ".fa.blast.report", + genblast_extension, + ): + path.unlink() + + +def _split_protein_file( + protein_dataset: Path, output_dir: Path, batch_size: int = 20 +) -> List: + """ + The protein dataset file is splitted by a number of sequence equals to the batch_size + in batch files stored in 10 output directories. + protein_dataset : Path for the protein dataset. + output_dir : Output directory path. + batch_size : Size of the batch, it needs to be equals to the number of threads + to parallelise the sequence processing for each file. + """ + batched_protein_files = [] + + for i in range(0, 10): + create_dir(output_dir, (f"bin_{i}")) + with open(protein_dataset,"r", encoding="utf8") as file_in: + seq_count = 0 + batch_count = 0 + current_record = "" + initial_seq = True + for line in file_in: + match = re.search(r">(.+)$", line) + # match header and is not first sequence, if the number of stored sequences in each file equals + # the number of batch_size, a new file will be created and the current_record reset + if match and not initial_seq and seq_count % batch_size == 0: + bin_num = random.randint(0, 9) + batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa" + with batch_file.open("w+") as file_out: + file_out.write(current_record) + batch_count += 1 + seq_count += 1 + current_record = line + batched_protein_files.append(batch_file) + # match header and is the first sequence + elif match: + current_record += line + initial_seq = False + seq_count += 1 + # other lines + else: + current_record += line + + if current_record: + bin_num = random.randint(0, 9) + batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa" + with batch_file.open("w+") as file_out: + file_out.write(current_record) + batched_protein_files.append(batch_file) + return batched_protein_files + + +def _run_convert2blastmask( + convert2blastmask_bin: Path, masked_genome: Path, asnb_file: Path +) -> None: + """ + Convert masking information in lower-case masked FASTA input to file + formats suitable for makeblastdb. + convert2blastmask_bin : Software path. + masked_genome: Path of masked genome file. + asnb_file: Path of assembly file. + """ + logger.info("Running convert2blastmask prior to GenBlast:") + cmd = [ + str(convert2blastmask_bin), + "-in", + str(masked_genome), + "-parse_seqids", + "-masking_algorithm", # mask_program_name + "other", + "-masking_options", # mask_program_options + '"REpeatDetector, default"', + "-outfmt", # output_format + "maskinfo_asn1_bin", + "-out", + str(asnb_file), + ] + logger.info(" ".join(cmd)) + subprocess.run(cmd, check=True) + logger.info("Completed running convert2blastmask") + + +def _run_makeblastdb(makeblastdb_bin: Path, masked_genome: Path, asnb_file: Path) -> None: + """ + Application to create BLAST databases. + makeblastdb_bin : Software path. + masked_genome: Path of masked genome file. + asnb_file: Path of assembly file. + """ + logger.info("Running makeblastdb prior to GenBlast") + subprocess.run( # pylint:disable=subprocess-run-check + [ + str(makeblastdb_bin), + "-in", + str(masked_genome), + "-dbtype", # molecule_type + "nucl", + "-parse_seqids", + "-mask_data", + str(asnb_file), + "-max_file_sz", # number_of_bytes + "10000000000", + ] + ) + logger.info("Completed running makeblastdb") + + +def _convert_genblast_gff_to_gtf(gff_file: Path) -> str: + """ + Convert the content of gtf file in gff format + gff_file: Path for the gff file + """ + gtf_string = "" + with open(gff_file, "r", encoding="utf8") as file_in: + for line in file_in: + results = line.split() + if len(results) == 9: + results[2] = "exon" if results[2] == "coding_exon" else results[2] + attributes = _set_genblast_attributes(str(results[8]), str(results[2])) + results[8] = attributes + converted_line = "\t".join(results) + gtf_string += converted_line + "\n" + return gtf_string + + +def _set_genblast_attributes(attributes: str, feature_type: str) -> str: + """ + Given the list of attributes in the genblast output, + define the new attributes for the gtf file. + attributes: GenBlast attribute list + feature_type: transcript or exon + Example genBlast output #pylint: disable=line-too-long, trailing-whitespace + 1 genBlastG transcript 131128674 131137049 252.729 - . ID=259447-R1-1-A1;Name=259447;PID=84.65;Coverage=94.22;Note=PID:84.65-Cover:94.22 + 1 genBlastG coding_exon 131137031 131137049 . - . ID=259447-R1-1-A1-E1;Parent=259447-R1-1-A1 + 1 genBlastG coding_exon 131136260 131136333 . - . ID=259447-R1-1-A1-E2;Parent=259447-R1-1-A1 + 1 genBlastG coding_exon 131128674 131130245 . - . ID=259447-R1-1-A1-E3;Parent=259447-R1-1-A1 + """ + converted_attributes = "" + split_attributes = attributes.split(";") + if feature_type == "transcript": + match = re.search(r"Name\=(.+)$", split_attributes[1]) + assert match + name = match.group(1) + converted_attributes = f'gene_id "{name}"; transcript_id "{name}";' + elif feature_type == "exon": + match = re.search(r"\-E(\d+);Parent\=(.+)\-R\d+\-\d+\-", attributes) + assert match + exon_rank = match.group(1) + name = match.group(2) + converted_attributes = ( + f'gene_id "{name}"; transcript_id "{name}"; exon_number "{exon_rank}";' + ) + + return converted_attributes + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run TRF.""" + + masked_genome_file = argschema.fields.InputFile( + required=True, description="Masked genome file path" + ) + output_dir = argschema.fields.OutputDir( + required=True, description="Output directory path" + ) + protein_file = argschema.fields.String( + required=True, description="Path for the protein dataset" + ) + genblast_timeout_secs = argschema.fields.Integer( + required=False, default=10800, description="Genblast timeout period" + ) + max_intron_length = argschema.fields.Integer( + required=True, description="Maximum intron length" + ) + genblast_bin = argschema.fields.String( + required=False, + default="genblast", + description="Genblast executable path", + ) + convert2blastmask_bin = argschema.fields.String( + required=False, + default="convert2blastmask", + description="convert2blastmask executable path", + ) + makeblastdb_bin = argschema.fields.String( + required=False, default="makeblastdb", description="makeblastdb executable path" + ) + num_threads = argschema.fields.Integer( + required=False, default=1, description="Number of threads" + ) + protein_set = argschema.fields.String( + required=True, + description="Protein set [uniprot,orthodb]", + validate=lambda x: x in ["uniprot", "orthodb"], + ) + + +def main() -> None: + """Genblast's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "genblast.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_genblast( + Path(mod.args["masked_genome_file"]), + Path(mod.args["output_dir"]), + Path(mod.args["protein_file"]), + mod.args["max_intron_length"], + mod.args["genblast_timeout_secs"], + Path(mod.args["genblast_bin"]), + Path(mod.args["convert2blastmask_bin"]), + Path(mod.args["makeblastdb_bin"]), + mod.args["num_threads"], + mod.args["protein_set"], + ) + + +if __name__ == "__main__": + main() +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/repeat_annotation/dust.html b/_modules/ensembl/tools/anno/repeat_annotation/dust.html new file mode 100644 index 0000000..cc62410 --- /dev/null +++ b/_modules/ensembl/tools/anno/repeat_annotation/dust.html @@ -0,0 +1,306 @@ + + + + + + + ensembl.tools.anno.repeat_annotation.dust — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.repeat_annotation.dust

+# See the NOTICE file distributed with this work for additional information #pylint: disable=missing-module-docstring
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+DustMasker is a program that identifies and masks out low complexity
+parts of a genome using a new and improved DUST algorithm.
+
+Morgulis A, Gertz EM, Schaffer AA, Agarwala R. A Fast and Symmetric
+DUST Implementation to Mask Low-Complexity DNA Sequences.
+"""
+__all__ = ["run_dust"]
+
+import logging
+import logging.config
+import multiprocessing
+import os
+from os import PathLike
+from pathlib import Path
+import re
+import subprocess
+import tempfile
+from typing import List
+import argschema
+
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+    get_seq_region_length,
+    get_slice_id,
+    slice_output_to_gtf,
+    get_sequence,
+)
+
+logger = logging.getLogger(__name__)
+
+
+
[docs]def run_dust( + genome_file: PathLike, + output_dir: Path, + dust_bin: Path = Path("dustmasker"), + num_threads: int = 1, +) -> None: + """ + Run Dust on genomic slices with mutiprocessing + :param genome_file: Genome file path. + :type genome_file: PathLike + :param output_dir: Working directory path. + :type output_dir: Path + :param dust_bin: Dust software path. + :type dust_bin: Path, default dustmasker + :param num_threads: Number of threads. + :type num_threads: int, default 1 + + :return: None + :rtype: None + """ + + check_exe(dust_bin) + dust_dir = create_dir(output_dir, "dust_output") + os.chdir(str(dust_dir)) + output_file = dust_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "repeat") + if transcript_count > 0: + logger.info("Dust gtf file exists, skipping analysis") + return + logger.info("Creating list of genomic slices") + seq_region_to_length = get_seq_region_length(genome_file, 5000) + slice_ids_per_region = get_slice_id(seq_region_to_length, slice_size=1000000, overlap=0, min_length=5000) + dust_cmd = [dust_bin, "-in"] + pool = multiprocessing.Pool(num_threads) # pylint: disable=consider-using-with + for slice_id in slice_ids_per_region: + pool.apply_async( + _multiprocess_dust, + args=( + dust_cmd, + slice_id, + dust_dir, + genome_file, + ), + ) + pool.close() + pool.join() + slice_output_to_gtf(dust_dir, "repeat_id", "dust", True, ".dust.gtf") + for gtf_file in dust_dir.glob("*.dust.gtf"): + gtf_file.unlink()
+ + +def _multiprocess_dust( # pylint: disable=too-many-locals + dust_cmd: List[str], + slice_id: List[str], + dust_dir: Path, + genome_file: Path, +) -> None: + """ + Run Dust on multiprocess on genomic slices + Args: + dust_cmd: Dust command to execute. + slice_id: List of slice IDs. + dust_dir : Dust output directory path. + genome_file : Genome file. + """ + region_name, start, end = slice_id + logger.info( + "Processing slice to find low complexity regions with Dust: %s:%s:%s", + region_name, + start, + end, + ) + seq = get_sequence(region_name, int(start), int(end), 1, genome_file, dust_dir) + slice_name = f"{region_name}.rs{start}.re{end}" + with tempfile.TemporaryDirectory(dir=dust_dir) as tmpdirname: + slice_file = dust_dir / tmpdirname / f"{slice_name}.fa" + with open(slice_file, "w+", encoding="utf8") as region_out: + region_out.write(f">{region_name}\n{seq}\n") + region_results = dust_dir / f"{slice_name}.dust.gtf" + output_file = Path(f"{slice_file}.dust") + dust_cmd.append(str(slice_file)) + logger.info("dust_cmd: %s", dust_cmd) + with open(output_file, "w+", encoding="utf8") as dust_out: + subprocess.run(dust_cmd, stdout=dust_out, check=True) + _create_dust_gtf(output_file, region_results, region_name) + slice_file.unlink() + output_file.unlink() + + +def _create_dust_gtf( + output_file: Path, + region_results: Path, + region_name: str, +) -> None: + """ + Read the fasta file and save the content in gtf format + All the genomic slices are collected in a single gtf output + Args: + output_file : GTF file with final results. + region_results : GTF file with the results per region. + region_name :Coordinates of genomic slice. + """ + with open(output_file, "r", encoding="utf8") as dust_in, open( + region_results, "w+", encoding="utf8" + ) as dust_out: + repeat_count = 1 + for line in dust_in: + result_match = re.search(r"(\d+)\ - (\d+)", line) + if result_match: + start = int(result_match.group(1)) + 1 + end = int(result_match.group(2)) + 1 + gtf_line = ( + f"{region_name}\tDust\trepeat\t{start}\t" f'{end}\t.\t+\t.\trepeat_id "{repeat_count}";\n' + ) + dust_out.write(gtf_line) + repeat_count += 1 + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run DustMasker.""" + + genome_file = argschema.fields.InputFile(required=True, description="Genome file path") + output_dir = argschema.fields.OutputDir(required=True, description="Output directory path") + dust_bin = argschema.fields.String( + required=False, + default="dustmasker", + description="Dust executable path", + ) + num_threads = argschema.fields.Integer(required=False, default=1, description="Number of threads") + + +def main() -> None: + """Dust's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "dust.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_dust( + mod.args["genome_file"], + mod.args["output_dir"], + mod.args["dust_bin"], + mod.args["num_threads"], + ) + + +if __name__ == "__main__": + main() +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/repeat_annotation/red.html b/_modules/ensembl/tools/anno/repeat_annotation/red.html new file mode 100644 index 0000000..c010a31 --- /dev/null +++ b/_modules/ensembl/tools/anno/repeat_annotation/red.html @@ -0,0 +1,272 @@ + + + + + + + ensembl.tools.anno.repeat_annotation.red — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.repeat_annotation.red

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Red is the first repeat-detection tool capable of labeling its training data
+and training itself automatically on an entire genome.
+Girgis, H.Z. Red: an intelligent, rapid, accurate tool for detecting repeats
+de-novo on the genomic scale. BMC Bioinformatics 16, 227 (2015).
+https://doi.org/10.1186/s12859-015-0654-5
+"""
+__all__ = ["run_red"]
+
+import logging
+import logging.config
+from os import PathLike
+from pathlib import Path
+import re
+import subprocess
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+)
+
+logger = logging.getLogger(__name__)
+
+
+
[docs]def run_red(genome_file: Path, output_dir: Path, red_bin: Path = Path("Red"),) -> str: + """ + Run Red on genome file + :param genome_file: Genome file path. + :type genome_file: Path + :param output_dir: Working directory path. + :type output_dir: Path + :param red_bin: Red software path. + :type red_bin: Path, default Red + + :return: Masked genome file + :rtype: str + """ + check_exe(red_bin) + red_dir = create_dir(output_dir, "red_output") + red_mask_dir = create_dir(red_dir, "mask_output") + red_repeat_dir = create_dir(red_dir, "repeat_output") + red_genome_dir = create_dir(red_dir, "genome_dir") + + sym_link_genome_cmd = "ln -s " + str(genome_file) + " " + str(red_genome_dir) + genome_file_name = genome_file.name + red_genome_file = red_genome_dir / genome_file_name + genome_file_stem = genome_file.stem + masked_genome_file = red_mask_dir / f"{genome_file_stem}.msk" + repeat_coords_file = red_repeat_dir / f"{genome_file_stem}.rpt" + output_file = red_dir / "annotation.gtf" + + if masked_genome_file.exists(): + logger.warning( + "Masked Genome file already found on the path to the Red mask output dir. \ + Will not create a new file" + ) + # _create_red_gtf(repeat_coords_file, output_file) + return str(masked_genome_file) + if red_genome_file.exists(): + logger.warning( + "Unmasked genome file already found on the path to the Red genome dir, \ + will not create a sym link" + ) + + else: + logger.info( + "Preparing to sym link the genome file to the Red genome dir. Cmd\n %s", + sym_link_genome_cmd, + ) + # subprocess.run(["ln", "-s", genome_file, red_genome_dir]) + red_genome_file.symlink_to(genome_file) + try: + if red_genome_file.exists(): + logger.info("Running Red") + subprocess.run( + [ + red_bin, + "-gnm", + red_genome_dir, + "-msk", + red_mask_dir, + "-rpt", + red_repeat_dir, + ], + check=True, + ) + except: + logger.error( + "Could not find the genome file in the Red genome dir or sym link \ + to the original file. Path expected:\n%s", + genome_file, + ) + _create_red_gtf(repeat_coords_file, output_file) + return str(masked_genome_file)
+ + +def _create_red_gtf(repeat_coords_file: Path, output_file: Path): + """ + Create Red gtf file from masked genome file + + Args: + repeat_coords_file: Coordinates for repeats. + output_file : GTF file with the final results. + """ + with open(repeat_coords_file, "r", encoding="utf8") as red_in, open( + output_file, "w+", encoding="utf8" + ) as red_out: + for repeat_id, line in enumerate(red_in, start=1): + result_match = re.search(r"^\>(.+)\:(\d+)\-(\d+)", line) + if result_match: + region_name = result_match.group(1) + # Note that Red is 0-based, so add 1 + start = int(result_match.group(2)) + 1 + end = int(result_match.group(3)) + 1 + gtf_line = ( + f"{region_name}\tRed\trepeat\t{start}\t" + f'{end}\t.\t+\t.\trepeat_id "{repeat_id}";\n' + ) + red_out.write(gtf_line) + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run Red.""" + + genome_file = argschema.fields.InputFile( + required=True, description="Genome file path" + ) + output_dir = argschema.fields.OutputDir( + required=True, description="Output directory path" + ) + red_bin = argschema.fields.String( + required=False, default="Red", description="Red executable path", + ) + + +def main() -> None: + """Red's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "red.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_red( + Path(mod.args["genome_file"]), mod.args["output_dir"], mod.args["red_bin"], + ) + + +if __name__ == "__main__": + main() +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/repeat_annotation/repeatmasker.html b/_modules/ensembl/tools/anno/repeat_annotation/repeatmasker.html new file mode 100644 index 0000000..f086cc4 --- /dev/null +++ b/_modules/ensembl/tools/anno/repeat_annotation/repeatmasker.html @@ -0,0 +1,378 @@ + + + + + + + ensembl.tools.anno.repeat_annotation.repeatmasker — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.repeat_annotation.repeatmasker

+# See the NOTICE file distributed with this work for additional information #pylint: disable=missing-module-docstring
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+    RepeatMasker is a program that screens DNA sequences for interspersed
+    repeats and low complexity DNA sequences.
+    Smit, AFA, Hubley, R & Green, P. RepeatMasker Open-4.0
+"""
+
+__all__ = ["run_repeatmasker"]
+
+import json
+import logging
+import logging.config
+import multiprocessing
+import os
+from os import PathLike
+from pathlib import Path
+import re
+import subprocess
+from typing import List
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+    get_seq_region_length,
+    get_slice_id,
+    slice_output_to_gtf,
+    get_sequence,
+)
+logger = logging.getLogger('__name__')
+
+
+
[docs]def run_repeatmasker( + genome_file: PathLike, + output_dir: Path, + repeatmasker_bin: Path = Path("RepeatMasker"), + library: str = "", + repeatmasker_engine: str = "rmblast", + species: str = "", + num_threads: int = 1, +) -> None: + + """ + Executes RepeatMasker on the genome slices and stores the final annotation.gtf in repeatmasker_output + + :param genome_file: Genome file path. + :type genome_file: PathLike + :param output_dir: Output directory path. + :type output_dir: Path + :param repeatmasker_bin: RepeatMasker executable path. + :type repeatmasker_bin: Path, default RepeatMasker + :param library: Custom repeat library. + :type library: str + :param repeatmasker_engine: RepeatMasker engine. + :type repeatmasker_engine: str, default rmblast + :param species: Species name. + :type species: str + :param num_threads: Number of threads. + :type num_threads: int, default 1 + + :return: None + :rtype: None + """ + check_exe(repeatmasker_bin) + repeatmasker_dir = create_dir(output_dir, "repeatmasker_output") + + output_file = repeatmasker_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "repeat") + if transcript_count > 0: + logger.info("Repeatmasker gtf file exists") + return + logger.info("Creating list of genomic slices") + seq_region_to_length = get_seq_region_length(genome_file, 5000) + slice_ids_per_region = get_slice_id( + seq_region_to_length, slice_size=1000000, overlap=0, min_length=5000 + ) + repeatmasker_cmd = [ + str(repeatmasker_bin), + "-nolow",#does not display simple repeats or low_complexity DNA in the annotation + "-engine", + repeatmasker_engine, + "-dir", + str(repeatmasker_dir), + ] + if not library: + if not species: + species = "homo" + repeatmasker_cmd.extend(["-species", species]) + else: + repeatmasker_cmd.extend(["-lib", library]) + logger.info(f"Running RepeatMasker {repeatmasker_cmd}") + pool = multiprocessing.Pool(num_threads) # pylint: disable=consider-using-with + for slice_id in slice_ids_per_region: + pool.apply_async( + _multiprocess_repeatmasker, + args=( + repeatmasker_cmd, + slice_id, + genome_file, + repeatmasker_dir, + ), + ) + pool.close() + pool.join() + slice_output_to_gtf(repeatmasker_dir, "repeat_id", "repeatmask", True, ".rm.gtf") + for gtf_file in repeatmasker_dir.glob("*.rm.gtf"): + gtf_file.unlink()
+ +def _multiprocess_repeatmasker( # pylint: disable=too-many-locals + repeatmasker_cmd: List[str], + slice_id: List[str], + genome_file: Path, + repeatmasker_dir: Path, +) -> None: + """ + Run Repeatmasker on genomic slice + + Args: + repeatmasker_cmd: RepeatMasker command to execute. + slice_id: Slice ID to run RepeatMasker on. + genome_file : Genome file path. + repeatmasker_dir : RepeatMasker output directory path. + """ + + region_name, start, end = slice_id + logger.info( + "Processing slice to find repeats with RepeatMasker: %s:%s:%s", + region_name, + start, + end, + ) + seq = get_sequence( + region_name, int(start), int(end), 1, genome_file, repeatmasker_dir + ) + slice_file_name = f"{region_name}.rs{start}.re{end}" + region_file = repeatmasker_dir / f"{slice_file_name}.fa" + with open(region_file, "w+", encoding="utf8") as region_fasta_out: + region_fasta_out.write(f">{region_name}\n{seq}\n") + region_results_file = Path(f"{region_file}.rm.gtf") + output_file = Path(f"{region_file}.out") + masked_file = Path(f"{region_file}.masked") + tbl_file = Path(f"{region_file}.tbl") + log_file = Path(f"{region_file}.log") + cat_file = Path(f"{region_file}.cat") + repeatmasker_cmd = repeatmasker_cmd.copy() + repeatmasker_cmd.append(str(region_file)) + logger.info(repeatmasker_cmd) + subprocess.run(repeatmasker_cmd, check=True) + _create_repeatmasker_gtf(output_file, region_results_file, region_name) + output_file.unlink() + region_file.unlink() + masked_file.unlink(missing_ok=True) + tbl_file.unlink(missing_ok=True) + log_file.unlink(missing_ok=True) + cat_file.unlink(missing_ok=True) + + +def _create_repeatmasker_gtf( # pylint: disable=too-many-locals + output_file: Path, + region_results_file: Path, + region_name: str, +) -> None: + """ + Read the fasta file and save the content in gtf format + + All the genomic slices are collected in a single gtf output with the following format: + SW perc perc perc query position in query matching repeat position in repeat + score div. del. ins. sequence begin end (left) repeat class/family begin end (left) ID + Args: + output_file : GTF file with final results. + region_results_file_path : GTF file with results per region. + region_name : Coordinates of genomic slice. + """ + with open(output_file, "r", encoding="utf8") as repeatmasker_in, open( + region_results_file, "w+", encoding="utf8" + ) as repeatmasker_out: + repeat_count = 1 + for line in repeatmasker_in: + result_match = re.search(r"^\s*\d+\s+", line) + if result_match: + results = line.split() + if results[-1] == "*": + results.pop() + if len(results) != 15: + continue + score = results[0] + start = results[5] + end = results[6] + strand = results[8] + repeat_name = results[9] + repeat_class = results[10] + if strand == "+": + repeat_start = results[11] + repeat_end = results[12] + else: + repeat_start = results[13] + repeat_end = results[12] + strand = "-" + gtf_line = ( + f"{region_name}\tRepeatMasker\trepeat\t{start}\t{end}\t.\t" + f"{strand}\t.\trepeat_id{repeat_count}; " + f'repeat_name "{repeat_name}"; repeat_class "{repeat_class}"; ' + f'repeat_start "{repeat_start}"; ' + f'repeat_end "{repeat_end}"; score "{score}";\n' + ) + repeatmasker_out.write(gtf_line) + repeat_count += 1 + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run RepeatMasker.""" + + genome_file = argschema.fields.InputFile( + required= True, description= "Genome file path" + ) + output_dir = argschema.fields.OutputDir( + required= True, description= "Output directory path" + ) + repeatmasker_bin = argschema.fields.String( + required= False, default= "RepeatMasker", + description = "RepeatMasker executable path", + + ) + library = argschema.fields.String( + required= False, default= "", description= "Custom repeat library" + ) + repeatmasker_engine = argschema.fields.String( + required= False, default= "rmblast", description= "RepeatMasker engine" + ) + species = argschema.fields.String( + required= False, + default="homo", + description="Species name (used if no library is provided)" + ) + num_threads = argschema.fields.Integer( + required= False, default= 1, description= "Number of threads" + ) + + +def main() -> None: + """RepeatMasker's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") /"repeatmasking.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig(loginipath, defaults={"logfilename": str(log_file_path)}, disable_existing_loggers=False,) + run_repeatmasker( + mod.args["genome_file"], + mod.args["output_dir"], + mod.args["repeatmasker_bin"], + mod.args["library"], + mod.args["repeatmasker_engine"], + mod.args["species"], + mod.args["num_threads"], + ) + +if __name__ == "__main__": + main() + +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/repeat_annotation/trf.html b/_modules/ensembl/tools/anno/repeat_annotation/trf.html new file mode 100644 index 0000000..504bb56 --- /dev/null +++ b/_modules/ensembl/tools/anno/repeat_annotation/trf.html @@ -0,0 +1,410 @@ + + + + + + + ensembl.tools.anno.repeat_annotation.trf — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.repeat_annotation.trf

+# See the NOTICE file distributed with this work for additional information #pylint: disable=missing-module-docstring
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+    Tandem Repeats Finder is a program to locate and display tandem repeats in DNA sequences.
+    Benson G. Tandem repeats finder: a program to analyze DNA sequences.
+    Nucleic Acids Res. 1999; 27(2):573–580. doi:10.1093/nar/27.2.573
+"""
+__all__ = ["run_trf"]
+
+import logging
+import logging.config
+import multiprocessing
+import os
+from os import PathLike
+from pathlib import Path
+import re
+import subprocess
+import tempfile
+from typing import List
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+    get_seq_region_length,
+    get_slice_id,
+    slice_output_to_gtf,
+    get_sequence,
+)
+
+logger = logging.getLogger(__name__)
+
+
+
[docs]def run_trf( + genome_file: PathLike, + output_dir: Path, + num_threads: int = 1, + trf_bin: Path = Path("trf"), + match_score: int = 2, + mismatch_score: int = 5, + delta: int = 7, + pm: int = 80, + pi: int = 10, + minscore: int = 40, + maxperiod: int = 500, +) -> None: + """ + Executes TRF on genomic slices + :param genome_file: Genome file path. + :type genome_file: PathLike + :param output_dir: Working directory path. + :type output_dir: Path + :param num_threads: int, number of threads. + :type num_threads: int, default 1 + :param trf_bin: TRF software path. + :type trf_bin: Path, default trf + :param match_score: Matching weight. + :type match_score: int, default 2 + :param mismatch_score: Mismatching penalty. + :type mismatch_score: int, default 5 + :param delta: Indel penalty. + :type delta: int, default 7 + :param pm: Match probability (whole number). + :type pm: int, default 80 + :param pi: Indel probability (whole number). + :type pi: int, default 10 + :param minscore: Minimum alignment score to report. + :type minscore: int, default 40 + :param maxperiod: Maximum period size to report. + :type maxperiod: int, default 500 + + :return: None + :rtype: None + """ + check_exe(trf_bin) + trf_dir = create_dir(output_dir, "trf_output") + os.chdir(str(trf_dir)) + output_file = trf_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "repeat") + if transcript_count > 0: + logger.info("Trf gtf file exists, skipping analysis") + return + logger.info("Creating list of genomic slices") + seq_region_to_length = get_seq_region_length(genome_file, 5000) + slice_ids_per_region = get_slice_id( + seq_region_to_length, slice_size=1000000, overlap=0, min_length=5000 + ) + trf_output_extension = ( + f".{match_score}.{mismatch_score}.{delta}." + f"{pm}.{pi}.{minscore}.{maxperiod}.dat" + ) + trf_cmd = [ + trf_bin, + None, + str(match_score), + str(mismatch_score), + str(delta), + str(pm), + str(pi), + str(minscore), + str(maxperiod), + "-d", + "-h", + ] + logger.info("Running TRF") + pool = multiprocessing.Pool(num_threads)#pylint:disable=consider-using-with + for slice_id in slice_ids_per_region: + pool.apply_async( + _multiprocess_trf, + args=( + trf_cmd, + slice_id, + trf_dir, + trf_output_extension, + genome_file, + ), + ) + pool.close() + pool.join() + slice_output_to_gtf(trf_dir, "repeat_id", "trf", True, ".trf.gtf") + for gtf_file in trf_dir.glob("*.trf.gtf"): + gtf_file.unlink()
+ + +def _multiprocess_trf( + trf_cmd: List[str], + slice_id: List[str], + trf_dir: Path, + trf_output_extension: Path, + genome_file:Path, +) -> None: + """ + Run TRF on multiprocess on genomic slices + Args: + trf_cmd: TRF command to execute. + slice_id: Slice Id to run TRF on. + trf_dir : TRF output dir. + trf_output_extension: TRF file output extension. + genome_file : Genome file. + """ + region_name, start, end = slice_id + logger.info( + "Processing slice to find tandem repeats with TRF:%s:%s:%s", + region_name, + start, + end, + ) + seq = get_sequence(region_name, int(start), int(end), 1, genome_file, trf_dir) + slice_name = f"{region_name}.rs{start}.re{end}" + with tempfile.TemporaryDirectory(dir=trf_dir) as tmpdirname: + slice_file = trf_dir / tmpdirname / f"{slice_name}.fa" + with open(slice_file, "w+", encoding="utf8") as region_out: + region_out.write(f">{region_name}\n{seq}\n") + region_results = trf_dir / f"{slice_name}.trf.gtf" + # TRF writes to the current dir, so swtich to the output dir for it + # os.chdir(str(trf_output_dir)) + output_file = Path(f"{slice_file}{trf_output_extension}") + trf_cmd = trf_cmd.copy() + trf_cmd[1] = str(slice_file) + logger.info("trf_cmd: %s", trf_cmd) + # with open(trf_output_file_path, "w+") as trf_out: + subprocess.run(trf_cmd, cwd=trf_dir / tmpdirname)#pylint:disable=subprocess-run-check + _create_trf_gtf(output_file, region_results, region_name) + slice_file.unlink() + output_file.unlink() + + +def _create_trf_gtf( + output_file: Path, + region_results: Path, + region_name: str, +) -> None: + """ + Read the fasta file and save the content in gtf format + + TRF output format: + cols 1+2: Indices of the repeat relative to the start of the sequence + col 3: Period size of the repeat + col 4: Number of copies aligned with the consensus pattern + col 5: Size of consensus pattern (may differ slightly from the period size) + col 6: Percent of matches between adjacent copies overall + col 7: Percent of indels between adjacent copies overall + col 8: Alignment score + cols 9-12: Percent composition for each of the four nucleotides + col 13: Entropy measure based on percent composition + col 14: Consensus sequence + col 15: Repeat sequence + Args: + output_file : GTF file with final results. + region_results : GTF file with results per region. + region_name : Coordinates of genomic slice. + """ + with open(output_file, "r", encoding="utf8") as trf_in, open( + region_results, "w+", encoding="utf8" + ) as trf_out: + repeat_count = 1 + for line in trf_in: + result_match = re.search(r"^\d+", line) + if result_match: + results = line.split() + if len(results) != 15: + continue + start = results[0] + end = results[1] + period = float(results[2]) + copy_number = float(results[3]) + percent_matches = float(results[5]) + score = float(results[7]) + repeat_consensus = results[13] + if ( # pylint: disable=too-many-boolean-expressions + score < 50 + and percent_matches >= 80 + and copy_number > 2 + and period < 10 + ) or (copy_number >= 2 and percent_matches >= 70 and score >= 50): + gtf_line = ( + f"{region_name}\tTRF\trepeat\t{start}\t{end}\t.\t+\t.\t" + f'repeat_id "{repeat_count}"; score "{score}"; ' + f'repeat_consensus "{repeat_consensus}";\n' + ) + trf_out.write(gtf_line) + repeat_count += 1 + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run TRF.""" + + genome_file = argschema.fields.InputFile( + required=True, description="Genome file path" + ) + output_dir = argschema.fields.OutputDir( + required=True, description="Output directory path" + ) + trf_bin = argschema.fields.String( + required=False, + default="trf", + description="TRF executable path", + ) + match_score = argschema.fields.Integer( + required=False, default=2, description="Matching weight" + ) + mismatch_score = argschema.fields.Integer( + required=False, default=5, description="Mismatching penalty" + ) + delta = argschema.fields.Integer( + required=False, default=7, description="Indel penalty" + ) + pm = argschema.fields.Integer( + required=False, default=80, description="Match probability" + ) + pi = argschema.fields.Integer( + required=False, default=10, description="Indel probability" + ) + minscore = argschema.fields.Integer( + required=False, default=40, description="Minimum alignment score to report" + ) + maxperiod = argschema.fields.Integer( + required=False, default=500, description="Maximum period size to report" + ) + num_threads = argschema.fields.Integer( + required=False, default=1, description="Number of threads" + ) + + +def main() -> None: + """TRF's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "trf.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_trf( + mod.args["genome_file"], + mod.args["output_dir"], + mod.args["num_threads"], + mod.args["trf_bin"], + mod.args["match_score"], + mod.args["mismatch_score"], + mod.args["delta"], + mod.args["pm"], + mod.args["pi"], + mod.args["minscore"], + mod.args["maxperiod"], + ) + + +if __name__ == "__main__": + main() +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/simple_feature_annotation/cpg.html b/_modules/ensembl/tools/anno/simple_feature_annotation/cpg.html new file mode 100644 index 0000000..28c5c2f --- /dev/null +++ b/_modules/ensembl/tools/anno/simple_feature_annotation/cpg.html @@ -0,0 +1,369 @@ + + + + + + + ensembl.tools.anno.simple_feature_annotation.cpg — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.simple_feature_annotation.cpg

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Set of discriminant functions that can recognize structural and compositional features
+such as CpG islands, promoter regions and first splice-donor sites.
+Davuluri RV, Grosse I, Zhang MQ: Computational identification of promoters and
+first exons in the human genome. Nat Genet. 2001, 29(4):412-417. [PMID: 11726928]
+"""
+__all__ = ["run_cpg"]
+import logging
+import logging.config
+import multiprocessing
+from os import PathLike
+from pathlib import Path
+import re
+import subprocess
+from tempfile import TemporaryDirectory
+from typing import List,Union
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+    get_seq_region_length,
+    get_slice_id,
+    slice_output_to_gtf,
+    get_sequence,
+)
+
+logger = logging.getLogger(__name__)
+
+
+
[docs]def run_cpg( + genome_file: PathLike, + output_dir: Path, + cpg_bin: Path = Path("cpg_lh"), + cpg_min_length: int = 400, + cpg_min_gc_content: int = 50, + cpg_min_oe: float = 0.6, + num_threads: int = 1, +) -> None: + """ + Run CpG islands on genomic slices + + :param genome_file: Genome file path. + :type genome_file: PathLike + :param output_dir: Working directory path + :type output_dir: Path + :param cpg_bin: CpG software path. + :type cpg_bin: Path + :param cpg_min_length: Min length of CpG islands + :type cpg_min_length: int + :param cpg_min_gc_content: Min GC frequency percentage + :type cpg_min_gc_content: int + :param cpg_min_oe: Min ratio of the observed to expected number of CpG (CpGo/e) + :type cpg_min_oe: float + :param num_threads: int, number of threads. + :type num_threads: int + + :return: None + :rtype: None + """ + + check_exe(cpg_bin) + cpg_dir = create_dir(output_dir, "cpg_output") + output_file = cpg_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "simple_feature") + if transcript_count > 0: + logger.info("Cpg gtf file exists") + return + logger.info("Creating list of genomic slices") + seq_region_to_length = get_seq_region_length(genome_file, 5000) + slice_ids_per_region = get_slice_id( + seq_region_to_length, slice_size=1000000, overlap=0, min_length=5000 + ) + logger.info("Running CpG") + pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with + for slice_id in slice_ids_per_region: + pool.apply_async( + _multiprocess_cpg, + args=( + cpg_bin, + slice_id, + genome_file, + cpg_dir, + cpg_min_length, + cpg_min_gc_content, + cpg_min_oe, + ), + ) + + pool.close() + pool.join() + slice_output_to_gtf(cpg_dir, "feature_id", "cpg", True, ".cpg.gtf") + for gtf_file in cpg_dir.glob("*.cpg.gtf"): + gtf_file.unlink()
+ + +def _multiprocess_cpg( + cpg_bin: Path, + slice_id: List[str], + genome_file: Path, + cpg_dir: Path, + cpg_min_length: int = 400, + cpg_min_gc_content: int = 50, + cpg_min_oe: float = 0.6, +) -> None: + """ + Annotation of CpG islands on multiprocess on genomic slices + Args: + cpg_bin: CpG software path. + slice_id: Slice id to run CpG on. + genome_file : Genome file. + cpg_dir : Output dir. + cpg_min_length : Min length of CpG islands + cpg_min_gc_content : Min GC frequency percentage + cpg_min_oe : Min ratio of the observed to expected number of CpG (CpGo/e) + """ + region_name, start, end = slice_id + logger.info( + "Processing slice to find CpG islands with cpg_lh: %s:%s:%s", + region_name, + start, + end, + ) + seq = get_sequence(region_name, int(start), int(end), 1, genome_file, cpg_dir) + slice_name = f"{region_name}.rs{start}.re{end}" + #with TemporaryDirectory(dir=cpg_dir) as tmpdirname: + slice_file = cpg_dir / f"{slice_name}.fa" + with open(slice_file, "w+", encoding="utf8") as region_out: + region_out.write(f">{region_name}\n{seq}\n") + region_results = cpg_dir / f"{slice_file}.cpg.gtf" + output_file = Path(f"{slice_file}.cpg") + cpg_cmd = [str(cpg_bin), str(slice_file)] + with open(output_file, "w+", encoding="utf8") as cpg_out: + subprocess.run(cpg_cmd, stdout=cpg_out, check=True) + _create_cpg_gtf( + output_file, + region_results, + region_name, + cpg_min_length, + cpg_min_gc_content, + cpg_min_oe, + ) + slice_file.unlink() + output_file.unlink() + + +def _create_cpg_gtf( + output_file: Path, + region_results: Path, + region_name: str, + cpg_min_length: int = 400, + cpg_min_gc_content: int = 50, + cpg_min_oe: float = 0.6, +) -> None: + """ + Read the fasta file and save the content in gtf format + All the genomic slices are collected in a single gtf output + Args: + output_file : GTF file with final results. + region_results : GTF file with the results per region. + region_name :Coordinates of genomic slice. + cpg_dir : Output dir. + cpg_min_length : Min length of CpG islands + cpg_min_gc_content : Min GC frequency percentage + cpg_min_oe : Min ratio of the observed to expected number of CpG (CpGo/e) + """ + with open(output_file, "r", encoding="utf8") as cpg_in, open(region_results, "w+", encoding="utf8") as cpg_out: + feature_count = 1 + for line in cpg_in: + result_match = re.search(r"^" + region_name, line) + if result_match: + results = line.split() + start = int(results[1]) + end = int(results[2]) + length = end - start + 1 + score = float(results[3]) + gc_content = float(results[6]) + oe_score_str = results[7] + oe_score: Union[float, int] + if oe_score_str in ("-", "inf"): + oe_score=0 + else: + oe_score=float(oe_score_str) + if ( + int(length) >= int(cpg_min_length) + and gc_content >= int(cpg_min_gc_content) + and oe_score >= float(cpg_min_oe) + ): + gtf_line = ( + f"{region_name}\tCpG\tsimple_feature\t{start}\t" + f'{end}\t.\t+\t.\tfeature_id "{feature_count}"; score "{score}";\n' + ) + cpg_out.write(gtf_line) + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run CpG software.""" + + genome_file = argschema.fields.InputFile( + required=True, description="Genome file path" + ) + output_dir = argschema.fields.OutputDir( + required=True, description="Output directory path" + ) + cpg_bin = argschema.fields.String( + required=False, + default="cpg_lh", + description="CpG executable path", + ) + cpg_min_length = argschema.fields.Integer( + required=False, + default="400", + description="Min length of CpG islands", + ) + cpg_min_gc_content = argschema.fields.Integer( + required=False, + default="50", + description="Min GC frequency percentage", + ) + cpg_min_oe = argschema.fields.Float( + required=False, + default="0.6", + description="Min ratio of the observed to expected number of CpG (CpGo/e)", + ) + num_threads = argschema.fields.Integer( + required=False, default=1, description="Number of threads" + ) + + +def main() -> None: + """CpG's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "cpg.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_cpg( + mod.args["genome_file"], + mod.args["output_dir"], + mod.args["cpg_bin"], + mod.args["cpg_min_length"], + mod.args["cpg_min_gc_content"], + mod.args["cpg_min_oe"], + mod.args["num_threads"], + ) +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/simple_feature_annotation/eponine.html b/_modules/ensembl/tools/anno/simple_feature_annotation/eponine.html new file mode 100644 index 0000000..4bb18bd --- /dev/null +++ b/_modules/ensembl/tools/anno/simple_feature_annotation/eponine.html @@ -0,0 +1,351 @@ + + + + + + + ensembl.tools.anno.simple_feature_annotation.eponine — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.simple_feature_annotation.eponine

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Eponine is a probabilistic method for detecting transcription start sites (TSS)
+in mammalian genomic sequence, with good specificity and excellent positional accuracy.
+Down TA, Hubbard TJ. Computational detection and location of transcription start sites
+in mammalian genomic DNA. Genome Res. 2002 Mar;12(3):458-61. doi: 10.1101/gr.216102.
+PMID: 11875034; PMCID: PMC155284.
+"""
+__all__ = ["run_eponine"]
+
+import logging
+import logging.config
+import multiprocessing
+import os
+from os import PathLike
+from pathlib import Path
+import re
+import subprocess
+from tempfile import TemporaryDirectory
+from typing import List
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    check_file,
+    create_dir,
+    check_gtf_content,
+    get_sequence,
+    get_seq_region_length,
+    get_slice_id,
+    slice_output_to_gtf,
+)
+
+logger = logging.getLogger("__name__")
+
+
+
[docs]def run_eponine( + genome_file: PathLike, + output_dir: Path, + num_threads: int = 1, + java_bin: Path = Path("java"), + eponine_bin: Path = Path( + "/hps/software/users/ensembl/ensw/C8-MAR21-sandybridge/linuxbrew/opt/eponine/libexec/eponine-scan.jar" + ), + eponine_threshold: float = 0.999, +) -> None: + """ + Run Eponine on genomic slices + :param genome_file: Genome file path. + :param genome_file: PathLike + :param output_dir: Working directory path. + :param output_dir: Path + :param java_bin: Java path. + :param java_bin: Path, default java + :param eponine_bin: Eponine software path + :param eponine_bin: Path + :param num_threads: Number of threads. + :param num_threads: int, default 1 + + :return: None + :rtype: None + """ + check_file(eponine_bin) + check_exe(java_bin) + eponine_dir = create_dir(output_dir, "eponine_output") + # os.chdir(str(eponine_dir)) + output_file = eponine_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "simple_feature") + if transcript_count > 0: + logger.info("Eponine gtf file exists, skipping analysis") + return + logger.info("Creating list of genomic slices") + seq_region_to_length = get_seq_region_length(genome_file, 5000) + slice_ids_per_region = get_slice_id( + seq_region_to_length, slice_size=1000000, overlap=0, min_length=5000 + ) + + eponine_cmd = [ + str(java_bin), + "-jar", + str(eponine_bin), + "-threshold", + str(eponine_threshold), + "-seq", + ] + logger.info("Running Eponine") + pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with + for slice_id in slice_ids_per_region: + pool.apply_async( + _multiprocess_eponine, + args=( + eponine_cmd, + slice_id, + eponine_dir, + Path(genome_file), + ), + ) + pool.close() + pool.join() + slice_output_to_gtf(eponine_dir, "feature_id", "eponine", True, ".epo.gtf") + for gtf_file in eponine_dir.glob("*.epo.gtf"): + gtf_file.unlink()
+ + +def _multiprocess_eponine( + eponine_cmd: List[str], + slice_id: List[str], + eponine_dir: Path, + genome_file: Path, +) -> None: + """ + Run Eponine on multiprocess on genomic slices + Args: + eponine_cmd: Eponine command to execute. + slice_id: List of slice IDs. + eponine_dir : Eponine output directory path. + genome_file : Genome file. + """ + region_name, start, end = slice_id + logger.info( + "Processing slice to find transcription start sites with Eponine: %s:%s:%s", + region_name, + start, + end, + ) + seq = get_sequence(region_name, int(start), int(end), 1, genome_file, eponine_dir) + slice_name = f"{region_name}.rs{start}.re{end}" + #with tempfile.TemporaryDirectory(dir=eponine_dir) as tmpdirname: + slice_file = eponine_dir / f"{slice_name}.fa" + with open(slice_file, "w+", encoding="utf8") as region_out: + region_out.write(f">{region_name}\n{seq}\n") + region_results = eponine_dir / f"{slice_name}.epo.gtf" + output_file = Path(f"{slice_file}.epo") + eponine_cmd = eponine_cmd.copy() + eponine_cmd.append(str(slice_file)) + logging.info(eponine_cmd) + with open(output_file, "w+", encoding="utf8") as eponine_out: + subprocess.run(eponine_cmd, stdout=eponine_out, check=True) + _create_eponine_gtf(output_file, region_results, region_name) + slice_file.unlink() + output_file.unlink() + + +def _create_eponine_gtf( + output_file: Path, + region_results: Path, + region_name: str, +) -> None: + """ + Read the fasta file and save the content in gtf format + All the genomic slices are collected in a single gtf output + Args: + output_file: GTF file with final results. + region_results: GTF file with the results per region. + region_name: Coordinates of genomic slice. + """ + with open(output_file, "r", encoding="utf8") as eponine_in, open( + region_results, "w+", encoding="utf8" + ) as eponine_out: + feature_count = 1 + for line in eponine_in: + result_match = re.search(r"^" + region_name, line) + if result_match: + results = line.split() + start = int(results[3]) + end = int(results[4]) + score = float(results[5]) + strand = results[6] + logging.info(results) + # There's a one base offset on the reverse strand + if strand == "-": + start -= 1 + end -= 1 + + gtf_line = ( + f"{region_name}\tEponine\tsimple_feature\t" + f"{start}\t{end}\t.\t{strand}\t.\t" + f'feature_id "{feature_count}"; score "{score}";\n' + ) + eponine_out.write(gtf_line) + feature_count += 1 + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run Eponine.""" + + genome_file = argschema.fields.InputFile( + required=True, description="Genome file path" + ) + output_dir = argschema.fields.OutputDir( + required=True, description="Output directory path" + ) + num_threads = argschema.fields.Integer( + required=False, default=1, description="Number of threads" + ) + java_bin = argschema.fields.String( + required=False, + default="java", + description="Java executable path", + ) + eponine_bin = argschema.fields.String( + required=False, + default="/hps/software/users/ensembl/ensw/C8-MAR21-sandybridge/linuxbrew/opt/eponine/libexec/eponine-scan.jar", # pylint:disable=line-too-long + description="Java executable path", + ) + eponine_threashold = argschema.fields.Float( + required=False, default=0.999, description="Eponine threashold" + ) + + +def main() -> None: + """Eponine's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "eponine.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_eponine( + mod.args["genome_file"], + mod.args["output_dir"], + mod.args["num_threads"], + Path(mod.args["java_bin"]), + Path(mod.args["eponine_bin"]), + mod.args["eponine_threashold"], + ) + + +if __name__ == "__main__": + main() +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/snc_rna_annotation/trnascan.html b/_modules/ensembl/tools/anno/snc_rna_annotation/trnascan.html new file mode 100644 index 0000000..22caf86 --- /dev/null +++ b/_modules/ensembl/tools/anno/snc_rna_annotation/trnascan.html @@ -0,0 +1,405 @@ + + + + + + + ensembl.tools.anno.snc_rna_annotation.trnascan — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.snc_rna_annotation.trnascan

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+tRNAscan-SE identifies 99-100% of transfer RNA genes in DNA sequence while
+giving less than one false positive per 15 gigabases.
+Lowe TM, Eddy SR: tRNAscan-SE: a program for improved detection of transfer
+RNA genes in genomic sequence.
+Nucleic Acids Res. 1997, 25(5):955-64. [PMID: 9023104]
+"""
+__all__ = ["run_trnascan"]
+
+import logging
+import logging.config
+import multiprocessing
+from os import PathLike
+from pathlib import Path
+import re
+import subprocess
+from typing import List
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    check_file,
+    create_dir,
+    check_gtf_content,
+    get_seq_region_length,
+    get_slice_id,
+    slice_output_to_gtf,
+    get_sequence,
+)
+
+logger = logging.getLogger(__name__)
+
+
+
[docs]def run_trnascan( + genome_file: PathLike, + output_dir: Path, + trnascan_bin: Path = Path("tRNAscan-SE"), + trnascan_filter: Path = Path("EukHighConfidenceFilter"), + num_threads: int = 1, +) -> None: + """ + Executes tRNAscan-SE on genomic slices + :param genome_file: Genome file path. + :type genome_file: PathLike + :param output_dir: working directory path. + :type output_dir: Path + :param trnascan_bin: tRNAscan-SE software path. + :type trnascan_bin: Path, default tRNAscan-SE + :param trnascan_filter: tRNAscan-SE filter set path. + :type trnascan_filter: Path, default EukHighConfidenceFilter + :param num_threads: int, number of threads. + :type num_threads: int, default 1 + + :return: None + :rtype: None + """ + check_exe(trnascan_bin) + check_file(trnascan_filter) + trnascan_dir = create_dir(output_dir, "trnascan_output") + output_file = trnascan_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "transcript") + if transcript_count > 0: + logger.info("Trnascan gtf file exists, skipping analysis") + return + logger.info("Creating list of genomic slices") + seq_region_to_length = get_seq_region_length(genome_file, 5000) + slice_ids_per_region = get_slice_id(seq_region_to_length, 1000000, 0, 5000) + trnascan_cmd = [ + str(trnascan_bin), + None, + "-o", + None, + "-f", + None, + "-H", # show both primary and secondary structure components to covariance model bit scores + "-q", # quiet mode + "--detail", + "-Q", + ] + logger.info("Running tRNAscan-SE") + pool = multiprocessing.Pool(num_threads) # pylint: disable=consider-using-with + for slice_id in slice_ids_per_region: + pool.apply_async( + _multiprocess_trnascan, + args=( + trnascan_cmd, + slice_id, + genome_file, + trnascan_filter, + trnascan_dir, + ), + ) + + pool.close() + pool.join() + slice_output_to_gtf( + output_dir=trnascan_dir, unique_ids=True, file_extension=".trna.gtf" + ) + for gtf_file in trnascan_dir.glob("*.trna.gtf"): + gtf_file.unlink()
+ + +def _multiprocess_trnascan( + trnascan_cmd: List[str], + slice_id: List[str], + genome_file: Path, + trnascan_filter: Path, + trnascan_dir: Path, +) -> None: + """ + Run tRNAscan-SE on multiprocess on genomic slices + Args: + trnascan_cmd: tRNAscan-SE command to execute. + slice_id: Slice Id to run tRNAscan-SE on. + genome_file : Genome file. + trnascan_dir : tRNAscan-SE output dir. + trnascan_filter: tRNAscan-SE filter set. + """ + region_name, start, end = slice_id + logger.info( + "Processing slice to find tRNAs using tRNAscan-SE:%s:%s:%s", + region_name, + start, + end, + ) + seq = get_sequence(region_name, int(start), int(end), 1, genome_file, trnascan_dir) + slice_name = f"{region_name}.rs{start}.re{end}" + slice_file = trnascan_dir / f"{slice_name}.fa" + with open(slice_file, "w+", encoding="utf8") as region_out: + region_out.write(f">{region_name}\n{seq}\n") + # trnscan output + region_results = trnascan_dir / f"{slice_name}.trna.gtf" + output_file = Path(f"{slice_file}.trna") + ss_output_file = Path(f"{output_file}.ss") + # filtering + filter_prefix_file = f"{slice_name}.filt" + filter_output_file = trnascan_dir / f"{filter_prefix_file}.out" + filter_log_file = trnascan_dir / f"{filter_prefix_file}.log" + filter_ss_file = trnascan_dir / f"{filter_prefix_file}.ss" + # trnascan_cmd = generic_trnascan_cmd.copy() + trnascan_cmd[1], trnascan_cmd[3], trnascan_cmd[5] = ( + str(slice_file), + str(output_file), + str(ss_output_file), + ) + logger.info("tRNAscan-SE command: %s", " ".join(trnascan_cmd)) + subprocess.run(trnascan_cmd, check=True) + # If the trnascan output is empty there is no need to go on with filtering + if output_file.stat().st_size == 0: + output_file.unlink() + slice_file.unlink() + ss_output_file.unlink(missing_ok=True) + return + + filter_cmd = [ + str(trnascan_filter), + "--result", # tRNAscan-SE output file used as input + str(output_file), + "--ss", # tRNAscan-SE secondary structure file used as input + str(ss_output_file), + "--output", + str(trnascan_dir), + "--prefix", + str(filter_prefix_file), + ] + logger.info( + "tRNAscan-SE filter command: %s", " ".join(str(item) for item in filter_cmd) + ) + subprocess.run(filter_cmd)#pylint:disable=subprocess-run-check + _create_trnascan_gtf(region_results, filter_output_file, region_name) + output_file.unlink(missing_ok=True) + slice_file.unlink(missing_ok=True) + ss_output_file.unlink(missing_ok=True) + Path(filter_prefix_file).unlink(missing_ok=True) + filter_log_file.unlink(missing_ok=True) + filter_ss_file.unlink(missing_ok=True) + filter_output_file.unlink(missing_ok=True) + + +def _create_trnascan_gtf( + region_results: Path, filter_output_file: Path, region_name: str +) -> None: + """ + Read the fasta file and save the content in gtf format + All the genomic slices are collected in a single gtf output + Args: + region_results : GTF file with the results per region. + filter_file : GTF file with the filtered results per region. + region_name :Coordinates of genomic slice. + + tRNAscan-SE output format: + col0: GtRNAdb Gene Symbol - gene ID in corresponding genome + col1: tRNAscan-SE ID - tRNA ID in tRNAscan-SE prediction results + col2-3: Locus - Genomic coordinates of predicted gene + col4: Isotype (from Anticodon) - tRNA isotype determined by anticodon + col5: Anticodon - anticodon of predicted tRNA gene + col6-7: Intron boundaries + col8: General tRNA Model Score - covariance model bit score from tRNAscan-SE results + col9: Best Isotype Model - best matching (highest scoring) isotype determined + by isotype-specific covariance model classification + col10-11-12: Anticodon and Isotype Model Agreement - consistency between anticodon + from predicted gene sequence and best isotype model + col13: Features - special gene features that may include gene set categorization, + number of introns, possible pseudogenes, possible truncation, or base-pair mismatches + """ + with open(filter_output_file, "r", encoding="utf8") as trna_in, open( + region_results, "w+", encoding="utf8" + ) as trna_out: + gene_counter = 1 + for line in trna_in: + result_match = re.search(r"^" + region_name, line) + if result_match: + results = line.split() + start = int(results[2]) + end = int(results[3]) + strand = "+" + if start > end: + strand = "-" + start, end = end, start + biotype = ( + "tRNA" + if re.search(r"high confidence set", line) + else "tRNA_pseudogene" + ) + transcript_string = ( + f"{region_name}\ttRNAscan\ttranscript\t{start}\t{end}\t.\t" + f'{strand}\t.\tgene_id "{gene_counter}"; transcript_id ' + f'"{gene_counter}"; biotype "{biotype}";\n' + ) + exon_string = ( + f"{region_name}\ttRNAscan\texon\t{start}\t{end}\t.\t" + f'{strand}\t.\tgene_id "{gene_counter}"; transcript_id ' + f'"{gene_counter}"; exon_number "1"; biotype "{biotype}";\n' + ) + trna_out.write(transcript_string) + trna_out.write(exon_string) + trna_out.flush() + gene_counter += 1 + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run tRNAscan-SE.""" + + genome_file = argschema.fields.InputFile( + required=True, description="Genome file path" + ) + trnascan_bin = argschema.fields.String( + required=False, + default="tRNAscan-SE", + description="tRNAscan-SE executable path", + ) + trnascan_filter = argschema.fields.String( + required=False, + default="/hps/software/users/ensembl/ensw/C8-MAR21-sandybridge/linuxbrew/bin/EukHighConfidenceFilter", + description="tRNAscan-SE filter path", + ) + output_dir = argschema.fields.OutputDir( + required=True, description="Output directory path" + ) + num_threads = argschema.fields.Integer( + required=False, default=1, description="Number of threads" + ) + + +def main() -> None: + """tRNAscan-SE's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "trnascan.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_trnascan( + mod.args["genome_file"], + mod.args["output_dir"], + mod.args["trnascan_bin"], + Path(mod.args["trnascan_filter"]), + mod.args["num_threads"], + ) + + +if __name__ == "__main__": + main() +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/transcriptomic_annotation/minimap.html b/_modules/ensembl/tools/anno/transcriptomic_annotation/minimap.html new file mode 100644 index 0000000..4d2c94f --- /dev/null +++ b/_modules/ensembl/tools/anno/transcriptomic_annotation/minimap.html @@ -0,0 +1,370 @@ + + + + + + + ensembl.tools.anno.transcriptomic_annotation.minimap — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.transcriptomic_annotation.minimap

+# See the NOTICE file distributed with this work for additional information #pylint: disable=missing-module-docstring
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Minimap2 is a pairwise sequence alignment algorithm designed for efficiently comparing nucleotide sequences.
+The algorithm uses a versatile indexing strategy to quickly find approximate matches between sequences, 
+allowing it to efficiently align long sequences against reference genomes or other sequences.
+
+Li, H. (2018). Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics, 34(18), 3094-3100.
+"""
+
+__all__ = ["run_minimap2"]
+import logging
+import logging.config
+from pathlib import Path
+import subprocess
+from typing import List
+import argschema
+
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+)
+
+
+
[docs]def run_minimap2( + output_dir: Path, + long_read_fastq_dir: Path, + genome_file: Path, + minimap2_bin: Path = Path("minimap2"), + paftools_bin: Path = Path("paftools.js"), + max_intron_length: int = 100000, + num_threads: int = 1, +) -> None: + """ + Run Minimap2 to align long read data against genome file. + Default Minimap set for PacBio data. + + :param output_dir: Working directory path. + :type output_dir: Path + :param long_read_fastq_dir: Long read directory path. + :type long_read_fastq_dir: Path + :param genome_file: Genome file path. + :type genome_file: Path + :param minimap2_bin: Software path. + :type minimap2_bin: Path, default minimap2 + :param paftools_bin: Software path. + :type paftools_bin: Path, default paftools.js + :param max_intron_length: The maximum intron size for alignments. Defaults to 100000. + :type max_intron_length: int, default 100000 + :param num_threads: Number of available threads. + :type num_threads: int, default 1 + + :return: None + :rtype: None + """ + check_exe(minimap2_bin) + check_exe(paftools_bin) + minimap2_dir = create_dir(output_dir, "minimap2_output") + + logging.info("Skip analysis if the gtf file already exists") + output_file = minimap2_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "transcript") + if transcript_count > 0: + logging.info("Minimap2 gtf file exists, skipping analysis") + return + minimap2_index_file = minimap2_dir / f"{Path(genome_file).name}.mmi" + # minimap2_hints_file = minimap2_dir /"minimap2_hints.gff" + file_types = ("*.fastq", "*.fq") + fastq_file_list = [ + path for file_type in file_types for path in Path(long_read_fastq_dir).rglob(file_type) + ] + if len(fastq_file_list) == 0: + raise IndexError(f"The list of fastq files is empty. Fastq dir:\n{long_read_fastq_dir}") + + if not minimap2_index_file.exists(): + logging.info("Did not find an index file for minimap2. Will create now") + try: + subprocess.run( # pylint:disable=subprocess-run-check + [ + minimap2_bin, + "-t", + str(num_threads), + "-d", + str(minimap2_index_file), + genome_file, + ] + ) + except subprocess.CalledProcessError as e: + logging.error("An error occurred while creating minimap2 index: %s", e) + except OSError as e: + logging.error("An OS error occurred: %s", e) + + logging.info("Running minimap2 on the files in the long read fastq dir") + for fastq_file in fastq_file_list: + sam_file = minimap2_dir / f"{fastq_file.name}.sam" + bed_file = minimap2_dir / f"{fastq_file.name}.bed" + logging.info("Processing %s", fastq_file) + with open(bed_file, "w+", encoding="utf8") as bed_file_out: + subprocess.run( # pylint:disable=subprocess-run-check + [ + minimap2_bin, + "-G", + str(max_intron_length), + "-t", + str(num_threads), + "--cs", + "--secondary=no", + "-ax", + "splice", + "-u", + "b", + minimap2_index_file, + fastq_file, + "-o", + sam_file, + ] + ) + logging.info("Creating bed file from SAM") + subprocess.run( + [paftools_bin, "splice2bed", sam_file], stdout=bed_file_out + ) # pylint:disable=subprocess-run-check + + _bed_to_gtf(minimap2_dir) + + logging.info("Completed running minimap2")
+ + +def _bed_to_gtf(output_dir: Path) -> None: + """ + Convert bed file into gtf file + Args: + output_dir : Working directory path. + """ + gtf_file_path = output_dir / "annotation.gtf" + with open(gtf_file_path, "w+", encoding="utf8") as gtf_out: + gene_id = 1 + for bed_file in output_dir.glob("*.bed"): + logging.info("Converting bed to GTF: %s", str(bed_file)) + with open(bed_file, "r", encoding="utf8") as bed_in: + for line in bed_in: + elements = line.rstrip().split("\t") + seq_region_name = elements[0] + offset = int(elements[1]) + strand = elements[5] + # sizes of individual block of exons + block_sizes = [size for size in elements[10].split(",") if size] + block_starts = [size for size in elements[11].split(",") if size] + exons = _bed_block_to_exons(block_sizes, block_starts, offset) + transcript_start = None + transcript_end = None + exon_records = [] + for i, exon_coords in enumerate(exons): + if transcript_start is None or exon_coords[0] < transcript_start: + transcript_start = exon_coords[0] + + if transcript_end is None or exon_coords[1] > transcript_end: + transcript_end = exon_coords[1] + + exon_line = ( + f"{seq_region_name}\tminimap\texon\t{exon_coords[0]}\t" + f"{exon_coords[1]}\t.\t{strand}\t.\t" + f'gene_id "minimap_{gene_id}"; transcript_id "minimap_{gene_id}"; ' + f'exon_number "{i+ 1}";\n' + ) + exon_records.append(exon_line) + transcript_line = ( + f"{seq_region_name}\tminimap\ttranscript\t{transcript_start}\t" + f"{transcript_end}\t.\t{strand}\t.\t" + f'gene_id "minimap_{gene_id}"; transcript_id "minimap_{gene_id}"\n' + ) + gtf_out.write(transcript_line) + for exon_line in exon_records: + gtf_out.write(exon_line) + gene_id += 1 + + +def _bed_block_to_exons(block_sizes: List, block_starts: List, offset: int) -> List: + """ + Extract exon size and start from exon feature block + Args: + block_sizes : Block feature size. + block_starts : Block feature starts. + offset : Feature offset. + + Returns: + List of exon coordinates + """ + exons = [] + for i, _ in enumerate(block_sizes): + block_start = offset + int(block_starts[i]) + 1 + block_end = block_start + int(block_sizes[i]) - 1 + if block_end < block_start: + logging.warning("Warning: block end is less than block start, skipping exon") + continue + exon_coords = [str(block_start), str(block_end)] + exons.append(exon_coords) + return exons + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run Minimap2 software.""" + + output_dir = argschema.fields.OutputDir(required=True, description="Output directory path") + long_read_fastq_dir = argschema.fields.String( + required=True, + description="Long read directory path", + ) + genome_file = argschema.fields.InputFile(required=True, description="Genome file path") + minimap2_bin = argschema.fields.String( + required=False, + default="minimap2", + description="Minimap2 software path", + ) + paftools_bin = argschema.fields.String( + required=False, + default="paftools.js", + description="Paftools software path", + ) + max_intron_length = argschema.fields.Integer( + required=False, + default="100000", + description="The maximum intron length.", + ) + max_intron_length = argschema.fields.Integer( + required=False, + default="100000", + description="The maximum intron size for alignments.", + ) + num_threads = argschema.fields.Integer(required=False, default=1, description="Number of threads") + + +def main() -> None: + """Minimap2's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "minimap.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_minimap2( + mod.args["output_dir"], + mod.args["long_read_fastq_dir"], + mod.args["genome_file"], + mod.args["minimap2_bin"], + mod.args["paftools_bin"], + mod.args["max_intron_length"], + mod.args["num_threads"], + ) +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/transcriptomic_annotation/scallop.html b/_modules/ensembl/tools/anno/transcriptomic_annotation/scallop.html new file mode 100644 index 0000000..a483fbf --- /dev/null +++ b/_modules/ensembl/tools/anno/transcriptomic_annotation/scallop.html @@ -0,0 +1,313 @@ + + + + + + + ensembl.tools.anno.transcriptomic_annotation.scallop — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.transcriptomic_annotation.scallop

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Scallop is a high-performance tool designed for the accurate and efficient quantification 
+of transcriptome assembly. 
+It's capable of handling large-scale transcriptomic data while providing precise estimates 
+of transcript abundances.
+Scallop's algorithmic approach allows it to efficiently reconstruct transcript structures 
+and quantify their expression levels, making it a valuable resource for studying gene 
+expression and transcriptome analysis.
+
+Shao M, Kingsford C. Accurate assembly of transcripts through phase-preserving graph 
+decomposition. Nat Biotechnol.
+2017 Dec;35(12):1167-1169. doi: 10.1038/nbt.4020. Epub 2017 Nov 13. PMID: 29131147; PMCID: PMC5722698.
+"""
+
+__all__ = ["run_scallop"]
+import logging
+import logging.config
+from pathlib import Path
+import re
+import subprocess
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+)
+
+
+
[docs]def run_scallop( + output_dir: Path, + scallop_bin: Path = Path("scallop"), + prlimit_bin: Path = Path("prlimit"), + stringtie_bin: Path = Path("stringtie"), + memory_limit: int = 40 * 1024**3, +) -> None: + """ + Run Scallop assembler on short read data after STAR alignment. + + :param output_dir: Working directory path. + :type output_dir: Path + :param scallop_bin: Software path. + :type scallop_bin: Path, default scallop + :param prlimit_bin: Software path. + :type prlimit_bin: Path, default prlimit + :param stringtie_bin: Software path. + :type stringtie_bin: Path, default stringtie + :param memory_limit: Memory limit Scallop command Defaults to 40*1024**3. + :type memory_limit: int + + :return: None + :rtype: None + """ + check_exe(scallop_bin) + check_exe(stringtie_bin) + scallop_dir = create_dir(output_dir, "scallop_output") + logging.info("Skip analysis if the gtf file already exists") + output_file = scallop_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "transcript") + if transcript_count > 0: + logging.info("Scallop gtf file exists, skipping analysis") + return + + star_dir = Path(f"{output_dir}/star_output") + + if star_dir.exists() and len(list(star_dir.glob("*.bam"))) != 0: + for sorted_bam_file in star_dir.glob("*.bam"): + transcript_file_name = re.sub(".bam", ".scallop.gtf", sorted_bam_file.name) + transcript_file = scallop_dir / transcript_file_name + if transcript_file.exists(): + logging.info( + "Found an existing stringtie gtf file, will not overwrite. \ + File found: %s", + transcript_file, + ) + else: + logging.info("Running Scallop on: %s", sorted_bam_file.name) + try: + scallop_cmd = [ + scallop_bin, + "-i", + sorted_bam_file, + "-o", + transcript_file, + "--min_flank_length", + "10", + ] + if memory_limit is not None: + scallop_cmd = _prlimit_command(prlimit_bin, scallop_cmd, memory_limit) + subprocess.check_output(scallop_cmd, stderr=subprocess.STDOUT, universal_newlines=True) + # This combines the standard output and error streams into a single + # string and ensures that the output is in text mode + + except subprocess.CalledProcessError as ex: + logging.error("Error occurred while running Scallop:") + logging.error("Command: %s\n", " ".join(scallop_cmd)) + logging.error("Return code: %s\n", str(ex.returncode)) + logging.error("Output and error messages: %s\n", ex.output) + else: + raise IndexError(f"The list of sorted bam files is empty, Star output dir: {star_dir}") + + # Now need to merge + logging.info("Merge Scaalop's output.") + _scallop_merge(scallop_dir, stringtie_bin)
+ + +def _scallop_merge(scallop_dir: Path, stringtie_bin: Path = Path("stringtie")) -> None: + """ + Merge Scallop result in a single gtf file + + scallop_dir : Input directory's path. + stringtie_bin : Software path. + """ + scallop_input_to_file = scallop_dir / "scallop_assemblies.txt" + scallop_merge_output_file = scallop_dir / "annotation.gtf" + with open(scallop_input_to_file, "w+", encoding="utf8") as gtf_list_out: + for gtf_file in scallop_dir.glob("*.scallop.gtf"): + transcript_count = check_gtf_content(gtf_file, "transcript") + if transcript_count > 0: + gtf_list_out.write(str(gtf_file) + "\n") + else: + logging.warning("Warning, skipping file with no transcripts. Path:%s\n", gtf_file) + + try: + subprocess.check_output( + [ + stringtie_bin, + "--merge", + "-o", + scallop_merge_output_file, + scallop_input_to_file, + ], + stderr=subprocess.STDOUT, + text=True, + ) + + except subprocess.CalledProcessError as e: + print("StringTie execution failed with an error:%s", e.output) + + +def _prlimit_command(prlimit_bin, command_list, virtual_memory_limit) -> list: + """ + Prepend memory limiting arguments to a command list to be run with subprocess. + + This method uses the `prlimit` program to set the memory limit. + + The `virtual_memory_limit` size is in bytes. + + prlimit arguments: + -v, --as[=limits] + Address space limit. + """ + return [str(prlimit_bin), f"-v{virtual_memory_limit}"] + command_list + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run StringTie software.""" + + output_dir = argschema.fields.OutputDir(required=True, description="Output directory path") + scallop_bin = argschema.fields.String( + required=False, + default="scallop", + description="Scallop software path", + ) + stringtie_bin = argschema.fields.String( + required=False, + default="stringtie", + description="Scallop software path", + ) + prlimit_bin = argschema.fields.String( + required=False, + default="prlimit", + description="Prlimit software path", + ) + memory_limit = argschema.fields.Integer( + required=False, default=40 * 1024**3, description="Memory's limit for Scallop command" + ) + + +def main() -> None: + """Scallop's entry-point. :no-index:""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "scallop.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_scallop( + mod.args["output_dir"], mod.args["scallop_bin"], mod.args["prlimit_bin"], mod.args["stringtie_bin"], mod.args["memory_limit"] + ) +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/transcriptomic_annotation/star.html b/_modules/ensembl/tools/anno/transcriptomic_annotation/star.html new file mode 100644 index 0000000..9783956 --- /dev/null +++ b/_modules/ensembl/tools/anno/transcriptomic_annotation/star.html @@ -0,0 +1,732 @@ + + + + + + + ensembl.tools.anno.transcriptomic_annotation.star — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.transcriptomic_annotation.star

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+The STAR (Spliced Transcripts Alignment to a Reference) alignment tool is widely used
+in genomics research for aligning RNA-seq data to a reference genome.
+Dobin A, Davis CA, Schlesinger F, et al. STAR: ultrafast universal RNA-seq aligner.
+Bioinformatics. 2013;29(1):15-21. doi:10.1093/bioinformatics/bts635
+"""
+
+__all__ = ["run_star"]
+import logging
+import logging.config
+import gzip
+import math
+import multiprocessing
+import os
+from pathlib import Path
+import random
+import re
+import shutil
+import subprocess
+from typing import List
+import argschema
+
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+    get_seq_region_length,
+)
+
+
+
[docs]def run_star( + genome_file: Path, + output_dir: Path, + short_read_fastq_dir: Path, + delete_pre_trim_fastq: bool = False, + trim_fastq: bool = False, + max_reads_per_sample: int = 0, + max_intron_length: int = 100000, + num_threads: int = 1, + star_bin: Path = Path("star"), + samtools_bin: Path = Path("samtools"), + trim_galore_bin: Path = Path("trim_galore"), +) -> None: + """ + Run STAR alignment on list of short read data. + :param genome_file: Genome file path. + :type genome_file: Path + :param output_dir: Working directory path. + :type output_dir: Path + :param short_read_fastq_dir: Short read directory path. + :type short_read_fastq_dir: Path + :param delete_pre_trim_fastq: Delete the original fastq files after trimming. Defaults to False. + :type delete_pre_trim_fastq: boolean, default False + :param trim_fastq: Trim short read files using TrimGalore. Defaults to False. + :type trim_fastq: boolean, default False + :param max_reads_per_sample: Max number of reads per sample. Defaults to 0 (unlimited). + :type max_reads_per_sample: int, default 0 + :param max_intron_length: The maximum intron size for alignments. Defaults to 100000. + :type max_intron_length: int, default 100000 + :param num_threads: Number of available threads. + :type num_threads: int, default 1 + :param star_bin: Software path. + :type star_bin: Path, default star + :param samtools_bin: Software path. + :type samtools_bin: Path,default samtools + :param trim_galore_bin: Software path. + :type trim_galore_bin: Path, default trim_galore + + :return: None + :rtype: None + """ + check_exe(star_bin) + # If trimming has been enabled then switch the path for + # short_read_fastq_dir from the original location to the trimmed fastq dir + if trim_fastq: + run_trimming(output_dir, short_read_fastq_dir, delete_pre_trim_fastq, num_threads, trim_galore_bin) + short_read_fastq_dir = output_dir / "trim_galore_output" + + # if not os.path.exists(subsample_script_path): + #subsample_script_path = "subsample_fastq.py" + + star_dir = create_dir(output_dir, "star_output") + + for output_file in [ + Path(f"{output_dir}/stringtie_output/annotation.gtf"), + Path(f"{output_dir}/scallop_output/annotation.gtf"), + ]: + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "transcript") # check a gtf + if transcript_count > 0: + logging.info("Transcriptomic alignment exists") + return + + star_index_file = star_dir / "SAindex" + fastq_file_list = [] + file_types = ("*.fastq", "*.fq", "*.fastq.gz", "*.fq.gz") + fastq_file_list = [ + path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type) + ] + if len(fastq_file_list) == 0: + raise IndexError(f"The list of fastq files is empty. Fastq dir:\n{short_read_fastq_dir}") + + + # for file_type in file_types: + # fastq_file_list.extend(glob.glob(os.path.join(short_read_fastq_dir, file_type))) + + # Get list of paired paths + fastq_file_list = _create_paired_paths(fastq_file_list) + # Subsamples in parallel if there's a value set + if max_reads_per_sample: + subsample_transcriptomic_data(fastq_file_list) + # Get the list of the new subsampled files + fastq_file_list = [ + path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type) + ] + # I don't think is needed + # fastq_file_list = check_for_fastq_subsamples(fastq_file_list) + + if not star_index_file.exists(): + logging.info("Did not find an index file for Star. Will create now") + seq_region_to_length = get_seq_region_length(genome_file, 0) + genome_size = sum(seq_region_to_length.values()) + # This calculates the base-2 logarithm of the genome_size. The logarithm of the genome size is + # a measure of how many bits are needed to represent the genome size in binary. + # + # The choice of 14 as the maximum value is likely based on empirical observations and optimization + # considerations. Too large of a seed length can lead to increased memory usage and potentially + # slower indexing, while a seed length that is too small might affect alignment accuracy. + index_bases = min(14, math.floor((math.log(genome_size, 2) / 2) - 1)) + try: + subprocess.run(#pylint:disable=subprocess-run-check + [ + str(star_bin), + "--runThreadN", + str(num_threads), + "--runMode", + "genomeGenerate", + "--outFileNamePrefix", + f"{star_dir}/", + "--genomeDir", + str(star_dir), + "--genomeSAindexNbases", + str(index_bases), + "--genomeFastaFiles", + str(genome_file), + ] + ) + except Exception as e: + logging.error("An error occurred while creating star index: %s", e) + + logging.info("Running Star on the files in the fastq dir") + for fastq_file in fastq_file_list: + # logger.info(fastq_file_path) + # fastq_file_name = os.path.basename(fastq_file_path) + star_tmp_dir = star_dir / "tmp" + if star_tmp_dir.exists(): + shutil.rmtree(star_tmp_dir) + sam_file = Path(f"{star_dir}/{fastq_file.name}.sam") + junctions_file = Path(f"{star_dir}/{fastq_file.name}.sj.tab") + sam_file_name = sam_file.name + sam_temp_file = Path(f"{star_dir}/{sam_file_name}.tmp") + bam_file = re.sub(".sam", ".bam", sam_file_name) + bam_sort_file = Path(f"{star_dir}/{bam_file}") + log_out_file = Path(f"{star_dir}/{fastq_file.name}.Log.final.out") + if log_out_file.exists() and bam_sort_file.exists() and bam_sort_file.stat().st_size != 0: + logging.info( + "Found an existing bam file for the fastq file, \ + presuming the file has been processed, will skip" + ) + continue + + logging.info("Processing %s", fastq_file) + star_command = [ + str(star_bin), + "--outFilterIntronMotifs", + "RemoveNoncanonicalUnannotated", + "--outSAMstrandField", + "intronMotif", + "--runThreadN", + str(num_threads), + "--twopassMode", + "Basic", + "--runMode", + "alignReads", + "--genomeDir", + str(star_dir), + "--readFilesIn", + str(fastq_file), + "--outFileNamePrefix", + f"{star_dir}/", + "--outTmpDir", + str(star_tmp_dir), + "--outSAMtype", + "SAM", + "--alignIntronMax", + str(max_intron_length), + ] + #'--outSJfilterIntronMaxVsReadN','5000','10000','25000','40000', + #'50000','50000','50000','50000','50000','100000'] + #check_compression = re.search(r".gz$", fastq_file) + if fastq_file.suffix.endswith('.gz'): + star_command.append("--readFilesCommand") + star_command.append("gunzip") + star_command.append("-c") + subprocess.run(star_command)#pylint:disable=subprocess-run-check + shutil.move(Path(f"{star_dir}/Aligned.out.sam"), sam_file) + shutil.move(Path(f"{star_dir}/SJ.out.tab"), junctions_file) + logging.info("Converting samfile into sorted bam file. Bam file: %s", bam_file) + subprocess.run(#pylint:disable=subprocess-run-check + [ + str(samtools_bin), + "sort", + "-@", + str(num_threads), + "-T", + str(sam_temp_file), + "-o", + str(bam_sort_file), + str(sam_file), + ] + ) + shutil.move(star_dir / "Log.final.out", log_out_file) + sam_file.unlink() + logging.info("Completed running STAR")
+ + +def _create_paired_paths(fastq_file_paths: List) -> List[Path]: + """ + Create list of paired transcriptomic fastq files + + Args: + fastq_file_paths (List): List of transcriptomic file paths. + + Returns: + List: List of paired transcriptomic files + """ + path_dict = {} + # final_list = [] + for fastq_file in fastq_file_paths: + paired_name = re.search(r"(.+)_\d+\.(fastq|fq)", str(fastq_file)) + if not paired_name: + logging.exception( + "Could not find _1 or _2 at the end of the prefix \ + for file. Assuming file is not paired: %s", + fastq_file, + ) + # final_list.append([fastq_file]) + path_dict[fastq_file] = [fastq_file] + continue + run_accession = paired_name.group(1) + if run_accession in path_dict: + path_dict[run_accession].append(fastq_file) + else: + path_dict[run_accession] = [fastq_file] + # for pair in path_dict: + # final_list.append(path_dict[pair]) + logging.info([value for values_list in path_dict.values() for value in values_list]) + return [value for values_list in path_dict.values() for value in values_list] + +#pylint:disable=pointless-string-statement +""" +For an advanced and optimised subsampling we could use +https://github.com/lh3/seqtk +""" + + +def _subsample_paired_fastq_files( + fastq_files: List[Path], + subsample_read_limit: int = 100000000, + num_threads: int = 2, + compressed: bool = False, +) -> None: + """ + Perform subsampling on two paired FastQ files in parallel using multiple threads. + + Args: + fastq_files : Path for paired fastq files. + output_files : Path for the output file. + subsample_read_limit : Subsample size, defaults to 100000000. + num_threads : Number of threads, defaults to 2. + compressed : file compressed, defaults to False. + """ + if len(fastq_files)==2: + fastq_file_1, fastq_file_2 = fastq_files + output_file_1, output_file_2 = [Path(f"{fastq_file_1}.sub"), Path(f"{fastq_file_2}.sub")] + elif len(fastq_files)==1: + fastq_file_1=fastq_files[0] + output_file_1 = Path(f"{fastq_file_1}.sub") + else: + raise FileNotFoundError("No fastq file found") + + if fastq_file_1.suffix.endswith('.gz$'): + compressed = True + num_lines = sum(1 for line in gzip.open(fastq_file_1))#pylint:disable=consider-using-with + else: + num_lines = sum(1 for line in open(fastq_file_1))#pylint:disable=consider-using-with + + range_limit = int(num_lines / 4) + if range_limit <= subsample_read_limit: + logging.info("Number of reads (%s is less than the max allowed read count (%s), \ + no need to subsample", str(range_limit),str(subsample_read_limit) + ) + return + + rand_list = random.sample(range(0, range_limit - 1), subsample_read_limit) + random_indices = {idx * 4: 1 for idx in rand_list} + logging.info("Processing paired files in parallel") + pool = multiprocessing.Pool(int(num_threads))#pylint:disable=consider-using-with + pool.apply_async( + _subsample_fastq_subset, + args=( + fastq_file_1, + output_file_1, + random_indices, + compressed, + ), + ) + pool.apply_async( + _subsample_fastq_subset, + args=( + fastq_file_2, + output_file_2, + random_indices, + compressed, + ), + ) + pool.close() + pool.join() + + +def _subsample_fastq_subset( + fastq_file: Path, output_file: Path, random_indices: dict, compressed: bool +) -> None: + """ + Selecting specific sets of four lines from an input FastQ file and writing them to an output file. + + Args: + fastq_file : Path for the fastq file. + output_file : Path for the output file. + random_indices : set of random indices. + compressed : the files is compressed + """ + line_index = 0 + + with gzip.open(fastq_file, "rt") if compressed else open(fastq_file) as file_in, open( + output_file, "w+" + ) as file_out: + lines = [file_in.readline() for _ in range(4)] + while lines[3]: # This ensures that the loop continues until the end of the input file. + if line_index in random_indices: + file_out.writelines(lines) + line_index += 4 + lines = [file_in.readline() for _ in range(4)] + + +def subsample_transcriptomic_data(fastq_file_list: List, num_threads: int = 2) -> None: + """ + Subsample paired fastq files. + + Args: + fastq_file_list : List of fastq file path to process. + num_threads : number of threads + """ + for fastq_files in fastq_file_list: + fastq_file_1, fastq_file_2 = fastq_files + # fastq_file_pair = "" + # if len(fastq_files) == 2: + # fastq_file_pair = fastq_files[1] + + if len(fastq_files) == 1: + fastq_file_1 = fastq_files[0] + if Path(f"{fastq_file_1}.sub").exists(): + logging.info( + "Found an existing .sub file on the fastq path, will use that instead. File:%s.sub", + fastq_file_1, + ) + else: + _subsample_paired_fastq_files(fastq_files, compressed=True, num_threads=num_threads) + + elif len(fastq_files) == 2: + fastq_file_1, fastq_file_2 = fastq_files + if Path(f"{fastq_file_1}.sub").exists() and Path(f"{fastq_file_2}.sub").exists(): + logging.info( + "Found an existing .sub files on the fastq path for both members of the pair, will use \ + those instead of subsampling again. Files: %s.sub,%s.sub", + fastq_file_1, + fastq_file_2, + ) + elif Path(f"{fastq_file_2}.sub").exists(): + _subsample_paired_fastq_files(fastq_files, compressed=True, num_threads=num_threads) + + +def run_trimming( + output_dir: Path, + short_read_fastq_dir: Path, + delete_pre_trim_fastq: bool = False, + num_threads: int = 1, + trim_galore_bin="trim_galore", +) -> None: + """ + Trim list of short read fastq files. + Args: + output_dir : Working directory path. + short_read_fastq_dir : Short read directory path. + delete_pre_trim_fastq : Removing original fastq file post trimming. Defaults to False. + num_threads : Number of threads. + trim_galore_bin : Software path. + """ + check_exe(trim_galore_bin) + trim_dir = create_dir(output_dir, "trim_galore_output") + + fastq_file_list = [] + file_types = ("*.fastq", "*.fq", "*.fastq.gz", "*.fq.gz") + fastq_file_list = [ + path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type) + ] + fastq_file_list = _create_paired_paths(fastq_file_list) + + trim_galore_cmd = [ + str(trim_galore_bin), + "--illumina", + "--quality", + "20", + "--length", + "50", + "--output_dir", + str(trim_dir), + ] + + pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with + for fastq_file in fastq_file_list: + if delete_pre_trim_fastq: + fastq_file.unlink() + pool.apply_async( + multiprocess_trim_galore, + args=( + trim_galore_cmd, + fastq_file, + trim_dir, + ), + ) + + pool.close() + pool.join() + + trimmed_fastq_list = trim_dir.glob("*.fq.gz") + + for trimmed_fastq_path in trimmed_fastq_list: + logging.info("Trimmed file path: %s", str(trimmed_fastq_path)) + sub_patterns = re.compile(r"|".join(("_val_1.fq", "_val_2.fq", "_trimmed.fq"))) + updated_file_path_name = sub_patterns.sub(".fq", trimmed_fastq_path.name) + updated_file_path = short_read_fastq_dir / updated_file_path_name + logging.info("Updated file path: %s", str(updated_file_path)) + trimmed_fastq_path.rename(updated_file_path) + + files_to_delete_list : List[Path] = [] + for file_type in file_types: + files_to_delete_list.extend(short_read_fastq_dir.glob(file_type)) + + for file_to_delete in files_to_delete_list: + file_to_delete.unlink() + + +def multiprocess_trim_galore(trim_galore_cmd: List, fastq_paired_files: List[Path]) -> None: + """ + Trim short paired or single short read fastq file. + Args: + trim_galore_cmd : Generic command. + fastq_paired_files : List of single or paired fastq files. + """ + + fastq_file = fastq_paired_files[0] + fastq_file_pair = None + + if len(fastq_paired_files) == 2: + fastq_file, fastq_file_pair = fastq_paired_files + trim_galore_cmd.append("--paired") + trim_galore_cmd.append(fastq_file) + trim_galore_cmd.append(fastq_file_pair) + elif len(fastq_paired_files) == 1: + trim_galore_cmd.append(fastq_paired_files) + + logging.info("Running Trim Galore with the following command: %s", {" ".join(trim_galore_cmd)}) + subprocess.run(trim_galore_cmd, check=True) + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run STAR software.""" + + genome_file = argschema.fields.InputFile(required=True, description="Genome file path") + output_dir = argschema.fields.OutputDir(required=True, description="Output directory path") + short_read_fastq_dir = argschema.fields.String( + required=True, + description="Short read directory path", + ) + delete_pre_trim_fastq = argschema.fields.Bool( + required=False, + default=False, + description="Delete the original fastq files after trimming", + ) + trim_fastq = argschema.fields.Bool( + required=False, + default=False, + description="Trim the short read files using Trim Galore", + ) + max_reads_per_sample = argschema.fields.Integer( + required=False, + default="0", + description="The maximum number of reads to use per sample.", + ) + max_intron_length = argschema.fields.Integer( + required=False, + default="100000", + description="The maximum intron size for alignments.", + ) + num_threads = argschema.fields.Integer(required=False, default=1, description="Number of threads") + star_bin = argschema.fields.String( + required=False, + default="STAR", + description="Star software path", + ) + samtools_bin = argschema.fields.String( + required=False, + default="samtools", + description="Samtools software path", + ) + trim_galore_bin = argschema.fields.String( + required=False, + default="trim_galore", + description="Trim Galore software path", + ) + + +def main() -> None: + """STAR's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "star.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_star( + mod.args["genome_file"], + mod.args["output_dir"], + mod.args["short_read_fastq_dir"], + mod.args["delete_pre_trim_fastq"], + mod.args["trim_fastq"], + mod.args["max_reads_per_sample"], + mod.args["max_intron_length"], + mod.args["num_threads"], + mod.args["star_bin"], + mod.args["samtools_bin"], + mod.args["trim_galore_bin"], + ) + + +# pylint:disable=pointless-string-statement +""" +def model_builder(work_dir): + + star_output_dir = os.path.join(work_dir, "star_output") + + all_junctions_file = os.path.join(star_output_dir, "all_junctions.sj") + sjf_out = open(all_junctions_file, "w+") + + for sj_tab_file in glob.glob(input_dir + "/*.sj.tab"): + sjf_in = open(sj_tab_file) + sjf_lines = sjf_in.readlines() + for line in sjf_lines: + elements = line.split("\t") + strand = "+" + + # my $slice_name = $eles[0]; + # my $start = $eles[1]; + # my $end = $eles[2]; + # my $strand = $eles[3]; + + # If the strand is undefined then skip, Augustus expects a strand + if elements[3] == "0": + continue + elif elements[3] == "2": + strand = "-" + + junction_length = int(elements[2]) - int(elements[1]) + 1 + if junction_length < 100: + continue + + if not elements[4] and elements[7] < 10: + continue + + # For the moment treat multimapping and single + # mapping things as a combined score + score = float(elements[6]) + float(elements[7]) + score = str(score) + output_line = [ + elements[0], + "RNASEQ", + "intron", + elements[1], + elements[2], + score, + strand, + ".", + ("src=W;mul=" + score + ";"), + ] + sjf_out.write("\t".join(output_line) + "\n") + + sjf_out.close() +""" +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/transcriptomic_annotation/stringtie.html b/_modules/ensembl/tools/anno/transcriptomic_annotation/stringtie.html new file mode 100644 index 0000000..5c4a899 --- /dev/null +++ b/_modules/ensembl/tools/anno/transcriptomic_annotation/stringtie.html @@ -0,0 +1,257 @@ + + + + + + + ensembl.tools.anno.transcriptomic_annotation.stringtie — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.transcriptomic_annotation.stringtie

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+StringTie is a fast and highly efficient assembler of RNA-Seq alignments into potential transcripts.
+It uses a novel network flow algorithm as well as an optional de novo assembly step to assemble and
+quantitate full-length transcripts representing multiple splice variants for each gene locus.
+Pertea M, Pertea GM, Antonescu CM, Chang TC, Mendell JT & Salzberg SL. StringTie enables improved 
+reconstruction of a transcriptome from RNA-seq reads Nature Biotechnology 2015, doi:10.1038/nbt.3122
+"""
+
+__all__ = ["run_stringtie"]
+import logging
+import logging.config
+from pathlib import Path
+import re
+import subprocess
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+)
+
+
+
[docs]def run_stringtie( + output_dir: Path, + stringtie_bin: Path = Path("stringtie"), + num_threads: int = 1, +) -> None: + """ + StringTie assembler of short read data. + :param output_dir: Working directory path. + :type output_dir: Path + :param stringtie_bin: Software path. + :type stringtie_bin: Path, default stringtie + :param num_threads: Number of available threads. + :type num_threads: int, default 1 + + :return: None + :rtype: None + """ + check_exe(stringtie_bin) + stringtie_dir = create_dir(output_dir, "stringtie_output") + logging.info("Skip analysis if the gtf file already exists") + output_file = stringtie_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "transcript") + if transcript_count > 0: + logging.info("Stringtie gtf file exists, skipping analysis") + return + + stringtie_merge_input_file = stringtie_dir / "stringtie_assemblies.txt" + stringtie_merge_output_file = stringtie_dir / "annotation.gtf" + star_dir = Path(f"{output_dir}/star_output") + + if star_dir.exists() and len(list(star_dir.glob("*.bam"))) != 0: + for sorted_bam_file in star_dir.glob("*.bam"): + transcript_file_name = re.sub(".bam", ".stringtie.gtf", sorted_bam_file.name) + transcript_file = stringtie_dir / transcript_file_name + if transcript_file.exists(): + logging.info( + "Found an existing stringtie gtf file, will not overwrite. \ + File found: %s", + transcript_file, + ) + else: + logging.info("Running Stringtie on: %s", sorted_bam_file.name) + try: + subprocess.check_output( # pylint:disable=subprocess-run-check + [ + stringtie_bin, + sorted_bam_file, + "-o", + transcript_file, + "-p", + str(num_threads), + "-t", # disable trimming of predicted transcripts based on coverage + "-a", # minimum anchor length for junctions + "15", + ] + ) + except subprocess.CalledProcessError as e: + logging.error("Error running Stringtie command: %s", e) + logging.error("Return code: %s", str(e.returncode)) + logging.error("Output and error messages:%s\n", e.output) + else: + raise IndexError(f"The list of sorted bam files is empty, Star output dir: {star_dir}") + + logging.info("Creating Stringtie merge input file: %s", stringtie_merge_input_file) + with open(stringtie_merge_input_file, "w+", encoding="utf8") as gtf_list_out: + for gtf_file in stringtie_dir.glob("*.stringtie.gtf"): + transcript_count = check_gtf_content(gtf_file, "transcript") + if transcript_count > 0: + gtf_list_out.write(f"{gtf_file}\n") + else: + logging.warning("Warning, skipping file with no transcripts. Path:%s", gtf_file) + logging.info("Merging Stringtie results.") + try: + subprocess.run( # pylint:disable=subprocess-run-check + [ + stringtie_bin, + "--merge", + "-o", + stringtie_merge_output_file, + stringtie_merge_input_file, + ] + ) + except subprocess.CalledProcessError as e: + logging.error("Error running Stringtie merging command: %s", e)
+ + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run StringTie software.""" + + output_dir = argschema.fields.OutputDir(required=True, description="Output directory path") + stringtie_bin = argschema.fields.String( + required=False, + default="stringtie", + description="StringTie software path", + ) + num_threads = argschema.fields.Integer(required=False, default=1, description="Number of threads") + + +def main() -> None: + """StringTie's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "stringtie.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_stringtie( + mod.args["output_dir"], + mod.args["stringtie_bin"], + mod.args["num_threads"], + ) +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/index.html b/_modules/index.html new file mode 100644 index 0000000..8777d03 --- /dev/null +++ b/_modules/index.html @@ -0,0 +1,117 @@ + + + + + + + Overview: module code — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+ + +
+
+
+ + + + + \ No newline at end of file diff --git a/_sources/cpg.rst.txt b/_sources/cpg.rst.txt new file mode 100644 index 0000000..8603218 --- /dev/null +++ b/_sources/cpg.rst.txt @@ -0,0 +1,8 @@ +CpG Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.simple_feature_annotation.cpg + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/dust.rst.txt b/_sources/dust.rst.txt new file mode 100644 index 0000000..f1a98b4 --- /dev/null +++ b/_sources/dust.rst.txt @@ -0,0 +1,8 @@ +DustMasker Module Documentation +=================================== + +.. automodule:: ensembl.tools.anno.repeat_annotation.dust + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/eponine.rst.txt b/_sources/eponine.rst.txt new file mode 100644 index 0000000..e460382 --- /dev/null +++ b/_sources/eponine.rst.txt @@ -0,0 +1,8 @@ +Eponine Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.simple_feature_annotation.eponine + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/genblast.rst.txt b/_sources/genblast.rst.txt new file mode 100644 index 0000000..401f411 --- /dev/null +++ b/_sources/genblast.rst.txt @@ -0,0 +1,8 @@ +Genblast Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.protein_annotation.genblast + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/index.rst.txt b/_sources/index.rst.txt new file mode 100644 index 0000000..add9008 --- /dev/null +++ b/_sources/index.rst.txt @@ -0,0 +1,58 @@ +.. See the NOTICE file distributed with this work for additional information + regarding copyright ownership. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. ensembl-anno documentation master file, created by + sphinx-quickstart on Fri Sep 1 12:25:36 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +======================================== +Ensembl-anno +======================================== + +Anno tool kit + + +Contents +-------- +Check out :ref:`installation ` section for further information on how +to install the project. + +.. toctree:: + :maxdepth: 1 + :caption: Index + + install + license + + cpg + dust + eponine + genblast + minimap + red + repeatmasker + scallop + star + stringtie + trf + trnascan + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` \ No newline at end of file diff --git a/_sources/install.rst.txt b/_sources/install.rst.txt new file mode 100644 index 0000000..997e6c0 --- /dev/null +++ b/_sources/install.rst.txt @@ -0,0 +1,54 @@ +.. See the NOTICE file distributed with this work for additional information + regarding copyright ownership. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +API Setup and installation +=========================== + +Requirements +-------------- + +.. _install: + +An Ensembl API checkout including: + +- ensembl-production `ensembl-production `_. +- ensembl-analysis `ensembl-analysis `_. (on dev/hive_master branch) +- ensembl-taxonomy `ensembl-taxonomy `_. +- ensembl-orm `ensembl-orm `_. + +Software +^^^^^^^^ + +#. Python 3.8+ +#. Bioperl 1.6.9+ + +Python Modules +^^^^^^^^^^^^^^ +#. argschema + + + +Installation +------------ +Directly from GitHub: + +.. code-block:: none + :linenos: + + git clone https://github.com/Ensembl/ensembl-analysis -b experimental/gbiab + git clone https://github.com/Ensembl/ensembl-production + git clone https://github.com/Ensembl/ensembl-hive + git clone https://github.com/Ensembl/ensembl-taxonomy + git clone https://github.com/Ensembl/ensembl-orm \ No newline at end of file diff --git a/_sources/license.rst.txt b/_sources/license.rst.txt new file mode 100644 index 0000000..9e9b2fe --- /dev/null +++ b/_sources/license.rst.txt @@ -0,0 +1,203 @@ +License +------- + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/_sources/minimap.rst.txt b/_sources/minimap.rst.txt new file mode 100644 index 0000000..bf3a45c --- /dev/null +++ b/_sources/minimap.rst.txt @@ -0,0 +1,8 @@ +Minimap2 Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.transcriptomic_annotation.minimap + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/red.rst.txt b/_sources/red.rst.txt new file mode 100644 index 0000000..26743cd --- /dev/null +++ b/_sources/red.rst.txt @@ -0,0 +1,8 @@ +Red Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.repeat_annotation.red + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/repeatmasker.rst.txt b/_sources/repeatmasker.rst.txt new file mode 100644 index 0000000..dd07f72 --- /dev/null +++ b/_sources/repeatmasker.rst.txt @@ -0,0 +1,8 @@ +Repeatmasker Module Documentation +==================================== + +.. automodule:: ensembl.tools.anno.repeat_annotation.repeatmasker + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/scallop.rst.txt b/_sources/scallop.rst.txt new file mode 100644 index 0000000..744055b --- /dev/null +++ b/_sources/scallop.rst.txt @@ -0,0 +1,8 @@ +Scallop Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.transcriptomic_annotation.scallop + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/star.rst.txt b/_sources/star.rst.txt new file mode 100644 index 0000000..d83c66b --- /dev/null +++ b/_sources/star.rst.txt @@ -0,0 +1,8 @@ +STAR Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.transcriptomic_annotation.star + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/stringtie.rst.txt b/_sources/stringtie.rst.txt new file mode 100644 index 0000000..980dcbd --- /dev/null +++ b/_sources/stringtie.rst.txt @@ -0,0 +1,8 @@ +Stringtie Module Documentation +================================== + +.. automodule:: ensembl.tools.anno.transcriptomic_annotation.stringtie + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/trf.rst.txt b/_sources/trf.rst.txt new file mode 100644 index 0000000..9268f3c --- /dev/null +++ b/_sources/trf.rst.txt @@ -0,0 +1,8 @@ +TRF Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.repeat_annotation.trf + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/trnascan.rst.txt b/_sources/trnascan.rst.txt new file mode 100644 index 0000000..6193545 --- /dev/null +++ b/_sources/trnascan.rst.txt @@ -0,0 +1,8 @@ +tRNAscan-SE Module Documentation +=================================== + +.. automodule:: ensembl.tools.anno.snc_rna_annotation.trnascan + :members: + :undoc-members: + :show-inheritance: + diff --git a/_static/agogo.css b/_static/agogo.css new file mode 100644 index 0000000..401127b --- /dev/null +++ b/_static/agogo.css @@ -0,0 +1,549 @@ +/* + * agogo.css_t + * ~~~~~~~~~~~ + * + * Sphinx stylesheet -- agogo theme. + * + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +* { + margin: 0px; + padding: 0px; +} + +body { + font-family: Garamond, Arial, serif; + line-height: 1.4em; + color: black; + background-color: #009999; +} + + +/* Page layout */ + +div.header, div.content, div.footer { + width: 70em; + margin-left: auto; + margin-right: auto; +} + +div.header-wrapper { + background: #009999; + border-bottom: 3px solid #2e3436; +} + + +/* Default body styles */ +a { + color: green; +} + +div.bodywrapper a, div.footer a { + text-decoration: underline; +} + +.clearer { + clear: both; +} + +.left { + float: left; +} + +.right { + float: right; +} + +.line-block { + display: block; + margin-top: 1em; + margin-bottom: 1em; +} + +.line-block .line-block { + margin-top: 0; + margin-bottom: 0; + margin-left: 1.5em; +} + +h1, h2, h3, h4 { + font-family: Arial, Helvetica, serif; + font-weight: normal; + color: #3465a4; + margin-bottom: .8em; +} + +h1 { + color: #204a87; +} + +h2 { + padding-bottom: .5em; + border-bottom: 1px solid #3465a4; +} + +a.headerlink { + visibility: hidden; + color: #dddddd; + padding-left: .3em; +} + +h1:hover > a.headerlink, +h2:hover > a.headerlink, +h3:hover > a.headerlink, +h4:hover > a.headerlink, +h5:hover > a.headerlink, +h6:hover > a.headerlink, +dt:hover > a.headerlink, +caption:hover > a.headerlink, +p.caption:hover > a.headerlink, +div.code-block-caption:hover > a.headerlink { + visibility: visible; +} + +img { + border: 0; +} + +div.admonition { + margin-top: 10px; + margin-bottom: 10px; + padding: 2px 7px 1px 7px; + border-left: 0.2em solid black; +} + +p.admonition-title { + margin: 0px 10px 5px 0px; + font-weight: bold; +} + +dt:target, .highlighted { + background-color: #fbe54e; +} + +/* Header */ + +div.header { + padding-top: 10px; + padding-bottom: 10px; +} + +div.header .headertitle { + font-family: Arial, Helvetica, serif; + font-weight: normal; + font-size: 180%; + letter-spacing: .08em; + margin-bottom: .8em; +} + +div.header .headertitle a { + color: white; +} + +div.header div.rel { + margin-top: 1em; +} + +div.header div.rel a { + color: #33d6ff; + letter-spacing: .1em; + text-transform: uppercase; +} + +p.logo { + float: right; +} + +img.logo { + border: 0; +} + + +/* Content */ +div.content-wrapper { + background-color: white; + padding-top: 20px; + padding-bottom: 20px; +} + +div.document { + width: 50em; + float: left; +} + +div.body { + padding-right: 2em; + text-align: justify; +} + +div.document h1 { + line-height: 120%; +} + +div.document ul { + margin: 1.5em; + list-style-type: square; +} + +div.document dd { + margin-left: 1.2em; + margin-top: .4em; + margin-bottom: 1em; +} + +div.document .section { + margin-top: 1.7em; +} +div.document .section:first-child { + margin-top: 0px; +} + +div.document div.highlight { + padding: 3px; + border-top: 2px solid #dddddd; + border-bottom: 2px solid #dddddd; + margin-top: .8em; + margin-bottom: .8em; +} + +div.document div.literal-block-wrapper { + margin-top: .8em; + margin-bottom: .8em; +} + +div.document div.literal-block-wrapper div.highlight { + margin: 0; +} + +div.document div.code-block-caption span.caption-number { + padding: 0.1em 0.3em; + font-style: italic; +} + +div.document div.code-block-caption span.caption-text { +} + +div.document h2 { + margin-top: .7em; +} + +div.document p { + margin-bottom: .5em; +} + +div.document li.toctree-l1 { + margin-bottom: 1em; +} + +div.document .descname { + font-weight: bold; +} + +div.document .sig-paren { + font-size: larger; +} + +div.document .docutils.literal { + background-color: #eeeeec; + padding: 1px; +} + +div.document .docutils.xref.literal { + background-color: transparent; + padding: 0px; +} + +div.document blockquote { + margin: 1em; +} + +div.document ol { + margin: 1.5em; +} + + +/* Sidebar */ + +div.sidebar, +aside.sidebar { + width: 20em; + float: right; + font-size: .9em; +} + +div.sidebar a, aside.sidebar a, div.header a { + text-decoration: none; +} + +div.sidebar a:hover, aside.sidebar a:hover, div.header a:hover { + text-decoration: underline; +} + +div.sidebar h3, +aside.sidebar h3 { + color: #2e3436; + text-transform: uppercase; + font-size: 130%; + letter-spacing: .1em; +} + +div.sidebar ul, +aside.sidebar ul { + list-style-type: none; +} + +div.sidebar li.toctree-l1 a, +aside.sidebar li.toctree-l1 a { + display: block; + padding: 1px; + border: 1px solid #dddddd; + background-color: #eeeeec; + margin-bottom: .4em; + padding-left: 3px; + color: #2e3436; +} + +div.sidebar li.toctree-l2 a, +aside.sidebar li.toctree-l2 a { + background-color: transparent; + border: none; + margin-left: 1em; + border-bottom: 1px solid #dddddd; +} + +div.sidebar li.toctree-l3 a, +aside.sidebar li.toctree-l3 a { + background-color: transparent; + border: none; + margin-left: 2em; + border-bottom: 1px solid #dddddd; +} + +div.sidebar li.toctree-l2:last-child a, +aside.sidebar li.toctree-l2:last-child a { + border-bottom: none; +} + +div.sidebar li.toctree-l1.current a, +aside.sidebar li.toctree-l1.current a { + border-right: 5px solid #33d6ff; +} + +div.sidebar li.toctree-l1.current li.toctree-l2 a, +aside.sidebar li.toctree-l1.current li.toctree-l2 a { + border-right: none; +} + +div.sidebar input[type="text"], +aside.sidebar input[type="text"] { + width: 170px; +} + +div.sidebar input[type="submit"], +aside.sidebar input[type="submit"] { + width: 30px; +} + + +/* Footer */ + +div.footer-wrapper { + background: #e6fff9; + border-top: 4px solid #babdb6; + padding-top: 10px; + padding-bottom: 10px; + min-height: 80px; +} + +div.footer, div.footer a { + color: #888a85; +} + +div.footer .right { + text-align: right; +} + +div.footer .left { + text-transform: uppercase; +} + + +/* Styles copied from basic theme */ + +img.align-left, figure.align-left, .figure.align-left, object.align-left { + clear: left; + float: left; + margin-right: 1em; +} + +img.align-right, figure.align-right, .figure.align-right, object.align-right { + clear: right; + float: right; + margin-left: 1em; +} + +img.align-center, figure.align-center, .figure.align-center, object.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +img.align-default, figure.align-default, .figure.align-default { + display: block; + margin-left: auto; + margin-right: auto; +} + +.align-left { + text-align: left; +} + +.align-center { + text-align: center; +} + +.align-right { + text-align: right; +} + +table caption span.caption-number { + font-style: italic; +} + +table caption span.caption-text { +} + +div.figure p.caption span.caption-number, +figcaption span.caption-number { + font-style: italic; +} + +div.figure p.caption span.caption-text, +figcaption span.caption-text { +} + +/* -- search page ----------------------------------------------------------- */ + +ul.search { + margin: 10px 0 0 20px; + padding: 0; +} + +ul.search li { + padding: 5px 0 5px 20px; + background-image: url(file.png); + background-repeat: no-repeat; + background-position: 0 7px; +} + +ul.search li a { + font-weight: bold; +} + +ul.search li div.context { + color: #888; + margin: 2px 0 0 30px; + text-align: left; +} + +ul.keywordmatches li.goodmatch a { + font-weight: bold; +} + +/* -- index page ------------------------------------------------------------ */ + +table.contentstable { + width: 90%; +} + +table.contentstable p.biglink { + line-height: 150%; +} + +a.biglink { + font-size: 1.3em; +} + +span.linkdescr { + font-style: italic; + padding-top: 5px; + font-size: 90%; +} + +/* -- general index --------------------------------------------------------- */ + +table.indextable td { + text-align: left; + vertical-align: top; +} + +table.indextable ul { + margin-top: 0; + margin-bottom: 0; + list-style-type: none; +} + +table.indextable > tbody > tr > td > ul { + padding-left: 0em; +} + +table.indextable tr.pcap { + height: 10px; +} + +table.indextable tr.cap { + margin-top: 10px; + background-color: #f2f2f2; +} + +img.toggler { + margin-right: 3px; + margin-top: 3px; + cursor: pointer; +} + +/* -- domain module index --------------------------------------------------- */ + +table.modindextable td { + padding: 2px; + border-collapse: collapse; +} + +/* -- viewcode extension ---------------------------------------------------- */ + +.viewcode-link { + float: right; +} + +.viewcode-back { + float: right; + font-family:: Garamond, Arial, serif; +} + +div.viewcode-block:target { + margin: -1px -3px; + padding: 0 3px; + background-color: #f4debf; + border-top: 1px solid #ac9; + border-bottom: 1px solid #ac9; +} + +div.code-block-caption { + background-color: #ddd; + color: #333; + padding: 2px 5px; + font-size: small; +} + +/* -- math display ---------------------------------------------------------- */ + +div.body div.math p { + text-align: center; +} + +span.eqno { + float: right; +} \ No newline at end of file diff --git a/_static/basic.css b/_static/basic.css new file mode 100644 index 0000000..a5cba42 --- /dev/null +++ b/_static/basic.css @@ -0,0 +1,921 @@ +/* + * basic.css + * ~~~~~~~~~ + * + * Sphinx stylesheet -- basic theme. + * + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +/* -- main layout ----------------------------------------------------------- */ + +div.clearer { + clear: both; +} + +div.section::after { + display: block; + content: ''; + clear: left; +} + +/* -- relbar ---------------------------------------------------------------- */ + +div.related { + width: 100%; + font-size: 90%; +} + +div.related h3 { + display: none; +} + +div.related ul { + margin: 0; + padding: 0 0 0 10px; + list-style: none; +} + +div.related li { + display: inline; +} + +div.related li.right { + float: right; + margin-right: 5px; +} + +/* -- sidebar --------------------------------------------------------------- */ + +div.sphinxsidebarwrapper { + padding: 10px 5px 0 10px; +} + +div.sphinxsidebar { + float: left; + width: 20em; + margin-left: -100%; + font-size: 90%; + word-wrap: break-word; + overflow-wrap : break-word; +} + +div.sphinxsidebar ul { + list-style: none; +} + +div.sphinxsidebar ul ul, +div.sphinxsidebar ul.want-points { + margin-left: 20px; + list-style: square; +} + +div.sphinxsidebar ul ul { + margin-top: 0; + margin-bottom: 0; +} + +div.sphinxsidebar form { + margin-top: 10px; +} + +div.sphinxsidebar input { + border: 1px solid #98dbcc; + font-family: sans-serif; + font-size: 1em; +} + +div.sphinxsidebar #searchbox form.search { + overflow: hidden; +} + +div.sphinxsidebar #searchbox input[type="text"] { + float: left; + width: 80%; + padding: 0.25em; + box-sizing: border-box; +} + +div.sphinxsidebar #searchbox input[type="submit"] { + float: left; + width: 20%; + border-left: none; + padding: 0.25em; + box-sizing: border-box; +} + + +img { + border: 0; + max-width: 100%; +} + +/* -- search page ----------------------------------------------------------- */ + +ul.search { + margin: 10px 0 0 20px; + padding: 0; +} + +ul.search li { + padding: 5px 0 5px 20px; + background-image: url(file.png); + background-repeat: no-repeat; + background-position: 0 7px; +} + +ul.search li a { + font-weight: bold; +} + +ul.search li p.context { + color: #888; + margin: 2px 0 0 30px; + text-align: left; +} + +ul.keywordmatches li.goodmatch a { + font-weight: bold; +} + +/* -- index page ------------------------------------------------------------ */ + +table.contentstable { + width: 90%; + margin-left: auto; + margin-right: auto; +} + +table.contentstable p.biglink { + line-height: 150%; +} + +a.biglink { + font-size: 1.3em; +} + +span.linkdescr { + font-style: italic; + padding-top: 5px; + font-size: 90%; +} + +/* -- general index --------------------------------------------------------- */ + +table.indextable { + width: 100%; +} + +table.indextable td { + text-align: left; + vertical-align: top; +} + +table.indextable ul { + margin-top: 0; + margin-bottom: 0; + list-style-type: none; +} + +table.indextable > tbody > tr > td > ul { + padding-left: 0em; +} + +table.indextable tr.pcap { + height: 10px; +} + +table.indextable tr.cap { + margin-top: 10px; + background-color: #f2f2f2; +} + +img.toggler { + margin-right: 3px; + margin-top: 3px; + cursor: pointer; +} + +div.modindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +div.genindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +/* -- domain module index --------------------------------------------------- */ + +table.modindextable td { + padding: 2px; + border-collapse: collapse; +} + +/* -- general body styles --------------------------------------------------- */ + +div.body { + min-width: 360px; + max-width: 800px; +} + +div.body p, div.body dd, div.body li, div.body blockquote { + -moz-hyphens: auto; + -ms-hyphens: auto; + -webkit-hyphens: auto; + hyphens: auto; +} + +a.headerlink { + visibility: hidden; +} + +h1:hover > a.headerlink, +h2:hover > a.headerlink, +h3:hover > a.headerlink, +h4:hover > a.headerlink, +h5:hover > a.headerlink, +h6:hover > a.headerlink, +dt:hover > a.headerlink, +caption:hover > a.headerlink, +p.caption:hover > a.headerlink, +div.code-block-caption:hover > a.headerlink { + visibility: visible; +} + +div.body p.caption { + text-align: inherit; +} + +div.body td { + text-align: left; +} + +.first { + margin-top: 0 !important; +} + +p.rubric { + margin-top: 30px; + font-weight: bold; +} + +img.align-left, figure.align-left, .figure.align-left, object.align-left { + clear: left; + float: left; + margin-right: 1em; +} + +img.align-right, figure.align-right, .figure.align-right, object.align-right { + clear: right; + float: right; + margin-left: 1em; +} + +img.align-center, figure.align-center, .figure.align-center, object.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +img.align-default, figure.align-default, .figure.align-default { + display: block; + margin-left: auto; + margin-right: auto; +} + +.align-left { + text-align: left; +} + +.align-center { + text-align: center; +} + +.align-default { + text-align: center; +} + +.align-right { + text-align: right; +} + +/* -- sidebars -------------------------------------------------------------- */ + +div.sidebar, +aside.sidebar { + margin: 0 0 0.5em 1em; + border: 1px solid #ddb; + padding: 7px; + background-color: #ffe; + width: 40%; + float: right; + clear: right; + overflow-x: auto; +} + +p.sidebar-title { + font-weight: bold; +} + +nav.contents, +aside.topic, +div.admonition, div.topic, blockquote { + clear: left; +} + +/* -- topics ---------------------------------------------------------------- */ + +nav.contents, +aside.topic, +div.topic { + border: 1px solid #ccc; + padding: 7px; + margin: 10px 0 10px 0; +} + +p.topic-title { + font-size: 1.1em; + font-weight: bold; + margin-top: 10px; +} + +/* -- admonitions ----------------------------------------------------------- */ + +div.admonition { + margin-top: 10px; + margin-bottom: 10px; + padding: 7px; +} + +div.admonition dt { + font-weight: bold; +} + +p.admonition-title { + margin: 0px 10px 5px 0px; + font-weight: bold; +} + +div.body p.centered { + text-align: center; + margin-top: 25px; +} + +/* -- content of sidebars/topics/admonitions -------------------------------- */ + +div.sidebar > :last-child, +aside.sidebar > :last-child, +nav.contents > :last-child, +aside.topic > :last-child, +div.topic > :last-child, +div.admonition > :last-child { + margin-bottom: 0; +} + +div.sidebar::after, +aside.sidebar::after, +nav.contents::after, +aside.topic::after, +div.topic::after, +div.admonition::after, +blockquote::after { + display: block; + content: ''; + clear: both; +} + +/* -- tables ---------------------------------------------------------------- */ + +table.docutils { + margin-top: 10px; + margin-bottom: 10px; + border: 0; + border-collapse: collapse; +} + +table.align-center { + margin-left: auto; + margin-right: auto; +} + +table.align-default { + margin-left: auto; + margin-right: auto; +} + +table caption span.caption-number { + font-style: italic; +} + +table caption span.caption-text { +} + +table.docutils td, table.docutils th { + padding: 1px 8px 1px 5px; + border-top: 0; + border-left: 0; + border-right: 0; + border-bottom: 1px solid #aaa; +} + +th { + text-align: left; + padding-right: 5px; +} + +table.citation { + border-left: solid 1px gray; + margin-left: 1px; +} + +table.citation td { + border-bottom: none; +} + +th > :first-child, +td > :first-child { + margin-top: 0px; +} + +th > :last-child, +td > :last-child { + margin-bottom: 0px; +} + +/* -- figures --------------------------------------------------------------- */ + +div.figure, figure { + margin: 0.5em; + padding: 0.5em; +} + +div.figure p.caption, figcaption { + padding: 0.3em; +} + +div.figure p.caption span.caption-number, +figcaption span.caption-number { + font-style: italic; +} + +div.figure p.caption span.caption-text, +figcaption span.caption-text { +} + +/* -- field list styles ----------------------------------------------------- */ + +table.field-list td, table.field-list th { + border: 0 !important; +} + +.field-list ul { + margin: 0; + padding-left: 1em; +} + +.field-list p { + margin: 0; +} + +.field-name { + -moz-hyphens: manual; + -ms-hyphens: manual; + -webkit-hyphens: manual; + hyphens: manual; +} + +/* -- hlist styles ---------------------------------------------------------- */ + +table.hlist { + margin: 1em 0; +} + +table.hlist td { + vertical-align: top; +} + +/* -- object description styles --------------------------------------------- */ + +.sig { + font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace; +} + +.sig-name, code.descname { + background-color: transparent; + font-weight: bold; +} + +.sig-name { + font-size: 1.1em; +} + +code.descname { + font-size: 1.2em; +} + +.sig-prename, code.descclassname { + background-color: transparent; +} + +.optional { + font-size: 1.3em; +} + +.sig-paren { + font-size: larger; +} + +.sig-param.n { + font-style: italic; +} + +/* C++ specific styling */ + +.sig-inline.c-texpr, +.sig-inline.cpp-texpr { + font-family: unset; +} + +.sig.c .k, .sig.c .kt, +.sig.cpp .k, .sig.cpp .kt { + color: #0033B3; +} + +.sig.c .m, +.sig.cpp .m { + color: #1750EB; +} + +.sig.c .s, .sig.c .sc, +.sig.cpp .s, .sig.cpp .sc { + color: #067D17; +} + + +/* -- other body styles ----------------------------------------------------- */ + +ol.arabic { + list-style: decimal; +} + +ol.loweralpha { + list-style: lower-alpha; +} + +ol.upperalpha { + list-style: upper-alpha; +} + +ol.lowerroman { + list-style: lower-roman; +} + +ol.upperroman { + list-style: upper-roman; +} + +:not(li) > ol > li:first-child > :first-child, +:not(li) > ul > li:first-child > :first-child { + margin-top: 0px; +} + +:not(li) > ol > li:last-child > :last-child, +:not(li) > ul > li:last-child > :last-child { + margin-bottom: 0px; +} + +ol.simple ol p, +ol.simple ul p, +ul.simple ol p, +ul.simple ul p { + margin-top: 0; +} + +ol.simple > li:not(:first-child) > p, +ul.simple > li:not(:first-child) > p { + margin-top: 0; +} + +ol.simple p, +ul.simple p { + margin-bottom: 0; +} + +aside.footnote > span, +div.citation > span { + float: left; +} +aside.footnote > span:last-of-type, +div.citation > span:last-of-type { + padding-right: 0.5em; +} +aside.footnote > p { + margin-left: 2em; +} +div.citation > p { + margin-left: 4em; +} +aside.footnote > p:last-of-type, +div.citation > p:last-of-type { + margin-bottom: 0em; +} +aside.footnote > p:last-of-type:after, +div.citation > p:last-of-type:after { + content: ""; + clear: both; +} + +dl.field-list { + display: grid; + grid-template-columns: fit-content(30%) auto; +} + +dl.field-list > dt { + font-weight: bold; + word-break: break-word; + padding-left: 0.5em; + padding-right: 5px; +} + +dl.field-list > dd { + padding-left: 0.5em; + margin-top: 0em; + margin-left: 0em; + margin-bottom: 0em; +} + +dl { + margin-bottom: 15px; +} + +dd > :first-child { + margin-top: 0px; +} + +dd ul, dd table { + margin-bottom: 10px; +} + +dd { + margin-top: 3px; + margin-bottom: 10px; + margin-left: 30px; +} + +.sig dd { + margin-top: 0px; + margin-bottom: 0px; +} + +.sig dl { + margin-top: 0px; + margin-bottom: 0px; +} + +dl > dd:last-child, +dl > dd:last-child > :last-child { + margin-bottom: 0; +} + +dt:target, span.highlighted { + background-color: #fbe54e; +} + +rect.highlighted { + fill: #fbe54e; +} + +dl.glossary dt { + font-weight: bold; + font-size: 1.1em; +} + +.versionmodified { + font-style: italic; +} + +.system-message { + background-color: #fda; + padding: 5px; + border: 3px solid red; +} + +.footnote:target { + background-color: #ffa; +} + +.line-block { + display: block; + margin-top: 1em; + margin-bottom: 1em; +} + +.line-block .line-block { + margin-top: 0; + margin-bottom: 0; + margin-left: 1.5em; +} + +.guilabel, .menuselection { + font-family: sans-serif; +} + +.accelerator { + text-decoration: underline; +} + +.classifier { + font-style: oblique; +} + +.classifier:before { + font-style: normal; + margin: 0 0.5em; + content: ":"; + display: inline-block; +} + +abbr, acronym { + border-bottom: dotted 1px; + cursor: help; +} + +.translated { + background-color: rgba(207, 255, 207, 0.2) +} + +.untranslated { + background-color: rgba(255, 207, 207, 0.2) +} + +/* -- code displays --------------------------------------------------------- */ + +pre { + overflow: auto; + overflow-y: hidden; /* fixes display issues on Chrome browsers */ +} + +pre, div[class*="highlight-"] { + clear: both; +} + +span.pre { + -moz-hyphens: none; + -ms-hyphens: none; + -webkit-hyphens: none; + hyphens: none; + white-space: nowrap; +} + +div[class*="highlight-"] { + margin: 1em 0; +} + +td.linenos pre { + border: 0; + background-color: transparent; + color: #aaa; +} + +table.highlighttable { + display: block; +} + +table.highlighttable tbody { + display: block; +} + +table.highlighttable tr { + display: flex; +} + +table.highlighttable td { + margin: 0; + padding: 0; +} + +table.highlighttable td.linenos { + padding-right: 0.5em; +} + +table.highlighttable td.code { + flex: 1; + overflow: hidden; +} + +.highlight .hll { + display: block; +} + +div.highlight pre, +table.highlighttable pre { + margin: 0; +} + +div.code-block-caption + div { + margin-top: 0; +} + +div.code-block-caption { + margin-top: 1em; + padding: 2px 5px; + font-size: small; +} + +div.code-block-caption code { + background-color: transparent; +} + +table.highlighttable td.linenos, +span.linenos, +div.highlight span.gp { /* gp: Generic.Prompt */ + user-select: none; + -webkit-user-select: text; /* Safari fallback only */ + -webkit-user-select: none; /* Chrome/Safari */ + -moz-user-select: none; /* Firefox */ + -ms-user-select: none; /* IE10+ */ +} + +div.code-block-caption span.caption-number { + padding: 0.1em 0.3em; + font-style: italic; +} + +div.code-block-caption span.caption-text { +} + +div.literal-block-wrapper { + margin: 1em 0; +} + +code.xref, a code { + background-color: transparent; + font-weight: bold; +} + +h1 code, h2 code, h3 code, h4 code, h5 code, h6 code { + background-color: transparent; +} + +.viewcode-link { + float: right; +} + +.viewcode-back { + float: right; + font-family: sans-serif; +} + +div.viewcode-block:target { + margin: -1px -10px; + padding: 0 10px; +} + +/* -- math display ---------------------------------------------------------- */ + +img.math { + vertical-align: middle; +} + +div.body div.math p { + text-align: center; +} + +span.eqno { + float: right; +} + +span.eqno a.headerlink { + position: absolute; + z-index: 1; +} + +div.math:hover a.headerlink { + visibility: visible; +} + +/* -- printout stylesheet --------------------------------------------------- */ + +@media print { + div.document, + div.documentwrapper, + div.bodywrapper { + margin: 0 !important; + width: 100%; + } + + div.sphinxsidebar, + div.related, + div.footer, + #top-link { + display: none; + } +} \ No newline at end of file diff --git a/_static/bgfooter.png b/_static/bgfooter.png new file mode 100644 index 0000000..b7c7cad Binary files /dev/null and b/_static/bgfooter.png differ diff --git a/_static/bgtop.png b/_static/bgtop.png new file mode 100644 index 0000000..0574088 Binary files /dev/null and b/_static/bgtop.png differ diff --git a/_static/doctools.js b/_static/doctools.js new file mode 100644 index 0000000..d06a71d --- /dev/null +++ b/_static/doctools.js @@ -0,0 +1,156 @@ +/* + * doctools.js + * ~~~~~~~~~~~ + * + * Base JavaScript utilities for all Sphinx HTML documentation. + * + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ +"use strict"; + +const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([ + "TEXTAREA", + "INPUT", + "SELECT", + "BUTTON", +]); + +const _ready = (callback) => { + if (document.readyState !== "loading") { + callback(); + } else { + document.addEventListener("DOMContentLoaded", callback); + } +}; + +/** + * Small JavaScript module for the documentation. + */ +const Documentation = { + init: () => { + Documentation.initDomainIndexTable(); + Documentation.initOnKeyListeners(); + }, + + /** + * i18n support + */ + TRANSLATIONS: {}, + PLURAL_EXPR: (n) => (n === 1 ? 0 : 1), + LOCALE: "unknown", + + // gettext and ngettext don't access this so that the functions + // can safely bound to a different name (_ = Documentation.gettext) + gettext: (string) => { + const translated = Documentation.TRANSLATIONS[string]; + switch (typeof translated) { + case "undefined": + return string; // no translation + case "string": + return translated; // translation exists + default: + return translated[0]; // (singular, plural) translation tuple exists + } + }, + + ngettext: (singular, plural, n) => { + const translated = Documentation.TRANSLATIONS[singular]; + if (typeof translated !== "undefined") + return translated[Documentation.PLURAL_EXPR(n)]; + return n === 1 ? singular : plural; + }, + + addTranslations: (catalog) => { + Object.assign(Documentation.TRANSLATIONS, catalog.messages); + Documentation.PLURAL_EXPR = new Function( + "n", + `return (${catalog.plural_expr})` + ); + Documentation.LOCALE = catalog.locale; + }, + + /** + * helper function to focus on search bar + */ + focusSearchBar: () => { + document.querySelectorAll("input[name=q]")[0]?.focus(); + }, + + /** + * Initialise the domain index toggle buttons + */ + initDomainIndexTable: () => { + const toggler = (el) => { + const idNumber = el.id.substr(7); + const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`); + if (el.src.substr(-9) === "minus.png") { + el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`; + toggledRows.forEach((el) => (el.style.display = "none")); + } else { + el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`; + toggledRows.forEach((el) => (el.style.display = "")); + } + }; + + const togglerElements = document.querySelectorAll("img.toggler"); + togglerElements.forEach((el) => + el.addEventListener("click", (event) => toggler(event.currentTarget)) + ); + togglerElements.forEach((el) => (el.style.display = "")); + if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler); + }, + + initOnKeyListeners: () => { + // only install a listener if it is really needed + if ( + !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS && + !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS + ) + return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.altKey || event.ctrlKey || event.metaKey) return; + + if (!event.shiftKey) { + switch (event.key) { + case "ArrowLeft": + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; + + const prevLink = document.querySelector('link[rel="prev"]'); + if (prevLink && prevLink.href) { + window.location.href = prevLink.href; + event.preventDefault(); + } + break; + case "ArrowRight": + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; + + const nextLink = document.querySelector('link[rel="next"]'); + if (nextLink && nextLink.href) { + window.location.href = nextLink.href; + event.preventDefault(); + } + break; + } + } + + // some keyboard layouts may need Shift to get / + switch (event.key) { + case "/": + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break; + Documentation.focusSearchBar(); + event.preventDefault(); + } + }); + }, +}; + +// quick alias for translations +const _ = Documentation.gettext; + +_ready(Documentation.init); diff --git a/_static/documentation_options.js b/_static/documentation_options.js new file mode 100644 index 0000000..cf359c0 --- /dev/null +++ b/_static/documentation_options.js @@ -0,0 +1,14 @@ +var DOCUMENTATION_OPTIONS = { + URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), + VERSION: '0.1', + LANGUAGE: 'en', + COLLAPSE_INDEX: false, + BUILDER: 'html', + FILE_SUFFIX: '.html', + LINK_SUFFIX: '.html', + HAS_SOURCE: true, + SOURCELINK_SUFFIX: '.txt', + NAVIGATION_WITH_KEYS: false, + SHOW_SEARCH_SUMMARY: true, + ENABLE_SEARCH_SHORTCUTS: true, +}; \ No newline at end of file diff --git a/_static/file.png b/_static/file.png new file mode 100644 index 0000000..a858a41 Binary files /dev/null and b/_static/file.png differ diff --git a/_static/language_data.js b/_static/language_data.js new file mode 100644 index 0000000..250f566 --- /dev/null +++ b/_static/language_data.js @@ -0,0 +1,199 @@ +/* + * language_data.js + * ~~~~~~~~~~~~~~~~ + * + * This script contains the language-specific data used by searchtools.js, + * namely the list of stopwords, stemmer, scorer and splitter. + * + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; + + +/* Non-minified version is copied as a separate JS file, is available */ + +/** + * Porter Stemmer + */ +var Stemmer = function() { + + var step2list = { + ational: 'ate', + tional: 'tion', + enci: 'ence', + anci: 'ance', + izer: 'ize', + bli: 'ble', + alli: 'al', + entli: 'ent', + eli: 'e', + ousli: 'ous', + ization: 'ize', + ation: 'ate', + ator: 'ate', + alism: 'al', + iveness: 'ive', + fulness: 'ful', + ousness: 'ous', + aliti: 'al', + iviti: 'ive', + biliti: 'ble', + logi: 'log' + }; + + var step3list = { + icate: 'ic', + ative: '', + alize: 'al', + iciti: 'ic', + ical: 'ic', + ful: '', + ness: '' + }; + + var c = "[^aeiou]"; // consonant + var v = "[aeiouy]"; // vowel + var C = c + "[^aeiouy]*"; // consonant sequence + var V = v + "[aeiou]*"; // vowel sequence + + var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0 + var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 + var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 + var s_v = "^(" + C + ")?" + v; // vowel in stem + + this.stemWord = function (w) { + var stem; + var suffix; + var firstch; + var origword = w; + + if (w.length < 3) + return w; + + var re; + var re2; + var re3; + var re4; + + firstch = w.substr(0,1); + if (firstch == "y") + w = firstch.toUpperCase() + w.substr(1); + + // Step 1a + re = /^(.+?)(ss|i)es$/; + re2 = /^(.+?)([^s])s$/; + + if (re.test(w)) + w = w.replace(re,"$1$2"); + else if (re2.test(w)) + w = w.replace(re2,"$1$2"); + + // Step 1b + re = /^(.+?)eed$/; + re2 = /^(.+?)(ed|ing)$/; + if (re.test(w)) { + var fp = re.exec(w); + re = new RegExp(mgr0); + if (re.test(fp[1])) { + re = /.$/; + w = w.replace(re,""); + } + } + else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1]; + re2 = new RegExp(s_v); + if (re2.test(stem)) { + w = stem; + re2 = /(at|bl|iz)$/; + re3 = new RegExp("([^aeiouylsz])\\1$"); + re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re2.test(w)) + w = w + "e"; + else if (re3.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + else if (re4.test(w)) + w = w + "e"; + } + } + + // Step 1c + re = /^(.+?)y$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(s_v); + if (re.test(stem)) + w = stem + "i"; + } + + // Step 2 + re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) + w = stem + step2list[suffix]; + } + + // Step 3 + re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) + w = stem + step3list[suffix]; + } + + // Step 4 + re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; + re2 = /^(.+?)(s|t)(ion)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + if (re.test(stem)) + w = stem; + } + else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1] + fp[2]; + re2 = new RegExp(mgr1); + if (re2.test(stem)) + w = stem; + } + + // Step 5 + re = /^(.+?)e$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + re2 = new RegExp(meq1); + re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) + w = stem; + } + re = /ll$/; + re2 = new RegExp(mgr1); + if (re.test(w) && re2.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + + // and turn initial Y back to y + if (firstch == "y") + w = firstch.toLowerCase() + w.substr(1); + return w; + } +} + diff --git a/_static/minus.png b/_static/minus.png new file mode 100644 index 0000000..d96755f Binary files /dev/null and b/_static/minus.png differ diff --git a/_static/plus.png b/_static/plus.png new file mode 100644 index 0000000..7107cec Binary files /dev/null and b/_static/plus.png differ diff --git a/_static/pygments.css b/_static/pygments.css new file mode 100644 index 0000000..6110e9f --- /dev/null +++ b/_static/pygments.css @@ -0,0 +1,84 @@ +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +.highlight .hll { background-color: #ffffcc } +.highlight { background: #f8f8f8; } +.highlight .c { color: #8f5902; font-style: italic } /* Comment */ +.highlight .err { color: #a40000; border: 1px solid #ef2929 } /* Error */ +.highlight .g { color: #000000 } /* Generic */ +.highlight .k { color: #204a87; font-weight: bold } /* Keyword */ +.highlight .l { color: #000000 } /* Literal */ +.highlight .n { color: #000000 } /* Name */ +.highlight .o { color: #ce5c00; font-weight: bold } /* Operator */ +.highlight .x { color: #000000 } /* Other */ +.highlight .p { color: #000000; font-weight: bold } /* Punctuation */ +.highlight .ch { color: #8f5902; font-style: italic } /* Comment.Hashbang */ +.highlight .cm { color: #8f5902; font-style: italic } /* Comment.Multiline */ +.highlight .cp { color: #8f5902; font-style: italic } /* Comment.Preproc */ +.highlight .cpf { color: #8f5902; font-style: italic } /* Comment.PreprocFile */ +.highlight .c1 { color: #8f5902; font-style: italic } /* Comment.Single */ +.highlight .cs { color: #8f5902; font-style: italic } /* Comment.Special */ +.highlight .gd { color: #a40000 } /* Generic.Deleted */ +.highlight .ge { color: #000000; font-style: italic } /* Generic.Emph */ +.highlight .ges { color: #000000; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +.highlight .gr { color: #ef2929 } /* Generic.Error */ +.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +.highlight .gi { color: #00A000 } /* Generic.Inserted */ +.highlight .go { color: #000000; font-style: italic } /* Generic.Output */ +.highlight .gp { color: #8f5902 } /* Generic.Prompt */ +.highlight .gs { color: #000000; font-weight: bold } /* Generic.Strong */ +.highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +.highlight .gt { color: #a40000; font-weight: bold } /* Generic.Traceback */ +.highlight .kc { color: #204a87; font-weight: bold } /* Keyword.Constant */ +.highlight .kd { color: #204a87; font-weight: bold } /* Keyword.Declaration */ +.highlight .kn { color: #204a87; font-weight: bold } /* Keyword.Namespace */ +.highlight .kp { color: #204a87; font-weight: bold } /* Keyword.Pseudo */ +.highlight .kr { color: #204a87; font-weight: bold } /* Keyword.Reserved */ +.highlight .kt { color: #204a87; font-weight: bold } /* Keyword.Type */ +.highlight .ld { color: #000000 } /* Literal.Date */ +.highlight .m { color: #0000cf; font-weight: bold } /* Literal.Number */ +.highlight .s { color: #4e9a06 } /* Literal.String */ +.highlight .na { color: #c4a000 } /* Name.Attribute */ +.highlight .nb { color: #204a87 } /* Name.Builtin */ +.highlight .nc { color: #000000 } /* Name.Class */ +.highlight .no { color: #000000 } /* Name.Constant */ +.highlight .nd { color: #5c35cc; font-weight: bold } /* Name.Decorator */ +.highlight .ni { color: #ce5c00 } /* Name.Entity */ +.highlight .ne { color: #cc0000; font-weight: bold } /* Name.Exception */ +.highlight .nf { color: #000000 } /* Name.Function */ +.highlight .nl { color: #f57900 } /* Name.Label */ +.highlight .nn { color: #000000 } /* Name.Namespace */ +.highlight .nx { color: #000000 } /* Name.Other */ +.highlight .py { color: #000000 } /* Name.Property */ +.highlight .nt { color: #204a87; font-weight: bold } /* Name.Tag */ +.highlight .nv { color: #000000 } /* Name.Variable */ +.highlight .ow { color: #204a87; font-weight: bold } /* Operator.Word */ +.highlight .pm { color: #000000; font-weight: bold } /* Punctuation.Marker */ +.highlight .w { color: #f8f8f8 } /* Text.Whitespace */ +.highlight .mb { color: #0000cf; font-weight: bold } /* Literal.Number.Bin */ +.highlight .mf { color: #0000cf; font-weight: bold } /* Literal.Number.Float */ +.highlight .mh { color: #0000cf; font-weight: bold } /* Literal.Number.Hex */ +.highlight .mi { color: #0000cf; font-weight: bold } /* Literal.Number.Integer */ +.highlight .mo { color: #0000cf; font-weight: bold } /* Literal.Number.Oct */ +.highlight .sa { color: #4e9a06 } /* Literal.String.Affix */ +.highlight .sb { color: #4e9a06 } /* Literal.String.Backtick */ +.highlight .sc { color: #4e9a06 } /* Literal.String.Char */ +.highlight .dl { color: #4e9a06 } /* Literal.String.Delimiter */ +.highlight .sd { color: #8f5902; font-style: italic } /* Literal.String.Doc */ +.highlight .s2 { color: #4e9a06 } /* Literal.String.Double */ +.highlight .se { color: #4e9a06 } /* Literal.String.Escape */ +.highlight .sh { color: #4e9a06 } /* Literal.String.Heredoc */ +.highlight .si { color: #4e9a06 } /* Literal.String.Interpol */ +.highlight .sx { color: #4e9a06 } /* Literal.String.Other */ +.highlight .sr { color: #4e9a06 } /* Literal.String.Regex */ +.highlight .s1 { color: #4e9a06 } /* Literal.String.Single */ +.highlight .ss { color: #4e9a06 } /* Literal.String.Symbol */ +.highlight .bp { color: #3465a4 } /* Name.Builtin.Pseudo */ +.highlight .fm { color: #000000 } /* Name.Function.Magic */ +.highlight .vc { color: #000000 } /* Name.Variable.Class */ +.highlight .vg { color: #000000 } /* Name.Variable.Global */ +.highlight .vi { color: #000000 } /* Name.Variable.Instance */ +.highlight .vm { color: #000000 } /* Name.Variable.Magic */ +.highlight .il { color: #0000cf; font-weight: bold } /* Literal.Number.Integer.Long */ \ No newline at end of file diff --git a/_static/searchtools.js b/_static/searchtools.js new file mode 100644 index 0000000..97d56a7 --- /dev/null +++ b/_static/searchtools.js @@ -0,0 +1,566 @@ +/* + * searchtools.js + * ~~~~~~~~~~~~~~~~ + * + * Sphinx JavaScript utilities for the full-text search. + * + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ +"use strict"; + +/** + * Simple result scoring code. + */ +if (typeof Scorer === "undefined") { + var Scorer = { + // Implement the following function to further tweak the score for each result + // The function takes a result array [docname, title, anchor, descr, score, filename] + // and returns the new score. + /* + score: result => { + const [docname, title, anchor, descr, score, filename] = result + return score + }, + */ + + // query matches the full name of an object + objNameMatch: 11, + // or matches in the last dotted part of the object name + objPartialMatch: 6, + // Additive scores depending on the priority of the object + objPrio: { + 0: 15, // used to be importantResults + 1: 5, // used to be objectResults + 2: -5, // used to be unimportantResults + }, + // Used when the priority is not in the mapping. + objPrioDefault: 0, + + // query found in title + title: 15, + partialTitle: 7, + // query found in terms + term: 5, + partialTerm: 2, + }; +} + +const _removeChildren = (element) => { + while (element && element.lastChild) element.removeChild(element.lastChild); +}; + +/** + * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#escaping + */ +const _escapeRegExp = (string) => + string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string + +const _displayItem = (item, searchTerms) => { + const docBuilder = DOCUMENTATION_OPTIONS.BUILDER; + const docUrlRoot = DOCUMENTATION_OPTIONS.URL_ROOT; + const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX; + const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX; + const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; + + const [docName, title, anchor, descr, score, _filename] = item; + + let listItem = document.createElement("li"); + let requestUrl; + let linkUrl; + if (docBuilder === "dirhtml") { + // dirhtml builder + let dirname = docName + "/"; + if (dirname.match(/\/index\/$/)) + dirname = dirname.substring(0, dirname.length - 6); + else if (dirname === "index/") dirname = ""; + requestUrl = docUrlRoot + dirname; + linkUrl = requestUrl; + } else { + // normal html builders + requestUrl = docUrlRoot + docName + docFileSuffix; + linkUrl = docName + docLinkSuffix; + } + let linkEl = listItem.appendChild(document.createElement("a")); + linkEl.href = linkUrl + anchor; + linkEl.dataset.score = score; + linkEl.innerHTML = title; + if (descr) + listItem.appendChild(document.createElement("span")).innerHTML = + " (" + descr + ")"; + else if (showSearchSummary) + fetch(requestUrl) + .then((responseData) => responseData.text()) + .then((data) => { + if (data) + listItem.appendChild( + Search.makeSearchSummary(data, searchTerms) + ); + }); + Search.output.appendChild(listItem); +}; +const _finishSearch = (resultCount) => { + Search.stopPulse(); + Search.title.innerText = _("Search Results"); + if (!resultCount) + Search.status.innerText = Documentation.gettext( + "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." + ); + else + Search.status.innerText = _( + `Search finished, found ${resultCount} page(s) matching the search query.` + ); +}; +const _displayNextItem = ( + results, + resultCount, + searchTerms +) => { + // results left, load the summary and display it + // this is intended to be dynamic (don't sub resultsCount) + if (results.length) { + _displayItem(results.pop(), searchTerms); + setTimeout( + () => _displayNextItem(results, resultCount, searchTerms), + 5 + ); + } + // search finished, update title and status message + else _finishSearch(resultCount); +}; + +/** + * Default splitQuery function. Can be overridden in ``sphinx.search`` with a + * custom function per language. + * + * The regular expression works by splitting the string on consecutive characters + * that are not Unicode letters, numbers, underscores, or emoji characters. + * This is the same as ``\W+`` in Python, preserving the surrogate pair area. + */ +if (typeof splitQuery === "undefined") { + var splitQuery = (query) => query + .split(/[^\p{Letter}\p{Number}_\p{Emoji_Presentation}]+/gu) + .filter(term => term) // remove remaining empty strings +} + +/** + * Search Module + */ +const Search = { + _index: null, + _queued_query: null, + _pulse_status: -1, + + htmlToText: (htmlString) => { + const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html'); + htmlElement.querySelectorAll(".headerlink").forEach((el) => { el.remove() }); + const docContent = htmlElement.querySelector('[role="main"]'); + if (docContent !== undefined) return docContent.textContent; + console.warn( + "Content block not found. Sphinx search tries to obtain it via '[role=main]'. Could you check your theme or template." + ); + return ""; + }, + + init: () => { + const query = new URLSearchParams(window.location.search).get("q"); + document + .querySelectorAll('input[name="q"]') + .forEach((el) => (el.value = query)); + if (query) Search.performSearch(query); + }, + + loadIndex: (url) => + (document.body.appendChild(document.createElement("script")).src = url), + + setIndex: (index) => { + Search._index = index; + if (Search._queued_query !== null) { + const query = Search._queued_query; + Search._queued_query = null; + Search.query(query); + } + }, + + hasIndex: () => Search._index !== null, + + deferQuery: (query) => (Search._queued_query = query), + + stopPulse: () => (Search._pulse_status = -1), + + startPulse: () => { + if (Search._pulse_status >= 0) return; + + const pulse = () => { + Search._pulse_status = (Search._pulse_status + 1) % 4; + Search.dots.innerText = ".".repeat(Search._pulse_status); + if (Search._pulse_status >= 0) window.setTimeout(pulse, 500); + }; + pulse(); + }, + + /** + * perform a search for something (or wait until index is loaded) + */ + performSearch: (query) => { + // create the required interface elements + const searchText = document.createElement("h2"); + searchText.textContent = _("Searching"); + const searchSummary = document.createElement("p"); + searchSummary.classList.add("search-summary"); + searchSummary.innerText = ""; + const searchList = document.createElement("ul"); + searchList.classList.add("search"); + + const out = document.getElementById("search-results"); + Search.title = out.appendChild(searchText); + Search.dots = Search.title.appendChild(document.createElement("span")); + Search.status = out.appendChild(searchSummary); + Search.output = out.appendChild(searchList); + + const searchProgress = document.getElementById("search-progress"); + // Some themes don't use the search progress node + if (searchProgress) { + searchProgress.innerText = _("Preparing search..."); + } + Search.startPulse(); + + // index already loaded, the browser was quick! + if (Search.hasIndex()) Search.query(query); + else Search.deferQuery(query); + }, + + /** + * execute search (requires search index to be loaded) + */ + query: (query) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + const allTitles = Search._index.alltitles; + const indexEntries = Search._index.indexentries; + + // stem the search terms and add them to the correct list + const stemmer = new Stemmer(); + const searchTerms = new Set(); + const excludedTerms = new Set(); + const highlightTerms = new Set(); + const objectTerms = new Set(splitQuery(query.toLowerCase().trim())); + splitQuery(query.trim()).forEach((queryTerm) => { + const queryTermLower = queryTerm.toLowerCase(); + + // maybe skip this "word" + // stopwords array is from language_data.js + if ( + stopwords.indexOf(queryTermLower) !== -1 || + queryTerm.match(/^\d+$/) + ) + return; + + // stem the word + let word = stemmer.stemWord(queryTermLower); + // select the correct list + if (word[0] === "-") excludedTerms.add(word.substr(1)); + else { + searchTerms.add(word); + highlightTerms.add(queryTermLower); + } + }); + + if (SPHINX_HIGHLIGHT_ENABLED) { // set in sphinx_highlight.js + localStorage.setItem("sphinx_highlight_terms", [...highlightTerms].join(" ")) + } + + // console.debug("SEARCH: searching for:"); + // console.info("required: ", [...searchTerms]); + // console.info("excluded: ", [...excludedTerms]); + + // array of [docname, title, anchor, descr, score, filename] + let results = []; + _removeChildren(document.getElementById("search-progress")); + + const queryLower = query.toLowerCase(); + for (const [title, foundTitles] of Object.entries(allTitles)) { + if (title.toLowerCase().includes(queryLower) && (queryLower.length >= title.length/2)) { + for (const [file, id] of foundTitles) { + let score = Math.round(100 * queryLower.length / title.length) + results.push([ + docNames[file], + titles[file] !== title ? `${titles[file]} > ${title}` : title, + id !== null ? "#" + id : "", + null, + score, + filenames[file], + ]); + } + } + } + + // search for explicit entries in index directives + for (const [entry, foundEntries] of Object.entries(indexEntries)) { + if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) { + for (const [file, id] of foundEntries) { + let score = Math.round(100 * queryLower.length / entry.length) + results.push([ + docNames[file], + titles[file], + id ? "#" + id : "", + null, + score, + filenames[file], + ]); + } + } + } + + // lookup as object + objectTerms.forEach((term) => + results.push(...Search.performObjectSearch(term, objectTerms)) + ); + + // lookup as search terms in fulltext + results.push(...Search.performTermsSearch(searchTerms, excludedTerms)); + + // let the scorer override scores with a custom scoring function + if (Scorer.score) results.forEach((item) => (item[4] = Scorer.score(item))); + + // now sort the results by score (in opposite order of appearance, since the + // display function below uses pop() to retrieve items) and then + // alphabetically + results.sort((a, b) => { + const leftScore = a[4]; + const rightScore = b[4]; + if (leftScore === rightScore) { + // same score: sort alphabetically + const leftTitle = a[1].toLowerCase(); + const rightTitle = b[1].toLowerCase(); + if (leftTitle === rightTitle) return 0; + return leftTitle > rightTitle ? -1 : 1; // inverted is intentional + } + return leftScore > rightScore ? 1 : -1; + }); + + // remove duplicate search results + // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept + let seen = new Set(); + results = results.reverse().reduce((acc, result) => { + let resultStr = result.slice(0, 4).concat([result[5]]).map(v => String(v)).join(','); + if (!seen.has(resultStr)) { + acc.push(result); + seen.add(resultStr); + } + return acc; + }, []); + + results = results.reverse(); + + // for debugging + //Search.lastresults = results.slice(); // a copy + // console.info("search results:", Search.lastresults); + + // print the results + _displayNextItem(results, results.length, searchTerms); + }, + + /** + * search for object names + */ + performObjectSearch: (object, objectTerms) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const objects = Search._index.objects; + const objNames = Search._index.objnames; + const titles = Search._index.titles; + + const results = []; + + const objectSearchCallback = (prefix, match) => { + const name = match[4] + const fullname = (prefix ? prefix + "." : "") + name; + const fullnameLower = fullname.toLowerCase(); + if (fullnameLower.indexOf(object) < 0) return; + + let score = 0; + const parts = fullnameLower.split("."); + + // check for different match types: exact matches of full name or + // "last name" (i.e. last dotted part) + if (fullnameLower === object || parts.slice(-1)[0] === object) + score += Scorer.objNameMatch; + else if (parts.slice(-1)[0].indexOf(object) > -1) + score += Scorer.objPartialMatch; // matches in last name + + const objName = objNames[match[1]][2]; + const title = titles[match[0]]; + + // If more than one term searched for, we require other words to be + // found in the name/title/description + const otherTerms = new Set(objectTerms); + otherTerms.delete(object); + if (otherTerms.size > 0) { + const haystack = `${prefix} ${name} ${objName} ${title}`.toLowerCase(); + if ( + [...otherTerms].some((otherTerm) => haystack.indexOf(otherTerm) < 0) + ) + return; + } + + let anchor = match[3]; + if (anchor === "") anchor = fullname; + else if (anchor === "-") anchor = objNames[match[1]][1] + "-" + fullname; + + const descr = objName + _(", in ") + title; + + // add custom score for some objects according to scorer + if (Scorer.objPrio.hasOwnProperty(match[2])) + score += Scorer.objPrio[match[2]]; + else score += Scorer.objPrioDefault; + + results.push([ + docNames[match[0]], + fullname, + "#" + anchor, + descr, + score, + filenames[match[0]], + ]); + }; + Object.keys(objects).forEach((prefix) => + objects[prefix].forEach((array) => + objectSearchCallback(prefix, array) + ) + ); + return results; + }, + + /** + * search for full-text terms in the index + */ + performTermsSearch: (searchTerms, excludedTerms) => { + // prepare search + const terms = Search._index.terms; + const titleTerms = Search._index.titleterms; + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + + const scoreMap = new Map(); + const fileMap = new Map(); + + // perform the search on the required terms + searchTerms.forEach((word) => { + const files = []; + const arr = [ + { files: terms[word], score: Scorer.term }, + { files: titleTerms[word], score: Scorer.title }, + ]; + // add support for partial matches + if (word.length > 2) { + const escapedWord = _escapeRegExp(word); + Object.keys(terms).forEach((term) => { + if (term.match(escapedWord) && !terms[word]) + arr.push({ files: terms[term], score: Scorer.partialTerm }); + }); + Object.keys(titleTerms).forEach((term) => { + if (term.match(escapedWord) && !titleTerms[word]) + arr.push({ files: titleTerms[word], score: Scorer.partialTitle }); + }); + } + + // no match but word was a required one + if (arr.every((record) => record.files === undefined)) return; + + // found search word in contents + arr.forEach((record) => { + if (record.files === undefined) return; + + let recordFiles = record.files; + if (recordFiles.length === undefined) recordFiles = [recordFiles]; + files.push(...recordFiles); + + // set score for the word in each file + recordFiles.forEach((file) => { + if (!scoreMap.has(file)) scoreMap.set(file, {}); + scoreMap.get(file)[word] = record.score; + }); + }); + + // create the mapping + files.forEach((file) => { + if (fileMap.has(file) && fileMap.get(file).indexOf(word) === -1) + fileMap.get(file).push(word); + else fileMap.set(file, [word]); + }); + }); + + // now check if the files don't contain excluded terms + const results = []; + for (const [file, wordList] of fileMap) { + // check if all requirements are matched + + // as search terms with length < 3 are discarded + const filteredTermCount = [...searchTerms].filter( + (term) => term.length > 2 + ).length; + if ( + wordList.length !== searchTerms.size && + wordList.length !== filteredTermCount + ) + continue; + + // ensure that none of the excluded terms is in the search result + if ( + [...excludedTerms].some( + (term) => + terms[term] === file || + titleTerms[term] === file || + (terms[term] || []).includes(file) || + (titleTerms[term] || []).includes(file) + ) + ) + break; + + // select one (max) score for the file. + const score = Math.max(...wordList.map((w) => scoreMap.get(file)[w])); + // add result to the result list + results.push([ + docNames[file], + titles[file], + "", + null, + score, + filenames[file], + ]); + } + return results; + }, + + /** + * helper function to return a node containing the + * search summary for a given text. keywords is a list + * of stemmed words. + */ + makeSearchSummary: (htmlText, keywords) => { + const text = Search.htmlToText(htmlText); + if (text === "") return null; + + const textLower = text.toLowerCase(); + const actualStartPosition = [...keywords] + .map((k) => textLower.indexOf(k.toLowerCase())) + .filter((i) => i > -1) + .slice(-1)[0]; + const startWithContext = Math.max(actualStartPosition - 120, 0); + + const top = startWithContext === 0 ? "" : "..."; + const tail = startWithContext + 240 < text.length ? "..." : ""; + + let summary = document.createElement("p"); + summary.classList.add("context"); + summary.textContent = top + text.substr(startWithContext, 240).trim() + tail; + + return summary; + }, +}; + +_ready(Search.init); diff --git a/_static/sphinx_highlight.js b/_static/sphinx_highlight.js new file mode 100644 index 0000000..aae669d --- /dev/null +++ b/_static/sphinx_highlight.js @@ -0,0 +1,144 @@ +/* Highlighting utilities for Sphinx HTML documentation. */ +"use strict"; + +const SPHINX_HIGHLIGHT_ENABLED = true + +/** + * highlight a given string on a node by wrapping it in + * span elements with the given class name. + */ +const _highlight = (node, addItems, text, className) => { + if (node.nodeType === Node.TEXT_NODE) { + const val = node.nodeValue; + const parent = node.parentNode; + const pos = val.toLowerCase().indexOf(text); + if ( + pos >= 0 && + !parent.classList.contains(className) && + !parent.classList.contains("nohighlight") + ) { + let span; + + const closestNode = parent.closest("body, svg, foreignObject"); + const isInSVG = closestNode && closestNode.matches("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.classList.add(className); + } + + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + parent.insertBefore( + span, + parent.insertBefore( + document.createTextNode(val.substr(pos + text.length)), + node.nextSibling + ) + ); + node.nodeValue = val.substr(0, pos); + + if (isInSVG) { + const rect = document.createElementNS( + "http://www.w3.org/2000/svg", + "rect" + ); + const bbox = parent.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute("class", className); + addItems.push({ parent: parent, target: rect }); + } + } + } else if (node.matches && !node.matches("button, select, textarea")) { + node.childNodes.forEach((el) => _highlight(el, addItems, text, className)); + } +}; +const _highlightText = (thisNode, text, className) => { + let addItems = []; + _highlight(thisNode, addItems, text, className); + addItems.forEach((obj) => + obj.parent.insertAdjacentElement("beforebegin", obj.target) + ); +}; + +/** + * Small JavaScript module for the documentation. + */ +const SphinxHighlight = { + + /** + * highlight the search words provided in localstorage in the text + */ + highlightSearchWords: () => { + if (!SPHINX_HIGHLIGHT_ENABLED) return; // bail if no highlight + + // get and clear terms from localstorage + const url = new URL(window.location); + const highlight = + localStorage.getItem("sphinx_highlight_terms") + || url.searchParams.get("highlight") + || ""; + localStorage.removeItem("sphinx_highlight_terms") + url.searchParams.delete("highlight"); + window.history.replaceState({}, "", url); + + // get individual terms from highlight string + const terms = highlight.toLowerCase().split(/\s+/).filter(x => x); + if (terms.length === 0) return; // nothing to do + + // There should never be more than one element matching "div.body" + const divBody = document.querySelectorAll("div.body"); + const body = divBody.length ? divBody[0] : document.querySelector("body"); + window.setTimeout(() => { + terms.forEach((term) => _highlightText(body, term, "highlighted")); + }, 10); + + const searchBox = document.getElementById("searchbox"); + if (searchBox === null) return; + searchBox.appendChild( + document + .createRange() + .createContextualFragment( + '" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + localStorage.removeItem("sphinx_highlight_terms") + }, + + initEscapeListener: () => { + // only install a listener if it is really needed + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; + if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { + SphinxHighlight.hideSearchWords(); + event.preventDefault(); + } + }); + }, +}; + +_ready(SphinxHighlight.highlightSearchWords); +_ready(SphinxHighlight.initEscapeListener); diff --git a/cpg.html b/cpg.html new file mode 100644 index 0000000..58b7389 --- /dev/null +++ b/cpg.html @@ -0,0 +1,183 @@ + + + + + + + + CpG Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

CpG Module Documentation

+

Set of discriminant functions that can recognize structural and compositional features +such as CpG islands, promoter regions and first splice-donor sites. +Davuluri RV, Grosse I, Zhang MQ: Computational identification of promoters and +first exons in the human genome. Nat Genet. 2001, 29(4):412-417. [PMID: 11726928]

+
+
+ensembl.tools.anno.simple_feature_annotation.cpg.run_cpg(genome_file: PathLike, output_dir: Path, cpg_bin: Path = PosixPath('cpg_lh'), cpg_min_length: int = 400, cpg_min_gc_content: int = 50, cpg_min_oe: float = 0.6, num_threads: int = 1) None[source]
+

Run CpG islands on genomic slices

+
+
+
param genome_file:
+

Genome file path.

+
+
type genome_file:
+

PathLike

+
+
param output_dir:
+

Working directory path

+
+
type output_dir:
+

Path

+
+
param cpg_bin:
+

CpG software path.

+
+
type cpg_bin:
+

Path

+
+
param cpg_min_length:
+

Min length of CpG islands

+
+
type cpg_min_length:
+

int

+
+
param cpg_min_gc_content:
+

Min GC frequency percentage

+
+
type cpg_min_gc_content:
+

int

+
+
param cpg_min_oe:
+

Min ratio of the observed to expected number of CpG (CpGo/e)

+
+
type cpg_min_oe:
+

float

+
+
param num_threads:
+

int, number of threads.

+
+
type num_threads:
+

int

+
+
return:
+

None

+
+
rtype:
+

None

+
+
+
+
+ +
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/doctrees/cpg.doctree b/doctrees/cpg.doctree new file mode 100644 index 0000000..d1a7c98 Binary files /dev/null and b/doctrees/cpg.doctree differ diff --git a/doctrees/dust.doctree b/doctrees/dust.doctree new file mode 100644 index 0000000..93a8a69 Binary files /dev/null and b/doctrees/dust.doctree differ diff --git a/doctrees/environment.pickle b/doctrees/environment.pickle new file mode 100644 index 0000000..51fc432 Binary files /dev/null and b/doctrees/environment.pickle differ diff --git a/doctrees/eponine.doctree b/doctrees/eponine.doctree new file mode 100644 index 0000000..d1c61bc Binary files /dev/null and b/doctrees/eponine.doctree differ diff --git a/doctrees/genblast.doctree b/doctrees/genblast.doctree new file mode 100644 index 0000000..d86992c Binary files /dev/null and b/doctrees/genblast.doctree differ diff --git a/doctrees/index.doctree b/doctrees/index.doctree new file mode 100644 index 0000000..3141c25 Binary files /dev/null and b/doctrees/index.doctree differ diff --git a/doctrees/install.doctree b/doctrees/install.doctree new file mode 100644 index 0000000..1efdc99 Binary files /dev/null and b/doctrees/install.doctree differ diff --git a/doctrees/license.doctree b/doctrees/license.doctree new file mode 100644 index 0000000..6af76fe Binary files /dev/null and b/doctrees/license.doctree differ diff --git a/doctrees/minimap.doctree b/doctrees/minimap.doctree new file mode 100644 index 0000000..1135099 Binary files /dev/null and b/doctrees/minimap.doctree differ diff --git a/doctrees/red.doctree b/doctrees/red.doctree new file mode 100644 index 0000000..5faa08d Binary files /dev/null and b/doctrees/red.doctree differ diff --git a/doctrees/repeatmasker.doctree b/doctrees/repeatmasker.doctree new file mode 100644 index 0000000..e56e5b0 Binary files /dev/null and b/doctrees/repeatmasker.doctree differ diff --git a/doctrees/scallop.doctree b/doctrees/scallop.doctree new file mode 100644 index 0000000..610e945 Binary files /dev/null and b/doctrees/scallop.doctree differ diff --git a/doctrees/star.doctree b/doctrees/star.doctree new file mode 100644 index 0000000..3cad10e Binary files /dev/null and b/doctrees/star.doctree differ diff --git a/doctrees/stringtie.doctree b/doctrees/stringtie.doctree new file mode 100644 index 0000000..952b4e0 Binary files /dev/null and b/doctrees/stringtie.doctree differ diff --git a/doctrees/trf.doctree b/doctrees/trf.doctree new file mode 100644 index 0000000..61be72e Binary files /dev/null and b/doctrees/trf.doctree differ diff --git a/doctrees/trnascan.doctree b/doctrees/trnascan.doctree new file mode 100644 index 0000000..991c879 Binary files /dev/null and b/doctrees/trnascan.doctree differ diff --git a/dust.html b/dust.html new file mode 100644 index 0000000..125fe2d --- /dev/null +++ b/dust.html @@ -0,0 +1,165 @@ + + + + + + + + DustMasker Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

DustMasker Module Documentation

+

DustMasker is a program that identifies and masks out low complexity +parts of a genome using a new and improved DUST algorithm.

+

Morgulis A, Gertz EM, Schaffer AA, Agarwala R. A Fast and Symmetric +DUST Implementation to Mask Low-Complexity DNA Sequences.

+
+
+ensembl.tools.anno.repeat_annotation.dust.run_dust(genome_file: PathLike, output_dir: Path, dust_bin: Path = PosixPath('dustmasker'), num_threads: int = 1) None[source]
+
+
Run Dust on genomic slices with mutiprocessing
+
param genome_file:
+

Genome file path.

+
+
type genome_file:
+

PathLike

+
+
param output_dir:
+

Working directory path.

+
+
type output_dir:
+

Path

+
+
param dust_bin:
+

Dust software path.

+
+
type dust_bin:
+

Path, default dustmasker

+
+
param num_threads:
+

Number of threads.

+
+
type num_threads:
+

int, default 1

+
+
return:
+

None

+
+
rtype:
+

None

+
+
+
+
+
+ +
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/eponine.html b/eponine.html new file mode 100644 index 0000000..cd0198a --- /dev/null +++ b/eponine.html @@ -0,0 +1,172 @@ + + + + + + + + Eponine Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

Eponine Module Documentation

+

Eponine is a probabilistic method for detecting transcription start sites (TSS) +in mammalian genomic sequence, with good specificity and excellent positional accuracy. +Down TA, Hubbard TJ. Computational detection and location of transcription start sites +in mammalian genomic DNA. Genome Res. 2002 Mar;12(3):458-61. doi: 10.1101/gr.216102. +PMID: 11875034; PMCID: PMC155284.

+
+
+ensembl.tools.anno.simple_feature_annotation.eponine.run_eponine(genome_file: PathLike, output_dir: Path, num_threads: int = 1, java_bin: Path = PosixPath('java'), eponine_bin: Path = PosixPath('/hps/software/users/ensembl/ensw/C8-MAR21-sandybridge/linuxbrew/opt/eponine/libexec/eponine-scan.jar'), eponine_threshold: float = 0.999) None[source]
+
+
Run Eponine on genomic slices
+
param genome_file:
+

Genome file path.

+
+
param genome_file:
+

PathLike

+
+
param output_dir:
+

Working directory path.

+
+
param output_dir:
+

Path

+
+
param java_bin:
+

Java path.

+
+
param java_bin:
+

Path, default java

+
+
param eponine_bin:
+

Eponine software path

+
+
param eponine_bin:
+

Path

+
+
param num_threads:
+

Number of threads.

+
+
param num_threads:
+

int, default 1

+
+
return:
+

None

+
+
rtype:
+

None

+
+
+
+
+
+ +
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/genblast.html b/genblast.html new file mode 100644 index 0000000..b6493b4 --- /dev/null +++ b/genblast.html @@ -0,0 +1,218 @@ + + + + + + + + Genblast Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

Genblast Module Documentation

+

GenBlast identifies homologous gene sequences in genomic databases. +One of the key features of GenBlast is its flexibility to handle +comparative genomics tasks and accurately identify homologs even when +the sequences have undergone significant evolutionary changes. +This capability makes it a valuable resource for researchers studying gene +evolution, gene families, and gene function across diverse species. +GenBlast has been widely used in various genomic analyses and is available as +a standalone command-line tool or as part of different bioinformatics pipelines. +Researchers in the field of comparative genomics and gene function analysis +often rely on GenBlast to perform sensitive homology searches and obtain +valuable insights into the evolutionary relationships and functional conservation +of genes in different organisms.

+

She, R., Chu, J.S., Uyar, B., Wang, J., Wang, K., and Chen, N. (2011). +GenBlastA: enabling BLAST to identify homologous gene sequences. +Genome Res., 21(5): 936-949.

+
+
+ensembl.tools.anno.protein_annotation.genblast.run_genblast(masked_genome: Path, output_dir: Path, protein_dataset: Path, max_intron_length: int, genblast_timeout_secs: int = 10800, genblast_bin: Path = PosixPath('genblast'), convert2blastmask_bin: Path = PosixPath('convert2blastmask'), makeblastdb_bin: Path = PosixPath('makeblastdb'), num_threads: int = 1, protein_set: str = ['uniprot', 'orthodb']) None[source]
+

Executes GenBlast on genomic slices

+
+
+
param masked_genome:
+

Masked genome file path.

+
+
type masked_genome:
+

Path

+
+
param output_dir:
+

Working directory path.

+
+
type output_dir:
+

Path

+
+
param protein_dataset:
+

Protein dataset (Uniprot/OrthoDb) path.

+
+
type protein_dataset:
+

Path

+
+
param genblast_timeout_secs:
+

Time for timeout (sec).

+
+
type genblast_timeout_secs:
+

int, default 10800

+
+
param max_intron_length:
+

Maximum intron length.

+
+
type max_intron_length:
+

int

+
+
param genblast_bin:
+

Software path.

+
+
type genblast_bin:
+

Path, default genblast

+
+
param convert2blastmask_bin:
+

Software path.

+
+
type convert2blastmask_bin:
+

Path, default convert2blastmask

+
+
param makeblastdb_bin:
+

Software path.

+
+
type makeblastdb_bin:
+

Path, default makeblastdb

+
+
param genblast_timeout:
+

seconds

+
+
type genblast_timeout:
+

int, default 1

+
+
param num_threads:
+

int, number of threads.

+
+
type num_threads:
+

int, default 1

+
+
param protein_set:
+

Source

+
+
type str:
+

[“uniprot”, “orthodb”]

+
+
return:
+

None

+
+
rtype:
+

None

+
+
+
+
+ +
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/genindex.html b/genindex.html new file mode 100644 index 0000000..b3acfbe --- /dev/null +++ b/genindex.html @@ -0,0 +1,272 @@ + + + + + + + Index — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ + +

Index

+ +
+ E + | M + | R + +
+

E

+ + + +
    +
  • + ensembl.tools.anno.protein_annotation.genblast + +
  • +
  • + ensembl.tools.anno.repeat_annotation.dust + +
  • +
  • + ensembl.tools.anno.repeat_annotation.red + +
  • +
  • + ensembl.tools.anno.repeat_annotation.repeatmasker + +
  • +
  • + ensembl.tools.anno.repeat_annotation.trf + +
  • +
  • + ensembl.tools.anno.simple_feature_annotation.cpg + +
  • +
    +
  • + ensembl.tools.anno.simple_feature_annotation.eponine + +
  • +
  • + ensembl.tools.anno.snc_rna_annotation.trnascan + +
  • +
  • + ensembl.tools.anno.transcriptomic_annotation.minimap + +
  • +
  • + ensembl.tools.anno.transcriptomic_annotation.scallop + +
  • +
  • + ensembl.tools.anno.transcriptomic_annotation.star + +
  • +
  • + ensembl.tools.anno.transcriptomic_annotation.stringtie + +
  • +
+ +

M

+ + +
+ +

R

+ + + +
+ + + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/html/output.json b/html/output.json new file mode 100644 index 0000000..371ed0a --- /dev/null +++ b/html/output.json @@ -0,0 +1,9 @@ +{"filename": "index.rst", "lineno": 56, "status": "unchecked", "code": 0, "uri": "", "info": ""} +{"filename": "index.rst", "lineno": 30, "status": "unchecked", "code": 0, "uri": "#install", "info": ""} +{"filename": "license.rst", "lineno": 197, "status": "redirected", "code": 301, "uri": "http://www.apache.org/licenses/LICENSE-2.0", "info": "https://www.apache.org/licenses/LICENSE-2.0"} +{"filename": "license.rst", "lineno": 4, "status": "redirected", "code": 301, "uri": "http://www.apache.org/licenses/", "info": "https://www.apache.org/licenses/"} +{"filename": "install.rst", "lineno": 29, "status": "working", "code": 0, "uri": "https://github.com/Ensembl/ensembl-orm", "info": ""} +{"filename": "install.rst", "lineno": 27, "status": "working", "code": 0, "uri": "https://github.com/Ensembl/ensembl-analysis/tree/dev/hive_master", "info": ""} +{"filename": "install.rst", "lineno": 26, "status": "working", "code": 0, "uri": "https://github.com/Ensembl/ensembl-production", "info": ""} +{"filename": "install.rst", "lineno": 28, "status": "working", "code": 0, "uri": "https://github.com/Ensembl/ensembl-taxonomy", "info": ""} +{"filename": "red.rst", "lineno": 1, "status": "redirected", "code": 302, "uri": "https://doi.org/10.1186/s12859-015-0654-5", "info": "https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0654-5"} diff --git a/html/output.txt b/html/output.txt new file mode 100644 index 0000000..3ee86ee --- /dev/null +++ b/html/output.txt @@ -0,0 +1,3 @@ +license.rst:197: [redirected permanently] http://www.apache.org/licenses/LICENSE-2.0 to https://www.apache.org/licenses/LICENSE-2.0 +license.rst:4: [redirected permanently] http://www.apache.org/licenses/ to https://www.apache.org/licenses/ +red.rst:1: [redirected with Found] https://doi.org/10.1186/s12859-015-0654-5 to https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0654-5 diff --git a/index.html b/index.html new file mode 100644 index 0000000..013d0e4 --- /dev/null +++ b/index.html @@ -0,0 +1,149 @@ + + + + + + + + Ensembl-anno — ensembl-anno 0.1 documentation + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/install.html b/install.html new file mode 100644 index 0000000..19ce844 --- /dev/null +++ b/install.html @@ -0,0 +1,155 @@ + + + + + + + + API Setup and installation — ensembl-anno 0.1 documentation + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

API Setup and installation

+
+

Requirements

+

An Ensembl API checkout including:

+ +
+

Software

+
    +
  1. Python 3.8+

  2. +
  3. Bioperl 1.6.9+

  4. +
+
+
+

Python Modules

+
    +
  1. argschema

  2. +
+
+
+
+

Installation

+

Directly from GitHub:

+
1git clone https://github.com/Ensembl/ensembl-analysis -b experimental/gbiab
+2git clone https://github.com/Ensembl/ensembl-production
+3git clone https://github.com/Ensembl/ensembl-hive
+4git clone https://github.com/Ensembl/ensembl-taxonomy
+5git clone https://github.com/Ensembl/ensembl-orm
+
+
+
+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/license.html b/license.html new file mode 100644 index 0000000..857023e --- /dev/null +++ b/license.html @@ -0,0 +1,301 @@ + + + + + + + + License — ensembl-anno 0.1 documentation + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

License

+
+

Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/

+
    +
  1. Definitions.

    +

    “License” shall mean the terms and conditions for use, reproduction, +and distribution as defined by Sections 1 through 9 of this document.

    +

    “Licensor” shall mean the copyright owner or entity authorized by +the copyright owner that is granting the License.

    +

    “Legal Entity” shall mean the union of the acting entity and all +other entities that control, are controlled by, or are under common +control with that entity. For the purposes of this definition, +“control” means (i) the power, direct or indirect, to cause the +direction or management of such entity, whether by contract or +otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity.

    +

    “You” (or “Your”) shall mean an individual or Legal Entity +exercising permissions granted by this License.

    +

    “Source” form shall mean the preferred form for making modifications, +including but not limited to software source code, documentation +source, and configuration files.

    +

    “Object” form shall mean any form resulting from mechanical +transformation or translation of a Source form, including but +not limited to compiled object code, generated documentation, +and conversions to other media types.

    +

    “Work” shall mean the work of authorship, whether in Source or +Object form, made available under the License, as indicated by a +copyright notice that is included in or attached to the work +(an example is provided in the Appendix below).

    +

    “Derivative Works” shall mean any work, whether in Source or Object +form, that is based on (or derived from) the Work and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. For the purposes +of this License, Derivative Works shall not include works that remain +separable from, or merely link (or bind by name) to the interfaces of, +the Work and Derivative Works thereof.

    +

    “Contribution” shall mean any work of authorship, including +the original version of the Work and any modifications or additions +to that Work or Derivative Works thereof, that is intentionally +submitted to Licensor for inclusion in the Work by the copyright owner +or by an individual or Legal Entity authorized to submit on behalf of +the copyright owner. For the purposes of this definition, “submitted” +means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, +and issue tracking systems that are managed by, or on behalf of, the +Licensor for the purpose of discussing and improving the Work, but +excluding communication that is conspicuously marked or otherwise +designated in writing by the copyright owner as “Not a Contribution.”

    +

    “Contributor” shall mean Licensor and any individual or Legal Entity +on behalf of whom a Contribution has been received by Licensor and +subsequently incorporated within the Work.

    +
  2. +
  3. Grant of Copyright License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the +Work and such Derivative Works in Source or Object form.

  4. +
  5. Grant of Patent License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +(except as stated in this section) patent license to make, have made, +use, offer to sell, sell, import, and otherwise transfer the Work, +where such license applies only to those patent claims licensable +by such Contributor that are necessarily infringed by their +Contribution(s) alone or by combination of their Contribution(s) +with the Work to which such Contribution(s) was submitted. If You +institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work +or a Contribution incorporated within the Work constitutes direct +or contributory patent infringement, then any patent licenses +granted to You under this License for that Work shall terminate +as of the date such litigation is filed.

  6. +
  7. Redistribution. You may reproduce and distribute copies of the +Work or Derivative Works thereof in any medium, with or without +modifications, and in Source or Object form, provided that You +meet the following conditions:

    +
      +
    1. You must give any other recipients of the Work or +Derivative Works a copy of this License; and

    2. +
    3. You must cause any modified files to carry prominent notices +stating that You changed the files; and

    4. +
    5. You must retain, in the Source form of any Derivative Works +that You distribute, all copyright, patent, trademark, and +attribution notices from the Source form of the Work, +excluding those notices that do not pertain to any part of +the Derivative Works; and

    6. +
    7. If the Work includes a “NOTICE” text file as part of its +distribution, then any Derivative Works that You distribute must +include a readable copy of the attribution notices contained +within such NOTICE file, excluding those notices that do not +pertain to any part of the Derivative Works, in at least one +of the following places: within a NOTICE text file distributed +as part of the Derivative Works; within the Source form or +documentation, if provided along with the Derivative Works; or, +within a display generated by the Derivative Works, if and +wherever such third-party notices normally appear. The contents +of the NOTICE file are for informational purposes only and +do not modify the License. You may add Your own attribution +notices within Derivative Works that You distribute, alongside +or as an addendum to the NOTICE text from the Work, provided +that such additional attribution notices cannot be construed +as modifying the License.

    8. +
    +

    You may add Your own copyright statement to Your modifications and +may provide additional or different license terms and conditions +for use, reproduction, or distribution of Your modifications, or +for any such Derivative Works as a whole, provided Your use, +reproduction, and distribution of the Work otherwise complies with +the conditions stated in this License.

    +
  8. +
  9. Submission of Contributions. Unless You explicitly state otherwise, +any Contribution intentionally submitted for inclusion in the Work +by You to the Licensor shall be under the terms and conditions of +this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify +the terms of any separate license agreement you may have executed +with Licensor regarding such Contributions.

  10. +
  11. Trademarks. This License does not grant permission to use the trade +names, trademarks, service marks, or product names of the Licensor, +except as required for reasonable and customary use in describing the +origin of the Work and reproducing the content of the NOTICE file.

  12. +
  13. Disclaimer of Warranty. Unless required by applicable law or +agreed to in writing, Licensor provides the Work (and each +Contributor provides its Contributions) on an “AS IS” BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied, including, without limitation, any warranties or conditions +of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. You are solely responsible for determining the +appropriateness of using or redistributing the Work and assume any +risks associated with Your exercise of permissions under this License.

  14. +
  15. Limitation of Liability. In no event and under no legal theory, +whether in tort (including negligence), contract, or otherwise, +unless required by applicable law (such as deliberate and grossly +negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, +incidental, or consequential damages of any character arising as a +result of this License or out of the use or inability to use the +Work (including but not limited to damages for loss of goodwill, +work stoppage, computer failure or malfunction, or any and all +other commercial damages or losses), even if such Contributor +has been advised of the possibility of such damages.

  16. +
  17. Accepting Warranty or Additional Liability. While redistributing +the Work or Derivative Works thereof, You may choose to offer, +and charge a fee for, acceptance of support, warranty, indemnity, +or other liability obligations and/or rights consistent with this +License. However, in accepting such obligations, You may act only +on Your own behalf and on Your sole responsibility, not on behalf +of any other Contributor, and only if You agree to indemnify, +defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason +of your accepting any such warranty or additional liability.

  18. +
+

END OF TERMS AND CONDITIONS

+

APPENDIX: How to apply the Apache License to your work.

+
+

To apply the Apache License to your work, attach the following +boilerplate notice, with the fields enclosed by brackets “{}” +replaced with your own identifying information. (Don’t include +the brackets!) The text should be enclosed in the appropriate +comment syntax for the file format. We also recommend that a +file or class name and description of purpose be included on the +same “printed page” as the copyright notice for easier +identification within third-party archives.

+
+

Copyright [yyyy] [name of copyright owner]

+

Licensed under the Apache License, Version 2.0 (the “License”); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at

+
+
+

Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an “AS IS” BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License.

+
+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/minimap.html b/minimap.html new file mode 100644 index 0000000..6965c6d --- /dev/null +++ b/minimap.html @@ -0,0 +1,184 @@ + + + + + + + + Minimap2 Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

Minimap2 Module Documentation

+

Minimap2 is a pairwise sequence alignment algorithm designed for efficiently comparing nucleotide sequences. +The algorithm uses a versatile indexing strategy to quickly find approximate matches between sequences, +allowing it to efficiently align long sequences against reference genomes or other sequences.

+

Li, H. (2018). Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics, 34(18), 3094-3100.

+
+
+ensembl.tools.anno.transcriptomic_annotation.minimap.run_minimap2(output_dir: Path, long_read_fastq_dir: Path, genome_file: Path, minimap2_bin: Path = PosixPath('minimap2'), paftools_bin: Path = PosixPath('paftools.js'), max_intron_length: int = 100000, num_threads: int = 1) None[source]
+

Run Minimap2 to align long read data against genome file. +Default Minimap set for PacBio data.

+
+
+
param output_dir:
+

Working directory path.

+
+
type output_dir:
+

Path

+
+
param long_read_fastq_dir:
+

Long read directory path.

+
+
type long_read_fastq_dir:
+

Path

+
+
param genome_file:
+

Genome file path.

+
+
type genome_file:
+

Path

+
+
param minimap2_bin:
+

Software path.

+
+
type minimap2_bin:
+

Path, default minimap2

+
+
param paftools_bin:
+

Software path.

+
+
type paftools_bin:
+

Path, default paftools.js

+
+
param max_intron_length:
+

The maximum intron size for alignments. Defaults to 100000.

+
+
type max_intron_length:
+

int, default 100000

+
+
param num_threads:
+

Number of available threads.

+
+
type num_threads:
+

int, default 1

+
+
return:
+

None

+
+
rtype:
+

None

+
+
+
+
+ +
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/objects.inv b/objects.inv new file mode 100644 index 0000000..3e22f49 Binary files /dev/null and b/objects.inv differ diff --git a/py-modindex.html b/py-modindex.html new file mode 100644 index 0000000..6668401 --- /dev/null +++ b/py-modindex.html @@ -0,0 +1,185 @@ + + + + + + + Python Module Index — ensembl-anno 0.1 documentation + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/red.html b/red.html new file mode 100644 index 0000000..bddc245 --- /dev/null +++ b/red.html @@ -0,0 +1,160 @@ + + + + + + + + Red Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

Red Module Documentation

+

Red is the first repeat-detection tool capable of labeling its training data +and training itself automatically on an entire genome. +Girgis, H.Z. Red: an intelligent, rapid, accurate tool for detecting repeats +de-novo on the genomic scale. BMC Bioinformatics 16, 227 (2015). +https://doi.org/10.1186/s12859-015-0654-5

+
+
+ensembl.tools.anno.repeat_annotation.red.run_red(genome_file: Path, output_dir: Path, red_bin: Path = PosixPath('Red')) str[source]
+
+
Run Red on genome file
+
param genome_file:
+

Genome file path.

+
+
type genome_file:
+

Path

+
+
param output_dir:
+

Working directory path.

+
+
type output_dir:
+

Path

+
+
param red_bin:
+

Red software path.

+
+
type red_bin:
+

Path, default Red

+
+
return:
+

Masked genome file

+
+
rtype:
+

str

+
+
+
+
+
+ +
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/repeatmasker.html b/repeatmasker.html new file mode 100644 index 0000000..8c8c3c2 --- /dev/null +++ b/repeatmasker.html @@ -0,0 +1,182 @@ + + + + + + + + Repeatmasker Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

Repeatmasker Module Documentation

+

RepeatMasker is a program that screens DNA sequences for interspersed +repeats and low complexity DNA sequences. +Smit, AFA, Hubley, R & Green, P. RepeatMasker Open-4.0

+
+
+ensembl.tools.anno.repeat_annotation.repeatmasker.run_repeatmasker(genome_file: PathLike, output_dir: Path, repeatmasker_bin: Path = PosixPath('RepeatMasker'), library: str = '', repeatmasker_engine: str = 'rmblast', species: str = '', num_threads: int = 1) None[source]
+

Executes RepeatMasker on the genome slices and stores the final annotation.gtf in repeatmasker_output

+
+
+
param genome_file:
+

Genome file path.

+
+
type genome_file:
+

PathLike

+
+
param output_dir:
+

Output directory path.

+
+
type output_dir:
+

Path

+
+
param repeatmasker_bin:
+

RepeatMasker executable path.

+
+
type repeatmasker_bin:
+

Path, default RepeatMasker

+
+
param library:
+

Custom repeat library.

+
+
type library:
+

str

+
+
param repeatmasker_engine:
+

RepeatMasker engine.

+
+
type repeatmasker_engine:
+

str, default rmblast

+
+
param species:
+

Species name.

+
+
type species:
+

str

+
+
param num_threads:
+

Number of threads.

+
+
type num_threads:
+

int, default 1

+
+
return:
+

None

+
+
rtype:
+

None

+
+
+
+
+ +
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/scallop.html b/scallop.html new file mode 100644 index 0000000..b947e84 --- /dev/null +++ b/scallop.html @@ -0,0 +1,177 @@ + + + + + + + + Scallop Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

Scallop Module Documentation

+

Scallop is a high-performance tool designed for the accurate and efficient quantification +of transcriptome assembly. +It’s capable of handling large-scale transcriptomic data while providing precise estimates +of transcript abundances. +Scallop’s algorithmic approach allows it to efficiently reconstruct transcript structures +and quantify their expression levels, making it a valuable resource for studying gene +expression and transcriptome analysis.

+

Shao M, Kingsford C. Accurate assembly of transcripts through phase-preserving graph +decomposition. Nat Biotechnol. +2017 Dec;35(12):1167-1169. doi: 10.1038/nbt.4020. Epub 2017 Nov 13. PMID: 29131147; PMCID: PMC5722698.

+
+
+ensembl.tools.anno.transcriptomic_annotation.scallop.run_scallop(output_dir: Path, scallop_bin: Path = PosixPath('scallop'), prlimit_bin: Path = PosixPath('prlimit'), stringtie_bin: Path = PosixPath('stringtie'), memory_limit: int = 42949672960) None[source]
+

Run Scallop assembler on short read data after STAR alignment.

+
+
+
param output_dir:
+

Working directory path.

+
+
type output_dir:
+

Path

+
+
param scallop_bin:
+

Software path.

+
+
type scallop_bin:
+

Path, default scallop

+
+
param prlimit_bin:
+

Software path.

+
+
type prlimit_bin:
+

Path, default prlimit

+
+
param stringtie_bin:
+

Software path.

+
+
type stringtie_bin:
+

Path, default stringtie

+
+
param memory_limit:
+

Memory limit Scallop command Defaults to 40*1024**3.

+
+
type memory_limit:
+

int

+
+
return:
+

None

+
+
rtype:
+

None

+
+
+
+
+ +
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/search.html b/search.html new file mode 100644 index 0000000..85eee41 --- /dev/null +++ b/search.html @@ -0,0 +1,139 @@ + + + + + + + Search — ensembl-anno 0.1 documentation + + + + + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Search

+ + + + +

+ Searching for multiple words only shows matches that contain + all words. +

+ + +
+ + + +
+ + + +
+ +
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/searchindex.js b/searchindex.js new file mode 100644 index 0000000..577d095 --- /dev/null +++ b/searchindex.js @@ -0,0 +1 @@ +Search.setIndex({"docnames": ["cpg", "dust", "eponine", "genblast", "index", "install", "license", "minimap", "red", "repeatmasker", "scallop", "star", "stringtie", "trf", "trnascan"], "filenames": ["cpg.rst", "dust.rst", "eponine.rst", "genblast.rst", "index.rst", "install.rst", "license.rst", "minimap.rst", "red.rst", "repeatmasker.rst", "scallop.rst", "star.rst", "stringtie.rst", "trf.rst", "trnascan.rst"], "titles": ["CpG Module Documentation", "DustMasker Module Documentation", "Eponine Module Documentation", "Genblast Module Documentation", "Ensembl-anno", "API Setup and installation", "License", "Minimap2 Module Documentation", "Red Module Documentation", "Repeatmasker Module Documentation", "Scallop Module Documentation", "STAR Module Documentation", "Stringtie Module Documentation", "TRF Module Documentation", "tRNAscan-SE Module Documentation"], "terms": {"set": [0, 7, 14], "discrimin": 0, "function": [0, 3], "can": 0, "recogn": 0, "structur": [0, 10], "composit": 0, "featur": [0, 3], "island": 0, "promot": 0, "region": 0, "first": [0, 8], "splice": [0, 11, 12], "donor": 0, "site": [0, 2], "davuluri": 0, "rv": 0, "gross": 0, "i": [0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13], "zhang": 0, "mq": 0, "comput": [0, 2, 6], "identif": [0, 6], "exon": 0, "human": 0, "genom": [0, 1, 2, 3, 7, 8, 9, 11, 13, 14], "nat": [0, 10], "genet": 0, "2001": 0, "29": [0, 11], "4": [0, 9], "412": 0, "417": 0, "pmid": [0, 2, 10, 14], "11726928": 0, "ensembl": [0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 12, 13, 14], "tool": [0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14], "anno": [0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14], "simple_feature_annot": [0, 2], "run_cpg": 0, "genome_fil": [0, 1, 2, 7, 8, 9, 11, 13, 14], "pathlik": [0, 1, 2, 9, 13, 14], "output_dir": [0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14], "path": [0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14], "cpg_bin": 0, "posixpath": [0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14], "cpg_lh": 0, "cpg_min_length": 0, "int": [0, 1, 2, 3, 7, 9, 10, 11, 12, 13, 14], "400": 0, "cpg_min_gc_cont": 0, "50": [0, 6], "cpg_min_o": 0, "float": [0, 2], "0": [0, 2, 6, 9, 11], "6": [0, 5], "num_thread": [0, 1, 2, 3, 7, 9, 11, 12, 13, 14], "1": [0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14], "none": [0, 1, 2, 3, 7, 9, 10, 11, 12, 13, 14], "sourc": [0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14], "run": [0, 1, 2, 7, 8, 10, 11], "slice": [0, 1, 2, 3, 9, 13, 14], "param": [0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14], "file": [0, 1, 2, 3, 6, 7, 8, 9, 11, 13, 14], "type": [0, 1, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14], "work": [0, 1, 2, 3, 6, 7, 8, 10, 11, 12, 13, 14], "directori": [0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14], "softwar": [0, 1, 2, 3, 6, 7, 8, 10, 11, 12, 13, 14], "min": 0, "length": [0, 3, 12], "gc": 0, "frequenc": 0, "percentag": 0, "ratio": 0, "observ": 0, "expect": 0, "number": [0, 1, 2, 3, 7, 9, 11, 12, 13, 14], "cpgo": 0, "e": 0, "thread": [0, 1, 2, 3, 7, 9, 11, 12, 13, 14], "return": [0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14], "rtype": [0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14], "program": [1, 9, 13, 14], "identifi": [1, 3, 6, 14], "mask": [1, 3, 8], "out": [1, 4, 6], "low": [1, 9, 14], "complex": [1, 9], "part": [1, 3, 6], "us": [1, 3, 6, 7, 11, 12], "new": 1, "improv": [1, 6, 12, 14], "dust": 1, "algorithm": [1, 7, 10, 12], "morguli": 1, "A": [1, 6, 11], "gertz": 1, "em": 1, "schaffer": 1, "aa": 1, "agarwala": 1, "r": [1, 3, 9], "fast": [1, 12], "symmetr": 1, "implement": 1, "dna": [1, 2, 9, 13, 14], "sequenc": [1, 2, 3, 7, 9, 13, 14], "repeat_annot": [1, 8, 9, 13], "run_dust": 1, "dust_bin": 1, "mutiprocess": 1, "default": [1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14], "probabilist": 2, "method": 2, "detect": [2, 8, 14], "transcript": [2, 10, 11, 12], "start": 2, "tss": 2, "mammalian": 2, "good": 2, "specif": [2, 6], "excel": 2, "posit": [2, 14], "accuraci": 2, "down": 2, "ta": 2, "hubbard": 2, "tj": 2, "locat": [2, 13], "re": [2, 3, 13, 14], "2002": 2, "mar": 2, "12": [2, 10], "3": [2, 5, 10], "458": 2, "61": 2, "doi": [2, 8, 10, 11, 12, 13], "10": [2, 8, 10, 11, 12, 13], "1101": 2, "gr": 2, "216102": 2, "11875034": 2, "pmcid": [2, 10], "pmc155284": 2, "run_eponin": 2, "java_bin": 2, "java": 2, "eponine_bin": 2, "hp": 2, "user": 2, "ensw": 2, "c8": 2, "mar21": 2, "sandybridg": 2, "linuxbrew": 2, "opt": 2, "libexec": 2, "scan": 2, "jar": 2, "eponine_threshold": 2, "999": 2, "homolog": 3, "gene": [3, 10, 12, 14], "databas": 3, "One": 3, "kei": 3, "its": [3, 6, 8], "flexibl": 3, "handl": [3, 10], "compar": [3, 7], "task": 3, "accur": [3, 8, 10], "even": [3, 6], "when": 3, "have": [3, 6], "undergon": 3, "signific": 3, "evolutionari": 3, "chang": [3, 6, 12], "thi": [3, 6], "capabl": [3, 8, 10], "make": [3, 6, 10], "valuabl": [3, 10], "resourc": [3, 10], "research": [3, 11], "studi": [3, 10], "evolut": 3, "famili": 3, "across": 3, "divers": 3, "speci": [3, 9], "ha": [3, 6], "been": [3, 6], "wide": [3, 11], "variou": 3, "analys": 3, "avail": [3, 6, 7, 11, 12], "standalon": 3, "command": [3, 10], "line": 3, "differ": [3, 6], "bioinformat": [3, 7, 8, 11], "pipelin": 3, "field": [3, 6], "analysi": [3, 5, 10], "often": 3, "reli": 3, "perform": [3, 6, 10], "sensit": 3, "homologi": 3, "search": [3, 4], "obtain": [3, 6], "insight": 3, "relationship": 3, "conserv": 3, "organ": 3, "she": 3, "chu": 3, "j": [3, 7], "": [3, 6, 10], "uyar": 3, "b": [3, 5], "wang": 3, "k": 3, "chen": 3, "n": 3, "2011": 3, "genblasta": 3, "enabl": [3, 12], "blast": 3, "21": [3, 11], "5": [3, 8, 13, 14], "936": 3, "949": 3, "protein_annot": 3, "run_genblast": 3, "masked_genom": 3, "protein_dataset": 3, "max_intron_length": [3, 7, 11], "genblast_timeout_sec": 3, "10800": 3, "genblast_bin": 3, "convert2blastmask_bin": 3, "convert2blastmask": 3, "makeblastdb_bin": 3, "makeblastdb": 3, "protein_set": 3, "str": [3, 8, 9], "uniprot": 3, "orthodb": 3, "execut": [3, 6, 9, 13, 14], "protein": 3, "dataset": 3, "time": 3, "timeout": 3, "sec": 3, "maximum": [3, 7, 11, 13], "intron": [3, 7, 11], "genblast_timeout": 3, "second": 3, "kit": 4, "check": 4, "instal": 4, "section": [4, 6], "further": 4, "inform": [4, 6], "how": [4, 6], "project": 4, "api": 4, "setup": 4, "licens": 4, "cpg": 4, "modul": 4, "document": [4, 6], "dustmask": 4, "eponin": 4, "genblast": 4, "minimap2": 4, "red": 4, "repeatmask": 4, "scallop": 4, "star": [4, 10], "stringti": [4, 10], "trf": 4, "trnascan": 4, "se": 4, "page": [4, 6], "an": [5, 6, 8, 12], "checkout": 5, "includ": [5, 6], "product": [5, 6], "dev": 5, "hive_mast": 5, "branch": 5, "taxonomi": 5, "orm": 5, "8": 5, "bioperl": 5, "9": [5, 6], "argschema": 5, "directli": 5, "from": [5, 6, 12], "github": 5, "git": 5, "clone": 5, "http": [5, 6, 8], "com": 5, "experiment": 5, "gbiab": 5, "hive": 5, "apach": 6, "version": 6, "2": [6, 13], "januari": 6, "2004": 6, "www": 6, "org": [6, 8], "definit": 6, "shall": 6, "mean": 6, "term": 6, "condit": 6, "reproduct": 6, "distribut": 6, "defin": 6, "through": [6, 10], "licensor": 6, "copyright": 6, "owner": 6, "entiti": 6, "author": 6, "grant": 6, "legal": 6, "union": 6, "act": 6, "all": 6, "other": [6, 7], "control": 6, "ar": 6, "under": 6, "common": 6, "For": 6, "purpos": 6, "power": 6, "direct": 6, "indirect": 6, "caus": 6, "manag": 6, "whether": 6, "contract": 6, "otherwis": 6, "ii": 6, "ownership": 6, "fifti": 6, "percent": 6, "more": 6, "outstand": 6, "share": 6, "iii": 6, "benefici": 6, "you": 6, "your": 6, "individu": 6, "exercis": 6, "permiss": 6, "form": 6, "prefer": 6, "modif": 6, "limit": [6, 10], "code": 6, "configur": 6, "object": 6, "ani": 6, "result": 6, "mechan": 6, "transform": 6, "translat": 6, "compil": 6, "gener": 6, "convers": 6, "media": 6, "authorship": 6, "made": 6, "indic": 6, "notic": 6, "attach": 6, "exampl": 6, "provid": [6, 10], "appendix": 6, "below": 6, "deriv": 6, "base": 6, "which": 6, "editori": 6, "revis": 6, "annot": [6, 9], "elabor": 6, "repres": [6, 12], "whole": [6, 13], "origin": [6, 11], "remain": 6, "separ": 6, "mere": 6, "link": 6, "bind": 6, "name": [6, 9], "interfac": 6, "thereof": 6, "contribut": 6, "addit": 6, "intention": 6, "submit": 6, "inclus": 6, "behalf": 6, "electron": 6, "verbal": 6, "written": 6, "commun": 6, "sent": 6, "mail": 6, "list": [6, 11], "system": 6, "issu": 6, "track": 6, "discuss": 6, "exclud": 6, "conspicu": 6, "mark": 6, "design": [6, 7, 10], "write": 6, "Not": 6, "contributor": 6, "whom": 6, "receiv": 6, "subsequ": 6, "incorpor": 6, "within": 6, "subject": 6, "each": [6, 12], "herebi": 6, "perpetu": 6, "worldwid": 6, "non": 6, "exclus": 6, "charg": 6, "royalti": 6, "free": 6, "irrevoc": 6, "reproduc": 6, "prepar": 6, "publicli": 6, "displai": [6, 13], "sublicens": 6, "patent": 6, "except": 6, "state": 6, "offer": 6, "sell": 6, "import": 6, "transfer": [6, 14], "where": 6, "appli": 6, "onli": 6, "those": 6, "claim": 6, "necessarili": 6, "infring": 6, "alon": 6, "combin": 6, "wa": 6, "If": 6, "institut": 6, "litig": 6, "against": [6, 7], "cross": 6, "counterclaim": 6, "lawsuit": 6, "alleg": 6, "constitut": 6, "contributori": 6, "termin": 6, "date": 6, "redistribut": 6, "mai": 6, "copi": 6, "medium": 6, "without": 6, "meet": 6, "follow": 6, "must": 6, "give": [6, 14], "recipi": 6, "modifi": 6, "carri": 6, "promin": 6, "retain": 6, "trademark": 6, "attribut": 6, "do": 6, "pertain": 6, "text": 6, "readabl": 6, "contain": 6, "least": 6, "one": [6, 14], "place": 6, "along": 6, "wherev": 6, "third": 6, "parti": 6, "normal": 6, "appear": 6, "The": [6, 7, 11], "content": 6, "add": 6, "own": 6, "alongsid": 6, "addendum": 6, "cannot": 6, "constru": 6, "statement": 6, "compli": 6, "submiss": 6, "unless": 6, "explicitli": 6, "notwithstand": 6, "abov": 6, "noth": 6, "herein": 6, "supersed": 6, "agreement": 6, "regard": 6, "doe": 6, "trade": 6, "servic": 6, "requir": 6, "reason": 6, "customari": 6, "describ": 6, "disclaim": 6, "warranti": 6, "applic": 6, "law": 6, "agre": 6, "AS": 6, "basi": 6, "OR": 6, "OF": 6, "kind": 6, "either": 6, "express": [6, 10], "impli": 6, "titl": 6, "merchant": 6, "fit": 6, "FOR": 6, "particular": 6, "sole": 6, "respons": 6, "determin": 6, "appropri": 6, "assum": 6, "risk": 6, "associ": 6, "liabil": 6, "In": 6, "event": 6, "theori": 6, "tort": 6, "neglig": 6, "deliber": 6, "grossli": 6, "liabl": 6, "damag": 6, "special": 6, "incident": 6, "consequenti": 6, "charact": 6, "aris": 6, "inabl": 6, "loss": 6, "goodwil": 6, "stoppag": 6, "failur": 6, "malfunct": 6, "commerci": 6, "advis": 6, "possibl": 6, "accept": 6, "while": [6, 10, 14], "choos": 6, "fee": 6, "support": 6, "indemn": 6, "oblig": 6, "right": 6, "consist": 6, "howev": 6, "indemnifi": 6, "defend": 6, "hold": 6, "harmless": 6, "incur": 6, "assert": 6, "end": 6, "AND": 6, "To": 6, "boilerpl": 6, "enclos": 6, "bracket": 6, "replac": 6, "don": 6, "t": 6, "should": 6, "comment": 6, "syntax": 6, "format": 6, "we": 6, "also": 6, "recommend": 6, "class": 6, "descript": 6, "same": 6, "print": 6, "easier": 6, "archiv": 6, "yyyi": 6, "complianc": 6, "see": 6, "languag": 6, "govern": 6, "pairwis": 7, "align": [7, 10, 11, 12, 13], "effici": [7, 10, 12], "nucleotid": 7, "versatil": 7, "index": 7, "strategi": 7, "quickli": 7, "find": 7, "approxim": 7, "match": [7, 13], "between": 7, "allow": [7, 10], "long": 7, "refer": [7, 11], "li": 7, "h": [7, 8], "2018": 7, "34": 7, "18": 7, "3094": 7, "3100": 7, "transcriptomic_annot": [7, 10, 11, 12], "minimap": 7, "run_minimap2": 7, "long_read_fastq_dir": 7, "minimap2_bin": 7, "paftools_bin": 7, "paftool": 7, "100000": [7, 11], "read": [7, 10, 11, 12], "data": [7, 8, 10, 11, 12], "pacbio": 7, "size": [7, 11, 13], "repeat": [8, 9, 13], "label": 8, "train": 8, "itself": 8, "automat": 8, "entir": 8, "girgi": 8, "z": 8, "intellig": 8, "rapid": 8, "de": [8, 12], "novo": [8, 12], "scale": [8, 10], "bmc": 8, "16": 8, "227": 8, "2015": [8, 12], "1186": 8, "s12859": 8, "015": 8, "0654": 8, "run_r": 8, "red_bin": 8, "screen": 9, "interspers": 9, "smit": 9, "afa": 9, "hublei": 9, "green": 9, "p": 9, "open": 9, "run_repeatmask": 9, "repeatmasker_bin": 9, "librari": 9, "repeatmasker_engin": 9, "rmblast": 9, "store": 9, "final": 9, "gtf": 9, "repeatmasker_output": 9, "output": 9, "custom": 9, "engin": 9, "high": 10, "quantif": 10, "transcriptom": [10, 12], "assembli": [10, 12], "It": [10, 12], "larg": 10, "precis": 10, "estim": 10, "abund": 10, "approach": 10, "reconstruct": [10, 12], "quantifi": 10, "level": 10, "shao": 10, "m": [10, 12], "kingsford": 10, "c": 10, "phase": 10, "preserv": 10, "graph": 10, "decomposit": 10, "biotechnol": 10, "2017": 10, "dec": 10, "35": 10, "1167": 10, "1169": 10, "1038": [10, 12], "nbt": [10, 12], "4020": 10, "epub": 10, "nov": 10, "13": 10, "29131147": 10, "pmc5722698": 10, "run_scallop": 10, "scallop_bin": 10, "prlimit_bin": 10, "prlimit": 10, "stringtie_bin": [10, 12], "memory_limit": 10, "42949672960": 10, "assembl": [10, 12], "short": [10, 11, 12], "after": [10, 11], "memori": 10, "40": [10, 13], "1024": 10, "rna": [11, 12, 14], "seq": [11, 12], "dobin": 11, "davi": 11, "ca": 11, "schlesing": 11, "f": 11, "et": 11, "al": 11, "ultrafast": 11, "univers": 11, "2013": 11, "15": [11, 14], "1093": [11, 13], "bts635": 11, "run_star": 11, "short_read_fastq_dir": 11, "delete_pre_trim_fastq": 11, "bool": 11, "fals": [11, 14], "trim_fastq": 11, "max_reads_per_sampl": 11, "star_bin": 11, "samtools_bin": 11, "samtool": 11, "trim_galore_bin": 11, "trim_galor": 11, "delet": 11, "fastq": 11, "trim": 11, "boolean": 11, "trimgalor": 11, "max": 11, "per": [11, 14], "sampl": 11, "unlimit": 11, "highli": 12, "potenti": 12, "novel": 12, "network": 12, "flow": 12, "well": 12, "option": 12, "step": 12, "quantit": 12, "full": 12, "multipl": 12, "variant": 12, "locu": 12, "pertea": 12, "gm": 12, "antonescu": 12, "cm": 12, "tc": 12, "mendel": 12, "jt": 12, "salzberg": 12, "sl": 12, "natur": 12, "biotechnologi": 12, "3122": 12, "run_stringti": 12, "tandem": 13, "finder": 13, "benson": 13, "g": 13, "analyz": 13, "nucleic": [13, 14], "acid": [13, 14], "1999": 13, "27": 13, "573": 13, "580": 13, "nar": 13, "run_trf": 13, "trf_bin": 13, "match_scor": 13, "mismatch_scor": 13, "delta": 13, "7": 13, "pm": 13, "80": 13, "pi": 13, "minscor": 13, "maxperiod": 13, "500": 13, "weight": 13, "mismatch": 13, "penalti": 13, "indel": 13, "probabl": 13, "minimum": 13, "score": 13, "report": 13, "period": 13, "99": 14, "100": 14, "less": 14, "than": 14, "gigabas": 14, "tm": 14, "eddi": 14, "sr": 14, "1997": 14, "25": 14, "955": 14, "64": 14, "9023104": 14, "snc_rna_annot": 14, "run_trnascan": 14, "trnascan_bin": 14, "trnascan_filt": 14, "eukhighconfidencefilt": 14, "filter": 14}, "objects": {"ensembl.tools.anno.protein_annotation": [[3, 0, 0, "-", "genblast"]], "ensembl.tools.anno.protein_annotation.genblast": [[3, 1, 1, "", "run_genblast"]], "ensembl.tools.anno.repeat_annotation": [[1, 0, 0, "-", "dust"], [8, 0, 0, "-", "red"], [9, 0, 0, "-", "repeatmasker"], [13, 0, 0, "-", "trf"]], "ensembl.tools.anno.repeat_annotation.dust": [[1, 1, 1, "", "run_dust"]], "ensembl.tools.anno.repeat_annotation.red": [[8, 1, 1, "", "run_red"]], "ensembl.tools.anno.repeat_annotation.repeatmasker": [[9, 1, 1, "", "run_repeatmasker"]], "ensembl.tools.anno.repeat_annotation.trf": [[13, 1, 1, "", "run_trf"]], "ensembl.tools.anno.simple_feature_annotation": [[0, 0, 0, "-", "cpg"], [2, 0, 0, "-", "eponine"]], "ensembl.tools.anno.simple_feature_annotation.cpg": [[0, 1, 1, "", "run_cpg"]], "ensembl.tools.anno.simple_feature_annotation.eponine": [[2, 1, 1, "", "run_eponine"]], "ensembl.tools.anno.snc_rna_annotation": [[14, 0, 0, "-", "trnascan"]], "ensembl.tools.anno.snc_rna_annotation.trnascan": [[14, 1, 1, "", "run_trnascan"]], "ensembl.tools.anno.transcriptomic_annotation": [[7, 0, 0, "-", "minimap"], [10, 0, 0, "-", "scallop"], [11, 0, 0, "-", "star"], [12, 0, 0, "-", "stringtie"]], "ensembl.tools.anno.transcriptomic_annotation.minimap": [[7, 1, 1, "", "run_minimap2"]], "ensembl.tools.anno.transcriptomic_annotation.scallop": [[10, 1, 1, "", "run_scallop"]], "ensembl.tools.anno.transcriptomic_annotation.star": [[11, 1, 1, "", "run_star"]], "ensembl.tools.anno.transcriptomic_annotation.stringtie": [[12, 1, 1, "", "run_stringtie"]]}, "objtypes": {"0": "py:module", "1": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "function", "Python function"]}, "titleterms": {"cpg": 0, "modul": [0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 12, 13, 14], "document": [0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14], "dustmask": 1, "eponin": 2, "genblast": 3, "ensembl": 4, "anno": 4, "content": 4, "index": 4, "indic": 4, "tabl": 4, "api": 5, "setup": 5, "instal": 5, "requir": 5, "softwar": 5, "python": 5, "licens": 6, "minimap2": 7, "red": 8, "repeatmask": 9, "scallop": 10, "star": 11, "stringti": 12, "trf": 13, "trnascan": 14, "se": 14}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"CpG Module Documentation": [[0, "module-ensembl.tools.anno.simple_feature_annotation.cpg"]], "DustMasker Module Documentation": [[1, "module-ensembl.tools.anno.repeat_annotation.dust"]], "Eponine Module Documentation": [[2, "module-ensembl.tools.anno.simple_feature_annotation.eponine"]], "Genblast Module Documentation": [[3, "module-ensembl.tools.anno.protein_annotation.genblast"]], "Ensembl-anno": [[4, "ensembl-anno"]], "Contents": [[4, "contents"]], "Index": [[4, null]], "Indices and tables": [[4, "indices-and-tables"]], "API Setup and installation": [[5, "api-setup-and-installation"]], "Requirements": [[5, "requirements"]], "Software": [[5, "software"]], "Python Modules": [[5, "python-modules"]], "Installation": [[5, "installation"]], "License": [[6, "license"]], "Minimap2 Module Documentation": [[7, "module-ensembl.tools.anno.transcriptomic_annotation.minimap"]], "Red Module Documentation": [[8, "module-ensembl.tools.anno.repeat_annotation.red"]], "Repeatmasker Module Documentation": [[9, "module-ensembl.tools.anno.repeat_annotation.repeatmasker"]], "Scallop Module Documentation": [[10, "module-ensembl.tools.anno.transcriptomic_annotation.scallop"]], "STAR Module Documentation": [[11, "module-ensembl.tools.anno.transcriptomic_annotation.star"]], "Stringtie Module Documentation": [[12, "module-ensembl.tools.anno.transcriptomic_annotation.stringtie"]], "TRF Module Documentation": [[13, "module-ensembl.tools.anno.repeat_annotation.trf"]], "tRNAscan-SE Module Documentation": [[14, "module-ensembl.tools.anno.snc_rna_annotation.trnascan"]]}, "indexentries": {"ensembl.tools.anno.simple_feature_annotation.cpg": [[0, "module-ensembl.tools.anno.simple_feature_annotation.cpg"]], "module": [[0, "module-ensembl.tools.anno.simple_feature_annotation.cpg"], [1, "module-ensembl.tools.anno.repeat_annotation.dust"], [2, "module-ensembl.tools.anno.simple_feature_annotation.eponine"], [3, "module-ensembl.tools.anno.protein_annotation.genblast"], [7, "module-ensembl.tools.anno.transcriptomic_annotation.minimap"], [8, "module-ensembl.tools.anno.repeat_annotation.red"], [9, "module-ensembl.tools.anno.repeat_annotation.repeatmasker"], [10, "module-ensembl.tools.anno.transcriptomic_annotation.scallop"], [11, "module-ensembl.tools.anno.transcriptomic_annotation.star"], [12, "module-ensembl.tools.anno.transcriptomic_annotation.stringtie"], [13, "module-ensembl.tools.anno.repeat_annotation.trf"], [14, "module-ensembl.tools.anno.snc_rna_annotation.trnascan"]], "run_cpg() (in module ensembl.tools.anno.simple_feature_annotation.cpg)": [[0, "ensembl.tools.anno.simple_feature_annotation.cpg.run_cpg"]], "ensembl.tools.anno.repeat_annotation.dust": [[1, "module-ensembl.tools.anno.repeat_annotation.dust"]], "run_dust() (in module ensembl.tools.anno.repeat_annotation.dust)": [[1, "ensembl.tools.anno.repeat_annotation.dust.run_dust"]], "ensembl.tools.anno.simple_feature_annotation.eponine": [[2, "module-ensembl.tools.anno.simple_feature_annotation.eponine"]], "run_eponine() (in module ensembl.tools.anno.simple_feature_annotation.eponine)": [[2, "ensembl.tools.anno.simple_feature_annotation.eponine.run_eponine"]], "ensembl.tools.anno.protein_annotation.genblast": [[3, "module-ensembl.tools.anno.protein_annotation.genblast"]], "run_genblast() (in module ensembl.tools.anno.protein_annotation.genblast)": [[3, "ensembl.tools.anno.protein_annotation.genblast.run_genblast"]], "ensembl.tools.anno.transcriptomic_annotation.minimap": [[7, "module-ensembl.tools.anno.transcriptomic_annotation.minimap"]], "run_minimap2() (in module ensembl.tools.anno.transcriptomic_annotation.minimap)": [[7, "ensembl.tools.anno.transcriptomic_annotation.minimap.run_minimap2"]], "ensembl.tools.anno.repeat_annotation.red": [[8, "module-ensembl.tools.anno.repeat_annotation.red"]], "run_red() (in module ensembl.tools.anno.repeat_annotation.red)": [[8, "ensembl.tools.anno.repeat_annotation.red.run_red"]], "ensembl.tools.anno.repeat_annotation.repeatmasker": [[9, "module-ensembl.tools.anno.repeat_annotation.repeatmasker"]], "run_repeatmasker() (in module ensembl.tools.anno.repeat_annotation.repeatmasker)": [[9, "ensembl.tools.anno.repeat_annotation.repeatmasker.run_repeatmasker"]], "ensembl.tools.anno.transcriptomic_annotation.scallop": [[10, "module-ensembl.tools.anno.transcriptomic_annotation.scallop"]], "run_scallop() (in module ensembl.tools.anno.transcriptomic_annotation.scallop)": [[10, "ensembl.tools.anno.transcriptomic_annotation.scallop.run_scallop"]], "ensembl.tools.anno.transcriptomic_annotation.star": [[11, "module-ensembl.tools.anno.transcriptomic_annotation.star"]], "run_star() (in module ensembl.tools.anno.transcriptomic_annotation.star)": [[11, "ensembl.tools.anno.transcriptomic_annotation.star.run_star"]], "ensembl.tools.anno.transcriptomic_annotation.stringtie": [[12, "module-ensembl.tools.anno.transcriptomic_annotation.stringtie"]], "run_stringtie() (in module ensembl.tools.anno.transcriptomic_annotation.stringtie)": [[12, "ensembl.tools.anno.transcriptomic_annotation.stringtie.run_stringtie"]], "ensembl.tools.anno.repeat_annotation.trf": [[13, "module-ensembl.tools.anno.repeat_annotation.trf"]], "run_trf() (in module ensembl.tools.anno.repeat_annotation.trf)": [[13, "ensembl.tools.anno.repeat_annotation.trf.run_trf"]], "ensembl.tools.anno.snc_rna_annotation.trnascan": [[14, "module-ensembl.tools.anno.snc_rna_annotation.trnascan"]], "run_trnascan() (in module ensembl.tools.anno.snc_rna_annotation.trnascan)": [[14, "ensembl.tools.anno.snc_rna_annotation.trnascan.run_trnascan"]]}}) \ No newline at end of file diff --git a/star.html b/star.html new file mode 100644 index 0000000..1186a69 --- /dev/null +++ b/star.html @@ -0,0 +1,207 @@ + + + + + + + + STAR Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

STAR Module Documentation

+

The STAR (Spliced Transcripts Alignment to a Reference) alignment tool is widely used +in genomics research for aligning RNA-seq data to a reference genome. +Dobin A, Davis CA, Schlesinger F, et al. STAR: ultrafast universal RNA-seq aligner. +Bioinformatics. 2013;29(1):15-21. doi:10.1093/bioinformatics/bts635

+
+
+ensembl.tools.anno.transcriptomic_annotation.star.run_star(genome_file: Path, output_dir: Path, short_read_fastq_dir: Path, delete_pre_trim_fastq: bool = False, trim_fastq: bool = False, max_reads_per_sample: int = 0, max_intron_length: int = 100000, num_threads: int = 1, star_bin: Path = PosixPath('star'), samtools_bin: Path = PosixPath('samtools'), trim_galore_bin: Path = PosixPath('trim_galore')) None[source]
+
+
Run STAR alignment on list of short read data.
+
param genome_file:
+

Genome file path.

+
+
type genome_file:
+

Path

+
+
param output_dir:
+

Working directory path.

+
+
type output_dir:
+

Path

+
+
param short_read_fastq_dir:
+

Short read directory path.

+
+
type short_read_fastq_dir:
+

Path

+
+
param delete_pre_trim_fastq:
+

Delete the original fastq files after trimming. Defaults to False.

+
+
type delete_pre_trim_fastq:
+

boolean, default False

+
+
param trim_fastq:
+

Trim short read files using TrimGalore. Defaults to False.

+
+
type trim_fastq:
+

boolean, default False

+
+
param max_reads_per_sample:
+

Max number of reads per sample. Defaults to 0 (unlimited).

+
+
type max_reads_per_sample:
+

int, default 0

+
+
param max_intron_length:
+

The maximum intron size for alignments. Defaults to 100000.

+
+
type max_intron_length:
+

int, default 100000

+
+
param num_threads:
+

Number of available threads.

+
+
type num_threads:
+

int, default 1

+
+
param star_bin:
+

Software path.

+
+
type star_bin:
+

Path, default star

+
+
param samtools_bin:
+

Software path.

+
+
type samtools_bin:
+

Path,default samtools

+
+
param trim_galore_bin:
+

Software path.

+
+
type trim_galore_bin:
+

Path, default trim_galore

+
+
return:
+

None

+
+
rtype:
+

None

+
+
+
+
+
+ +
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/stringtie.html b/stringtie.html new file mode 100644 index 0000000..edc6e83 --- /dev/null +++ b/stringtie.html @@ -0,0 +1,160 @@ + + + + + + + + Stringtie Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

Stringtie Module Documentation

+

StringTie is a fast and highly efficient assembler of RNA-Seq alignments into potential transcripts. +It uses a novel network flow algorithm as well as an optional de novo assembly step to assemble and +quantitate full-length transcripts representing multiple splice variants for each gene locus. +Pertea M, Pertea GM, Antonescu CM, Chang TC, Mendell JT & Salzberg SL. StringTie enables improved +reconstruction of a transcriptome from RNA-seq reads Nature Biotechnology 2015, doi:10.1038/nbt.3122

+
+
+ensembl.tools.anno.transcriptomic_annotation.stringtie.run_stringtie(output_dir: Path, stringtie_bin: Path = PosixPath('stringtie'), num_threads: int = 1) None[source]
+
+
StringTie assembler of short read data.
+
param output_dir:
+

Working directory path.

+
+
type output_dir:
+

Path

+
+
param stringtie_bin:
+

Software path.

+
+
type stringtie_bin:
+

Path, default stringtie

+
+
param num_threads:
+

Number of available threads.

+
+
type num_threads:
+

int, default 1

+
+
return:
+

None

+
+
rtype:
+

None

+
+
+
+
+
+ +
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/trf.html b/trf.html new file mode 100644 index 0000000..820f45c --- /dev/null +++ b/trf.html @@ -0,0 +1,206 @@ + + + + + + + + TRF Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

TRF Module Documentation

+

Tandem Repeats Finder is a program to locate and display tandem repeats in DNA sequences. +Benson G. Tandem repeats finder: a program to analyze DNA sequences. +Nucleic Acids Res. 1999; 27(2):573–580. doi:10.1093/nar/27.2.573

+
+
+ensembl.tools.anno.repeat_annotation.trf.run_trf(genome_file: PathLike, output_dir: Path, num_threads: int = 1, trf_bin: Path = PosixPath('trf'), match_score: int = 2, mismatch_score: int = 5, delta: int = 7, pm: int = 80, pi: int = 10, minscore: int = 40, maxperiod: int = 500) None[source]
+
+
Executes TRF on genomic slices
+
param genome_file:
+

Genome file path.

+
+
type genome_file:
+

PathLike

+
+
param output_dir:
+

Working directory path.

+
+
type output_dir:
+

Path

+
+
param num_threads:
+

int, number of threads.

+
+
type num_threads:
+

int, default 1

+
+
param trf_bin:
+

TRF software path.

+
+
type trf_bin:
+

Path, default trf

+
+
param match_score:
+

Matching weight.

+
+
type match_score:
+

int, default 2

+
+
param mismatch_score:
+

Mismatching penalty.

+
+
type mismatch_score:
+

int, default 5

+
+
param delta:
+

Indel penalty.

+
+
type delta:
+

int, default 7

+
+
param pm:
+

Match probability (whole number).

+
+
type pm:
+

int, default 80

+
+
param pi:
+

Indel probability (whole number).

+
+
type pi:
+

int, default 10

+
+
param minscore:
+

Minimum alignment score to report.

+
+
type minscore:
+

int, default 40

+
+
param maxperiod:
+

Maximum period size to report.

+
+
type maxperiod:
+

int, default 500

+
+
return:
+

None

+
+
rtype:
+

None

+
+
+
+
+
+ +
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/trnascan.html b/trnascan.html new file mode 100644 index 0000000..b4c7bc7 --- /dev/null +++ b/trnascan.html @@ -0,0 +1,167 @@ + + + + + + + + tRNAscan-SE Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

tRNAscan-SE Module Documentation

+

tRNAscan-SE identifies 99-100% of transfer RNA genes in DNA sequence while +giving less than one false positive per 15 gigabases. +Lowe TM, Eddy SR: tRNAscan-SE: a program for improved detection of transfer +RNA genes in genomic sequence. +Nucleic Acids Res. 1997, 25(5):955-64. [PMID: 9023104]

+
+
+ensembl.tools.anno.snc_rna_annotation.trnascan.run_trnascan(genome_file: PathLike, output_dir: Path, trnascan_bin: Path = PosixPath('tRNAscan-SE'), trnascan_filter: Path = PosixPath('EukHighConfidenceFilter'), num_threads: int = 1) None[source]
+
+
Executes tRNAscan-SE on genomic slices
+
param genome_file:
+

Genome file path.

+
+
type genome_file:
+

PathLike

+
+
param output_dir:
+

working directory path.

+
+
type output_dir:
+

Path

+
+
param trnascan_bin:
+

tRNAscan-SE software path.

+
+
type trnascan_bin:
+

Path, default tRNAscan-SE

+
+
param trnascan_filter:
+

tRNAscan-SE filter set path.

+
+
type trnascan_filter:
+

Path, default EukHighConfidenceFilter

+
+
param num_threads:
+

int, number of threads.

+
+
type num_threads:
+

int, default 1

+
+
return:
+

None

+
+
rtype:
+

None

+
+
+
+
+
+ +
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file