Merge pull request #139 from labgem/gff_output

Add proksee and gff output
labgem · Oct 27, 2023 · faf2e50 · faf2e50
2 parents 6f7e806 + d58a253
commit faf2e50
Show file tree

Hide file tree

Showing 23 changed files with 1,200 additions and 333 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -67,7 +67,7 @@ jobs:
         ppanggolin spot -p stepbystep/pangenome.h5 --spot_graph --overlapping_match 2 --set_size 3 --exact_match_size 1
         ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f
         ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05
-        ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1
+        ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1 --fasta organisms.fasta.list --gff --proksee
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list
         ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
         ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --info_modules --no_print_info -f --log metrics.log
@@ -134,10 +134,10 @@ jobs:
       run: |
         cd testingDataset
         head organisms.gbff.list | sed 's/^/input_org_/g' > organisms.gbff.head.list
-        ppanggolin projection --pangenome stepbystep/pangenome.h5  -o projection_from_lisy_of_gbff --anno organisms.gbff.head.list 
+        ppanggolin projection --pangenome stepbystep/pangenome.h5  -o projection_from_list_of_gbff --anno organisms.gbff.head.list --gff --proksee
 
 
         ppanggolin projection --pangenome mybasicpangenome/pangenome.h5  -o projection_from_single_fasta \
                               --organism_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \
-                              --spot_graph --graph_formats graphml --fast --keep_tmp -f
+                              --spot_graph --graph_formats graphml --fast --keep_tmp -f --add_sequences --gff --proksee
 
diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py
@@ -278,10 +278,10 @@ def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq
 
     partition_proj = output.absolute() / "sequences_partition_projection.tsv"
     with open(partition_proj, "w") as partProjFile:
-        for input_seq, pangFam in seqid_to_gene_family.items():
-            partProjFile.write(input_seq + "\t" + pangFam.named_partition + "\n")
-        for remainingSeq in seq_set - seqid_to_gene_family.keys():
-            partProjFile.write(remainingSeq + "\tcloud\n")  # if there is no hit, it's going to be cloud genes.
+        for input_seq, gene_fam in seqid_to_gene_family.items():
+            partProjFile.write(input_seq + "\t" + gene_fam.named_partition + "\n")
+        for remaining_seq in seq_set - seqid_to_gene_family.keys():
+            partProjFile.write(remaining_seq + "\tcloud\n")  # if there is no hit, it's going to be cloud genes.
     return partition_proj
 
 
@@ -298,11 +298,11 @@ def write_gene_to_gene_family(seqid_to_gene_family: Dict[str, GeneFamily], seq_s
 
     gene_fam_map_file = output.absolute() / "gene_to_gene_family.tsv"
     with open(gene_fam_map_file, "w") as partProjFile:
-        for input_seq, pangFam in seqid_to_gene_family.items():
-            partProjFile.write(f"{input_seq}\t{pangFam.name}\n")
+        for input_seq, gene_fam in seqid_to_gene_family.items():
+            partProjFile.write(f"{input_seq}\t{gene_fam.name}\n")
 
-        for remainingSeq in seq_set - seqid_to_gene_family.keys():
-            partProjFile.write(f"{remainingSeq}\t{remainingSeq}\n")  # if there is no hit, gene family is itself.
+        for remaining_seq in seq_set - seqid_to_gene_family.keys():
+            partProjFile.write(f"{remaining_seq}\t{remaining_seq}\n")  # if there is no hit, gene family is itself.
 
     return gene_fam_map_file
 
@@ -517,9 +517,8 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Itera
 
 def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: float = 0.8,
           coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False,
-          use_representatives: bool = False,
-          draw_related: bool = False, translation_table: int = 11, tmpdir: Path = None,
-          disable_bar: bool = False, keep_tmp=False):
+          use_representatives: bool = False, draw_related: bool = False, translation_table: int = 11, 
+          tmpdir: Path = None, disable_bar: bool = False, keep_tmp=False):
     """
     Aligns pangenome sequences with sequences in a FASTA file using MMSeqs2.
 

diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py
@@ -238,6 +238,9 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li
         while not line.startswith('//'):
             sequence += line[10:].replace(" ", "").strip().upper()
             line = lines.pop()
+
+        if contig.length != len(sequence):
+            raise ValueError("The contig lenght defined is different than the sequence length")
         # get each gene's sequence.
         for gene in contig.genes:
             gene.add_sequence(get_dna_sequence(sequence, gene))
@@ -315,7 +318,7 @@ def get_id_attribute(attributes_dict: dict) -> str:
                                         True if fields[1] in circular_contigs else False)
                         contig_counter.value += 1
                     org.add(contig)
-                    contig.length = int(fields[-1]) - int(fields[3]) + 1
+                    contig.length = int(fields[-1]) - int(fields[2]) + 1
 
                 continue
             elif line.startswith('#'):  # comment lines to be ignores by parsers
@@ -368,6 +371,9 @@ def get_id_attribute(attributes_dict: dict) -> str:
     if has_fasta and fasta_string != "":
         contig_sequences = read_fasta(org, fasta_string.split('\n'))  # _ is total contig length
         for contig in org.contigs:
+            if contig.length != len(contig_sequences[contig.name]):
+                raise ValueError("The contig lenght defined is different than the sequence length")
+
             for gene in contig.genes:
                 gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene))
             for rna in contig.RNAs:

diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py
@@ -169,7 +169,6 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str,
     :return: Dictionnary with contig_name as keys and contig sequence in values
     """
     global contig_counter
-
     try:
         contigs = {}
         contig_seq = ""

diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
@@ -309,8 +309,6 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool =
         newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir)
         tmp_path = Path(newtmpdir.name)
 
-    # newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir)
-    # tmp_path = Path(newtmpdir.name)
     with open(tmp_path/'nucleotid_sequences', "w") as sequence_file:
         check_pangenome_for_clustering(pangenome, sequence_file, force, disable_bar=disable_bar)
         logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...")

diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py
@@ -9,7 +9,7 @@
 import logging
 import os
 from typing import List, Dict, Tuple, Iterable, Hashable, Iterator, Set
-from itertools import zip_longest, chain
+from itertools import chain
 from collections import defaultdict
 from pathlib import Path
 
@@ -21,7 +21,7 @@
 # local libraries
 from ppanggolin.formats import check_pangenome_info
 from ppanggolin.genome import Gene, Contig
-from ppanggolin.utils import mk_outdir, restricted_float, create_tmpdir, read_compressed_or_not
+from ppanggolin.utils import mk_outdir, restricted_float, create_tmpdir, read_compressed_or_not, extract_contig_window
 from ppanggolin.pangenome import Pangenome
 from ppanggolin.align.alignOnPang import project_and_write_partition, get_input_seq_to_family_with_rep, \
     get_input_seq_to_family_with_all, get_seq_ids
@@ -445,63 +445,6 @@ def get_n_next_genes_index(current_index: int, next_genes_count: int,
         yield next_gene_index
 
 
-def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int,
-                          is_circular: bool = False):
-    """
-    Extracts contiguous windows around positions of interest within a contig.
-
-    :param contig_size: Number of genes in contig.
-    :param positions_of_interest: An iterable containing the positions of interest.
-    :param window_size: The size of the window to extract around each position of interest.
-    :param is_circular: Indicates if the contig is circular.
-    :return: Yields tuples representing the start and end positions of each contiguous window.
-    """
-    windows_coordinates = []
-
-    # Sort the positions of interest
-    sorted_positions = sorted(positions_of_interest)
-
-    # Check if any position of interest is out of range
-    if sorted_positions[0] < 0 or sorted_positions[-1] >= contig_size:
-        raise IndexError(f'Positions of interest are out of range. '
-                         f"Contig has {contig_size} genes while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions")
-
-    if is_circular:
-        first_position = sorted_positions[0]
-        last_position = sorted_positions[-1]
-        # in a circular contig, if the window of a gene of interest overlaps the end/start of the contig
-        # an out of scope position is added to the sorted positions to take into account those positions
-        # the returned window are always checked that its positions are not out of range... 
-        # so there's no chance to find an out of scope position in final list
-        if first_position - window_size < 0:
-            out_of_scope_position = contig_size + first_position
-            sorted_positions.append(out_of_scope_position)
-
-        if last_position + window_size >= contig_size:
-            out_of_scope_position = last_position - contig_size
-            sorted_positions.insert(0, out_of_scope_position)
-
-    start_po = max(sorted_positions[0] - window_size, 0)
-
-    for position, next_po in zip_longest(sorted_positions, sorted_positions[1:]):
-
-        if next_po is None:
-            # If there are no more positions, add the final window
-            end_po = min(position + window_size, contig_size - 1)
-            windows_coordinates.append((start_po, end_po))
-
-        elif position + window_size + 1 < next_po - window_size:
-            # If there is a gap between positions, add the current window 
-            # and update the start position for the next window
-            end_po = min(position + window_size, contig_size - 1)
-
-            windows_coordinates.append((start_po, end_po))
-
-            start_po = max(next_po - window_size, 0)
-
-    return windows_coordinates
-
-
 def get_contig_to_genes(gene_families: Iterable[GeneFamily]) -> Dict[Contig, Set[Gene]]:
     """
     Group genes from specified gene families by contig.

diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py
@@ -647,10 +647,10 @@ def draw_spots(pangenome: Pangenome, output: Path, spot_list: str, disable_bar:
                          need_rgp=True, need_spots=True, need_modules=need_mod, disable_bar=disable_bar)
 
     if spot_list == 'all' or any(x == 'all' for x in spot_list):
-        logging.getLogger("PPanGGOLiN").debug(f"'all' value is found in spot list, all spots are drawn.")
+        logging.getLogger("PPanGGOLiN").debug("'all' value is found in spot list, all spots are drawn.")
         selected_spots = list(pangenome.spots)
     elif spot_list == "synteny" or any(x == 'synteny' for x in spot_list):
-        logging.getLogger().debug(f"'synteny' value is found in spot list, all spots with more than 1 conserved synteny are drawn.")
+        logging.getLogger().debug("'synteny' value is found in spot list, all spots with more than 1 conserved synteny are drawn.")
         selected_spots = [s for s in pangenome.spots if len(s.get_uniq_ordered_set()) > 1]
     else:
         curated_spot_list = {'spot_' + str(s) if not s.startswith("spot_") else str(s) for s in spot_list}

diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py
@@ -173,7 +173,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di
                        shapes=shapes,
                        plot_bgcolor='#ffffff')
     logging.getLogger("PPanGGOLiN").info("Drawing the figure itself...")
-    #fig = go.Figure(data=[heatmap], layout=layout)
+
     fig = go.Figure(data=[heatmap])
 
     fig.add_trace(go.Scatter(x=dendro_org['icoord'],