Skip to content

Commit

Permalink
Merge pull request #139 from labgem/gff_output
Browse files Browse the repository at this point in the history
Add proksee and gff output
  • Loading branch information
jpjarnoux authored Oct 27, 2023
2 parents 6f7e806 + d58a253 commit faf2e50
Show file tree
Hide file tree
Showing 23 changed files with 1,200 additions and 333 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ jobs:
ppanggolin spot -p stepbystep/pangenome.h5 --spot_graph --overlapping_match 2 --set_size 3 --exact_match_size 1
ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f
ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05
ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1
ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1 --fasta organisms.fasta.list --gff --proksee
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list
ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --info_modules --no_print_info -f --log metrics.log
Expand Down Expand Up @@ -134,10 +134,10 @@ jobs:
run: |
cd testingDataset
head organisms.gbff.list | sed 's/^/input_org_/g' > organisms.gbff.head.list
ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_lisy_of_gbff --anno organisms.gbff.head.list
ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_list_of_gbff --anno organisms.gbff.head.list --gff --proksee
ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_single_fasta \
--organism_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \
--spot_graph --graph_formats graphml --fast --keep_tmp -f
--spot_graph --graph_formats graphml --fast --keep_tmp -f --add_sequences --gff --proksee
21 changes: 10 additions & 11 deletions ppanggolin/align/alignOnPang.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,10 +278,10 @@ def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq

partition_proj = output.absolute() / "sequences_partition_projection.tsv"
with open(partition_proj, "w") as partProjFile:
for input_seq, pangFam in seqid_to_gene_family.items():
partProjFile.write(input_seq + "\t" + pangFam.named_partition + "\n")
for remainingSeq in seq_set - seqid_to_gene_family.keys():
partProjFile.write(remainingSeq + "\tcloud\n") # if there is no hit, it's going to be cloud genes.
for input_seq, gene_fam in seqid_to_gene_family.items():
partProjFile.write(input_seq + "\t" + gene_fam.named_partition + "\n")
for remaining_seq in seq_set - seqid_to_gene_family.keys():
partProjFile.write(remaining_seq + "\tcloud\n") # if there is no hit, it's going to be cloud genes.
return partition_proj


Expand All @@ -298,11 +298,11 @@ def write_gene_to_gene_family(seqid_to_gene_family: Dict[str, GeneFamily], seq_s

gene_fam_map_file = output.absolute() / "gene_to_gene_family.tsv"
with open(gene_fam_map_file, "w") as partProjFile:
for input_seq, pangFam in seqid_to_gene_family.items():
partProjFile.write(f"{input_seq}\t{pangFam.name}\n")
for input_seq, gene_fam in seqid_to_gene_family.items():
partProjFile.write(f"{input_seq}\t{gene_fam.name}\n")

for remainingSeq in seq_set - seqid_to_gene_family.keys():
partProjFile.write(f"{remainingSeq}\t{remainingSeq}\n") # if there is no hit, gene family is itself.
for remaining_seq in seq_set - seqid_to_gene_family.keys():
partProjFile.write(f"{remaining_seq}\t{remaining_seq}\n") # if there is no hit, gene family is itself.

return gene_fam_map_file

Expand Down Expand Up @@ -517,9 +517,8 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Itera

def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: float = 0.8,
coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False,
use_representatives: bool = False,
draw_related: bool = False, translation_table: int = 11, tmpdir: Path = None,
disable_bar: bool = False, keep_tmp=False):
use_representatives: bool = False, draw_related: bool = False, translation_table: int = 11,
tmpdir: Path = None, disable_bar: bool = False, keep_tmp=False):
"""
Aligns pangenome sequences with sequences in a FASTA file using MMSeqs2.
Expand Down
8 changes: 7 additions & 1 deletion ppanggolin/annotate/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,9 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li
while not line.startswith('//'):
sequence += line[10:].replace(" ", "").strip().upper()
line = lines.pop()

if contig.length != len(sequence):
raise ValueError("The contig lenght defined is different than the sequence length")
# get each gene's sequence.
for gene in contig.genes:
gene.add_sequence(get_dna_sequence(sequence, gene))
Expand Down Expand Up @@ -315,7 +318,7 @@ def get_id_attribute(attributes_dict: dict) -> str:
True if fields[1] in circular_contigs else False)
contig_counter.value += 1
org.add(contig)
contig.length = int(fields[-1]) - int(fields[3]) + 1
contig.length = int(fields[-1]) - int(fields[2]) + 1

continue
elif line.startswith('#'): # comment lines to be ignores by parsers
Expand Down Expand Up @@ -368,6 +371,9 @@ def get_id_attribute(attributes_dict: dict) -> str:
if has_fasta and fasta_string != "":
contig_sequences = read_fasta(org, fasta_string.split('\n')) # _ is total contig length
for contig in org.contigs:
if contig.length != len(contig_sequences[contig.name]):
raise ValueError("The contig lenght defined is different than the sequence length")

for gene in contig.genes:
gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene))
for rna in contig.RNAs:
Expand Down
1 change: 0 additions & 1 deletion ppanggolin/annotate/synta.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,6 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str,
:return: Dictionnary with contig_name as keys and contig sequence in values
"""
global contig_counter

try:
contigs = {}
contig_seq = ""
Expand Down
2 changes: 0 additions & 2 deletions ppanggolin/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,8 +309,6 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool =
newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir)
tmp_path = Path(newtmpdir.name)

# newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir)
# tmp_path = Path(newtmpdir.name)
with open(tmp_path/'nucleotid_sequences', "w") as sequence_file:
check_pangenome_for_clustering(pangenome, sequence_file, force, disable_bar=disable_bar)
logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...")
Expand Down
61 changes: 2 additions & 59 deletions ppanggolin/context/searchGeneContext.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import logging
import os
from typing import List, Dict, Tuple, Iterable, Hashable, Iterator, Set
from itertools import zip_longest, chain
from itertools import chain
from collections import defaultdict
from pathlib import Path

Expand All @@ -21,7 +21,7 @@
# local libraries
from ppanggolin.formats import check_pangenome_info
from ppanggolin.genome import Gene, Contig
from ppanggolin.utils import mk_outdir, restricted_float, create_tmpdir, read_compressed_or_not
from ppanggolin.utils import mk_outdir, restricted_float, create_tmpdir, read_compressed_or_not, extract_contig_window
from ppanggolin.pangenome import Pangenome
from ppanggolin.align.alignOnPang import project_and_write_partition, get_input_seq_to_family_with_rep, \
get_input_seq_to_family_with_all, get_seq_ids
Expand Down Expand Up @@ -445,63 +445,6 @@ def get_n_next_genes_index(current_index: int, next_genes_count: int,
yield next_gene_index


def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int,
is_circular: bool = False):
"""
Extracts contiguous windows around positions of interest within a contig.
:param contig_size: Number of genes in contig.
:param positions_of_interest: An iterable containing the positions of interest.
:param window_size: The size of the window to extract around each position of interest.
:param is_circular: Indicates if the contig is circular.
:return: Yields tuples representing the start and end positions of each contiguous window.
"""
windows_coordinates = []

# Sort the positions of interest
sorted_positions = sorted(positions_of_interest)

# Check if any position of interest is out of range
if sorted_positions[0] < 0 or sorted_positions[-1] >= contig_size:
raise IndexError(f'Positions of interest are out of range. '
f"Contig has {contig_size} genes while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions")

if is_circular:
first_position = sorted_positions[0]
last_position = sorted_positions[-1]
# in a circular contig, if the window of a gene of interest overlaps the end/start of the contig
# an out of scope position is added to the sorted positions to take into account those positions
# the returned window are always checked that its positions are not out of range...
# so there's no chance to find an out of scope position in final list
if first_position - window_size < 0:
out_of_scope_position = contig_size + first_position
sorted_positions.append(out_of_scope_position)

if last_position + window_size >= contig_size:
out_of_scope_position = last_position - contig_size
sorted_positions.insert(0, out_of_scope_position)

start_po = max(sorted_positions[0] - window_size, 0)

for position, next_po in zip_longest(sorted_positions, sorted_positions[1:]):

if next_po is None:
# If there are no more positions, add the final window
end_po = min(position + window_size, contig_size - 1)
windows_coordinates.append((start_po, end_po))

elif position + window_size + 1 < next_po - window_size:
# If there is a gap between positions, add the current window
# and update the start position for the next window
end_po = min(position + window_size, contig_size - 1)

windows_coordinates.append((start_po, end_po))

start_po = max(next_po - window_size, 0)

return windows_coordinates


def get_contig_to_genes(gene_families: Iterable[GeneFamily]) -> Dict[Contig, Set[Gene]]:
"""
Group genes from specified gene families by contig.
Expand Down
4 changes: 2 additions & 2 deletions ppanggolin/figures/draw_spot.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,10 +647,10 @@ def draw_spots(pangenome: Pangenome, output: Path, spot_list: str, disable_bar:
need_rgp=True, need_spots=True, need_modules=need_mod, disable_bar=disable_bar)

if spot_list == 'all' or any(x == 'all' for x in spot_list):
logging.getLogger("PPanGGOLiN").debug(f"'all' value is found in spot list, all spots are drawn.")
logging.getLogger("PPanGGOLiN").debug("'all' value is found in spot list, all spots are drawn.")
selected_spots = list(pangenome.spots)
elif spot_list == "synteny" or any(x == 'synteny' for x in spot_list):
logging.getLogger().debug(f"'synteny' value is found in spot list, all spots with more than 1 conserved synteny are drawn.")
logging.getLogger().debug("'synteny' value is found in spot list, all spots with more than 1 conserved synteny are drawn.")
selected_spots = [s for s in pangenome.spots if len(s.get_uniq_ordered_set()) > 1]
else:
curated_spot_list = {'spot_' + str(s) if not s.startswith("spot_") else str(s) for s in spot_list}
Expand Down
2 changes: 1 addition & 1 deletion ppanggolin/figures/tile_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di
shapes=shapes,
plot_bgcolor='#ffffff')
logging.getLogger("PPanGGOLiN").info("Drawing the figure itself...")
#fig = go.Figure(data=[heatmap], layout=layout)

fig = go.Figure(data=[heatmap])

fig.add_trace(go.Scatter(x=dendro_org['icoord'],
Expand Down
Loading

0 comments on commit faf2e50

Please sign in to comment.