Skip to content

Commit

Permalink
Merge pull request #151 from marbl/add-maf-option
Browse files Browse the repository at this point in the history
Add maf option
  • Loading branch information
bkille authored Apr 12, 2024
2 parents e6402b1 + 5d438e0 commit 8e0d41c
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 8 deletions.
2 changes: 1 addition & 1 deletion logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
COLOR_SEQ = "\033[1;%dm"
BOLD_SEQ = "\033[1m"

MIN_TQDM_INTERVAL=30
MIN_TQDM_INTERVAL=5


# Logging redirect copied from https://stackoverflow.com/questions/14897756/python-progress-bar-through-logging-module
Expand Down
71 changes: 64 additions & 7 deletions parsnp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
'''

import os, sys, string, random, subprocess, time, operator, math, datetime, numpy #pysam
from collections import defaultdict
from collections import defaultdict, namedtuple
import shutil
import multiprocessing
import shlex
Expand All @@ -17,15 +17,15 @@ from logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL
import argparse
import signal
from multiprocessing import Pool
from Bio import SeqIO
from Bio import SeqIO, AlignIO
from glob import glob
from pathlib import Path


import extend as ext
from tqdm import tqdm

__version__ = "2.0.4"
__version__ = "2.0.5"
reroot_tree = True #use --midpoint-reroot
random_seeded = random.Random(42)

Expand Down Expand Up @@ -141,6 +141,54 @@ def handler(signum, frame):

signal.signal(signal.SIGINT, handler)

def xmfa_to_maf(xmfa_path, maf_path, all_input_paths):
sample_delim = '#'
SeqInfo = namedtuple("SeqInfo", "cid seq_length")
hdr_block_pattern = re.compile("##SequenceIndex (\d+)\n##SequenceFile (.+)\n##SequenceHeader >\s*(\S+).*\n##SequenceLength (\d+)bp")
idx_to_fname = {}
ref_fname = ""
with open(xmfa_path) as xmfa_in:
next(xmfa_in) # skip version
line = next(xmfa_in)
seq_count = int(re.match("#SequenceCount (\d+)\n", line).groups()[0])
for i in range(seq_count):
info_block = ""
for _ in range(4):
info_block += next(xmfa_in)
parsed_info = hdr_block_pattern.match(info_block).groups()
fname = parsed_info[1]
if fname.endswith(".ref") and ref_fname == "":
fname = fname[:-4]
ref_fname = fname
idx_to_fname[parsed_info[0]] = fname

fname_to_info = defaultdict(list)
for f in all_input_paths:
fname = Path(f).name
if fname.endswith(".ref"):
fname = fname[:-4]
fname_to_info[fname] = [SeqInfo(rec.id.split(" ")[0], len(rec.seq)) for rec in SeqIO.parse(f, "fasta")]

seqid_pattern = re.compile(r'^cluster(\d+) s(\d+):p(\d+)')
with open(maf_path, 'w') as maf_out:
for aln in AlignIO.parse(xmfa_path, "mauve"):
for rec_idx, rec in enumerate(aln):
aln_len = rec.annotations["end"] - rec.annotations["start"]
cluster_idx, contig_idx, startpos = [int(x) for x in seqid_pattern.match(rec.id).groups()]
fname = idx_to_fname[rec.name]
if rec_idx == 0:
maf_out.write(f"a cluster={cluster_idx}\n")
elif fname == ref_fname:
continue
rec_info = fname_to_info[fname][contig_idx-1]
maf_out.write(f"s\t{'.'.join(fname.split('.')[:-1])}{sample_delim}{rec_info.cid}")
maf_out.write(f"\t{startpos}")
maf_out.write(f"\t{aln_len}")
maf_out.write(f"\t{'+' if rec.annotations['strand'] == 1 else '-'}")
maf_out.write(f"\t{rec_info.seq_length}")
maf_out.write(f"\t{rec.seq}\n")
maf_out.write("\n")
return

#TODO Merge run fns
def run_phipack(query,seqlen,workingdir):
Expand Down Expand Up @@ -487,6 +535,10 @@ def parse_args():
"--vcf",
action = "store_true",
help = "Generate VCF file.")
misc_args.add_argument(
"--no-maf",
action = "store_true",
help = "Do not generage MAF file (XMFA only)")
misc_args.add_argument(
"-p",
"--threads",
Expand Down Expand Up @@ -893,6 +945,7 @@ def parse_input_files(input_files, curated, validate_input):
'''
# global ff, hdr, seq
fnafiles = []
fname_to_info = defaultdict(list)
for input_file in input_files[:]:
# If biopython cannot parse the input as a fasta, kick it out of the list and move on:
if validate_input:
Expand Down Expand Up @@ -1242,9 +1295,7 @@ SETTINGS:
# fnafiles.remove(ref)

#sort reference by largest replicon to smallest
if ref in fnafiles:
fnafiles = [f for f in fnafiles if f != ref]
elif sortem and os.path.exists(ref) and not autopick_ref:
if sortem and os.path.exists(ref) and not autopick_ref:
sequences = SeqIO.parse(ref, "fasta")
new_ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
SeqIO.write(sequences, new_ref, "fasta")
Expand Down Expand Up @@ -1453,7 +1504,7 @@ SETTINGS:

finalfiles = sorted(finalfiles)
random_seeded.shuffle(finalfiles)
totseqs = len(finalfiles)
totseqs = len(finalfiles) + 1 # Add one to include reference

#initiate parallelPhiPack tasks
run_recomb_filter = 0
Expand Down Expand Up @@ -1705,6 +1756,12 @@ SETTINGS:
for lcb in new_lcbs:
partition.write_aln_to_xmfa(lcb, out_f)


# Generate MAF file
if not args.no_maf:
logger.debug(f"Writing MAF file to {Path(parsnp_output).with_suffix('.maf')}")
xmfa_to_maf(parsnp_output, Path(parsnp_output).with_suffix(".maf"), finalfiles + [ref])

#add genbank here, if present
if len(genbank_ref) != 0:
rnc = f"harvesttools -q -o {outputDir}/parsnp.ggr -x {parsnp_output}"
Expand Down

0 comments on commit 8e0d41c

Please sign in to comment.