Skip to content

Commit

Permalink
pff -> psf file
Browse files Browse the repository at this point in the history
  • Loading branch information
lrauschning committed Aug 23, 2024
1 parent cfc1445 commit ded6760
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 57 deletions.
42 changes: 21 additions & 21 deletions msyd/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def main():
# help="Filter a VCF file to only contain annotations in multisyntenic regions",
# description="""
# Used for filtering VCF files to only contain calls in multisyntenic regions.
# Can be run on pff files processed with msyd view.
# Can be run on psf files processed with msyd view.
# """)
#filter_parser.set_defaults(func=filter)
#filter_parser.add_argument("--vcf", dest='invcf', required=True, type=argparse.FileType('r'), help="The .vcf file to filter and write to -o.")
Expand All @@ -73,11 +73,11 @@ def main():
description="""
Call Multisynteny in a set of genomes that have been aligned to a reference and processed with syri.\n
Requires a tab-separated file listing for each organism the name that should be used, the path to the alignment and syri output files.\n
Output can be saved either in Population Synteny Format (PSF) or VCF. VCF output does not preserve alignment information and cannot be used for some of the further processing!\n
Output can be saved either in Population Synteny File Format (.psf) or VCF. VCF output does not preserve alignment information and cannot be used for some of the further processing!\n
""")
call_parser.set_defaults(func=call)
call_parser.add_argument("-i", dest='infile', required=True, type=argparse.FileType('r'), help="The .tsv file to read SyRI output, alignment and VCF files in from. For more details, see the Readme.")
call_parser.add_argument("-o", dest='pff', required=True, type=argparse.FileType('wt'), help="Where to save the output PFF file (see format.md)")
call_parser.add_argument("-o", dest='psf', required=True, type=argparse.FileType('wt'), help="Where to save the output PFF file (see format.md)")
call_parser.add_argument("-m", "--merge-vcf", dest='vcf', type=argparse.FileType('wt'), help="Merge the VCFs specified in the input table, store the merged VCF at the path specified. Does not currently work with --realign, as non-ref haplotypes do not have coordinates on the reference that VCF records can be fetched from.")
call_parser.add_argument("-a", "--all", dest='all', action='store_true', default=False, help="Merge all VCF records instead of only records annotated in multisyntenic regions.")
call_parser.add_argument("-x", "--complex", dest='no_complex', action='store_const', const=False, default=True, help="Do not filter the input VCFs to only contain SNPs and INDELs")
Expand All @@ -86,7 +86,7 @@ def main():
call_parser.add_argument("-c", dest="cores", help="Number of cores to use for parallel computation. Multisyn cannot make effective use of more cores than the number of input organisms divided by two. Defaults to 1.", type=int, default=1)
call_parser.add_argument("--core", dest='core', action='store_true', default=False, help="Call only core synteny. Improves runtime significantly, particularly on larger datasets.")
call_parser.add_argument("--syn", "-s", dest='SYNAL', action='store_const', const=False, default=True, help="Use SYN instead of SYNAL SyRI annotations. Yields more contiguous regions and faster runtime, but calls may not be exact to the base level.")
call_parser.add_argument("--no-cigars", dest='cigars', action='store_const', const=False, default=True, help="Don't store CIGAR strings in the saved .pff file. Has no effect when --syn is specified.")
call_parser.add_argument("--no-cigars", dest='cigars', action='store_const', const=False, default=True, help="Don't store CIGAR strings in the saved .psf file. Has no effect when --syn is specified.")
call_parser.add_argument("--realign", "-ali", dest='realign', action='store_true', default=False, help="After calling core and reference cross synteny, realign missing regions to identify non-reference synteny.")
call_parser.add_argument("--pairwise", dest='pairwise', required=False, type=argparse.FileType('r'), help="Path to a TSV containing paths to full pairwise alignments that msyd will read in from disk during realignment if this parameter is passed. Otherwise, individual regions will be realigned on the fly with minimap2/mappy. This is useful if you already have pairwise alignments, or want to use a different aligner.")
call_parser.add_argument("-p", "--print", dest='print', action='store_true', default=False, help="print a subset of the output to stdout, for debugging.")
Expand All @@ -113,8 +113,8 @@ def main():
view_parser.add_argument("--intersect", dest='intersect', type=argparse.FileType('r'), help="VCF File to intersect with the PFF file given with -i. Will only keep annotations within multisyntenic regions")
view_parser.add_argument("--impute", dest='impute', action='store_true', default=False, help="When processing small variants in a VCF, interpret the lack of a variant as identical to the reference genotype for that haplotype.")

view_parser.add_argument("--opff", dest='filetype', action='store_const', const='pff', help="store output in PFF format")
view_parser.add_argument("--opff-nocg", dest='filetype', action='store_const', const='pff-nocg', help="store output in PFF format, discarding cigar strings")
view_parser.add_argument("--opsf", dest='filetype', action='store_const', const='psf', help="store output in PFF format")
view_parser.add_argument("--opsf-nocg", dest='filetype', action='store_const', const='psf-nocg', help="store output in PFF format, discarding cigar strings")
view_parser.add_argument("--ovcf", dest='filetype', action='store_const', const='vcf', help="store output in VCF format, discarding cigar strings")

merge_parser = subparsers.add_parser("merge",
Expand All @@ -139,7 +139,7 @@ def main():
realign_parser.add_argument("-t", dest='tsvfile', required=True, type=argparse.FileType('r'), help="TSV containing the sample names and path to genome fastas.")
realign_parser.add_argument("-p", "--pairwise", dest='pairwise', required=False, type=argparse.FileType('r'), help="Path to a TSV containing paths to full pairwise alignments that msyd will read in from disk if this parameter is passed. Otherwise, individual regions will be realigned on the fly with minimap2/mappy. This is useful if you already have pairwise alignments, or want to use a different aligner.")
realign_parser.add_argument("--workdir", "-w", dest='tmp', required=False, type=str, help="Path to a working directory to be used for storing temporary files. If the path does not exist, it will be created!")
realign_parser.add_argument("--no-cigars", dest='cigars', action='store_const', const=False, default=True, help="Don't store CIGAR strings in the saved .pff file. Has no effect when --syn is specified.")
realign_parser.add_argument("--no-cigars", dest='cigars', action='store_const', const=False, default=True, help="Don't store CIGAR strings in the saved .psf file. Has no effect when --syn is specified.")
realign_parser.add_argument("--min-realign", dest="min_realign", help="Minimum region size to realign, in bp. Default 100 bp.", type=int, default=-1)
realign_parser.add_argument("--min-syn-id", dest="min_syn_id", help="% Identity required for a region to be called as syntenic during the realignment step. Default 80%.", type=int, default=80)
realign_parser.add_argument("--max-realign", dest="max_realign", help="Maximum number of realignment steps to perform. Default 0 (unlimited).", type=int, default=-1)
Expand Down Expand Up @@ -184,7 +184,7 @@ def call(args):
# from msyd.script.io import find_multisyn
import msyd.io as io
import msyd.imputation as imputation
from msyd.multisyn import find_multisyn
from msyd.intersection import find_multisyn
import msyd.realignment as realignment
from msyd.coords import Range
import msyd.ordering as ordering
Expand Down Expand Up @@ -223,8 +223,8 @@ def call(args):
print(util.get_stats(df))

# save output
logger.info(f"Saving msyd calls to PFF at {args.pff.name}")
io.save_to_pff(df, args.pff, save_cigars=args.cigars)
logger.info(f"Saving msyd calls to PFF at {args.psf.name}")
io.save_to_psf(df, args.psf, save_cigars=args.cigars)

# if specified, merge the VCFs
if args.vcf:
Expand Down Expand Up @@ -254,7 +254,7 @@ def call(args):
logger.info(f"Adding multisynteny annotations, saving to {args.vcf.name}")
vcf.add_syn_anns_to_vcf(df, tmpfile, args.vcf.name, ref=ref)

logger.info(f"Finished running msyd call, output saved to {args.pff.name}.")
logger.info(f"Finished running msyd call, output saved to {args.psf.name}.")

def merge(args):
import msyd.io as io
Expand All @@ -275,7 +275,7 @@ def order(args):
import msyd.util as util
logger = util.CustomFormatter.getlogger("order")

df = io.read_pff(args.infile)
df = io.read_psf(args.infile)
print(ordering.order_hierarchical(df, orgs=None, score_fn=ordering.syn_score))
logger.info("Finished running msyd order")

Expand All @@ -284,7 +284,7 @@ def view(args):
import msyd.util as util
logger = util.CustomFormatter.getlogger("view")
logger.info(f"reading multisynteny from {args.infile.name}")
df = io.read_pff(args.infile)
df = io.read_psf(args.infile)
if not args.filetype: # determine filetype if not present
args.filetype = args.outfile.name.split(".")[-1]
logger.info(f"No output filetype specified - guessing from OUTFILE extension")
Expand All @@ -305,12 +305,12 @@ def view(args):

# save
logger.info(f"Writing to {args.outfile.name} in {args.filetype} format")
if args.filetype == 'pff':
io.save_to_pff(df, args.outfile)
if args.filetype == 'psf':
io.save_to_psf(df, args.outfile)
elif args.filetype == 'vcf':
io.save_to_vcf(df, args.outfile, args.ref.name if args.ref else None)
elif args.filetype == 'pff-nocg' or args.filetype == 'pff-nocigar':
io.save_to_pff(df, args.outfile, save_cigars=False)
elif args.filetype == 'psf-nocg' or args.filetype == 'psf-nocigar':
io.save_to_psf(df, args.outfile, save_cigars=False)
else:
logger.error(f"Couldn't determine filetype for {args.filetype}")
return
Expand All @@ -330,12 +330,12 @@ def realign(args):

logger.info(f"realigning from {args.infile.name}, taking genome files from {args.tsvfile.name}")
qrynames, syris, alns, vcfs, fastas = util.parse_input_tsv(args.tsvfile)
syns = io.read_pff(args.infile)
syns = io.read_psf(args.infile)
resyns = realignment.realign(syns, qrynames, fastas, MIN_REALIGN_LEN=args.min_realign, MIN_SYN_ID=args.min_syn_id, MAX_REALIGN=args.max_realign, pairwise=alndict if args.pairwise else None)
print(util.get_stats(resyns))

logger.info(f"Saving to {args.outfile.name} in PFF format.")
io.save_to_pff(resyns, args.outfile, save_cigars=args.cigars)
io.save_to_psf(resyns, args.outfile, save_cigars=args.cigars)
logger.info(f"Finished running msyd realign, output saved to {args.outfile.name}.")

def stats(args):
Expand All @@ -344,7 +344,7 @@ def stats(args):
logger = util.CustomFormatter.getlogger("stats")

logger.info(f"Reading from {args.infile.name}.")
syns = io.read_pff(args.infile)
syns = io.read_psf(args.infile)
#print(util.get_stats(resyns), file=args.outfile)
if args.agg:
print(util.get_stats(syns), file=args.outfile)
Expand All @@ -371,7 +371,7 @@ def plot(args):
"""Deprecated/internal, DO NOT USE
"""
import msyd.io as io
df = io.read_pff(args.infile)
df = io.read_psf(args.infile)
cols = ['ref', 'chr'] + list(util.get_orgs_from_df(df))

def pstolendf(x):
Expand Down
18 changes: 9 additions & 9 deletions msyd/pyxfiles/coords.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class Position:
#return f"Position({self.org}, {self.chr}, {self.haplo}, {self.pos})"
return f"Position({self.org}, {self.chr}, {self.pos})"

def to_pff(self):
def to_psf(self):
"""Transform this `Position` into population synteny file format
"""
#return f"{self.chr}:{self.haplo}:{self.pos}"
Expand Down Expand Up @@ -70,22 +70,22 @@ class Range:
#return f"Range({self.org}, {self.chr}, {self.haplo}, {self.start}, {self.end})"
return f"Range({self.org}, {self.chr}, {self.start}, {self.end})"

def to_pff(self):
"""Transform this `Range` into the form specified by PFF
def to_psf(self):
"""Transform this `Range` into the form specified by PSF
"""
#return f"{self.chr}:{self.haplo}:{self.start}-{self.end}"
return f"{self.chr}:{self.start}-{self.end}"

def to_pff_org(self):
"""Transform this `Range` into the form specified by PFF, with the sample name being prepended as specified for the realigned reference haplotype
def to_psf_org(self):
"""Transform this `Range` into the form specified by PSF, with the sample name being prepended as specified for the realigned reference haplotype
"""
#return f"{self.org}:{self.chr}:{self.haplo}:{self.start}-{self.end}"
return f"{self.org}:{self.chr}:{self.start}-{self.end}"


def read_pff(org:str, cell: str):
"""Parse a Range in PFF format
PFF format is :-separated and must contain the chromosome first, then followed by a haplotype (optional) and a start and end position separated by -.
def read_psf(org:str, cell: str):
"""Parse a Range in PSF format
PSF format is :-separated and must contain the chromosome first, then followed by a haplotype (optional) and a start and end position separated by -.
If the end position is before the start position, the range is treated as inverted.
`org` specifies the organism the returned range should have. If this range is just used for filtering, None may be passed.
Examples: Chr1:mat:1000-2000, Chr3:10000-50000
Expand All @@ -94,7 +94,7 @@ class Range:
#print(cell)
cellarr = cell.split(':')
if len(cellarr) < 2 or len(cellarr) > 4:
raise ValueError(f"Invalid PFF Range string: {cell}")
raise ValueError(f"Invalid PSF Range string: {cell}")
if len(cellarr) == 4: # if a ref name is specified in the cell, that overrides the argument
org = cellarr[0]
cellarr = cellarr[1:]
Expand Down
4 changes: 2 additions & 2 deletions msyd/pyxfiles/intersection.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -340,8 +340,8 @@ def find_multisyn(qrynames, syris, alns, base=None, sort=False, ref='a', cores=1

# shouldn't need any overlap removal
if base:
logger.info("reading in PFF for incremental calling")
syns.append(io.read_pff(base))
logger.info("reading in PSF for incremental calling")
syns.append(io.read_psf(base))

return reduce_find_overlaps(syns, cores, **kwargs)
# END
Expand Down
Loading

0 comments on commit ded6760

Please sign in to comment.