Skip to content

Commit

Permalink
fix pysam issues, TODO cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
lrauschning committed May 16, 2024
1 parent cb985c2 commit 3b62ddc
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 9 deletions.
4 changes: 3 additions & 1 deletion msyd/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,9 @@ def call(args):
io.reduce_vcfs(vcfs, tmpfile)

logger.info(f"Adding pansynteny annotations, saving to {args.vcf.name}")
io.add_syn_anns_to_vcf(df, tmpfile, args.vcf.name, ref=ref)
#io.add_syn_anns_to_vcf(df, tmpfile, args.vcf.name, ref=ref)
print(args.impute)
io.extract_syntenic_from_vcf(df, tmpfile, args.vcf.name, no_complex=args.no_complex, add_syn_anns=True, impute_ref=args.impute)

logger.info(f"Finished running msyd call, output saved to {args.pff.name}.")

Expand Down
34 changes: 26 additions & 8 deletions msyd/pyxfiles/io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -490,11 +490,23 @@ cpdef void extract_syntenic_from_vcf(syns, inpath:Union[str, os.PathLike], outpa
if any([re.fullmatch(r'N|[ACGT]*', allele) == None for allele in rec.alleles]):
continue # skip this variant

# get the samples for which we already have a genotype
# to use for either imputation or transfering nonsyn genotypes
contained_samples = set()
for sample in rec.samples:
# based on the ipmlementation in
# https://github.com/pysam-developers/pysam/blob/43c10664834cf3914000a744e6057826c6a6fa65/pysam/libcbcf.pyx#L3443
# this stuff is all undocumented (see pysam#407)
if not rec.samples[sample].alleles[0] is None:
contained_samples.add(sample)

new_rec = vcfout.new_record()
new_rec.pos = rec.pos
new_rec.chrom = rec.chrom
new_rec.id = rec.id
new_rec.alleles = rec.alleles
new_rec.ref = rec.ref # unnecessary, as this is covered by alleles

# discard old INFO information if reading it in as coords
if not coords_in_info:
for key in rec.info:
Expand All @@ -507,19 +519,21 @@ cpdef void extract_syntenic_from_vcf(syns, inpath:Union[str, os.PathLike], outpa
# a syntenic haplotype with no call probably has the ref haplotype
if impute_ref:
for org in syn.ranges_dict:
logger.info("checking", org, "against", rec.samples)
if org in orgsvcf and not org in rec.samples:
print("checking", org)
if org in orgsvcf and not org in contained_samples:
# unless it has a record, assume this sample has the same gt as the reference of the haplotype
logger.info("imputing", org)
logger.info(rec.samples[syn.ref.org])
logger.info(rec.samples)
new_rec.samples[org].update(rec.samples[syn.ref.org])
new_rec.samples[org].update({'GT': 0})
logger.info(f"imputing {org}")
new_rec.samples[org].allele_indices = 0
#new_rec.allele =
#new_rec.samples[org].update(rec.samples[syn.ref.org])
#new_rec.samples[org].update({'GT': 0})


# write existing genotype if there is one
for sample in rec.samples:
if keep_nonsyn_calls or sample in orgsvcf:
if keep_nonsyn_calls or sample in contained_samples:
#TODO should this copy over before impute?
logger.info(f"overwriting {sample}")
new_rec.samples[sample].update(rec.samples[sample])

# read in coords from INFO column, add to single sample
Expand Down Expand Up @@ -554,6 +568,10 @@ cpdef void reduce_vcfs(vcfs: List[Union[str, os.PathLike]], opath: Union[str, os
# incorporate the last vcf, save directly to output
merge_vcfs(vcfs[-1], tmpfiles[-1], opath)

#cpdef void impute_syn_in_vcf(syns, inpath: Union[str, os.PathLike], outpath: Union[str, os.PathLike], force_index=True, synorg='ref', ref=None, keep_nonsyn_calls=False, add_syn_anns=True, add_cigar=False, add_identity=True, no_complex=False, coords_in_info=False, impute_ref=False):



cpdef add_syn_anns_to_vcf(syns, vcfin: Union[str, os.PathLike], vcfout: Union[str, os.PathLike], ref=None):
"""Takes a VCF file, overwrites it adding annotations for core/cross-syn region. Other records are preserved as-is."""
cdef:
Expand Down

0 comments on commit 3b62ddc

Please sign in to comment.