-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
formatting and change error to warning
- the minimal variant fraction threshold was limited to >0.5 -> changed to the error to a warning
- Loading branch information
1 parent
4459fed
commit ab4e856
Showing
2 changed files
with
107 additions
and
55 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,9 @@ | ||
work/ | ||
work*/ | ||
results*/ | ||
conda/ | ||
.nextflow/ | ||
.nextflow.log* | ||
nextflow-autodownload-databases/ | ||
*.nextflow.log* | ||
nextflow-autodownload-databases/ | ||
tests_*/ | ||
tmp/ | ||
settings.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
#author: Stephan Fuchs (Robert Koch Institute, MF-1, [email protected]) | ||
# author: Stephan Fuchs (Robert Koch Institute, MF-1, [email protected]) | ||
|
||
VERSION = "0.0.9" | ||
import os | ||
|
@@ -9,111 +9,160 @@ | |
import re | ||
import sys | ||
import gzip | ||
import warnings | ||
|
||
|
||
def parse_args(CMD=None): | ||
parser = argparse.ArgumentParser(prog="rename_in_gff3.py", description="changes genotype in VCFs", ) | ||
parser.add_argument('vcf', metavar="FILE", help="vcf file", type=str) | ||
parser.add_argument('--ao', metavar="STR", help="tag for read count supporting the respective variant (default: AO)", type=str, default="AO") | ||
parser.add_argument('--dp', metavar="STR", help="tag for total read count at the repsective position (default: DP)", type=str, default="DP") | ||
parser.add_argument('--gt', metavar="STR", help="tag for genotype (default: GT)", type=str, default="GT") | ||
parser.add_argument('-o', help="output file (will be overwritten!)", type=str, required=True) | ||
parser.add_argument('--gz', help="bgzip compressed input", action="store_true") | ||
parser.add_argument('--vf', metavar="FLOAT", help="minimal variant fraction to set a homogeneous genotype (default: 0.9)", type=float, default=0.9) | ||
parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION) | ||
parser = argparse.ArgumentParser( | ||
prog="rename_in_gff3.py", | ||
description="changes genotype in VCFs", | ||
) | ||
parser.add_argument("vcf", metavar="FILE", help="vcf file", type=str) | ||
parser.add_argument( | ||
"--ao", | ||
metavar="STR", | ||
help="tag for read count supporting the respective variant (default: AO)", | ||
type=str, | ||
default="AO", | ||
) | ||
parser.add_argument( | ||
"--dp", | ||
metavar="STR", | ||
help="tag for total read count at the repsective position (default: DP)", | ||
type=str, | ||
default="DP", | ||
) | ||
parser.add_argument( | ||
"--gt", | ||
metavar="STR", | ||
help="tag for genotype (default: GT)", | ||
type=str, | ||
default="GT", | ||
) | ||
parser.add_argument( | ||
"-o", help="output file (will be overwritten!)", type=str, required=True | ||
) | ||
parser.add_argument("--gz", help="bgzip compressed input", action="store_true") | ||
parser.add_argument( | ||
"--vf", | ||
metavar="FLOAT", | ||
help="minimal variant fraction to set a homogeneous genotype (default: 0.9)", | ||
type=float, | ||
default=0.9, | ||
) | ||
parser.add_argument("--version", action="version", version="%(prog)s " + VERSION) | ||
return parser.parse_args(CMD) | ||
|
||
|
||
# open file handles considering compression state | ||
def get_filehandle(in_fname, gz): | ||
if not gz: | ||
inhandle = open(in_fname, "r") | ||
else: | ||
inhandle = gzip.open(in_fname, "rt") | ||
return inhandle | ||
|
||
def process(in_fname, out_fname, min_vf, ao_tag="AO", dp_tag="DP", gt_tag="GT", gz=False): | ||
#sanity checks | ||
|
||
|
||
def process( | ||
in_fname, out_fname, min_vf, ao_tag="AO", dp_tag="DP", gt_tag="GT", gz=False | ||
): | ||
# sanity checks | ||
if min_vf <= 0.5: | ||
sys.exit("error: min_vf has to be greater than 0.5") | ||
warnings.warn( | ||
f"[WARNING] Minimal variant fraction to set a homogeneous genotype (--vf) is below 0.5 ({min_vf}). Assuming you know what you are doing." | ||
) | ||
out_gz = out_fname.endswith(".gz") | ||
intermediate = re.sub("\.gz$","", out_fname) | ||
intermediate = re.sub("\.gz$", "", out_fname) | ||
|
||
#regex generation | ||
# regex generation | ||
ao_pattern = re.compile(r"(?:^|\t|;)" + re.escape(ao_tag) + "=([0-9,]+)(?:$|\t|;)") | ||
dp_pattern = re.compile(r"(?:^|\t|;)" + re.escape(dp_tag) + "=([0-9]+)(?:$|\t|;)") | ||
gt_pattern = re.compile(r"(?:^|\t|;)" + re.escape(gt_tag) + "=([0-9]+/[0-9]+)(?:$|\t|;)") | ||
gt_pattern = re.compile( | ||
r"(?:^|\t|;)" + re.escape(gt_tag) + "=([0-9]+/[0-9]+)(?:$|\t|;)" | ||
) | ||
|
||
with get_filehandle(in_fname, gz) as inhandle: | ||
with open(intermediate, "w") as outhandle: | ||
with get_filehandle(in_fname, gz) as inhandle: | ||
with open(intermediate, "w") as outhandle: | ||
for l, line in enumerate(inhandle): | ||
|
||
#skip empty or comment lines | ||
# skip empty or comment lines | ||
if len(line.strip()) == 0 or line.startswith("#"): | ||
outhandle.write(line) | ||
continue | ||
|
||
fields = line.split("\t") | ||
#find GT position | ||
|
||
# find GT position | ||
gt_pos = fields[8].split(":").index(gt_tag) | ||
#replacing GT info (considering line eventual breaks at end) | ||
|
||
# replacing GT info (considering line eventual breaks at end) | ||
cols = fields[9].split(":") | ||
|
||
# check line for homo/herterozygot | ||
gt1, gt2 = cols[gt_pos].split('/') | ||
gt1, gt2 = cols[gt_pos].split("/") | ||
if gt1 == gt2: | ||
# homozygot | ||
outhandle.write(line) | ||
continue | ||
#find ao and dp | ||
|
||
# find ao and dp | ||
ao = ao_pattern.findall(fields[7]) | ||
dp = dp_pattern.findall(fields[7]) | ||
|
||
if len(ao) > 1: | ||
sys.exit("error: multiple occurences of " + ao_tag + " tag in line " + str(l+1)) | ||
sys.exit( | ||
"error: multiple occurences of " | ||
+ ao_tag | ||
+ " tag in line " | ||
+ str(l + 1) | ||
) | ||
if len(dp) > 1: | ||
sys.exit("error: multiple occurences of " + dp_tag + " tag in line " + str(l+1)) | ||
|
||
#calc fractions and check threshold | ||
fracs = [int(x)/int(dp[0]) for x in ao[0].split(",")] | ||
sys.exit( | ||
"error: multiple occurences of " | ||
+ dp_tag | ||
+ " tag in line " | ||
+ str(l + 1) | ||
) | ||
|
||
# calc fractions and check threshold | ||
fracs = [int(x) / int(dp[0]) for x in ao[0].split(",")] | ||
m = max(fracs) | ||
|
||
if m < min_vf: | ||
outhandle.write(line) | ||
continue | ||
#generate new GT | ||
gt = str(fracs.index(m) + 1) # REF == 0 -> ++1 | ||
|
||
# generate new GT | ||
gt = str(fracs.index(m) + 1) # REF == 0 -> ++1 | ||
gt = gt + "/" + gt | ||
#find GT position | ||
|
||
# find GT position | ||
gt_pos = fields[8].split(":").index(gt_tag) | ||
#replacing GT info (considering line eventual breaks at end) | ||
if gt_pos == len(cols)-1: | ||
|
||
# replacing GT info (considering line eventual breaks at end) | ||
if gt_pos == len(cols) - 1: | ||
cols[gt_pos] += "\n" | ||
fields[9] = ":".join(cols) | ||
outhandle.write("\t".join(fields)) | ||
if out_gz: | ||
bgzip_outname(intermediate, out_fname) | ||
|
||
|
||
|
||
def bgzip_outname(_file, outfile=None): | ||
if outfile is not None: | ||
with open(outfile, "wb") as out_fh: | ||
with subprocess.Popen(["bgzip", _file, "-c"], | ||
stdout=out_fh) as bg_proc: | ||
ret = bg_proc.wait() | ||
os.remove(_file) | ||
with subprocess.Popen(["bgzip", _file, "-c"], stdout=out_fh) as bg_proc: | ||
ret = bg_proc.wait() | ||
os.remove(_file) | ||
else: | ||
with subprocess.Popen(["bgzip", _file], | ||
stderr=subprocess.PIPE) as bg_proc: | ||
with subprocess.Popen(["bgzip", _file], stderr=subprocess.PIPE) as bg_proc: | ||
out, err = bg_proc.communicate() | ||
ret = bg_proc.wait() | ||
|
||
ret = bg_proc.wait() | ||
|
||
|
||
def main(CMD=None): | ||
args = parse_args(CMD) | ||
process(args.vcf, args.o, args.vf, args.ao, args.dp, args.gt, args.gz) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |