-
Notifications
You must be signed in to change notification settings - Fork 0
/
list_derived_alleles.py
92 lines (81 loc) · 4.52 KB
/
list_derived_alleles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import argparse
import random
import subprocess
from colorama import Fore, Style
### FUNCTIONS ###
# identify derived alleles on one individual #
def list_derived_ind(FID_IID, dataset, outdir = "", het_rndlow = 0):
sample = FID_IID.split()[1]
print(Fore.LIGHTYELLOW_EX + Style.BRIGHT + "\nIdentifying sites with derived allele for sample " + sample + " ...", end = " ")
# write file with FID and IID for plink --keep #
with open(outdir + sample + ".txt", "w") as outfile:
outfile.write(FID_IID)
# run plink to get stats for each snp #
subprocess.run(["plink", "--bfile", dataset, "--keep-allele-order", "--freqx",
"--keep", outdir + sample + ".txt", "--out", outdir + sample], stdout = subprocess.DEVNULL)
# identify sites with derived allele #
list_derived = []
with open(outdir + sample + ".frqx") as sites:
for site in sites:
# if hom for derived allele (= 1 in column 7) add to list #
if site.split()[6] == "1":
list_derived.append(site.split()[1])
# if het (= 1 in column 6) add to list with 0.5 probability #
# if you want to include all derived alleles found at heterozygous genotypes set het_rndlow to 1 #
elif site.split()[5] == "1" and random.randint(het_rndlow, 1) == 1:
list_derived.append(site.split()[1])
# write list of sites with derived allele to file #
print(sample, "has", len(list_derived), "sites with derived allele", Style.RESET_ALL)
if het_rndlow == 1: het_status = "allhet_"
else: het_status = ""
with open(outdir + "derived_alleles_" + het_status + sample + ".snps", "w") as outfile:
outfile.write("\n".join(list_derived))
# delete intermediate files #
subprocess.run(["rm", outdir + sample + ".log", outdir + sample + ".frqx", outdir + sample + ".nosex", outdir + sample + ".txt"])
#
###
# initialize the parser #
parser = argparse.ArgumentParser(
description = "Identify sites with derived allele - You must first create a plink bynary file where the ancestral allele is set to A1 (e.g. by using plink --reference-allele)"
)
# add arguments #
parser.add_argument("-d", "--dataset", required = True, metavar = "[file prefix]",
help = "prefix of input plink files")
group = parser.add_mutually_exclusive_group(required = True)
group.add_argument("-i", "--indlist", metavar = "[file]",
help = "list of individuals for which derived alleles have to be identified, "
"one per line with FID and IID of plink .fam file "
"(tip: add Ancestor.REF Ancestor.REF as a safe check)")
group.add_argument("-p", "--poplist", metavar = "[file]",
help = "list of populations for which mean number of derived alleles needs to be computed, "
"one per line and equal to FID of plink .fam file "
"(tip: add Ancestor.REF as a safe check)")
parser.add_argument("-a", "--all", metavar = "", action = "store_const", const = 1, default = 0, dest = "rndlow",
help = "keep all derived sites in diploid individuals"
"[default behaviour: keep derived sites at heterozygous sites with 0.5 probability]")
# parse the arguments #
args = parser.parse_args()
###
# if poplist is provided indentify derived alleles for every individual of each of the populations provided #
# create a directory for each population to store individual results #
if args.poplist != None:
with open(args.poplist) as poplist:
for pop in poplist:
pop = pop.splitlines()[0]
print(Fore.RED + Style.BRIGHT + "\n\nAnalysing ", pop, ":", Style.RESET_ALL)
subprocess.run(["mkdir", pop])
# get list of individuals for each pop #
out = subprocess.run(["grep", "-w", "^" + pop, args.dataset + ".fam"], capture_output = True, text = True)
# write indlist for list_derived_ind() function #
with open(pop + "/indlist.txt", "w") as outfile:
outfile.write(out.stdout)
# get list of derived allele for each ind in the population #
with open(pop + "/indlist.txt") as indlist:
for ind in indlist:
list_derived_ind(ind, args.dataset, pop + "/", args.rndlow)
# if indlist is provided identify derived allele for each individual #
else:
with open(args.indlist) as indlist:
for ind in indlist:
list_derived_ind(ind, args.dataset, het_rndlow = args.rndlow)
#