-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerateRandReads.py
79 lines (58 loc) · 2.29 KB
/
generateRandReads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Generates minHashes for a dataset and stores the corresponding minHash Matrix in dataset/minHashes/minHashArr.txt
from Bio.SeqIO.FastaIO import SimpleFastaParser
import mmh3
import itertools
import multiprocessing as mp
import numpy as np
import os, pickle
import argparse
from tqdm import tqdm
def runSim(args):
i = args[0]
dataset, numHashes = args[1:]
symLength = 7 # k in k-mer
hashOut = []
np.random.seed(i)
rand_kmers = np.random.choice(list(kmer_dict.keys()),size = avgReadLen-symLength+1, replace=True, p=ps)
for j in range(numHashes):
hashLst = [mmh3.hash(sym, j, signed=False) for sym in rand_kmers]
hashOut.append(min(hashLst))
minHashes = np.array(hashOut)
minHashes.dump(dataset+"randReads/randMinHashes_{}.txt".format(i))
ap = argparse.ArgumentParser(description="Reproduce the experiments in the manuscript")
ap.add_argument("--dataset", help="Folder to dataset eg. /data/MAB_alignment/ecoli_simulated2")
ap.add_argument("--num_jobs", help="Num of parallel experiments", type=int, default=32 )
ap.add_argument("--num_reads", help="Number of desired random calibration reads", type=int, default=5 )
ap.add_argument("--num_hashes", help="Number of hashes to compute", type=int,default=1000)
args = ap.parse_args()
dataset = args.dataset
numReads = args.num_reads
numHashes = args.num_hashes
num_jobs = min(args.num_jobs,numReads)
if dataset[-1] != '/':
dataset += '/'
reads_lst = []
with open(dataset+"reads.fasta") as handle:
for values in SimpleFastaParser(handle):
reads_lst.append(values[1])
n = len(reads_lst)
avgReadLen = int(np.mean([len(x) for x in reads_lst]))
if not os.path.isdir(dataset+"/randReads"):
print("Creating randReads folder")
os.mkdir(dataset+"/randReads")
## generate kmer frequencies for selecting kmers from dataset dist
print("Generating kmer frequencies")
kmer_dict = {}
k = 7
for ref_read in tqdm(reads_lst):
sub_strings = [ref_read[i:i+k] for i in range(len(ref_read)-k)]
for sub in sub_strings:
kmer_dict.setdefault(sub,0)
kmer_dict[sub] += 1
ps = np.array(list(kmer_dict.values()))
ps = ps/ps.sum()
print("parallelizing randRead generation")
pool = mp.Pool(processes=num_jobs)
arg_tuple = itertools.product(range(numReads), [dataset], [numHashes])
for _ in tqdm(pool.imap_unordered(runSim, arg_tuple), total=numReads):
pass