Skip to content

Commit

Permalink
Merge pull request #9 from remiolsen/dev
Browse files Browse the repository at this point in the history
0.3.0 release
  • Loading branch information
remiolsen authored Apr 27, 2020
2 parents 8d30040 + 5ec43ec commit e101131
Show file tree
Hide file tree
Showing 7 changed files with 378 additions and 165 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/anglerfish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:

steps:
# Checkout code and install miniconda + environment
- uses: actions/checkout@v1
- uses: actions/checkout@v2
- uses: goanpeca/setup-miniconda@v1
with:
activate-environment: anglerfish
Expand All @@ -32,4 +32,4 @@ jobs:
# Run anglerfish using test data
- shell: bash -l {0}
name: Run anglerfish.py with test data
run: anglerfish.py -i test/BC18_P14351_1001.fastq.gz -s test/samples.csv
run: anglerfish.py -s test/samples.csv
25 changes: 14 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
[![Anglerfish CI Status](https://github.com/remiolsen/anglerfish/workflows/Anglerfish/badge.svg)](https://github.com/remiolsen/anglerfish/actions)
[![Docker Container available](https://img.shields.io/docker/automated/remiolsen/anglerfish.svg)](https://hub.docker.com/r/remiolsen/anglerfish/)

![Anglerfish](docs/Anglerfish_logo.svg)

## Introduction

Expand Down Expand Up @@ -65,26 +64,26 @@ Anglerfish requires two files to run.
Example of a samplesheet file:

```
P12864_201,truseq_dual,TAATGCGC,CAGGACGT
P12864_202,truseq_dual,TAATGCGC,GTACTGAC
P9712_101, truseq_dual,ATTACTCG,TATAGCCT
P9712_102, truseq_dual,ATTACTCG,ATAGAGGC
P9712_103, truseq_dual,ATTACTCG,CCTATCCT
P9712_104, truseq_dual,ATTACTCG,GGCTCTGA
P9712_105, truseq_dual,ATTACTCG,AGGCGAAG
P9712_106, truseq_dual,ATTACTCG,TAATCTTA
P12864_201,truseq_dual,TAATGCGC-CAGGACGT,/path/to/ONTreads.fastq.gz
P12864_202,truseq_dual,TAATGCGC-GTACTGAC,/path/to/ONTreads.fastq.gz
P9712_101, truseq_dual,ATTACTCG-TATAGCCT,/path/to/ONTreads.fastq.gz
P9712_102, truseq_dual,ATTACTCG-ATAGAGGC,/path/to/ONTreads.fastq.gz
P9712_103, truseq_dual,ATTACTCG-CCTATCCT,/path/to/ONTreads.fastq.gz
P9712_104, truseq_dual,ATTACTCG-GGCTCTGA,/path/to/ONTreads.fastq.gz
P9712_105, truseq_dual,ATTACTCG-AGGCGAAG,/path/to/ONTreads.fastq.gz
P9712_106, truseq_dual,ATTACTCG-TAATCTTA,/path/to/ONTreads.fastq.gz
```

Or using single index:

```
P12345_101,truseq,CAGGACGT
P12345_101,truseq,CAGGACGT,/path/to/ONTreads.fastq.gz
```

Then run:

```
anglerfish.py -i /path/to/ONTreads.fastq.gz -o /path/to/samples.csv
anglerfish.py -o /path/to/samples.csv
```

### Optional
Expand Down Expand Up @@ -115,3 +114,7 @@ In folder `anglerfish_????_??_??_?????/`

The Anglerfish code was written by [@remiolsen](https://github.com/remiolsen) but it would not exist without the contributions of [@FranBonath](https://github.com/FranBonath), [@taborsak](https://github.com/taborsak), [@ssjunnebo](https://github.com/ssjunnebo) and Carl Rubin.
Also, the [Anglerfish logo](docs/Anglerfish_logo.svg) was designed by [@FranBonath](https://github.com/FranBonath).

<p align="center">
<img src="docs/Anglerfish_logo.svg">
</p>
95 changes: 53 additions & 42 deletions anglerfish.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@

def run_demux(args):

adaptor_path = os.path.join(args.out_fastq,"adaptors.fasta")
aln_path = os.path.join(args.out_fastq,"out.paf")
os.mkdir(args.out_fastq)
ss = SampleSheet(args.samplesheet)
version = pkg_resources.get_distribution("anglerfish").version
Expand All @@ -29,67 +27,82 @@ def run_demux(args):
exit()
log.debug("Samplesheet bc_dist == {}".format(bc_dist))

# TODO: split into several adaptor files and run minimap more than once
with open(adaptor_path, "w") as f:
f.write(ss.get_fastastring())
retcode = run_minimap2(args.in_fastq, adaptor_path, aln_path, args.threads)

# Easy line count in input fastqfile
fq_entries = 0
with gzip.open(args.in_fastq, 'rb') as f:
for i in f:
fq_entries += 1
fq_entries = int(fq_entries / 4)

# Sort the adaptors by type and size
adaptors_t = [adaptor.name for sample, adaptor in ss]
adaptors_t = [(adaptor.name, os.path.abspath(fastq)) for sample, adaptor, fastq in ss]
adaptor_set = set(adaptors_t)
adaptors_sorted = dict([(i, []) for i in adaptor_set])
for sample, adaptor in ss:
adaptors_sorted[adaptor.name].append((sample, adaptor))
for sample, adaptor, fastq in ss:
adaptors_sorted[(adaptor.name, os.path.abspath(fastq))].append((sample, adaptor))

paf_entries = parse_paf_lines(aln_path)
paf_stats = {}
sample_stats = []
out_fastqs = []
stats = ["Anglerfish v. "+version, "===================",""]
for adaptor in adaptor_set:
fragments, singletons, concats, unknowns = layout_matches(adaptor+"_i5",adaptor+"_i7",paf_entries)
all_samples = []

for key, sample in adaptors_sorted.items():

adaptor_name, fastq_path = key
sample0_name, adaptor0_object = sample[0]

aln_name = "{}_{}".format(adaptor_name,os.path.basename(fastq_path).split('.')[0])
assert aln_name not in paf_stats, "Input fastq files can not have the same filename"
aln_path = os.path.join(args.out_fastq, "{}.paf".format(aln_name))
adaptor_path = os.path.join(args.out_fastq,"{}.fasta".format(adaptor_name))
with open(adaptor_path, "w") as f:
f.write(ss.get_fastastring(adaptor_name))
retcode = run_minimap2(fastq_path, adaptor_path, aln_path, args.threads)

# Easy line count in input fastqfile
fq_entries = 0
with gzip.open(fastq_path, 'rb') as f:
for i in f:
fq_entries += 1
fq_entries = int(fq_entries / 4)
paf_entries = parse_paf_lines(aln_path)

fragments, singletons, concats, unknowns = layout_matches(adaptor_name+"_i5",adaptor_name+"_i7",paf_entries)
total = len(fragments)+len(singletons)+len(concats)+len(unknowns)
stats.append(adaptor+":")
stats.append("{}\tinput reads".format(fq_entries))
stats.append("{}\treads aligning to adaptor sequences ({:.2f}%)".format(total, (total/float(fq_entries)*100)))
stats.append("{}\taligned reads matching both I7 and I5 adaptor ({:.2f}%)".format(len(fragments), (len(fragments)/float(total)*100)))
stats.append("{}\taligned reads matching only I7 or I5 adaptor ({:.2f}%)".format(len(singletons), (len(singletons)/float(total)*100)))
stats.append("{}\taligned reads matching multiple I7/I5 adaptor pairs ({:.2f}%)".format(len(concats), (len(concats)/float(total)*100)))
stats.append("{}\taligned reads with uncategorized alignments ({:.2f}%)".format(len(unknowns), (len(unknowns)/float(total)*100)))
matches = cluster_matches(adaptors_sorted[adaptor], adaptor, fragments, args.max_distance)
stats.append("")
stats.append("sample_name\t#reads")
paf_stats[aln_name] = []
paf_stats[aln_name].append(aln_name+":")
paf_stats[aln_name].append("{}\tinput reads".format(fq_entries))
paf_stats[aln_name].append("{}\treads aligning to adaptor sequences ({:.2f}%)".format(total, (total/float(fq_entries)*100)))
paf_stats[aln_name].append("{}\taligned reads matching both I7 and I5 adaptor ({:.2f}%)".format(len(fragments), (len(fragments)/float(total)*100)))
paf_stats[aln_name].append("{}\taligned reads matching only I7 or I5 adaptor ({:.2f}%)".format(len(singletons), (len(singletons)/float(total)*100)))
paf_stats[aln_name].append("{}\taligned reads matching multiple I7/I5 adaptor pairs ({:.2f}%)".format(len(concats), (len(concats)/float(total)*100)))
paf_stats[aln_name].append("{}\taligned reads with uncategorized alignments ({:.2f}%)".format(len(unknowns), (len(unknowns)/float(total)*100)))
matches = cluster_matches(adaptors_sorted[key], adaptor_name, fragments, args.max_distance)

aligned_samples = []
for k, v in groupby(sorted(matches,key=lambda x: x[3]), key=lambda y: y[3]):
aligned_samples.append(k)
fq_name = os.path.join(args.out_fastq, k+".fastq.gz")
out_fastqs.append(fq_name)
sample_dict = {i[0]: [i] for i in v}
stats.append("{}\t{}".format(k, len(sample_dict.keys())))
sample_stats.append("{}\t{}".format(k, len(sample_dict.keys())))
if not args.skip_demux:
write_demuxedfastq(sample_dict, args.in_fastq, fq_name)
write_demuxedfastq(sample_dict, fastq_path, fq_name)

all_samples.extend(aligned_samples)
# Check if there were samples in the samplesheet without adaptor alignments
for ss_sample, ss_adaptor in ss:
for ss_sample, ss_adaptor, ss_path in ss:
if ss_adaptor.name == adaptor and ss_sample not in aligned_samples:
stats.append("{}\t0".format(ss_sample))
sample_stats.append("{}\t0".format(ss_sample))

with open(os.path.join(args.out_fastq,"anglerfish_stats.txt"), "w") as f:
for i in stats:
f.write(i+"\n")
log.info(i)
f.write("Anglerfish v. "+version+"\n===================\n")
for key, line in paf_stats.items():
f.write("\n".join(line)+"\n")
f.write("\nsample_name\t#reads\n")
for sample in sample_stats:
f.write(sample+"\n")
if not args.skip_fastqc and not args.skip_demux:
fastqc = run_fastqc(out_fastqs, args.out_fastq, args.threads)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Tools to demux I7 and I5 barcodes when sequenced by single-molecules')
parser.add_argument('--in_fastq', '-i', required=True, help='Input ONT fastq file')
parser.add_argument('--out_fastq', '-o', default='.', help='Analysis output folder (default: Current dir)')
parser.add_argument('--samplesheet', '-s', required=True, help='CSV formatted list of samples and barcodes')
parser.add_argument('--out_fastq', '-o', default='.', help='Analysis output folder (default: Current dir)')
parser.add_argument('--threads', '-t', default=4, help='Number of threads to use (default: 4)')
parser.add_argument('--skip_demux', '-c', action='store_true', help='Only do BC counting and not demuxing')
parser.add_argument('--skip_fastqc', '-f', action='store_true', help='After demuxing, skip running FastQC+MultiQC')
Expand All @@ -100,9 +113,7 @@ def run_demux(args):
runname = utcnow.strftime("anglerfish_%Y_%m_%d_%H%M%S")

assert os.path.exists(args.out_fastq)
assert os.path.exists(args.in_fastq)
assert os.path.exists(args.samplesheet)
args.out_fastq = os.path.join(os.path.abspath(args.out_fastq),runname)
args.in_fastq = os.path.abspath(args.in_fastq)
args.samplesheet = os.path.abspath(args.samplesheet)
run_demux(args)
70 changes: 48 additions & 22 deletions demux/samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import print_function
import csv
import Levenshtein as lev
import os
from itertools import combinations

adaptors = {
Expand Down Expand Up @@ -57,49 +58,74 @@ class SampleSheet(object):
def __init__(self, input_csv):

# Read samplesheet in format:
# sample_name, adaptors, i7_index, i5_index
# sample_name, adaptors, i7_index(-i5_index), fastq_path

self.samplesheet = []
try:
csvfile = open(input_csv, "r")
dialect = csv.Sniffer().sniff(csvfile.readline(), [',',';','\t'])
csvfile.seek(0)
data = csv.DictReader(csvfile,
fieldnames=['sample_name', 'adaptors', 'i7_index', 'i5_index'], dialect=dialect)
fieldnames=['sample_name', 'adaptors', 'index', 'fastq_path'], dialect=dialect)
rn = 1
for row in data:
if row['adaptors'] not in adaptors:
raise UserWarning("'{}' not in the list of valid adaptors: {}".format(row['adaptors'],adaptors.keys()))
self.samplesheet.append((row['sample_name'], Adaptor(row['adaptors'], row['i7_index'], row['i5_index'])))
i7i5 = row["index"].split("-")
i7 = i7i5[0]; i5 = None
if len(i7i5) > 1:
i5 = i7i5[1]

sample_name = row['sample_name']
# TODO: find a more clever way of resolving duplicate names
if row['sample_name'] in [sn[0] for sn in self.samplesheet]:
sample_name = sample_name+"_row"+str(rn)
assert os.path.exists(row['fastq_path']) == 1
self.samplesheet.append((sample_name, Adaptor(row['adaptors'], i7, i5),row['fastq_path']))
rn += 1
except:
raise
finally:
csvfile.close()

def minimum_bc_distance(self):

testset = []
for sample, adaptor in self.samplesheet:
if adaptor.i5_index is not None:
testset.append(adaptor.i5_index+adaptor.i7_index)
ss_by_fastq = {}
testset = {}
for _, adaptor, fastq in self.samplesheet:
if fastq in ss_by_fastq:
ss_by_fastq[fastq].append(adaptor)
else:
testset.append(adaptor.i7_index)


distances=[]
if len(testset) == 1:
distances = [len(testset[0])]
else:
for a, b in [i for i in combinations(testset, 2)]:
distances.append(lev.distance(a,b))

return min(distances)
ss_by_fastq[fastq] = [adaptor]

for fastq, adaptors in ss_by_fastq.items():
testset[fastq] = []
for adaptor in adaptors:
if adaptor.i5_index is not None:
testset[fastq].append(adaptor.i5_index+adaptor.i7_index)
else:
testset[fastq].append(adaptor.i7_index)

fq_distances=[]
for fastq, adaptors in testset.items():
distances = []
if len(adaptors) == 1:
distances = [len(adaptors[0])]
else:
for a, b in [i for i in combinations(adaptors, 2)]:
distances.append(lev.distance(a,b))
fq_distances.append(min(distances))
return min(fq_distances)

def get_fastastring(self):
def get_fastastring(self, adaptor_name=None):

fastas = {}
for sample, adaptor in self.samplesheet:
fastas[adaptor.name+"_i7"] = adaptor.get_i7_mask()
fastas[adaptor.name+"_i5"] = adaptor.get_i5_mask()
for sample, adaptor, fastq in self.samplesheet:
if adaptor.name == adaptor_name or adaptor_name is None:
fastas[adaptor.name+"_i7"] = adaptor.get_i7_mask()
fastas[adaptor.name+"_i5"] = adaptor.get_i5_mask()

assert len(fastas) > 0

outstr = ""
for key, seq in fastas.items():
Expand Down
Loading

0 comments on commit e101131

Please sign in to comment.