Skip to content

Commit

Permalink
Merge pull request #14 from remiolsen/dev
Browse files Browse the repository at this point in the history
0.4.0 Changes
  • Loading branch information
remiolsen authored Jul 27, 2021
2 parents e101131 + 4ac04ec commit c8f7d66
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 23 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/anglerfish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@ jobs:
steps:
# Checkout code and install miniconda + environment
- uses: actions/checkout@v2
- uses: goanpeca/setup-miniconda@v1
- uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: anglerfish
environment-file: environment.yml
python-version: ${{ matrix.python-version }}
auto-update-conda: true
auto-activate-base: false

# Install Anglerfish
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Python modules:

* biopython v. 1.70
* python-levenshtein v. 0.12.0
* numpy v. 1.19.2

Software:

Expand Down Expand Up @@ -98,7 +99,7 @@ anglerfish.py -o /path/to/samples.csv
--skip_demux, -c Only do BC counting and not demuxing
--skip_fastqc, -f After demuxing, skip running FastQC+MultiQC
--max-distance MAX_DISTANCE, -m MAX_DISTANCE
Maximum edit distance for BC matching (default: 2)
Manually adjust maximum edit distance for BC matching
```

### Output files
Expand Down
48 changes: 43 additions & 5 deletions anglerfish.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@
import logging
import sys
import os
import json
import pkg_resources
import numpy as np
from datetime import datetime as dt
from itertools import groupby
from collections import Counter
from demux.demux import run_minimap2, parse_paf_lines, layout_matches, cluster_matches, write_demuxedfastq, run_fastqc
from demux.samplesheet import SampleSheet
import gzip
Expand All @@ -20,8 +23,15 @@ def run_demux(args):
os.mkdir(args.out_fastq)
ss = SampleSheet(args.samplesheet)
version = pkg_resources.get_distribution("anglerfish").version

log.info(" version {}".format(version))
log.info(" arguments {}".format(vars(args)))
bc_dist = ss.minimum_bc_distance()
if bc_dist == 0:
log.error("There is one or more identical barcodes in the input samplesheet. Aborting!")
exit()
if args.max_distance == None:
args.max_distance = bc_dist - 1
log.info("Using maximum edit distance of {}".format(args.max_distance))
if args.max_distance >= bc_dist:
log.error(" Edit distance of barcodes in samplesheet are less than the minimum specified {}>={}".format(args.max_distance, bc_dist))
exit()
Expand All @@ -36,6 +46,7 @@ def run_demux(args):

paf_stats = {}
sample_stats = []
unmatched_stats = []
out_fastqs = []
all_samples = []

Expand All @@ -62,6 +73,7 @@ def run_demux(args):

fragments, singletons, concats, unknowns = layout_matches(adaptor_name+"_i5",adaptor_name+"_i7",paf_entries)
total = len(fragments)+len(singletons)+len(concats)+len(unknowns)

paf_stats[aln_name] = []
paf_stats[aln_name].append(aln_name+":")
paf_stats[aln_name].append("{}\tinput reads".format(fq_entries))
Expand All @@ -70,15 +82,24 @@ def run_demux(args):
paf_stats[aln_name].append("{}\taligned reads matching only I7 or I5 adaptor ({:.2f}%)".format(len(singletons), (len(singletons)/float(total)*100)))
paf_stats[aln_name].append("{}\taligned reads matching multiple I7/I5 adaptor pairs ({:.2f}%)".format(len(concats), (len(concats)/float(total)*100)))
paf_stats[aln_name].append("{}\taligned reads with uncategorized alignments ({:.2f}%)".format(len(unknowns), (len(unknowns)/float(total)*100)))
matches = cluster_matches(adaptors_sorted[key], adaptor_name, fragments, args.max_distance)
no_matches, matches = cluster_matches(adaptors_sorted[key], adaptor_name, fragments, args.max_distance)

aligned_samples = []
for k, v in groupby(sorted(matches,key=lambda x: x[3]), key=lambda y: y[3]):
aligned_samples.append(k)
fq_name = os.path.join(args.out_fastq, k+".fastq.gz")
out_fastqs.append(fq_name)
sample_dict = {i[0]: [i] for i in v}
sample_stats.append("{}\t{}".format(k, len(sample_dict.keys())))

# Find read lengths
rlens = np.array([])
for l,w in sample_dict.items():
for i in w:
rlens = np.append(rlens, i[2]-i[1])
rmean = np.round(np.mean(rlens),2)
rstd = np.round(np.std(rlens),2)

sample_stats.append("{}\t{}\t{}\t{}".format(k, len(sample_dict.keys()), rmean, rstd))
if not args.skip_demux:
write_demuxedfastq(sample_dict, fastq_path, fq_name)

Expand All @@ -88,13 +109,30 @@ def run_demux(args):
if ss_adaptor.name == adaptor and ss_sample not in aligned_samples:
sample_stats.append("{}\t0".format(ss_sample))

# Top unmatched indexes
nomatch_count = Counter([x[3] for x in no_matches])
unmatched_stats.append(nomatch_count.most_common(10))

header1 = ["sample_name","#reads","mean_read_len","std_read_len"]
header2 = ["undetermined_index","count"]
json_out = {"angerfish_version":version,"paf_stats": [], "sample_stats": [], "undetermined": []}
with open(os.path.join(args.out_fastq,"anglerfish_stats.txt"), "w") as f:
f.write("Anglerfish v. "+version+"\n===================\n")
for key, line in paf_stats.items():
f.write("\n".join(line)+"\n")
f.write("\nsample_name\t#reads\n")
json_out["paf_stats"].append(line)
f.write("\n{}\n".format("\t".join(header1)))
for sample in sample_stats:
f.write(sample+"\n")
json_out["sample_stats"].append(dict(zip(header1,sample.split("\t"))))
f.write("\n{}\n".format("\t".join(header2)))
for unmatch in unmatched_stats:
for idx, mnum in unmatch:
f.write("{}\t{}\n".format(idx, mnum))
json_out["undetermined"].append(dict(zip(header2,[idx, mnum])))

with open(os.path.join(args.out_fastq,"anglerfish_stats.json"), "w") as f:
f.write(json.dumps(json_out,indent=2, sort_keys=True))
if not args.skip_fastqc and not args.skip_demux:
fastqc = run_fastqc(out_fastqs, args.out_fastq, args.threads)

Expand All @@ -106,7 +144,7 @@ def run_demux(args):
parser.add_argument('--threads', '-t', default=4, help='Number of threads to use (default: 4)')
parser.add_argument('--skip_demux', '-c', action='store_true', help='Only do BC counting and not demuxing')
parser.add_argument('--skip_fastqc', '-f', action='store_true', help='After demuxing, skip running FastQC+MultiQC')
parser.add_argument('--max-distance', '-m', default=2, type=int, help='Maximum edit distance for BC matching (default: 2)')
parser.add_argument('--max-distance', '-m', type=int, help='Manually adjust maximum edit distance for BC matching')
parser.add_argument('--debug', '-d', action='store_true', help='Extra commandline output')
args = parser.parse_args()
utcnow = dt.utcnow()
Expand Down
26 changes: 13 additions & 13 deletions demux/demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def parse_cs(cs_string, index, max_distance):
nts = "".join(re.findall(nt, cs_string))

# Allow for mismatches
return lev.distance(index.lower(), nts)
return nts, lev.distance(index.lower(), nts)

def run_fastqc(fastqs, out_path, threads):
"""
Expand Down Expand Up @@ -157,7 +157,7 @@ def layout_matches(i5_name, i7_name, paf_entries):
def cluster_matches(sample_adaptor, adaptor_name, matches, max_distance):

# Only illumina fragments
matched = {}; matched_bed = []; unmatched = {}
matched = {}; matched_bed = []; unmatched_bed = []
for read, alignments in matches.items():

i5 = False
Expand All @@ -173,35 +173,35 @@ def cluster_matches(sample_adaptor, adaptor_name, matches, max_distance):
continue

dists = []
fi5 = ""; fi7 = ""
for sample, adaptor in sample_adaptor:
try:
d1 = parse_cs(i5['cs'], adaptor.i5_index, max_distance)
fi5, d1 = parse_cs(i5['cs'], adaptor.i5_index, max_distance)
except AttributeError:
d1 = 0 # presumably there it's single index
d2 = parse_cs(i7['cs'], adaptor.i7_index, max_distance)
fi7, d2 = parse_cs(i7['cs'], adaptor.i7_index, max_distance)
dists.append(d1+d2)

index_min = min(range(len(dists)), key=dists.__getitem__)
# Test if two samples in the sheet is equidistant to the i5/i7
if len([i for i, j in enumerate(dists) if j==dists[index_min]]) > 1:
log.debug(" Ambiguous alignment, skipping")
unmatched[read] = alignments
continue
if dists[index_min] > max_distance:
log.debug(" No match")
unmatched[read] = alignments
continue

start_insert = min(i5['rend'],i7['rend'])
end_insert = max(i7['rstart'],i5['rstart'])
if end_insert - start_insert < 10:
log.debug(" Erroneous / overlapping adaptor matches")
unmatched[read] = alignments
continue

if dists[index_min] > max_distance:
log.debug(" No match")
# Find only full length i7(+i5) adaptor combos. Basically a list of "known unknowns"
if len(fi7) + len(fi5) == len(adaptor.i7_index or "") + len(adaptor.i5_index or ""):
fi75 = "+".join([i for i in [fi7, fi5] if not i == ""])
unmatched_bed.append([read, start_insert, end_insert, fi75, "999", "."])
continue
matched[read] = alignments
matched_bed.append([read, start_insert, end_insert, sample_adaptor[index_min][0], "999", "."])
return matched_bed
return unmatched_bed, matched_bed



Expand Down
3 changes: 2 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ channels:
dependencies:
- conda-forge::biopython=1.70
- conda-forge::python-levenshtein=0.12.0
- conda-forge::numpy=1.19.2
- bioconda::minimap2=2.17
- bioconda::fastqc=0.11.9
- bioconda::multiqc=1.8
- bioconda::multiqc=1.9
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

setup(
name='anglerfish',
version='0.3.0',
version='0.4.0',
description='Anglerfish, a tool to demultiplex Illumina libraries from ONT data',
author='Remi-Andre Olsen',
author_email='[email protected]',
Expand All @@ -13,7 +13,8 @@
packages = find_packages(),
install_requires=[
'python-levenshtein',
'biopython'
'biopython',
'numpy'
],
scripts=['./anglerfish.py'],
zip_safe=False,
Expand Down

0 comments on commit c8f7d66

Please sign in to comment.