Merge pull request #14 from remiolsen/dev

0.4.0 Changes
NationalGenomicsInfrastructure · Jul 27, 2021 · c8f7d66 · c8f7d66
2 parents e101131 + 4ac04ec
commit c8f7d66
Show file tree

Hide file tree

Showing 6 changed files with 65 additions and 23 deletions.
diff --git a/.github/workflows/anglerfish.yml b/.github/workflows/anglerfish.yml
@@ -12,11 +12,12 @@ jobs:
  steps:
  # Checkout code and install miniconda + environment
  - uses: actions/checkout@v2
- - uses: goanpeca/setup-miniconda@v1
+ - uses: conda-incubator/setup-miniconda@v2
  with:
  activate-environment: anglerfish
  environment-file: environment.yml
  python-version: ${{ matrix.python-version }}
+ auto-update-conda: true
  auto-activate-base: false
 
  # Install Anglerfish

diff --git a/README.md b/README.md
@@ -22,6 +22,7 @@ Python modules:
 
 * biopython v. 1.70
 * python-levenshtein v. 0.12.0
+* numpy v. 1.19.2
 
 Software:
 
@@ -98,7 +99,7 @@ anglerfish.py -o /path/to/samples.csv
 --skip_demux, -c Only do BC counting and not demuxing
 --skip_fastqc, -f After demuxing, skip running FastQC+MultiQC
 --max-distance MAX_DISTANCE, -m MAX_DISTANCE
- Maximum edit distance for BC matching (default: 2)
+ Manually adjust maximum edit distance for BC matching
 ```
 
 ### Output files

diff --git a/anglerfish.py b/anglerfish.py
@@ -5,9 +5,12 @@
 import logging
 import sys
 import os
+import json
 import pkg_resources
+import numpy as np
 from datetime import datetime as dt
 from itertools import groupby
+from collections import Counter
 from demux.demux import run_minimap2, parse_paf_lines, layout_matches, cluster_matches, write_demuxedfastq, run_fastqc
 from demux.samplesheet import SampleSheet
 import gzip
@@ -20,8 +23,15 @@ def run_demux(args):
  os.mkdir(args.out_fastq)
  ss = SampleSheet(args.samplesheet)
  version = pkg_resources.get_distribution("anglerfish").version
-
+ log.info(" version {}".format(version))
+ log.info(" arguments {}".format(vars(args)))
  bc_dist = ss.minimum_bc_distance()
+ if bc_dist == 0:
+ log.error("There is one or more identical barcodes in the input samplesheet. Aborting!")
+ exit()
+ if args.max_distance == None:
+ args.max_distance = bc_dist - 1
+ log.info("Using maximum edit distance of {}".format(args.max_distance))
  if args.max_distance >= bc_dist:
  log.error(" Edit distance of barcodes in samplesheet are less than the minimum specified {}>={}".format(args.max_distance, bc_dist))
  exit()
@@ -36,6 +46,7 @@ def run_demux(args):
 
  paf_stats = {}
  sample_stats = []
+ unmatched_stats = []
  out_fastqs = []
  all_samples = []
 
@@ -62,6 +73,7 @@ def run_demux(args):
 
  fragments, singletons, concats, unknowns = layout_matches(adaptor_name+"_i5",adaptor_name+"_i7",paf_entries)
  total = len(fragments)+len(singletons)+len(concats)+len(unknowns)
+
  paf_stats[aln_name] = []
  paf_stats[aln_name].append(aln_name+":")
  paf_stats[aln_name].append("{}\tinput reads".format(fq_entries))
@@ -70,15 +82,24 @@ def run_demux(args):
  paf_stats[aln_name].append("{}\taligned reads matching only I7 or I5 adaptor ({:.2f}%)".format(len(singletons), (len(singletons)/float(total)*100)))
  paf_stats[aln_name].append("{}\taligned reads matching multiple I7/I5 adaptor pairs ({:.2f}%)".format(len(concats), (len(concats)/float(total)*100)))
  paf_stats[aln_name].append("{}\taligned reads with uncategorized alignments ({:.2f}%)".format(len(unknowns), (len(unknowns)/float(total)*100)))
- matches = cluster_matches(adaptors_sorted[key], adaptor_name, fragments, args.max_distance)
+ no_matches, matches = cluster_matches(adaptors_sorted[key], adaptor_name, fragments, args.max_distance)
 
  aligned_samples = []
  for k, v in groupby(sorted(matches,key=lambda x: x[3]), key=lambda y: y[3]):
  aligned_samples.append(k)
  fq_name = os.path.join(args.out_fastq, k+".fastq.gz")
  out_fastqs.append(fq_name)
  sample_dict = {i[0]: [i] for i in v}
- sample_stats.append("{}\t{}".format(k, len(sample_dict.keys())))
+
+ # Find read lengths
+ rlens = np.array([])
+ for l,w in sample_dict.items():
+ for i in w:
+ rlens = np.append(rlens, i[2]-i[1])
+ rmean = np.round(np.mean(rlens),2)
+ rstd = np.round(np.std(rlens),2)
+
+ sample_stats.append("{}\t{}\t{}\t{}".format(k, len(sample_dict.keys()), rmean, rstd))
  if not args.skip_demux:
  write_demuxedfastq(sample_dict, fastq_path, fq_name)
 
@@ -88,13 +109,30 @@ def run_demux(args):
  if ss_adaptor.name == adaptor and ss_sample not in aligned_samples:
  sample_stats.append("{}\t0".format(ss_sample))
 
+ # Top unmatched indexes
+ nomatch_count = Counter([x[3] for x in no_matches])
+ unmatched_stats.append(nomatch_count.most_common(10))
+
+ header1 = ["sample_name","#reads","mean_read_len","std_read_len"]
+ header2 = ["undetermined_index","count"]
+ json_out = {"angerfish_version":version,"paf_stats": [], "sample_stats": [], "undetermined": []}
  with open(os.path.join(args.out_fastq,"anglerfish_stats.txt"), "w") as f:
  f.write("Anglerfish v. "+version+"\n===================\n")
  for key, line in paf_stats.items():
  f.write("\n".join(line)+"\n")
- f.write("\nsample_name\t#reads\n")
+ json_out["paf_stats"].append(line)
+ f.write("\n{}\n".format("\t".join(header1)))
  for sample in sample_stats:
  f.write(sample+"\n")
+ json_out["sample_stats"].append(dict(zip(header1,sample.split("\t"))))
+ f.write("\n{}\n".format("\t".join(header2)))
+ for unmatch in unmatched_stats:
+ for idx, mnum in unmatch:
+ f.write("{}\t{}\n".format(idx, mnum))
+ json_out["undetermined"].append(dict(zip(header2,[idx, mnum])))
+
+ with open(os.path.join(args.out_fastq,"anglerfish_stats.json"), "w") as f:
+ f.write(json.dumps(json_out,indent=2, sort_keys=True))
  if not args.skip_fastqc and not args.skip_demux:
  fastqc = run_fastqc(out_fastqs, args.out_fastq, args.threads)
 
@@ -106,7 +144,7 @@ def run_demux(args):
  parser.add_argument('--threads', '-t', default=4, help='Number of threads to use (default: 4)')
  parser.add_argument('--skip_demux', '-c', action='store_true', help='Only do BC counting and not demuxing')
  parser.add_argument('--skip_fastqc', '-f', action='store_true', help='After demuxing, skip running FastQC+MultiQC')
- parser.add_argument('--max-distance', '-m', default=2, type=int, help='Maximum edit distance for BC matching (default: 2)')
+ parser.add_argument('--max-distance', '-m', type=int, help='Manually adjust maximum edit distance for BC matching')
  parser.add_argument('--debug', '-d', action='store_true', help='Extra commandline output')
  args = parser.parse_args()
  utcnow = dt.utcnow()

diff --git a/demux/demux.py b/demux/demux.py
@@ -25,7 +25,7 @@ def parse_cs(cs_string, index, max_distance):
  nts = "".join(re.findall(nt, cs_string))
 
  # Allow for mismatches
- return lev.distance(index.lower(), nts)
+ return nts, lev.distance(index.lower(), nts)
 
 def run_fastqc(fastqs, out_path, threads):
  """
@@ -157,7 +157,7 @@ def layout_matches(i5_name, i7_name, paf_entries):
 def cluster_matches(sample_adaptor, adaptor_name, matches, max_distance):
 
  # Only illumina fragments
- matched = {}; matched_bed = []; unmatched = {}
+ matched = {}; matched_bed = []; unmatched_bed = []
  for read, alignments in matches.items():
 
  i5 = False
@@ -173,35 +173,35 @@ def cluster_matches(sample_adaptor, adaptor_name, matches, max_distance):
  continue
 
  dists = []
+ fi5 = ""; fi7 = ""
  for sample, adaptor in sample_adaptor:
  try:
- d1 = parse_cs(i5['cs'], adaptor.i5_index, max_distance)
+ fi5, d1 = parse_cs(i5['cs'], adaptor.i5_index, max_distance)
  except AttributeError:
  d1 = 0 # presumably there it's single index
- d2 = parse_cs(i7['cs'], adaptor.i7_index, max_distance)
+ fi7, d2 = parse_cs(i7['cs'], adaptor.i7_index, max_distance)
  dists.append(d1+d2)
 
  index_min = min(range(len(dists)), key=dists.__getitem__)
  # Test if two samples in the sheet is equidistant to the i5/i7
  if len([i for i, j in enumerate(dists) if j==dists[index_min]]) > 1:
  log.debug(" Ambiguous alignment, skipping")
- unmatched[read] = alignments
  continue
- if dists[index_min] > max_distance:
- log.debug(" No match")
- unmatched[read] = alignments
- continue
-
  start_insert = min(i5['rend'],i7['rend'])
  end_insert = max(i7['rstart'],i5['rstart'])
  if end_insert - start_insert < 10:
  log.debug(" Erroneous / overlapping adaptor matches")
- unmatched[read] = alignments
  continue
-
+ if dists[index_min] > max_distance:
+ log.debug(" No match")
+ # Find only full length i7(+i5) adaptor combos. Basically a list of "known unknowns"
+ if len(fi7) + len(fi5) == len(adaptor.i7_index or "") + len(adaptor.i5_index or ""):
+ fi75 = "+".join([i for i in [fi7, fi5] if not i == ""])
+ unmatched_bed.append([read, start_insert, end_insert, fi75, "999", "."])
+ continue
  matched[read] = alignments
  matched_bed.append([read, start_insert, end_insert, sample_adaptor[index_min][0], "999", "."])
- return matched_bed
+ return unmatched_bed, matched_bed
 
 
 

diff --git a/environment.yml b/environment.yml
@@ -5,6 +5,7 @@ channels:
 dependencies:
  - conda-forge::biopython=1.70
  - conda-forge::python-levenshtein=0.12.0
+ - conda-forge::numpy=1.19.2
  - bioconda::minimap2=2.17
  - bioconda::fastqc=0.11.9
- - bioconda::multiqc=1.8
+ - bioconda::multiqc=1.9
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 
 setup(
  name='anglerfish',
- version='0.3.0',
+ version='0.4.0',
  description='Anglerfish, a tool to demultiplex Illumina libraries from ONT data',
  author='Remi-Andre Olsen',
  author_email='[email protected]',
@@ -13,7 +13,8 @@
  packages = find_packages(),
  install_requires=[
  'python-levenshtein',
- 'biopython'
+ 'biopython',
+ 'numpy'
  ],
  scripts=['./anglerfish.py'],
  zip_safe=False,