Merge pull request #222 from golobor/master

stats: calculate the divergence point and corresponding read fractions
open2c · Apr 7, 2024 · 0c1a0c7 · 0c1a0c7
2 parents cff8226 + 00ef8e6
commit 0c1a0c7
Show file tree

Hide file tree

Showing 5 changed files with 488 additions and 179 deletions.
diff --git a/doc/protocols_pipelines.rst b/doc/protocols_pipelines.rst
@@ -9,7 +9,7 @@ Typical Hi-C Workflow
 ----------------------
 
 A typical pairtools workflow for processing standard Hi-C data is outlined below. 
-Please, note that this is a shorter version; you can find a more detailed and reproducible example in chapter :ref:`examples/pairtools_walkthrough`.
+Please, note that this is a shorter version. For a detailed reproducible example, please, check the Jupyter notebook "Pairtools Walkthrough".
 
 1. Align sequences to the reference genome with ``bwa mem``:
 
@@ -103,6 +103,7 @@ Technical tips
         bwa mem -SP index input.R1.fastq input.R2.fastq | \
         pairtools parse -c chromsizes.txt | \
         pairtools sort | \
+        pairtools dedup | \
             --output output.nodups.pairs.gz \
             --output-dups output.dups.pairs.gz \
             --output-unmapped output.unmapped.pairs.gz 
@@ -116,8 +117,9 @@ Technical tips
   Each pairtool has the CLI flags --nproc-in and --nproc-out to control the number of cores dedicated 
   to input decompression and output compression. Additionally, `pairtools sort` parallelizes sorting with `--nproc`.ß
 
-Example Workflows
+Advanced Workflows
 ------------------
+
 For more advanced workflows, please check the following projects:
 
 - `Distiller-nf <https://github.com/open2c/distiller-nf>`_ is a feature-rich Open2C Hi-C processing pipeline for the Nextflow workflow manager.

diff --git a/doc/stats.rst b/doc/stats.rst
@@ -14,7 +14,7 @@ output file.
 
 - **Global statistics** include:
     - number of pairs (total, unmapped, single-side mapped, etc.),
-    - total number of different pair types (UU, NN, NU, and others, see ` Pair types in pairtools docs <https://pairtools.readthedocs.io/en/latest/formats.html#pair-types>`_),
+    - total number of different pair types (UU, NN, NU, and others, see `Pair types in pairtools docs <https://pairtools.readthedocs.io/en/latest/formats.html#pair-types>`_),
     - number of contacts between all chromosome pairs
 
 - **Summary statistics** include:
@@ -59,17 +59,23 @@ replacement from a finite pool of fragments in DNA library [1]_ [2]_.
 With each new sequenced molecule, the expected number of observed unique molecules
 increases according to a simple equation:
 
-$$ U(N+1) = U(N) + (1 - {U(N) \\over C}), $$
+.. math::
 
-where $N$ is the number of sequenced molecules, $U(N)$ is the expected number
-of observed unique molecules after sequencing $N$ molecules, and C is the library complexity.
+    U(N+1) = U(N) + \left(1 - \frac{U(N)}{C} \right),
+
+where :math:`N` is the number of sequenced molecules, :math:`U(N)` is the expected number
+of observed unique molecules after sequencing :math:`N` molecules, and :math:`C` is the library complexity.
 This differential equation yields [1, 2]:
 
-$$ {U(N) \\over C} = 1 - exp( - {N \\over C}), $$
+.. math::
+    
+    {U(N) \over C} = 1 - exp\left( - \frac{N}{C} \right),
 
 which can be solved as
 
-$$ C = \Re(lambert W( - { \exp( - {1 \\over u} ) \\over u} ) ) + {1 \\over u} $$
+.. math::
+
+    C = \Re \left( W_{Lambert} \left( - \frac{ \exp\left( - \frac{1}{U} \right) } {U} \right) \right) + \frac{1}{U}
 
 Library complexity can guide in the choice of sequencing depth of the library
 and provide an estimate of library quality.

diff --git a/pairtools/cli/stats.py b/pairtools/cli/stats.py
@@ -28,6 +28,14 @@
     " all overlapping statistics. Non-overlapping statistics are appended to"
     " the end of the file. Supported for tsv stats with single filter.",
 )
+@click.option(
+    "--n-dist-bins-decade",
+    type=int,
+    default=PairCounter.N_DIST_BINS_DECADE_DEFAULT,
+    show_default=True,
+    required=False,
+    help="Number of bins to split the distance range in log10-space, specified per a factor of 10 difference.",
+)
 @click.option(
     "--with-chromsizes/--no-chromsizes",
     is_flag=True,
@@ -107,7 +115,7 @@
 )
 @common_io_options
 def stats(
-    input_path, output, merge, bytile_dups, output_bytile_stats, filter, **kwargs
+    input_path, output, merge, n_dist_bins_decade, bytile_dups, output_bytile_stats, filter, **kwargs
 ):
     """Calculate pairs statistics.
 
@@ -123,6 +131,7 @@ def stats(
         input_path,
         output,
         merge,
+        n_dist_bins_decade,
         bytile_dups,
         output_bytile_stats,
         filter,
@@ -131,10 +140,10 @@ def stats(
 
 
 def stats_py(
-    input_path, output, merge, bytile_dups, output_bytile_stats, filter, **kwargs
+    input_path, output, merge, n_dist_bins_decade, bytile_dups, output_bytile_stats, filter, **kwargs
 ):
     if merge:
-        do_merge(output, input_path, **kwargs)
+        do_merge(output, input_path, n_dist_bins_decade=n_dist_bins_decade, **kwargs)
         return
 
     if len(input_path) == 0:
@@ -181,6 +190,7 @@ def stats_py(
         filter = None
 
     stats = PairCounter(
+        n_dist_bins_decade=n_dist_bins_decade,
         bytile_dups=bytile_dups,
         filters=filter,
         startup_code=kwargs.get("startup_code", ""),  # for evaluation of filters