Skip to content

Commit

Permalink
Merge pull request #1104 from cfe-lab/stitcher-read-csv
Browse files Browse the repository at this point in the history
Implement CSV reading for contig stitcher
  • Loading branch information
Donaim authored May 23, 2024
2 parents 56cbb48 + a701f28 commit 2a022da
Show file tree
Hide file tree
Showing 7 changed files with 346 additions and 230 deletions.
26 changes: 12 additions & 14 deletions docs/design/stitcher.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,11 @@ PYTHONPATH="/path/to/micall/repository" python3 -m micall.core.contig_stitcher -
Stitching is initiated either as a pipeline step in MiCall, or as a
command line call given above. In each case:

**Input:** The Stitcher receives a single input file in FASTA
**Input:** The Stitcher receives a single input file in CSV
format. This file contains 1 or more contigs that are the outcomes of
the previous assembly step. These contigs are essentially segments of
DNA sequences. They can vary significantly in length.
the previous assembly step, together with associated reference genome
information. These contigs are essentially segments of DNA
sequences. They can vary significantly in length.

**Output:** The sole output from the Stitcher is a CSV
file. This file holds the stitched sequences -- longer or fully
Expand Down Expand Up @@ -173,25 +174,22 @@ The setup process for the Stitcher ensures that each contig is
properly aligned and prepared for the stitching process. The steps are
as follows:

1. **Determine Reference Genome**: Identify a the best maching
reference genome for each contig based on its sequence data.

2. **Align Contigs**: Align each contig to its corresponding reference
1. **Align Contigs**: Align each contig to its corresponding reference
genome to approximate their positions within a global reference
framework, allowing for spatial comparison between different contigs.

3. **Split Multi-Alignment Contigs**: Split contigs that align to
2. **Split Multi-Alignment Contigs**: Split contigs that align to
multiple distinct parts of the reference genome into separate
segments.

4. **Handle Reverse Complement**: Reverse complement contigs that
3. **Handle Reverse Complement**: Reverse complement contigs that
align to the reverse strand of the reference genome to ensure all
sequences are oriented in the same direction.

5. **Sort Contigs**: Arrange the contigs based on their starting
4. **Sort Contigs**: Arrange the contigs based on their starting
positions along the reference genome.

6. **Group by Reference**: Group contigs such that all contigs
5. **Group by Reference**: Group contigs such that all contigs
associated with the same reference genome are processed together.

These setup steps perform minimal alteration to the original contigs
Expand Down Expand Up @@ -577,13 +575,13 @@ specifying the path to the output plot file. Here's an example of how
to stitch contigs and retrieve a visualizer plot:

```sh
PYTHONPATH="/path/to/micall/repository" python3 -m micall.core.contig_stitcher "contigs.fasta" "stitched_contigs.csv" --plot "visualized.svg"
PYTHONPATH="/path/to/micall/repository" python3 -m micall.core.contig_stitcher "contigs.csv" "stitched_contigs.csv" --plot "visualized.svg"
```

**Command Line Arguments:**

- `contigs.fasta`: Input file in FASTA format containing assembled
contigs.
- `contigs.csv`: Input file in CSV format containing assembled
contigs and related information.
- `stitched_contigs.csv`: Output CSV file that will contain the
stitched contigs.
- `--plot visualized.svg`: The optional argument to generate a visual
Expand Down
81 changes: 68 additions & 13 deletions micall/core/contig_stitcher.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import Iterable, Optional, Tuple, List, Dict, Literal, TypeVar
from typing import Iterable, Optional, Tuple, List, Dict, Literal, TypeVar, TextIO, Sequence
from collections import defaultdict
import csv
import os
from dataclasses import replace
from math import ceil
from mappy import Aligner
Expand All @@ -13,6 +15,8 @@
from operator import itemgetter
from aligntools import Cigar, connect_cigar_hits, CigarHit

from micall.core.project_config import ProjectConfig
from micall.core.plot_contigs import plot_stitcher_coverage
from micall.utils.contig_stitcher_context import context, StitcherContext
from micall.utils.contig_stitcher_contigs import GenotypedContig, AlignedContig
import micall.utils.contig_stitcher_events as events
Expand Down Expand Up @@ -575,7 +579,7 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedCont
yield from combine_overlaps(aligned)


GroupRef = str
GroupRef = Optional[str]


def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedContig]:
Expand All @@ -597,12 +601,66 @@ def combine(group_ref):
yield from map(combine, consensus_parts)


def main(args):
def write_contigs(output_csv: TextIO, contigs: Iterable[GenotypedContig]):
writer = csv.DictWriter(output_csv,
['ref', 'match', 'group_ref', 'contig'],
lineterminator=os.linesep)
writer.writeheader()
for contig in contigs:
writer.writerow(dict(ref=contig.ref_name,
match=contig.match_fraction,
group_ref=contig.group_ref,
contig=contig.seq))

output_csv.flush()


def read_contigs(input_csv: TextIO) -> Iterable[GenotypedContig]:
projects = ProjectConfig.loadDefault()

for row in csv.DictReader(input_csv):
seq = row['contig']
ref_name = row['ref']
group_ref = row['group_ref']
match_fraction = float(row['match'])

try:
ref_seq = projects.getGenotypeReference(group_ref)
except KeyError:
try:
ref_seq = projects.getReference(group_ref)
except KeyError:
ref_seq = None

yield GenotypedContig(name=None,
seq=seq,
ref_name=ref_name,
group_ref=group_ref,
ref_seq=str(ref_seq) if ref_seq is not None else None,
match_fraction=match_fraction)


def run(input_csv: TextIO, output_csv: TextIO, stitcher_plot_path: Optional[str]) -> int:
with StitcherContext.fresh() as ctx:
contigs = list(read_contigs(input_csv))

if output_csv is not None or stitcher_plot_path is not None:
contigs = list(stitch_consensus(contigs))

if output_csv is not None:
write_contigs(output_csv, contigs)

if stitcher_plot_path is not None:
plot_stitcher_coverage(ctx.events, stitcher_plot_path)

return len(contigs)


def main(argv: Sequence[str]):
import argparse
from micall.core.denovo import write_contig_refs # TODO(vitalik): move denovo stuff here.

parser = argparse.ArgumentParser()
parser.add_argument('contigs', type=argparse.FileType('r'), help="Input FASTA file with assembled contigs.")
parser.add_argument('contigs', type=argparse.FileType('r'), help="Input CSV file with assembled contigs.")
parser.add_argument('stitched_contigs', type=argparse.FileType('w'),
help="Output CSV file with stitched contigs.")
parser.add_argument('--plot', type=argparse.FileType('w'),
Expand All @@ -612,7 +670,8 @@ def main(args):
verbosity_group.add_argument('--no-verbose', action='store_true', help='Normal output verbosity.', default=True)
verbosity_group.add_argument('--debug', action='store_true', help='Maximum output verbosity.')
verbosity_group.add_argument('--quiet', action='store_true', help='Minimize output verbosity.')
args = parser.parse_args(args)

args = parser.parse_args(argv)

if args.quiet:
logger.setLevel(logging.ERROR)
Expand All @@ -624,13 +683,9 @@ def main(args):
logger.setLevel(logging.WARN)

logging.basicConfig(level=logger.level)
with StitcherContext.fresh():
plot_path = args.plot.name if args.plot is not None else None
write_contig_refs(args.contigs.name, None, args.stitched_contigs, stitcher_plot_path=plot_path)
args.contigs.close()
args.stitched_contigs.close()
if args.plot is not None:
args.plot.close()

plot_path = args.plot.name if args.plot is not None else None
run(args.contigs, args.stitched_contigs, plot_path)


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 2a022da

Please sign in to comment.