Skip to content

Commit

Permalink
Merge pull request #63 from iqbal-lab-org/dev
Browse files Browse the repository at this point in the history
v0.5.0 release
  • Loading branch information
leoisl authored Jul 20, 2023
2 parents 80aca37 + 55ca0ec commit b9bab76
Show file tree
Hide file tree
Showing 70 changed files with 5,313 additions and 606 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,4 @@ tests/data/prg_builder/write_prg/sample.bin
tests/data/prg_builder/write_prg/sample.prg.fa
tests/integration_tests/data/output/
tests/integration_tests/data/output_update/
debugging
16 changes: 14 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.4.0] - 2022-22-11
## [0.5.0] - 2023-07-18

### Fixed

- Properly handling Ns in the MSA, and in the denovo sequences (see [PR #60](https://github.com/iqbal-lab-org/make_prg/pull/60)
and [PR #61](https://github.com/iqbal-lab-org/make_prg/pull/61) for more details);

### Changed
- `scikit-learn`, `numpy` and `pytest` dependencies updated;
- The KMeans algorithm used is now `elkan`;

## [0.4.0] - 2022-11-22

### Added
- `make_prg update` command, that updates PRGs without requiring to rebuild MSAs and the PRG itself from scratch;
Expand Down Expand Up @@ -103,8 +114,9 @@ operations.
source project CHANGELOG.


[Unreleased]: https://github.com/iqbal-lab-org/make_prg/compare/0.4.0...HEAD
[Unreleased]: https://github.com/iqbal-lab-org/make_prg/compare/0.5.0...HEAD

[0.5.0]: https://github.com/iqbal-lab-org/make_prg/compare/0.4.0...0.5.0
[0.4.0]: https://github.com/iqbal-lab-org/make_prg/compare/0.2.0...0.4.0
[0.2.0]: https://github.com/iqbal-lab-org/make_prg/compare/0.1.1...0.2.0
[0.1.1]: https://github.com/iqbal-lab-org/make_prg/compare/0.1.0...0.1.1
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# To build: docker build . -t make_prg:0.4.0
# To build: docker build . -t make_prg:0.5.0
# Tagged as such, it can be used in scripts/build_precompiled_binary/build_precompiled_binary.sh to build the precompiled binary
FROM python:3.10-slim

Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ In this binary, all libraries are linked statically. Compilation is done using [

#### Download
```
wget https://github.com/iqbal-lab-org/make_prg/releases/download/0.4.0/make_prg_0.4.0
wget https://github.com/iqbal-lab-org/make_prg/releases/download/0.5.0/make_prg_0.5.0
```

#### Run
```
chmod +x make_prg_0.4.0
./make_prg_0.4.0 -h
chmod +x make_prg_0.5.0
./make_prg_0.5.0 -h
```

### pip
Expand Down Expand Up @@ -77,7 +77,7 @@ The above will use the latest version. If you want to specify a version then use
[tag][quay.io] (or commit) like so.

```sh
VERSION="0.4.0"
VERSION="0.5.0"
URI="docker://quay.io/iqballab/make_prg:${VERSION}"
```

Expand Down
4 changes: 3 additions & 1 deletion make_prg/from_msa/cluster_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,9 @@ def kmeans_cluster_seqs(
break
if num_clusters == num_sequences:
break
kmeans = KMeans(n_clusters=num_clusters, random_state=2).fit(count_matrix)
kmeans = KMeans(n_clusters=num_clusters, random_state=2, algorithm="elkan").fit(
count_matrix
)
prev_cluster_assignment = cluster_assignment
cluster_assignment = list(kmeans.predict(count_matrix))
num_fitted_clusters = len(set(cluster_assignment))
Expand Down
8 changes: 6 additions & 2 deletions make_prg/update/denovo_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ class DenovoError(Exception):
pass


class NonACGTError(Exception):
pass


class TooLongDeletion(Exception):
pass

Expand Down Expand Up @@ -82,7 +86,7 @@ def _param_checking(
def _check_sequence_is_composed_of_ACGT_only(seq: str):
sequence_is_composed_of_ACGT_only = all([base in "ACGT" for base in seq])
if not sequence_is_composed_of_ACGT_only:
raise DenovoError(f"Found a non-ACGT seq ({seq}) in a denovo variant")
raise NonACGTError(f"Found a non-ACGT seq ({seq}) in a denovo variant")

def __eq__(self, other):
if isinstance(other, self.__class__):
Expand Down Expand Up @@ -496,7 +500,7 @@ def _read_variants(
filehandler, long_deletion_threshold
)
variants.append(denovo_variant)
except TooLongDeletion as error:
except (TooLongDeletion, NonACGTError) as error:
logger.warning(f"Ignoring variant: {error}")
return variants

Expand Down
39 changes: 32 additions & 7 deletions make_prg/utils/io_utils.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,51 @@
import gzip
import os
import tempfile
from io import StringIO
from pathlib import Path
from typing import Dict, Union
from zipfile import ZipFile

from Bio import AlignIO
from Bio.Seq import Seq

from make_prg import MSA
from make_prg.subcommands.output_type import OutputType
from make_prg.utils.seq_utils import get_majority_consensus_from_MSA


def load_alignment_file(msa_file: Union[str, Path], alignment_format: str) -> MSA:
msa_file = str(msa_file)
if msa_file.endswith(".gz"):
handle = gzip.open(msa_file, "rt")
alignment = AlignIO.read(handle, alignment_format)
handle.close()
else:
def load_alignment_file(
msa_file: Union[str, Path, StringIO], alignment_format: str
) -> MSA:
if isinstance(msa_file, StringIO):
alignment = AlignIO.read(msa_file, alignment_format)
else:
msa_file = str(msa_file)
if msa_file.endswith(".gz"):
with gzip.open(msa_file, "rt") as handle:
alignment = AlignIO.read(handle, alignment_format)
else:
with open(msa_file, "r") as handle:
alignment = AlignIO.read(handle, alignment_format)

# upper case seqs
for record in alignment:
record.seq = record.seq.upper()

# Compute the consensus sequence
consensus = get_majority_consensus_from_MSA(alignment)

# Replace 'N' with the consensus sequence in each record
for record in alignment:
record.seq = Seq(
"".join(
[
consensus[i] if nucleotide == "N" else nucleotide
for i, nucleotide in enumerate(str(record.seq))
]
)
)

return alignment


Expand Down
54 changes: 54 additions & 0 deletions make_prg/utils/seq_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import copy
import hashlib
import itertools
import random
from collections import Counter
from typing import Generator, List, Tuple

import numpy as np
Expand Down Expand Up @@ -234,3 +237,54 @@ def get_consensus_from_MSA(alignment: MSA) -> str:
consensus_string_as_list.append(column.pop())
consensus_string = "".join(consensus_string_as_list)
return consensus_string


def convert_to_upper(sequences: Generator) -> Generator:
return (s.upper() for s in sequences)


def generate_random_seed(sequences: List[str]) -> bytes:
return hashlib.sha256("".join(sequences).encode()).digest()


def get_consensus_residue(
position: int, sequences: List[str], local_random: random.Random
) -> str:
# Count the residues at this position, ignoring gaps and Ns
pos_counts = Counter(
seq[position]
for seq in sequences
if seq[position] != GAP and seq[position] != "N"
)

# If there are no residues other than gaps and Ns at this position, use a random base
if len(pos_counts) == 0:
return local_random.choice("ACGT")

# Find the residue(s) with the highest count
max_count = pos_counts.most_common(1)[0][1]
max_residues = [res for res, count in pos_counts.items() if count == max_count]

# Randomly select a residue from the residues with the highest count
return local_random.choice(max_residues)


def get_majority_consensus_from_MSA(alignment: MSA) -> str:
"""
Produces a consensus string (composed only of ACGT) just based on the major base for each column.
"""
all_seqs = get_alignment_seqs(alignment)
all_seqs = list(convert_to_upper(all_seqs))
random_seed_for_this_alignment = generate_random_seed(all_seqs)
local_random = random.Random()
local_random.seed(random_seed_for_this_alignment)

# Initialize the consensus sequence as an empty string
consensus = ""

# Loop over the positions in the alignment
for i in range(alignment.get_alignment_length()):
# Add the residue to the consensus sequence
consensus += get_consensus_residue(i, all_seqs, local_random)

return consensus
Loading

0 comments on commit b9bab76

Please sign in to comment.