-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* variant combinator * variant combinator vcf * bug fix upper case * sample from vcf * format fields bug fix * sort_intervals * update on testcase Co-authored-by: M. Hasan Celik <[email protected]>
- Loading branch information
1 parent
8893483
commit e234baf
Showing
8 changed files
with
263 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
from typing import Iterable | ||
from itertools import product | ||
from kipoiseq import Interval, Variant | ||
from kipoiseq.utils import alphabets | ||
from kipoiseq.extractors import FastaStringExtractor | ||
from kipoiseq.extractors.vcf_matching import pyranges_to_intervals | ||
|
||
|
||
class VariantCombinator: | ||
|
||
def __init__(self, fasta_file: str, bed_file: str = None, | ||
variant_type='snv', alphabet='DNA'): | ||
if variant_type not in {'all', 'snv', 'in', 'del'}: | ||
raise ValueError("variant_type should be one of " | ||
"{'all', 'snv', 'in', 'del'}") | ||
|
||
self.bed_file = bed_file | ||
self.fasta = fasta_file | ||
self.fasta = FastaStringExtractor(fasta_file, force_upper=True) | ||
self.variant_type = variant_type | ||
self.alphabet = alphabets[alphabet] | ||
|
||
def combination_variants_snv(self, interval: Interval) -> Iterable[Variant]: | ||
"""Returns all the possible variants in the regions. | ||
interval: interval of variants | ||
""" | ||
seq = self.fasta.extract(interval) | ||
for pos, ref in zip(range(interval.start, interval.end), seq): | ||
pos = pos + 1 # 0 to 1 base | ||
for alt in self.alphabet: | ||
if ref != alt: | ||
yield Variant(interval.chrom, pos, ref, alt) | ||
|
||
def combination_variants_insertion(self, interval, length=2) -> Iterable[Variant]: | ||
"""Returns all the possible variants in the regions. | ||
interval: interval of variants | ||
length: insertions up to length | ||
""" | ||
if length < 2: | ||
raise ValueError('length argument should be larger than 1') | ||
|
||
seq = self.fasta.extract(interval) | ||
for pos, ref in zip(range(interval.start, interval.end), seq): | ||
pos = pos + 1 # 0 to 1 base | ||
for l in range(2, length + 1): | ||
for alt in product(self.alphabet, repeat=l): | ||
yield Variant(interval.chrom, pos, ref, ''.join(alt)) | ||
|
||
def combination_variants_deletion(self, interval, length=1) -> Iterable[Variant]: | ||
"""Returns all the possible variants in the regions. | ||
interval: interval of variants | ||
length: deletions up to length | ||
""" | ||
if length < 1 and length <= interval.width: | ||
raise ValueError('length argument should be larger than 0' | ||
' and smaller than interval witdh') | ||
|
||
seq = self.fasta.extract(interval) | ||
for i, pos in enumerate(range(interval.start, interval.end)): | ||
pos = pos + 1 # 0 to 1 base | ||
for j in range(1, length + 1): | ||
if i + j <= len(seq): | ||
yield Variant(interval.chrom, pos, seq[i:i + j], '') | ||
|
||
def combination_variants(self, interval, variant_type='snv', | ||
in_length=2, del_length=2) -> Iterable[Variant]: | ||
if variant_type in {'snv', 'all'}: | ||
yield from self.combination_variants_snv(interval) | ||
if variant_type in {'indel', 'in', 'all'}: | ||
yield from self.combination_variants_insertion( | ||
interval, length=in_length) | ||
if variant_type in {'indel', 'del', 'all'}: | ||
yield from self.combination_variants_deletion( | ||
interval, length=del_length) | ||
|
||
def __iter__(self) -> Iterable[Variant]: | ||
import pyranges as pr | ||
|
||
gr = pr.read_bed(self.bed_file) | ||
gr = gr.merge(strand=False).sort() | ||
|
||
for interval in pyranges_to_intervals(gr): | ||
yield from self.combination_variants(interval, self.variant_type) | ||
|
||
def to_vcf(self, path): | ||
from cyvcf2 import Writer | ||
header = '''##fileformat=VCFv4.2 | ||
#CHROM POS ID REF ALT QUAL FILTER INFO | ||
''' | ||
writer = Writer.from_string(path, header) | ||
|
||
for v in self: | ||
variant = writer.variant_from_string('\t'.join([ | ||
v.chrom, str(v.pos), '.', v.ref, v.alt, '.', '.', '.' | ||
])) | ||
writer.write_record(variant) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import pytest | ||
from conftest import example_intervals_bed, sample_5kb_fasta_file | ||
import pyranges as pr | ||
from kipoiseq import Interval | ||
from kipoiseq.extractors import VariantCombinator, MultiSampleVCF | ||
|
||
|
||
@pytest.fixture | ||
def variant_combinator(): | ||
return VariantCombinator(sample_5kb_fasta_file, example_intervals_bed) | ||
|
||
|
||
def test_VariantCombinator_combination_variants(variant_combinator): | ||
interval = Interval('chr1', 20, 30) | ||
variants = list(variant_combinator.combination_variants(interval, 'snv')) | ||
assert len(variants) == 30 | ||
|
||
interval = Interval('chr1', 20, 22) | ||
variants = list(variant_combinator.combination_variants(interval, 'snv')) | ||
assert variants[0].chrom == 'chr1' | ||
assert variants[0].ref == 'A' | ||
assert variants[0].alt == 'C' | ||
assert variants[1].alt == 'G' | ||
assert variants[2].alt == 'T' | ||
|
||
assert variants[3].ref == 'C' | ||
assert variants[3].alt == 'A' | ||
assert variants[4].alt == 'G' | ||
assert variants[5].alt == 'T' | ||
|
||
interval = Interval('chr1', 20, 22) | ||
variants = list(variant_combinator.combination_variants(interval, 'in')) | ||
len(variants) == 32 | ||
assert variants[0].ref == 'A' | ||
assert variants[0].alt == 'AA' | ||
assert variants[15].alt == 'TT' | ||
|
||
assert variants[16].ref == 'C' | ||
assert variants[16].alt == 'AA' | ||
assert variants[31].alt == 'TT' | ||
|
||
interval = Interval('chr1', 20, 22) | ||
variants = list(variant_combinator.combination_variants( | ||
interval, 'del', del_length=2)) | ||
assert len(variants) == 3 | ||
assert variants[0].ref == 'A' | ||
assert variants[0].alt == '' | ||
assert variants[1].ref == 'AC' | ||
assert variants[1].alt == '' | ||
assert variants[2].ref == 'C' | ||
assert variants[2].alt == '' | ||
|
||
variants = list(variant_combinator.combination_variants( | ||
interval, 'all', in_length=2, del_length=2)) | ||
assert len(variants) == 6 + 32 + 3 | ||
|
||
|
||
def test_VariantCombinator_iter(variant_combinator): | ||
variants = list(variant_combinator) | ||
df = pr.read_bed(example_intervals_bed).merge(strand=False).df | ||
num_snv = (df['End'] - df['Start']).sum() * 3 | ||
assert len(variants) == num_snv | ||
assert len(variants) == len(set(variants)) | ||
|
||
|
||
def test_VariantCombinator_to_vcf(tmpdir, variant_combinator): | ||
output_vcf_file = str(tmpdir / 'output.vcf') | ||
variant_combinator.to_vcf(output_vcf_file) | ||
|
||
vcf = MultiSampleVCF(output_vcf_file) | ||
|
||
df = pr.read_bed(example_intervals_bed).merge(strand=False).df | ||
num_snv = (df['End'] - df['Start']).sum() * 3 | ||
assert len(list(vcf)) == num_snv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters