From 45b9f212bedf90f9418a4059474cc13445e37947 Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 28 Jul 2021 11:28:49 +0100 Subject: [PATCH] Truncate genotype calls that exceed max alt alleles --- sgkit/io/vcf/vcf_reader.py | 14 +++++++++++--- sgkit/tests/io/vcf/test_vcf_reader.py | 8 +++++++- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/sgkit/io/vcf/vcf_reader.py b/sgkit/io/vcf/vcf_reader.py index c4582a128..e5368357a 100644 --- a/sgkit/io/vcf/vcf_reader.py +++ b/sgkit/io/vcf/vcf_reader.py @@ -152,7 +152,7 @@ def for_field( ) -> "VcfFieldHandler": if field == "FORMAT/GT": return GenotypeFieldHandler( - vcf, chunk_length, ploidy, mixed_ploidy, truncate_calls + vcf, chunk_length, ploidy, mixed_ploidy, truncate_calls, max_alt_alleles ) category = field.split("/")[0] vcf_field_defs = _get_vcf_field_defs(vcf, category) @@ -277,6 +277,7 @@ def __init__( ploidy: int, mixed_ploidy: bool, truncate_calls: bool, + max_alt_alleles: int, ) -> None: n_sample = len(vcf.samples) self.call_genotype = np.empty((chunk_length, n_sample, ploidy), dtype="i1") @@ -284,6 +285,7 @@ def __init__( self.ploidy = ploidy self.mixed_ploidy = mixed_ploidy self.truncate_calls = truncate_calls + self.max_alt_alleles = max_alt_alleles def add_variant(self, i: int, variant: Any) -> None: fill = -2 if self.mixed_ploidy else -1 @@ -296,6 +298,10 @@ def add_variant(self, i: int, variant: Any) -> None: self.call_genotype[i, ..., 0:n] = gt[..., 0:n] self.call_genotype[i, ..., n:] = fill self.call_genotype_phased[i] = gt[..., -1] + + # set any calls that exceed maximum number of alt alleles as missing + self.call_genotype[i][self.call_genotype[i] > self.max_alt_alleles] = -1 + else: self.call_genotype[i] = fill self.call_genotype_phased[i] = 0 @@ -583,7 +589,8 @@ def vcf_to_zarrs( max_alt_alleles The (maximum) number of alternate alleles in the VCF file. Any records with more than this number of alternate alleles will have the extra alleles dropped (the `variant_allele` - variable will be truncated). Call genotype fields will however be unaffected. + variable will be truncated). Any call genotype fields with the extra alleles will + be changed to the missing-allele sentinel value of -1. fields Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes. Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``. @@ -739,7 +746,8 @@ def vcf_to_zarr( max_alt_alleles The (maximum) number of alternate alleles in the VCF file. Any records with more than this number of alternate alleles will have the extra alleles dropped (the `variant_allele` - variable will be truncated). Call genotype fields will however be unaffected. + variable will be truncated). Any call genotype fields with the extra alleles will + be changed to the missing-allele sentinel value of -1. fields Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes. Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``. diff --git a/sgkit/tests/io/vcf/test_vcf_reader.py b/sgkit/tests/io/vcf/test_vcf_reader.py index 974212a82..637925cb7 100644 --- a/sgkit/tests/io/vcf/test_vcf_reader.py +++ b/sgkit/tests/io/vcf/test_vcf_reader.py @@ -101,7 +101,10 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path): output = tmp_path.joinpath("vcf.zarr").as_posix() with pytest.warns(MaxAltAllelesExceededWarning): - vcf_to_zarr(path, output, chunk_length=5, chunk_width=2, max_alt_alleles=1) + max_alt_alleles = 1 + vcf_to_zarr( + path, output, chunk_length=5, chunk_width=2, max_alt_alleles=max_alt_alleles + ) ds = xr.open_zarr(output) # extra alt alleles are dropped @@ -120,6 +123,9 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path): ], ) + # genotype calls are truncated + assert np.all(ds["call_genotype"].values <= max_alt_alleles) + # the maximum number of alt alleles actually seen is stored as an attribute assert ds.attrs["max_alt_alleles_seen"] == 3