Skip to content

Commit

Permalink
Truncate genotype calls that exceed max alt alleles
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwhite committed Jul 28, 2021
1 parent 7c17bac commit 45b9f21
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 4 deletions.
14 changes: 11 additions & 3 deletions sgkit/io/vcf/vcf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def for_field(
) -> "VcfFieldHandler":
if field == "FORMAT/GT":
return GenotypeFieldHandler(
vcf, chunk_length, ploidy, mixed_ploidy, truncate_calls
vcf, chunk_length, ploidy, mixed_ploidy, truncate_calls, max_alt_alleles
)
category = field.split("/")[0]
vcf_field_defs = _get_vcf_field_defs(vcf, category)
Expand Down Expand Up @@ -277,13 +277,15 @@ def __init__(
ploidy: int,
mixed_ploidy: bool,
truncate_calls: bool,
max_alt_alleles: int,
) -> None:
n_sample = len(vcf.samples)
self.call_genotype = np.empty((chunk_length, n_sample, ploidy), dtype="i1")
self.call_genotype_phased = np.empty((chunk_length, n_sample), dtype=bool)
self.ploidy = ploidy
self.mixed_ploidy = mixed_ploidy
self.truncate_calls = truncate_calls
self.max_alt_alleles = max_alt_alleles

def add_variant(self, i: int, variant: Any) -> None:
fill = -2 if self.mixed_ploidy else -1
Expand All @@ -296,6 +298,10 @@ def add_variant(self, i: int, variant: Any) -> None:
self.call_genotype[i, ..., 0:n] = gt[..., 0:n]
self.call_genotype[i, ..., n:] = fill
self.call_genotype_phased[i] = gt[..., -1]

# set any calls that exceed maximum number of alt alleles as missing
self.call_genotype[i][self.call_genotype[i] > self.max_alt_alleles] = -1

else:
self.call_genotype[i] = fill
self.call_genotype_phased[i] = 0
Expand Down Expand Up @@ -583,7 +589,8 @@ def vcf_to_zarrs(
max_alt_alleles
The (maximum) number of alternate alleles in the VCF file. Any records with more than
this number of alternate alleles will have the extra alleles dropped (the `variant_allele`
variable will be truncated). Call genotype fields will however be unaffected.
variable will be truncated). Any call genotype fields with the extra alleles will
be changed to the missing-allele sentinel value of -1.
fields
Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes.
Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``.
Expand Down Expand Up @@ -739,7 +746,8 @@ def vcf_to_zarr(
max_alt_alleles
The (maximum) number of alternate alleles in the VCF file. Any records with more than
this number of alternate alleles will have the extra alleles dropped (the `variant_allele`
variable will be truncated). Call genotype fields will however be unaffected.
variable will be truncated). Any call genotype fields with the extra alleles will
be changed to the missing-allele sentinel value of -1.
fields
Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes.
Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``.
Expand Down
8 changes: 7 additions & 1 deletion sgkit/tests/io/vcf/test_vcf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,10 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path):
output = tmp_path.joinpath("vcf.zarr").as_posix()

with pytest.warns(MaxAltAllelesExceededWarning):
vcf_to_zarr(path, output, chunk_length=5, chunk_width=2, max_alt_alleles=1)
max_alt_alleles = 1
vcf_to_zarr(
path, output, chunk_length=5, chunk_width=2, max_alt_alleles=max_alt_alleles
)
ds = xr.open_zarr(output)

# extra alt alleles are dropped
Expand All @@ -120,6 +123,9 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path):
],
)

# genotype calls are truncated
assert np.all(ds["call_genotype"].values <= max_alt_alleles)

# the maximum number of alt alleles actually seen is stored as an attribute
assert ds.attrs["max_alt_alleles_seen"] == 3

Expand Down

0 comments on commit 45b9f21

Please sign in to comment.