From ec823202285e6213e02577e3a0b5fe11b683ce33 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 23 Aug 2024 09:59:19 -0700 Subject: [PATCH 1/2] Use exact fractional sequences per group The previous `_calculate_fractional_sequences_per_group()` was an approximation of this exact value. The approximation could return a fractional value above 1, which would fail the assertion in `get_probabilistic_group_sizes()`. --- augur/filter/subsample.py | 54 +------------------ ...nces-with-probabilistic-sampling-warning.t | 4 +- .../subsample-probabilistic-sampling-output.t | 2 +- .../cram/subsample-skip-ambiguous-dates.t | 2 +- 4 files changed, 5 insertions(+), 57 deletions(-) diff --git a/augur/filter/subsample.py b/augur/filter/subsample.py index 11f2eef86..277398915 100644 --- a/augur/filter/subsample.py +++ b/augur/filter/subsample.py @@ -548,10 +548,7 @@ def calculate_sequences_per_group(target_max_value, group_sizes, allow_probabili except TooManyGroupsError as error: if allow_probabilistic: print_err(f"WARNING: {error}") - sequences_per_group = _calculate_fractional_sequences_per_group( - target_max_value, - group_sizes, - ) + sequences_per_group = target_max_value / len(group_sizes) probabilistic_used = True else: raise error @@ -634,52 +631,3 @@ def _calculate_sequences_per_group( return int(hi) else: return int(lo) - - -def _calculate_fractional_sequences_per_group( - target_max_value: int, - group_sizes: Collection[int] -) -> float: - """Returns the fractional sequences per group for the given list of group - sequences such that the total doesn't exceed the requested number of - samples. - - Parameters - ---------- - target_max_value : int - the total number of sequences allowed across all groups - group_sizes : Collection[int] - the number of sequences in each group - - Returns - ------- - float - fractional maximum number of sequences allowed per group to meet the - required maximum total sequences allowed - - Examples - -------- - >>> np.around(_calculate_fractional_sequences_per_group(4, [4, 2]), 4) - 1.9375 - >>> np.around(_calculate_fractional_sequences_per_group(2, [4, 2]), 4) - 0.9688 - - Unlike the integer-based version of this function, the fractional version - can accept a maximum number of sequences that exceeds the number of groups. - In this case, the function returns a fraction that can be used downstream, - for example with Poisson sampling. - - >>> np.around(_calculate_fractional_sequences_per_group(1, [4, 2]), 4) - 0.4844 - """ - lo = 1e-5 - hi = float(target_max_value) - - while (hi / lo) > 1.1: - mid = (lo + hi) / 2 - if _calculate_total_sequences(mid, group_sizes) <= target_max_value: - lo = mid - else: - hi = mid - - return (lo + hi) / 2 diff --git a/tests/functional/filter/cram/subsample-max-sequences-with-probabilistic-sampling-warning.t b/tests/functional/filter/cram/subsample-max-sequences-with-probabilistic-sampling-warning.t index 72ed589f2..7939bbc0c 100644 --- a/tests/functional/filter/cram/subsample-max-sequences-with-probabilistic-sampling-warning.t +++ b/tests/functional/filter/cram/subsample-max-sequences-with-probabilistic-sampling-warning.t @@ -15,7 +15,7 @@ Explicitly use probabilistic subsampling to handle the case when there are more > --probabilistic-sampling \ > --output-strains filtered_strains_probabilistic.txt > /dev/null WARNING: Asked to provide at most 5 sequences, but there are 8 groups. - Sampling probabilistically at 0.6055 sequences per group, meaning it is possible to have more than the requested maximum of 5 sequences after filtering. + Sampling probabilistically at 0.6250 sequences per group, meaning it is possible to have more than the requested maximum of 5 sequences after filtering. 10 strains were dropped during filtering 1 had no metadata 1 had no sequence data @@ -36,7 +36,7 @@ Using the default probabilistic subsampling, should work the same as the previou > --subsample-seed 314159 \ > --output-strains filtered_strains_default.txt > /dev/null WARNING: Asked to provide at most 5 sequences, but there are 8 groups. - Sampling probabilistically at 0.6055 sequences per group, meaning it is possible to have more than the requested maximum of 5 sequences after filtering. + Sampling probabilistically at 0.6250 sequences per group, meaning it is possible to have more than the requested maximum of 5 sequences after filtering. 10 strains were dropped during filtering 1 had no metadata 1 had no sequence data diff --git a/tests/functional/filter/cram/subsample-probabilistic-sampling-output.t b/tests/functional/filter/cram/subsample-probabilistic-sampling-output.t index c908988fa..865204107 100644 --- a/tests/functional/filter/cram/subsample-probabilistic-sampling-output.t +++ b/tests/functional/filter/cram/subsample-probabilistic-sampling-output.t @@ -12,7 +12,7 @@ Check output of probabilistic sampling. > --subsample-seed 314159 \ > --output-metadata filtered_metadata.tsv WARNING: Asked to provide at most 3 sequences, but there are 8 groups. - Sampling probabilistically at 0.3633 sequences per group, meaning it is possible to have more than the requested maximum of 3 sequences after filtering. + Sampling probabilistically at 0.3750 sequences per group, meaning it is possible to have more than the requested maximum of 3 sequences after filtering. 10 strains were dropped during filtering 1 was dropped during grouping due to ambiguous year information 1 was dropped during grouping due to ambiguous month information diff --git a/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t b/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t index b638bdd59..dec6bedad 100644 --- a/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t +++ b/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t @@ -13,7 +13,7 @@ Strains with ambiguous years or months should be dropped and logged. > --output-strains filtered_strains.txt \ > --output-log filtered_log.tsv > /dev/null WARNING: Asked to provide at most 5 sequences, but there are 6 groups. - Sampling probabilistically at 0.8203 sequences per group, meaning it is possible to have more than the requested maximum of 5 sequences after filtering. + Sampling probabilistically at 0.8333 sequences per group, meaning it is possible to have more than the requested maximum of 5 sequences after filtering. 8 strains were dropped during filtering 1 was dropped during grouping due to ambiguous year information 1 was dropped during grouping due to ambiguous month information From f1e09e19ff8d430a8aced8f22530b753c8cea973 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 23 Aug 2024 11:12:48 -0700 Subject: [PATCH 2/2] Update changelog --- CHANGES.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 7b26e0f21..7218ea169 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,13 +8,16 @@ ### Bug Fixes +* filter: Previously, when `--subsample-max-sequences` was slightly lower than the number of groups, it was possible to fail with an uncaught `AssertionError`. Internal calculations have been adjusted to prevent this from happening. [#1588][] [#1598][] (@victorlin) * filter: Improved warning and error messages in the case of missing columns. [#1604] (@victorlin) * merge: Any user-customized `~/.sqliterc` file is now ignored so it doesn't break `augur merge`'s internal use of SQLite. [#1608][] (@tsibley) * merge: Non-id columns in metadata inputs that would conflict with the output id column are now forbidden and will cause an error if present. Previously they would overwrite values in the output id column, causing incorrect output. [#1593][] (@tsibley) * import: Spaces in BEAST MCC tree annotations (for example, from a discrete state reconstruction) no longer break `augur import beast`'s parsing. [#1610][] (@watronfire) +[#1588]: https://github.com/nextstrain/augur/issues/1588 [#1593]: https://github.com/nextstrain/augur/pull/1593 [#1594]: https://github.com/nextstrain/augur/pull/1594 +[#1598]: https://github.com/nextstrain/augur/issues/1598 [#1604]: https://github.com/nextstrain/augur/pull/1604 [#1608]: https://github.com/nextstrain/augur/pull/1608 [#1610]: https://github.com/nextstrain/augur/pull/1610