Skip to content

Commit

Permalink
Add min to FrequencyBin (#529)
Browse files Browse the repository at this point in the history
  • Loading branch information
dvadym authored Oct 17, 2024
1 parent 3a7a0ff commit f654204
Show file tree
Hide file tree
Showing 8 changed files with 387 additions and 213 deletions.
1 change: 1 addition & 0 deletions analysis/tests/parameter_tuning_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def _frequency_bin(max_value: float = 0.0,
lower: float = 0.0) -> histograms.FrequencyBin:
return histograms.FrequencyBin(max=max_value,
lower=lower,
min=None,
upper=None,
count=None,
sum=None)
Expand Down
3 changes: 2 additions & 1 deletion pipeline_dp/dataset_histograms/computing_histograms.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ def _map_to_frequency_bin(value: int,
upper=bin_upper,
count=frequency,
sum=frequency * value,
max=value)
max=value,
min=value)

col = backend.map_tuple(col, _map_to_frequency_bin, "To FrequencyBin")
# (lower_bin_value, hist.FrequencyBin)
Expand Down
7 changes: 5 additions & 2 deletions pipeline_dp/dataset_histograms/histograms.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,18 @@ class FrequencyBin:
count: int
sum: Union[int, float]
max: Union[int, float]
min: Union[int, float]

def __add__(self, other: 'FrequencyBin') -> 'FrequencyBin':
self._check_same_bin(other)
return FrequencyBin(self.lower, self.upper, self.count + other.count,
self.sum + other.sum, max(self.max, other.max))
self.sum + other.sum, max(self.max, other.max),
min(self.min, other.min))

def __eq__(self, other):
return (self.lower == other.lower and self.count == other.count and
self.sum == other.sum and self.max == other.max)
self.sum == other.sum and self.max == other.max and
self.min == self.min)

def _check_same_bin(self, other: 'FrequencyBin'):
assert self.lower == other.lower
Expand Down
3 changes: 2 additions & 1 deletion pipeline_dp/dataset_histograms/sum_histogram_computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ def _map_to_frequency_bin(
upper=bin_upper,
count=1,
sum=value,
max=value)
max=value,
min=value)
return (index, bin_lower), bucket

col = backend.map_with_side_inputs(col, _map_to_frequency_bin,
Expand Down
243 changes: 163 additions & 80 deletions tests/dataset_histograms/computing_histograms_test.py

Large diffs are not rendered by default.

102 changes: 87 additions & 15 deletions tests/dataset_histograms/histograms_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

def frequency_bin(lower: Union[int, float],
upper: Union[int, float]) -> FrequencyBin:
return FrequencyBin(lower, upper, count=0, sum=0, max=0)
return FrequencyBin(lower, upper, count=0, sum=0, min=lower, max=lower)


class HistogramTest(parameterized.TestCase):
Expand All @@ -35,19 +35,55 @@ class HistogramTest(parameterized.TestCase):
upper=1010,
count=10,
sum=10100,
min=1000,
max=1009),
],
q=[0.05, 0.1, 0.5, 0.8, 0.9],
expected_quantiles=[1000, 1000, 1000, 1000, 1000]),
dict(testcase_name='6 bins histogram',
bins=[
hist.FrequencyBin(lower=1, upper=2, count=2, sum=2, max=1),
hist.FrequencyBin(lower=2, upper=3, count=1, sum=2, max=2),
hist.FrequencyBin(lower=3, upper=4, count=1, sum=3, max=3),
hist.FrequencyBin(lower=4, upper=5, count=2, sum=8, max=4),
hist.FrequencyBin(lower=5, upper=6, count=2, sum=10, max=5),
hist.FrequencyBin(lower=6, upper=7, count=1, sum=6, max=6),
hist.FrequencyBin(lower=10, upper=12, count=1, sum=11, max=11)
hist.FrequencyBin(lower=1,
upper=2,
count=2,
sum=2,
min=1,
max=1),
hist.FrequencyBin(lower=2,
upper=3,
count=1,
sum=2,
min=2,
max=2),
hist.FrequencyBin(lower=3,
upper=4,
count=1,
sum=3,
min=3,
max=3),
hist.FrequencyBin(lower=4,
upper=5,
count=2,
sum=8,
min=4,
max=4),
hist.FrequencyBin(lower=5,
upper=6,
count=2,
sum=10,
min=5,
max=5),
hist.FrequencyBin(lower=6,
upper=7,
count=1,
sum=6,
min=6,
max=6),
hist.FrequencyBin(lower=10,
upper=12,
count=1,
sum=11,
min=10,
max=11)
],
q=[0.001, 0.05, 0.1, 0.5, 0.8, 0.9],
expected_quantiles=[1, 1, 1, 4, 6, 10]))
Expand All @@ -64,18 +100,54 @@ def test_quantile_contributions(self, bins, q, expected_quantiles):
upper=1021,
count=10,
sum=10100,
min=1,
max=1020),
],
expected_ratios=[(0, 1), (1000, 100 / 10100), (1020, 0.0)]),
dict(testcase_name='7 bins histogram',
bins=[
hist.FrequencyBin(lower=1, upper=2, count=8, sum=8, max=1),
hist.FrequencyBin(lower=2, upper=3, count=2, sum=4, max=2),
hist.FrequencyBin(lower=3, upper=4, count=1, sum=3, max=3),
hist.FrequencyBin(lower=4, upper=5, count=2, sum=8, max=4),
hist.FrequencyBin(lower=5, upper=6, count=2, sum=10, max=5),
hist.FrequencyBin(lower=6, upper=7, count=1, sum=6, max=6),
hist.FrequencyBin(lower=11, upper=12, count=1, sum=11, max=11),
hist.FrequencyBin(lower=1,
upper=2,
count=8,
sum=8,
min=1,
max=1),
hist.FrequencyBin(lower=2,
upper=3,
count=2,
sum=4,
min=2,
max=2),
hist.FrequencyBin(lower=3,
upper=4,
count=1,
sum=3,
min=3,
max=3),
hist.FrequencyBin(lower=4,
upper=5,
count=2,
sum=8,
min=4,
max=4),
hist.FrequencyBin(lower=5,
upper=6,
count=2,
sum=10,
min=5,
max=5),
hist.FrequencyBin(lower=6,
upper=7,
count=1,
sum=6,
min=6,
max=6),
hist.FrequencyBin(lower=11,
upper=12,
count=1,
sum=11,
min=11,
max=11),
],
expected_ratios=[(0, 1), (1, 0.66), (2, 0.48), (3, 0.34),
(4, 0.22), (5, 0.14), (6, 0.1), (11, 0.0)]))
Expand Down
Loading

0 comments on commit f654204

Please sign in to comment.