From f6542045fec11fcc08a83a5cf2b807f2e4c9c288 Mon Sep 17 00:00:00 2001 From: Vadym Doroshenko <53558779+dvadym@users.noreply.github.com> Date: Thu, 17 Oct 2024 11:44:03 +0200 Subject: [PATCH] Add min to FrequencyBin (#529) --- analysis/tests/parameter_tuning_test.py | 1 + .../computing_histograms.py | 3 +- pipeline_dp/dataset_histograms/histograms.py | 7 +- .../sum_histogram_computation.py | 3 +- .../computing_histograms_test.py | 243 ++++++++++++------ tests/dataset_histograms/histograms_test.py | 102 ++++++-- .../sum_histogram_computation_test.py | 166 ++++++++---- tests/private_contribution_bounds_test.py | 75 +----- 8 files changed, 387 insertions(+), 213 deletions(-) diff --git a/analysis/tests/parameter_tuning_test.py b/analysis/tests/parameter_tuning_test.py index d63fc2f1..e0855f63 100644 --- a/analysis/tests/parameter_tuning_test.py +++ b/analysis/tests/parameter_tuning_test.py @@ -56,6 +56,7 @@ def _frequency_bin(max_value: float = 0.0, lower: float = 0.0) -> histograms.FrequencyBin: return histograms.FrequencyBin(max=max_value, lower=lower, + min=None, upper=None, count=None, sum=None) diff --git a/pipeline_dp/dataset_histograms/computing_histograms.py b/pipeline_dp/dataset_histograms/computing_histograms.py index 5b26e056..c751a7aa 100644 --- a/pipeline_dp/dataset_histograms/computing_histograms.py +++ b/pipeline_dp/dataset_histograms/computing_histograms.py @@ -112,7 +112,8 @@ def _map_to_frequency_bin(value: int, upper=bin_upper, count=frequency, sum=frequency * value, - max=value) + max=value, + min=value) col = backend.map_tuple(col, _map_to_frequency_bin, "To FrequencyBin") # (lower_bin_value, hist.FrequencyBin) diff --git a/pipeline_dp/dataset_histograms/histograms.py b/pipeline_dp/dataset_histograms/histograms.py index 8b897a7f..3f8455b6 100644 --- a/pipeline_dp/dataset_histograms/histograms.py +++ b/pipeline_dp/dataset_histograms/histograms.py @@ -42,15 +42,18 @@ class FrequencyBin: count: int sum: Union[int, float] max: Union[int, float] + min: Union[int, float] def __add__(self, other: 'FrequencyBin') -> 'FrequencyBin': self._check_same_bin(other) return FrequencyBin(self.lower, self.upper, self.count + other.count, - self.sum + other.sum, max(self.max, other.max)) + self.sum + other.sum, max(self.max, other.max), + min(self.min, other.min)) def __eq__(self, other): return (self.lower == other.lower and self.count == other.count and - self.sum == other.sum and self.max == other.max) + self.sum == other.sum and self.max == other.max and + self.min == self.min) def _check_same_bin(self, other: 'FrequencyBin'): assert self.lower == other.lower diff --git a/pipeline_dp/dataset_histograms/sum_histogram_computation.py b/pipeline_dp/dataset_histograms/sum_histogram_computation.py index baa43136..997f02df 100644 --- a/pipeline_dp/dataset_histograms/sum_histogram_computation.py +++ b/pipeline_dp/dataset_histograms/sum_histogram_computation.py @@ -122,7 +122,8 @@ def _map_to_frequency_bin( upper=bin_upper, count=1, sum=value, - max=value) + max=value, + min=value) return (index, bin_lower), bucket col = backend.map_with_side_inputs(col, _map_to_frequency_bin, diff --git a/tests/dataset_histograms/computing_histograms_test.py b/tests/dataset_histograms/computing_histograms_test.py index 0a6b28c7..05f46f39 100644 --- a/tests/dataset_histograms/computing_histograms_test.py +++ b/tests/dataset_histograms/computing_histograms_test.py @@ -41,24 +41,51 @@ def test_to_bin_lower_upper_logarithmic(self): dict(testcase_name='small_histogram', input=[3, 3, 1, 1, 2, 10], expected=[ - hist.FrequencyBin(lower=1, upper=2, count=2, sum=2, max=1), - hist.FrequencyBin(lower=2, upper=3, count=1, sum=2, max=2), - hist.FrequencyBin(lower=3, upper=4, count=2, sum=6, max=3), - hist.FrequencyBin(lower=10, upper=11, count=1, sum=10, max=10) + hist.FrequencyBin(lower=1, + upper=2, + count=2, + sum=2, + min=1, + max=1), + hist.FrequencyBin(lower=2, + upper=3, + count=1, + sum=2, + min=1, + max=2), + hist.FrequencyBin(lower=3, + upper=4, + count=2, + sum=6, + min=1, + max=3), + hist.FrequencyBin(lower=10, + upper=11, + count=1, + sum=10, + min=1, + max=10) ]), dict(testcase_name='histogram_with_bins_wider_1', input=[1005, 3, 12345, 12346], expected=[ - hist.FrequencyBin(lower=3, upper=4, count=1, sum=3, max=3), + hist.FrequencyBin(lower=3, + upper=4, + count=1, + sum=3, + min=1, + max=3), hist.FrequencyBin(lower=1000, upper=1010, count=1, sum=1005, + min=1, max=1005), hist.FrequencyBin(lower=12300, upper=12400, count=2, sum=24691, + min=1, max=12346) ]), ) @@ -101,29 +128,36 @@ def test_list_to_contribution_histograms(self): testcase_name='small_histogram', input=[(1, 1), (1, 2), (2, 1)], # (privacy_id, partition) expected=[ - hist.FrequencyBin(lower=1, upper=2, count=1, sum=1, max=1), - hist.FrequencyBin(lower=2, upper=3, count=1, sum=2, max=2) + hist.FrequencyBin( + lower=1, upper=2, count=1, sum=1, min=1, max=1), + hist.FrequencyBin( + lower=2, upper=3, count=1, sum=2, min=1, max=2) ]), dict( testcase_name='Each privacy id, 1 contribution', input=[(i, i) for i in range(100)], # (privacy_id, partition) expected=[ hist.FrequencyBin( - lower=1, upper=2, count=100, sum=100, max=1), + lower=1, upper=2, count=100, sum=100, min=1, max=1), ]), dict( testcase_name='1 privacy id many contributions to 1 partition', input=[(0, 0)], # (privacy_id, partition) expected=[ - hist.FrequencyBin(lower=1, upper=2, count=1, sum=1, max=1), + hist.FrequencyBin( + lower=1, upper=2, count=1, sum=1, min=1, max=1), ]), dict( testcase_name= '1 privacy id many contributions to many partition', input=[(0, i) for i in range(1234)], # (privacy_id, partition) expected=[ - hist.FrequencyBin( - lower=1230, upper=1240, count=1, sum=1234, max=1234), + hist.FrequencyBin(lower=1230, + upper=1240, + count=1, + sum=1234, + min=1, + max=1234), ]), dict( testcase_name='2 privacy ids, same partitions contributed', @@ -131,7 +165,7 @@ def test_list_to_contribution_histograms(self): [(1, i) for i in range(10, 25)], # (privacy_id, partition) expected=[ hist.FrequencyBin( - lower=15, upper=16, count=2, sum=30, max=15), + lower=15, upper=16, count=2, sum=30, min=1, max=15), ]), ), pre_aggregated=(False, True)) @@ -160,22 +194,24 @@ def test_compute_l0_contributions_histogram(self, testcase_name, input, testcase_name='small_histogram', input=[(1, 1), (1, 2), (2, 1)], # (privacy_id, partition) expected=[ - hist.FrequencyBin(lower=1, upper=2, count=1, sum=1, max=1), - hist.FrequencyBin(lower=2, upper=3, count=1, sum=2, max=2) + hist.FrequencyBin( + lower=1, upper=2, count=1, sum=1, min=1, max=1), + hist.FrequencyBin( + lower=2, upper=3, count=1, sum=2, min=1, max=2) ]), dict( testcase_name='Each privacy id, 1 contribution', input=[(i, i) for i in range(100)], # (privacy_id, partition) expected=[ hist.FrequencyBin( - lower=1, upper=2, count=100, sum=100, max=1), + lower=1, upper=2, count=100, sum=100, min=1, max=1), ]), dict( testcase_name='1 privacy id many contributions to 1 partition', input=[(0, 0)] * 100, # (privacy_id, partition) expected=[ hist.FrequencyBin( - lower=100, upper=101, count=1, sum=100, max=100), + lower=100, upper=101, count=1, sum=100, min=1, max=100), ]), dict( testcase_name= @@ -183,8 +219,12 @@ def test_compute_l0_contributions_histogram(self, testcase_name, input, input=[(0, i // 2) for i in range(1235) ], # (privacy_id, partition) expected=[ - hist.FrequencyBin( - lower=1230, upper=1240, count=1, sum=1235, max=1235), + hist.FrequencyBin(lower=1230, + upper=1240, + count=1, + sum=1235, + min=1, + max=1235), ]), dict( testcase_name='2 privacy ids, same partitions contributed', @@ -192,7 +232,7 @@ def test_compute_l0_contributions_histogram(self, testcase_name, input, [(1, i) for i in range(10, 25)], # (privacy_id, partition) expected=[ hist.FrequencyBin( - lower=15, upper=16, count=2, sum=30, max=15), + lower=15, upper=16, count=2, sum=30, min=1, max=15), ]), dict( testcase_name='3 privacy ids', @@ -201,9 +241,9 @@ def test_compute_l0_contributions_histogram(self, testcase_name, input, [(2, i) for i in range(11)], # (privacy_id, partition) expected=[ hist.FrequencyBin( - lower=11, upper=12, count=1, sum=11, max=11), + lower=11, upper=12, count=1, sum=11, min=1, max=11), hist.FrequencyBin( - lower=15, upper=16, count=2, sum=30, max=15), + lower=15, upper=16, count=2, sum=30, min=1, max=15), ]), ), pre_aggregated=(False, True)) @@ -233,22 +273,24 @@ def test_compute_l1_contributions_histogram(self, testcase_name, input, input=[(1, 1), (1, 2), (2, 1), (1, 1)], # (privacy_id, partition) expected=[ - hist.FrequencyBin(lower=1, upper=2, count=2, sum=2, max=1), - hist.FrequencyBin(lower=2, upper=3, count=1, sum=2, max=2) + hist.FrequencyBin( + lower=1, upper=2, count=2, sum=2, min=1, max=1), + hist.FrequencyBin( + lower=2, upper=3, count=1, sum=2, min=1, max=2) ]), dict( testcase_name='Each privacy id, 1 contribution', input=[(i, i) for i in range(100)], # (privacy_id, partition) expected=[ hist.FrequencyBin( - lower=1, upper=2, count=100, sum=100, max=1), + lower=1, upper=2, count=100, sum=100, min=1, max=1), ]), dict( testcase_name='1 privacy id many contributions to 1 partition', input=[(0, 0)] * 100, # (privacy_id, partition) expected=[ hist.FrequencyBin( - lower=100, upper=101, count=1, sum=100, max=100), + lower=100, upper=101, count=1, sum=100, min=1, max=100), ]), dict( testcase_name= @@ -256,24 +298,27 @@ def test_compute_l1_contributions_histogram(self, testcase_name, input, input=[(0, i) for i in range(1234)], # (privacy_id, partition) expected=[ hist.FrequencyBin( - lower=1, upper=2, count=1234, sum=1234, max=1), + lower=1, upper=2, count=1234, sum=1234, min=1, max=1), ]), dict( testcase_name='2 privacy ids, same partitions contributed', input=[(0, i) for i in range(15)] + [(1, i) for i in range(10, 25)], # (privacy_id, partition) expected=[ - hist.FrequencyBin(lower=1, upper=2, count=30, sum=30, - max=1), + hist.FrequencyBin( + lower=1, upper=2, count=30, sum=30, min=1, max=1), ]), dict( testcase_name='2 privacy ids', input=[(0, 0), (0, 0), (0, 1), (1, 0), (1, 0), (1, 0), (1, 2)], # (privacy_id, partition) expected=[ - hist.FrequencyBin(lower=1, upper=2, count=2, sum=2, max=1), - hist.FrequencyBin(lower=2, upper=3, count=1, sum=2, max=2), - hist.FrequencyBin(lower=3, upper=4, count=1, sum=3, max=3), + hist.FrequencyBin( + lower=1, upper=2, count=2, sum=2, min=1, max=1), + hist.FrequencyBin( + lower=2, upper=3, count=1, sum=2, min=1, max=2), + hist.FrequencyBin( + lower=3, upper=4, count=1, sum=3, min=1, max=3), ]), ), pre_aggregated=(False, True)) @@ -305,22 +350,24 @@ def test_compute_linf_contributions_histogram(self, testcase_name, input, input=[(1, 1), (1, 2), (2, 1), (1, 1)], # (privacy_id, partition) expected=[ - hist.FrequencyBin(lower=1, upper=2, count=1, sum=1, max=1), - hist.FrequencyBin(lower=3, upper=3, count=1, sum=3, max=3) + hist.FrequencyBin( + lower=1, upper=2, count=1, sum=1, min=1, max=1), + hist.FrequencyBin( + lower=3, upper=3, count=1, sum=3, min=1, max=3) ]), dict( testcase_name='Each privacy id, 1 contribution', input=[(i, i) for i in range(100)], # (privacy_id, partition) expected=[ hist.FrequencyBin( - lower=1, upper=2, count=100, sum=100, max=1), + lower=1, upper=2, count=100, sum=100, min=1, max=1), ]), dict( testcase_name='1 privacy id many contributions to 1 partition', input=[(0, 0)] * 100, # (privacy_id, partition) expected=[ hist.FrequencyBin( - lower=100, upper=101, count=1, sum=100, max=100), + lower=100, upper=101, count=1, sum=100, min=1, max=100), ]), dict( testcase_name= @@ -328,16 +375,17 @@ def test_compute_linf_contributions_histogram(self, testcase_name, input, input=[(0, i) for i in range(1234)], # (privacy_id, partition) expected=[ hist.FrequencyBin( - lower=1, upper=2, count=1234, sum=1234, max=1), + lower=1, upper=2, count=1234, sum=1234, min=1, max=1), ]), dict( testcase_name='2 privacy ids, same partitions contributed', input=[(0, i) for i in range(15)] + [(1, i) for i in range(10, 25)], # (privacy_id, partition) expected=[ - hist.FrequencyBin(lower=1, upper=2, count=20, sum=20, - max=1), - hist.FrequencyBin(lower=2, upper=3, count=5, sum=10, max=2), + hist.FrequencyBin( + lower=1, upper=2, count=20, sum=20, min=1, max=1), + hist.FrequencyBin( + lower=2, upper=3, count=5, sum=10, min=1, max=2), ]), ), pre_aggregated=(False, True)) @@ -366,21 +414,24 @@ def test_compute_partitions_count_histogram(self, testcase_name, input, testcase_name='small_histogram', input=[(1, 1), (1, 2), (2, 1)], # (privacy_id, partition) expected=[ - hist.FrequencyBin(lower=1, upper=2, count=1, sum=1, max=1), - hist.FrequencyBin(lower=2, upper=3, count=1, sum=2, max=2) + hist.FrequencyBin( + lower=1, upper=2, count=1, sum=1, min=1, max=1), + hist.FrequencyBin( + lower=2, upper=3, count=1, sum=2, min=1, max=2) ]), dict( testcase_name='Each privacy id, 1 contribution', input=[(i, i) for i in range(100)], # (privacy_id, partition) expected=[ hist.FrequencyBin( - lower=1, upper=2, count=100, sum=100, max=1), + lower=1, upper=2, count=100, sum=100, min=1, max=1), ]), dict( testcase_name='1 privacy id many contributions to 1 partition', input=[(0, 0)], # (privacy_id, partition) expected=[ - hist.FrequencyBin(lower=1, upper=2, count=1, sum=1, max=1), + hist.FrequencyBin( + lower=1, upper=2, count=1, sum=1, min=1, max=1), ]), dict( testcase_name= @@ -388,16 +439,17 @@ def test_compute_partitions_count_histogram(self, testcase_name, input, input=[(0, i) for i in range(1234)], # (privacy_id, partition) expected=[ hist.FrequencyBin( - lower=1, upper=2, count=1234, sum=1234, max=1), + lower=1, upper=2, count=1234, sum=1234, min=1, max=1), ]), dict( testcase_name='2 privacy ids, same partitions contributed', input=[(0, i) for i in range(15)] + [(1, i) for i in range(10, 25)], # (privacy_id, partition) expected=[ - hist.FrequencyBin(lower=1, upper=2, count=20, sum=20, - max=1), - hist.FrequencyBin(lower=2, upper=3, count=5, sum=10, max=2), + hist.FrequencyBin( + lower=1, upper=2, count=20, sum=20, min=1, max=1), + hist.FrequencyBin( + lower=2, upper=3, count=5, sum=10, min=1, max=2), ]), ), pre_aggregated=(False, True)) @@ -435,23 +487,38 @@ def test_compute_partitions_privacy_id_count_histogram( input=lambda: [(1, 1, 0.5), (1, 2, 1.5), (2, 1, -2.5), (1, 1, 0.5)], # (privacy_id, partition, value) expected_cross_partition=lambda: [ - hist.FrequencyBin(lower=1, upper=2, count=1, sum=1, max=1), - hist.FrequencyBin(lower=2, upper=3, count=1, sum=2, max=2) + hist.FrequencyBin( + lower=1, upper=2, count=1, sum=1, min=1, max=1), + hist.FrequencyBin( + lower=2, upper=3, count=1, sum=2, min=1, max=2) ], expected_per_partition=lambda: [ - hist.FrequencyBin(lower=1, upper=2, count=2, sum=2, max=1), - hist.FrequencyBin(lower=2, upper=3, count=1, sum=2, max=2) + hist.FrequencyBin( + lower=1, upper=2, count=2, sum=2, min=1, max=1), + hist.FrequencyBin( + lower=2, upper=3, count=1, sum=2, min=1, max=2) ], expected_sum_per_partition=lambda: [ # see for explanation why these values # test_compute_linf_sum_contributions_histogram. - hist.FrequencyBin( - lower=-2.5, upper=-2.5004, count=1, sum=-2.5, max=-2.5), - hist.FrequencyBin( - lower=1.0, upper=-1.0004, count=1, sum=1.0, max=1.0), - hist.FrequencyBin( - lower=1.4996, - upper=1.5, count=1, sum=1.5, max=1.5), + hist.FrequencyBin(lower=-2.5, + upper=-2.5004, + count=1, + sum=-2.5, + min=1, + max=-2.5), + hist.FrequencyBin(lower=1.0, + upper=-1.0004, + count=1, + sum=1.0, + min=1, + max=1.0), + hist.FrequencyBin(lower=1.4996, + upper=1.5, + count=1, + sum=1.5, + min=1, + max=1.5), ]), dict( testcase_name='Each privacy id, 1 contribution', @@ -459,15 +526,15 @@ def test_compute_partitions_privacy_id_count_histogram( ], # (privacy_id, partition, value) expected_cross_partition=lambda: [ hist.FrequencyBin( - lower=1, upper=2, count=100, sum=100, max=1), + lower=1, upper=2, count=100, sum=100, min=1, max=1), ], expected_per_partition=lambda: [ hist.FrequencyBin( - lower=1, upper=2, count=100, sum=100, max=1), + lower=1, upper=2, count=100, sum=100, min=1, max=1), ], expected_sum_per_partition=lambda: [ hist.FrequencyBin( - lower=1, upper=1, count=100, sum=100, max=1), + lower=1, upper=1, count=100, sum=100, min=1, max=1), ], ), dict( @@ -475,16 +542,20 @@ def test_compute_partitions_privacy_id_count_histogram( input=lambda: [(0, 0, 1.0)] * 100, # (privacy_id, partition, value) expected_cross_partition=lambda: [ - hist.FrequencyBin(lower=1, upper=2, count=1, sum=1, max=1), + hist.FrequencyBin( + lower=1, upper=2, count=1, sum=1, min=1, max=1), ], expected_per_partition=lambda: [ hist.FrequencyBin( - lower=100, upper=101, count=1, sum=100, max=100), + lower=100, upper=101, count=1, sum=100, min=1, max=100), ], expected_sum_per_partition=lambda: [ - hist.FrequencyBin( - lower=100.0, upper=100.0, count=1, sum=100.0, max=100.0 - ), + hist.FrequencyBin(lower=100.0, + upper=100.0, + count=1, + sum=100.0, + min=1, + max=100.0), ], ), dict( @@ -493,16 +564,24 @@ def test_compute_partitions_privacy_id_count_histogram( input=lambda: [(0, i, 1.0) for i in range(1234) ], # (privacy_id, partition, value) expected_cross_partition=lambda: [ - hist.FrequencyBin( - lower=1230, upper=1240, count=1, sum=1234, max=1234), + hist.FrequencyBin(lower=1230, + upper=1240, + count=1, + sum=1234, + min=1, + max=1234), ], expected_per_partition=lambda: [ hist.FrequencyBin( - lower=1, upper=2, count=1234, sum=1234, max=1), + lower=1, upper=2, count=1234, sum=1234, min=1, max=1), ], expected_sum_per_partition=lambda: [ - hist.FrequencyBin( - lower=1.0, upper=1.0, count=1234, sum=1234.0, max=1), + hist.FrequencyBin(lower=1.0, + upper=1.0, + count=1234, + sum=1234.0, + min=1, + max=1), ], ), dict( @@ -512,15 +591,15 @@ def test_compute_partitions_privacy_id_count_histogram( ], # (privacy_id, partition, value) expected_cross_partition=lambda: [ hist.FrequencyBin( - lower=15, upper=16, count=2, sum=30, max=15), + lower=15, upper=16, count=2, sum=30, min=1, max=15), ], expected_per_partition=lambda: [ - hist.FrequencyBin(lower=1, upper=2, count=30, sum=30, max=1 - ), + hist.FrequencyBin( + lower=1, upper=2, count=30, sum=30, min=1, max=1), ], expected_sum_per_partition=lambda: [ hist.FrequencyBin( - lower=1.0, upper=1.0, count=30, sum=30, max=1), + lower=1.0, upper=1.0, count=30, sum=30, min=1, max=1), ], ), dict( @@ -529,16 +608,20 @@ def test_compute_partitions_privacy_id_count_histogram( (1, 0, 0.0), (1, 0, 1.3), (1, 0, 0.7), (1, 2, 2.0)], # (privacy_id, partition, value) expected_cross_partition=lambda: [ - hist.FrequencyBin(lower=2, upper=3, count=2, sum=4, max=2), + hist.FrequencyBin( + lower=2, upper=3, count=2, sum=4, min=1, max=2), ], expected_per_partition=lambda: [ - hist.FrequencyBin(lower=1, upper=2, count=2, sum=2, max=1), - hist.FrequencyBin(lower=2, upper=3, count=1, sum=2, max=2), - hist.FrequencyBin(lower=3, upper=4, count=1, sum=3, max=3), + hist.FrequencyBin( + lower=1, upper=2, count=2, sum=2, min=1, max=1), + hist.FrequencyBin( + lower=2, upper=3, count=1, sum=2, min=1, max=2), + hist.FrequencyBin( + lower=3, upper=4, count=1, sum=3, min=1, max=3), ], expected_sum_per_partition=lambda: [ hist.FrequencyBin( - lower=2.0, upper=2.0, count=4, sum=8, max=2), + lower=2.0, upper=2.0, count=4, sum=8, min=1, max=2), ], )), pre_aggregated=(False, True)) diff --git a/tests/dataset_histograms/histograms_test.py b/tests/dataset_histograms/histograms_test.py index d0e2f257..959d1678 100644 --- a/tests/dataset_histograms/histograms_test.py +++ b/tests/dataset_histograms/histograms_test.py @@ -23,7 +23,7 @@ def frequency_bin(lower: Union[int, float], upper: Union[int, float]) -> FrequencyBin: - return FrequencyBin(lower, upper, count=0, sum=0, max=0) + return FrequencyBin(lower, upper, count=0, sum=0, min=lower, max=lower) class HistogramTest(parameterized.TestCase): @@ -35,19 +35,55 @@ class HistogramTest(parameterized.TestCase): upper=1010, count=10, sum=10100, + min=1000, max=1009), ], q=[0.05, 0.1, 0.5, 0.8, 0.9], expected_quantiles=[1000, 1000, 1000, 1000, 1000]), dict(testcase_name='6 bins histogram', bins=[ - hist.FrequencyBin(lower=1, upper=2, count=2, sum=2, max=1), - hist.FrequencyBin(lower=2, upper=3, count=1, sum=2, max=2), - hist.FrequencyBin(lower=3, upper=4, count=1, sum=3, max=3), - hist.FrequencyBin(lower=4, upper=5, count=2, sum=8, max=4), - hist.FrequencyBin(lower=5, upper=6, count=2, sum=10, max=5), - hist.FrequencyBin(lower=6, upper=7, count=1, sum=6, max=6), - hist.FrequencyBin(lower=10, upper=12, count=1, sum=11, max=11) + hist.FrequencyBin(lower=1, + upper=2, + count=2, + sum=2, + min=1, + max=1), + hist.FrequencyBin(lower=2, + upper=3, + count=1, + sum=2, + min=2, + max=2), + hist.FrequencyBin(lower=3, + upper=4, + count=1, + sum=3, + min=3, + max=3), + hist.FrequencyBin(lower=4, + upper=5, + count=2, + sum=8, + min=4, + max=4), + hist.FrequencyBin(lower=5, + upper=6, + count=2, + sum=10, + min=5, + max=5), + hist.FrequencyBin(lower=6, + upper=7, + count=1, + sum=6, + min=6, + max=6), + hist.FrequencyBin(lower=10, + upper=12, + count=1, + sum=11, + min=10, + max=11) ], q=[0.001, 0.05, 0.1, 0.5, 0.8, 0.9], expected_quantiles=[1, 1, 1, 4, 6, 10])) @@ -64,18 +100,54 @@ def test_quantile_contributions(self, bins, q, expected_quantiles): upper=1021, count=10, sum=10100, + min=1, max=1020), ], expected_ratios=[(0, 1), (1000, 100 / 10100), (1020, 0.0)]), dict(testcase_name='7 bins histogram', bins=[ - hist.FrequencyBin(lower=1, upper=2, count=8, sum=8, max=1), - hist.FrequencyBin(lower=2, upper=3, count=2, sum=4, max=2), - hist.FrequencyBin(lower=3, upper=4, count=1, sum=3, max=3), - hist.FrequencyBin(lower=4, upper=5, count=2, sum=8, max=4), - hist.FrequencyBin(lower=5, upper=6, count=2, sum=10, max=5), - hist.FrequencyBin(lower=6, upper=7, count=1, sum=6, max=6), - hist.FrequencyBin(lower=11, upper=12, count=1, sum=11, max=11), + hist.FrequencyBin(lower=1, + upper=2, + count=8, + sum=8, + min=1, + max=1), + hist.FrequencyBin(lower=2, + upper=3, + count=2, + sum=4, + min=2, + max=2), + hist.FrequencyBin(lower=3, + upper=4, + count=1, + sum=3, + min=3, + max=3), + hist.FrequencyBin(lower=4, + upper=5, + count=2, + sum=8, + min=4, + max=4), + hist.FrequencyBin(lower=5, + upper=6, + count=2, + sum=10, + min=5, + max=5), + hist.FrequencyBin(lower=6, + upper=7, + count=1, + sum=6, + min=6, + max=6), + hist.FrequencyBin(lower=11, + upper=12, + count=1, + sum=11, + min=11, + max=11), ], expected_ratios=[(0, 1), (1, 0.66), (2, 0.48), (3, 0.34), (4, 0.22), (5, 0.14), (6, 0.1), (11, 0.0)])) diff --git a/tests/dataset_histograms/sum_histogram_computation_test.py b/tests/dataset_histograms/sum_histogram_computation_test.py index 8a013c8b..43b4ce2a 100644 --- a/tests/dataset_histograms/sum_histogram_computation_test.py +++ b/tests/dataset_histograms/sum_histogram_computation_test.py @@ -48,17 +48,28 @@ class SumHistogramComputationTest(parameterized.TestCase): expected=lambda: [ # step is (1.5 - (-2.5)) / 1e4 = 0.0004, # ((2, 1), -2.5) - hist.FrequencyBin( - lower=-2.5, upper=-2.5004, count=1, sum=-2.5, max=-2.5), + hist.FrequencyBin(lower=-2.5, + upper=-2.5004, + count=1, + sum=-2.5, + min=-2.5, + max=-2.5), # 2 times ((1, 1), 0.5), they are summed up and put into a # bin as one. - hist.FrequencyBin( - lower=1.0, upper=-1.0004, count=1, sum=1.0, max=1.0), + hist.FrequencyBin(lower=1.0, + upper=-1.0004, + count=1, + sum=1.0, + min=1.0, + max=1.0), # ((1, 1, 1.5), 1.5 is max and not included, # therefore 1.5 - 0.0004 = 1.4996. - hist.FrequencyBin( - lower=1.4996, - upper=1.5, count=1, sum=1.5, max=1.5), + hist.FrequencyBin(lower=1.4996, + upper=1.5, + count=1, + sum=1.5, + min=1.5, + max=1.5), ]), dict( testcase_name='Different privacy ids, 1 equal contribution', @@ -66,7 +77,7 @@ class SumHistogramComputationTest(parameterized.TestCase): # ((privacy_id, partition), value) expected=lambda: [ hist.FrequencyBin( - lower=1, upper=1, count=100, sum=100, max=1), + lower=1, upper=1, count=100, sum=100, min=1, max=1), ]), dict( testcase_name='Different privacy ids, 1 different contribution', @@ -78,10 +89,15 @@ class SumHistogramComputationTest(parameterized.TestCase): upper=float(i + 1), count=1, sum=i, + min=i, max=i) for i in range(9999) ] + [ - hist.FrequencyBin( - lower=9999, upper=1000, count=2, sum=19999, max=10000) + hist.FrequencyBin(lower=9999, + upper=1000, + count=2, + sum=19999, + min=9999, + max=10000) ]), dict( testcase_name='1 privacy id many contributions to 1 ' @@ -89,9 +105,12 @@ class SumHistogramComputationTest(parameterized.TestCase): input=lambda: [( (0, 0), 1.0)] * 100, # ((privacy_id, partition), value) expected=lambda: [ - hist.FrequencyBin( - lower=100.0, upper=100.0, count=1, sum=100.0, max=100.0 - ), + hist.FrequencyBin(lower=100.0, + upper=100.0, + count=1, + sum=100.0, + min=100.0, + max=100.0), ]), dict( testcase_name= @@ -99,8 +118,12 @@ class SumHistogramComputationTest(parameterized.TestCase): input=lambda: [((0, i), 1.0) for i in range(1234)], # ((privacy_id, partition), value) expected=lambda: [ - hist.FrequencyBin( - lower=1.0, upper=1.0, count=1234, sum=1234.0, max=1), + hist.FrequencyBin(lower=1.0, + upper=1.0, + count=1234, + sum=1234.0, + min=1.0, + max=1), ]), dict( testcase_name= @@ -113,10 +136,15 @@ class SumHistogramComputationTest(parameterized.TestCase): upper=float(i + 1), count=1, sum=i, + min=i, max=i) for i in range(9999) ] + [ - hist.FrequencyBin( - lower=9999, upper=1000, count=2, sum=19999, max=10000) + hist.FrequencyBin(lower=9999, + upper=1000, + count=2, + sum=19999, + min=9999, + max=10000) ]), dict( testcase_name= @@ -126,7 +154,7 @@ class SumHistogramComputationTest(parameterized.TestCase): # ((privacy_id, partition), value) expected=lambda: [ hist.FrequencyBin( - lower=1.0, upper=1.0, count=30, sum=30, max=1), + lower=1.0, upper=1.0, count=30, sum=30, min=1, max=1), ]), dict( testcase_name='2 privacy ids, same partitions differently ' @@ -137,11 +165,15 @@ class SumHistogramComputationTest(parameterized.TestCase): # step = (1 - (-1)) / 1e4 = 0.0002, # therefore last lower is 1 - 0.0002 = 0.9998. expected=lambda: [ + hist.FrequencyBin(lower=-1.0, + upper=-1.0002, + count=15, + sum=-15, + min=-1, + max=-1), hist.FrequencyBin( - lower=-1.0, upper=-1.0002, count=15, sum=-15, max=-1), - hist. - FrequencyBin(lower=0.9998, upper=1, count=15, sum=15, max=1 - ), + lower=0.9998, + upper=1, count=15, sum=15, min=1, max=1), ]), ), pre_aggregated=(False, True)) @@ -186,15 +218,15 @@ def test_compute_linf_sum_contributions_histogram_2_columns( expected = [ hist.Histogram(hist.HistogramType.LINF_SUM_CONTRIBUTIONS, [ hist.FrequencyBin( - lower=1.0, upper=1.0004, count=1, sum=1, max=1), + lower=1.0, upper=1.0004, count=1, sum=1, min=1, max=1), hist.FrequencyBin( - lower=4.9996, upper=5.0, count=2, sum=10, max=5) + lower=4.9996, upper=5.0, count=2, sum=10, min=5, max=5) ]), hist.Histogram(hist.HistogramType.LINF_SUM_CONTRIBUTIONS, [ hist.FrequencyBin( - lower=10.0, upper=10.004, count=1, sum=10, max=10), + lower=10.0, upper=10.004, count=1, sum=10, min=10, max=10), hist.FrequencyBin( - lower=49.996, upper=50.0, count=2, sum=100, max=50) + lower=49.996, upper=50.0, count=2, sum=100, min=50, max=50) ]) ] if pre_aggregated: @@ -227,12 +259,17 @@ def test_compute_linf_sum_contributions_histogram_2_columns( (1, 1), 0.5)], # ((privacy_id, partition), value) expected=lambda: [ # Bucket step = 3/10**4 = 0.0003 - hist.FrequencyBin( - lower=-1.5, upper=-1.4997, count=1, sum=-1.5, max=-1.5), + hist.FrequencyBin(lower=-1.5, + upper=-1.4997, + count=1, + sum=-1.5, + min=-1.5, + max=-1.5), hist.FrequencyBin(lower=1.4996999999999998, upper=1.5, count=1, sum=1.5, + min=1.5, max=1.5) ]), dict( @@ -242,7 +279,7 @@ def test_compute_linf_sum_contributions_histogram_2_columns( # ((privacy_id, partition), value) expected=lambda: [ hist.FrequencyBin( - lower=1, upper=1, count=100, sum=100, max=1), + lower=1, upper=1, count=100, sum=100, min=1, max=1), ]), dict( testcase_name='Different privacy ids, 1 different contribution', @@ -254,10 +291,15 @@ def test_compute_linf_sum_contributions_histogram_2_columns( upper=float(i + 1), count=1, sum=i, + min=i, max=i) for i in range(9999) ] + [ - hist.FrequencyBin( - lower=9999, upper=1000, count=2, sum=19999, max=10000) + hist.FrequencyBin(lower=9999, + upper=1000, + count=2, + sum=19999, + min=9999, + max=10000) ]), dict( testcase_name='1 privacy id many contributions to 1 ' @@ -265,9 +307,12 @@ def test_compute_linf_sum_contributions_histogram_2_columns( input=lambda: [( (0, 0), 1.0)] * 100, # ((privacy_id, partition), value) expected=lambda: [ - hist.FrequencyBin( - lower=100.0, upper=100.0, count=1, sum=100.0, max=100.0 - ), + hist.FrequencyBin(lower=100.0, + upper=100.0, + count=1, + sum=100.0, + min=100.0, + max=100.0), ]), dict( testcase_name= @@ -275,8 +320,12 @@ def test_compute_linf_sum_contributions_histogram_2_columns( input=lambda: [((0, i), 1.0) for i in range(1234)], # ((privacy_id, partition), value) expected=lambda: [ - hist.FrequencyBin( - lower=1.0, upper=1.0, count=1234, sum=1234.0, max=1), + hist.FrequencyBin(lower=1.0, + upper=1.0, + count=1234, + sum=1234.0, + min=1, + max=1), ]), dict( testcase_name= @@ -289,10 +338,15 @@ def test_compute_linf_sum_contributions_histogram_2_columns( upper=float(i + 1), count=1, sum=i, + min=i, max=i) for i in range(9999) ] + [ - hist.FrequencyBin( - lower=9999, upper=1000, count=2, sum=19999, max=10000) + hist.FrequencyBin(lower=9999, + upper=1000, + count=2, + sum=19999, + min=9999, + max=10000) ]), dict( testcase_name= @@ -301,11 +355,18 @@ def test_compute_linf_sum_contributions_histogram_2_columns( (1, i), 1.0) for i in range(10, 25)], # ((privacy_id, partition), value) expected=lambda: [ - hist.FrequencyBin( - lower=1.0, upper=1.0001, count=20, sum=20.0, max=1.0), - hist.FrequencyBin( - lower=1.9999, - upper=2.0, count=5, sum=10.0, max=2.0) + hist.FrequencyBin(lower=1.0, + upper=1.0001, + count=20, + sum=20.0, + min=1.0, + max=1.0), + hist.FrequencyBin(lower=1.9999, + upper=2.0, + count=5, + sum=10.0, + min=2.0, + max=2.0) ]), dict( testcase_name='2 privacy ids, same partitions differently ' @@ -320,15 +381,20 @@ def test_compute_linf_sum_contributions_histogram_2_columns( upper=-0.9998, count=10, sum=-10.0, + min=-1.0, max=-1.0), hist.FrequencyBin(lower=0.0, upper=0.00019999999999997797, count=5, sum=0.0, + min=0, max=0.0), - hist.FrequencyBin( - lower=0.9998, - upper=1.0, count=10, sum=10.0, max=1.0) + hist.FrequencyBin(lower=0.9998, + upper=1.0, + count=10, + sum=10.0, + min=1.0, + max=1.0) ]), ), pre_aggregated=(False, True)) @@ -369,15 +435,15 @@ def test_compute_partition_sum_histogram_2_columns(self, expected = [ hist.Histogram(hist.HistogramType.SUM_PER_PARTITION, [ hist.FrequencyBin( - lower=5.0, upper=5.0001, count=1, sum=5, max=5), + lower=5.0, upper=5.0001, count=1, sum=5, min=5, max=5), hist.FrequencyBin( - lower=5.9999, upper=6.0, count=1, sum=6, max=6) + lower=5.9999, upper=6.0, count=1, sum=6, min=6, max=6) ]), hist.Histogram(hist.HistogramType.SUM_PER_PARTITION, [ hist.FrequencyBin( - lower=50.0, upper=50.001, count=1, sum=50, max=50), + lower=50.0, upper=50.001, count=1, sum=50, min=50, max=50), hist.FrequencyBin( - lower=59.999, upper=60.0, count=1, sum=60, max=60) + lower=59.999, upper=60.0, count=1, sum=60, min=60, max=60) ]) ] if pre_aggregated: diff --git a/tests/private_contribution_bounds_test.py b/tests/private_contribution_bounds_test.py index 768c1f85..6aeab1ac 100644 --- a/tests/private_contribution_bounds_test.py +++ b/tests/private_contribution_bounds_test.py @@ -54,16 +54,19 @@ def test_score_laplace_noise_valid_values_calculates_score_correctly(self): upper=2, count=100, sum=100, + min=1, max=1), hist.FrequencyBin(lower=2, upper=6, count=10, sum=10, + min=2, max=5), hist.FrequencyBin(lower=6, upper=100, count=20, sum=20, + min=6, max=60) ]) l0_scoring_function = private_contribution_bounds.L0ScoringFunction( @@ -95,39 +98,6 @@ def test_score_laplace_noise_valid_values_calculates_score_correctly(self): # -0.5 * 200 * 200 / 0.9 * sqrt(2) - 0.5 * 0 self.assertAlmostEqual(-31427.0, score_200, places=1) - def test_score_laplace_noise_invalid_values_throws_exception(self): - params = construct_params( - aggregation_noise_kind=pipeline_dp.NoiseKind.LAPLACE, - aggregation_eps=0.9, - max_partitions_contributed_upper_bound=100) - number_of_partitions = 200 - l0_histogram = hist.Histogram(name=hist.HistogramType.L0_CONTRIBUTIONS, - bins=[ - hist.FrequencyBin(lower=1, - upper=2, - count=100, - sum=100, - max=1), - hist.FrequencyBin(lower=2, - upper=6, - count=10, - sum=10, - max=5), - hist.FrequencyBin(lower=6, - upper=100, - count=20, - sum=20, - max=60) - ]) - l0_scoring_function = private_contribution_bounds.L0ScoringFunction( - params, number_of_partitions, l0_histogram) - - with self.assertRaises(RuntimeError): - l0_scoring_function.score(-1) - - with self.assertRaises(RuntimeError): - l0_scoring_function.score(0) - def test_score_gaussian_noise_valid_values_calculates_score_correctly(self): params = construct_params( aggregation_noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, @@ -141,16 +111,19 @@ def test_score_gaussian_noise_valid_values_calculates_score_correctly(self): upper=2, count=100, sum=100, + min=1, max=1), hist.FrequencyBin(lower=2, upper=6, count=10, sum=10, + min=2, max=5), hist.FrequencyBin(lower=6, upper=100, count=20, sum=20, + min=6, max=60) ]) l0_scoring_function = private_contribution_bounds.L0ScoringFunction( @@ -183,37 +156,6 @@ def test_score_gaussian_noise_valid_values_calculates_score_correctly(self): # -0.5 * 200 * sqrt(200) * sigma - 0.5 * 0 self.assertAlmostEqual(-3977, score_200, delta=10) - def test_score_gaussian_noise_invalid_values_throws_exception(self): - params = construct_params( - aggregation_noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, - aggregation_eps=0.9, - aggregation_delta=0.001, - max_partitions_contributed_upper_bound=100) - number_of_partitions = 200 - l0_histogram = hist.Histogram(name=hist.HistogramType.L0_CONTRIBUTIONS, - bins=[ - hist.FrequencyBin(lower=1, - upper=2, - count=100, - sum=100, - max=1), - hist.FrequencyBin(lower=2, - upper=6, - count=10, - sum=10, - max=5), - hist.FrequencyBin(lower=6, - upper=100, - count=20, - sum=20, - max=60) - ]) - l0_scoring_function = private_contribution_bounds.L0ScoringFunction( - params, number_of_partitions, l0_histogram) - - with self.assertRaises(RuntimeError): - l0_scoring_function.score(0) - class PrivateL0CalculatorTest(unittest.TestCase): @@ -232,16 +174,19 @@ def test_calculate_returns_one_of_the_lower_bounds(self): upper=2, count=100, sum=100, + min=1, max=1), hist.FrequencyBin(lower=2, upper=6, count=10, sum=10, + min=2, max=5), hist.FrequencyBin(lower=6, upper=100, count=20, sum=20, + min=6, max=60) ]) histograms = [ @@ -278,11 +223,13 @@ def test_calculate_one_bound_has_much_higher_score_returns_it(self): upper=2, count=1, sum=1, + min=1, max=1), hist.FrequencyBin(lower=2, upper=3, count=10000, sum=10000, + min=2, max=2) ]) histograms = [