OpenMined · dvadym · Aug 25, 2023 · Aug 24, 2023
diff --git a/analysis/parameter_tuning.py b/analysis/parameter_tuning.py
@@ -35,18 +35,6 @@ class MinimizingFunction(Enum):
     RELATIVE_ERROR = 'relative_error'
 
 
-class ParametersSearchStrategy(Enum):
-    """Strategy types for selecting candidate parameters."""
-
-    # Picks up candidates that correspond tp a predefined list of quantiles.
-    QUANTILES = 1
-    # Candidates are a sequence starting from 1 where relative difference
-    # between two neighbouring elements is the same. Mathematically it means
-    # that candidates are a sequence a_i, where
-    # a_i = max_value^(i / (max_candidates - 1)), i in [0..(max_candidates - 1)]
-    CONSTANT_RELATIVE_STEP = 2
-
-
 @dataclass
 class ParametersToTune:
     """Contains parameters to tune."""
@@ -82,8 +70,6 @@ class TuneOptions:
         pre_aggregated_data: when True the input data is already pre-aggregated,
           otherwise the input data are raw. Preaggregated data also can be
           sampled.
-        parameters_search_strategy: specifies how to select candidates for
-          parameters.
         number_of_parameter_candidates: how many candidates to generate for
           parameter tuning. This is an upper bound, there can be fewer
           candidates generated.
@@ -96,7 +82,6 @@ class TuneOptions:
     parameters_to_tune: ParametersToTune
     partitions_sampling_prob: float = 1
     pre_aggregated_data: bool = False
-    parameters_search_strategy: ParametersSearchStrategy = ParametersSearchStrategy.CONSTANT_RELATIVE_STEP
     number_of_parameter_candidates: int = 100
 
     def __post_init__(self):
@@ -131,7 +116,6 @@ def _find_candidate_parameters(
         hist: histograms.DatasetHistograms,
         parameters_to_tune: ParametersToTune,
         metric: Optional[pipeline_dp.Metric],
-        strategy: ParametersSearchStrategy,
         max_candidates: int) -> analysis.MultiParameterConfiguration:
     """Finds candidates for l0 and/or l_inf parameters.
 
@@ -140,19 +124,10 @@ def _find_candidate_parameters(
         parameters_to_tune: which parameters to tune.
         metric: dp aggregation for which candidates are computed. If metric is
           None, it means no metrics to compute, i.e. only select partitions.
-        strategy: determines the strategy how to select candidates, see comments
-          to enum values for full description of the respective strategies.
         max_candidates: how many candidates ((l0, linf) pairs) can be in the
           output. Note that output can contain fewer candidates. 100 is default
           heuristically chosen value, better to adjust it for your use-case.
     """
-    if strategy == ParametersSearchStrategy.QUANTILES:
-        find_candidates_func = _find_candidates_quantiles
-    elif strategy == ParametersSearchStrategy.CONSTANT_RELATIVE_STEP:
-        find_candidates_func = _find_candidates_constant_relative_step
-    else:
-        raise ValueError("Unknown strategy for candidate parameters search.")
-
     calculate_l0_param = parameters_to_tune.max_partitions_contributed
     generate_linf = metric == pipeline_dp.Metrics.COUNT
     calculate_linf_param = (parameters_to_tune.max_contributions_per_partition
@@ -161,22 +136,22 @@ def _find_candidate_parameters(
 
     if calculate_l0_param and calculate_linf_param:
         max_candidates_per_parameter = int(math.sqrt(max_candidates))
-        l0_candidates = find_candidates_func(hist.l0_contributions_histogram,
-                                             max_candidates_per_parameter)
-        linf_candidates = find_candidates_func(
+        l0_candidates = _find_candidates_constant_relative_step(
+            hist.l0_contributions_histogram, max_candidates_per_parameter)
+        linf_candidates = _find_candidates_constant_relative_step(
             hist.linf_contributions_histogram, max_candidates_per_parameter)
         l0_bounds, linf_bounds = [], []
 
         # if linf or l0 has fewer candidates than requested then we can add more
         # candidates for the other parameter.
         if (len(linf_candidates) < max_candidates_per_parameter and
                 len(l0_candidates) == max_candidates_per_parameter):
-            l0_candidates = find_candidates_func(
+            l0_candidates = _find_candidates_constant_relative_step(
                 hist.l0_contributions_histogram,
                 int(max_candidates / len(linf_candidates)))
         elif (len(l0_candidates) < max_candidates_per_parameter and
               len(linf_candidates) == max_candidates_per_parameter):
-            linf_candidates = find_candidates_func(
+            linf_candidates = _find_candidates_constant_relative_step(
                 hist.linf_contributions_histogram,
                 int(max_candidates / len(l0_candidates)))
 
@@ -185,11 +160,11 @@ def _find_candidate_parameters(
                 l0_bounds.append(l0)
                 linf_bounds.append(linf)
     elif calculate_l0_param:
-        l0_bounds = find_candidates_func(hist.l0_contributions_histogram,
-                                         max_candidates)
+        l0_bounds = _find_candidates_constant_relative_step(
+            hist.l0_contributions_histogram, max_candidates)
     elif calculate_linf_param:
-        linf_bounds = find_candidates_func(hist.linf_contributions_histogram,
-                                           max_candidates)
+        linf_bounds = _find_candidates_constant_relative_step(
+            hist.linf_contributions_histogram, max_candidates)
     else:
         assert False, "Nothing to tune."
 
@@ -198,20 +173,15 @@ def _find_candidate_parameters(
         max_contributions_per_partition=linf_bounds)
 
 
-def _find_candidates_quantiles(histogram: histograms.Histogram,
-                               max_candidates: int) -> List[int]:
-    """Implementation of QUANTILES strategy."""
-    quantiles_to_use = [0.9, 0.95, 0.98, 0.99, 0.995]
-    candidates = histogram.quantiles(quantiles_to_use)
-    candidates.append(histogram.max_value())
-    candidates = list(set(candidates))  # remove duplicates
-    candidates.sort()
-    return candidates[:max_candidates]
-
-
 def _find_candidates_constant_relative_step(histogram: histograms.Histogram,
                                             max_candidates: int) -> List[int]:
-    """Implementation of CONSTANT_RELATIVE_STEP strategy."""
+    """Finds candidates with constant relative step.
+
+    Candidates are a sequence starting from 1 where relative difference
+    between two neighbouring elements is the same. Mathematically it means
+    that candidates are a sequence a_i, where
+    a_i = max_value^(i / (max_candidates - 1)), i in [0..(max_candidates - 1)]
+    """
     max_value = histogram.max_value()
     assert max_value >= 1, "max_value has to be >= 1."
     max_candidates = min(max_candidates, max_value)
@@ -244,8 +214,7 @@ def tune(col,
     """Tunes parameters.
 
     It works in the following way:
-        1. Candidates for contribution bounding parameters chosen based on
-          options.parameters_search_strategy strategy.
+        1. Find candidates for contribution bounding parameters.
         2. Utility analysis run for those parameters.
         3. The best parameter set is chosen according to
           options.minimizing_function.
@@ -282,7 +251,6 @@ def tune(col,
 
     candidates = _find_candidate_parameters(
         contribution_histograms, options.parameters_to_tune, metric,
-        options.parameters_search_strategy,
         options.number_of_parameter_candidates)
 
     utility_analysis_options = analysis.UtilityAnalysisOptions(

diff --git a/analysis/tests/parameter_tuning_test.py b/analysis/tests/parameter_tuning_test.py
@@ -22,7 +22,6 @@
 import pipeline_dp
 from analysis import metrics
 from analysis import parameter_tuning
-from analysis.parameter_tuning import ParametersSearchStrategy
 from pipeline_dp.dataset_histograms import histograms
 from pipeline_dp.dataset_histograms import computing_histograms
 
@@ -48,55 +47,12 @@ def _get_tune_options():
 
 class ParameterTuning(parameterized.TestCase):
 
-    @parameterized.parameters(
-        (True, True, pipeline_dp.Metrics.COUNT, [1, 1, 2, 2, 6, 6
-                                                ], [3, 6, 3, 6, 3, 6]),
-        (False, True, pipeline_dp.Metrics.COUNT, None, [3, 6]),
-        (True, False, pipeline_dp.Metrics.COUNT, [1, 2, 6], None),
-        (True, True, pipeline_dp.Metrics.PRIVACY_ID_COUNT, [1, 2, 6], None),
-    )
-    def test_find_candidate_parameters_quantiles_strategy(
-        self,
-        tune_max_partitions_contributed: bool,
-        tune_max_contributions_per_partition: bool,
-        metric: pipeline_dp.Metrics,
-        expected_max_partitions_contributed: List,
-        expected_max_contributions_per_partition: List,
-    ):
-        mock_l0_histogram = histograms.Histogram(None, None)
-        mock_l0_histogram.quantiles = mock.Mock(return_value=[1, 1, 2])
-        mock_l0_histogram.max_value = mock.Mock(return_value=6)
-        mock_linf_histogram = histograms.Histogram(None, None)
-        mock_linf_histogram.quantiles = mock.Mock(return_value=[3, 6, 6])
-        mock_linf_histogram.max_value = mock.Mock(return_value=6)
-
-        mock_histograms = histograms.DatasetHistograms(mock_l0_histogram, None,
-                                                       mock_linf_histogram,
-                                                       None, None, None)
-        parameters_to_tune = parameter_tuning.ParametersToTune(
-            max_partitions_contributed=tune_max_partitions_contributed,
-            max_contributions_per_partition=tune_max_contributions_per_partition
-        )
-
-        candidates = parameter_tuning._find_candidate_parameters(
-            mock_histograms,
-            parameters_to_tune,
-            metric,
-            ParametersSearchStrategy.QUANTILES,
-            max_candidates=100)
-        self.assertEqual(expected_max_partitions_contributed,
-                         candidates.max_partitions_contributed)
-        self.assertEqual(expected_max_contributions_per_partition,
-                         candidates.max_contributions_per_partition)
-
     def test_find_candidate_parameters_maximum_number_of_candidates_is_respected_when_both_parameters_needs_to_be_tuned(
             self):
         mock_l0_histogram = histograms.Histogram(None, None)
-        mock_l0_histogram.quantiles = mock.Mock(return_value=[1, 2, 3])
         mock_l0_histogram.max_value = mock.Mock(return_value=6)
         mock_linf_histogram = histograms.Histogram(None, None)
-        mock_linf_histogram.quantiles = mock.Mock(return_value=[4, 5, 6])
-        mock_linf_histogram.max_value = mock.Mock(return_value=6)
+        mock_linf_histogram.max_value = mock.Mock(return_value=3)
 
         mock_histograms = histograms.DatasetHistograms(mock_l0_histogram, None,
                                                        mock_linf_histogram,
@@ -109,20 +65,17 @@ def test_find_candidate_parameters_maximum_number_of_candidates_is_respected_whe
             mock_histograms,
             parameters_to_tune,
             pipeline_dp.Metrics.COUNT,
-            ParametersSearchStrategy.QUANTILES,
             max_candidates=5)
-        self.assertEqual([1, 1, 2, 2], candidates.max_partitions_contributed)
-        self.assertEqual([4, 5, 4, 5],
+        self.assertEqual([1, 1, 6, 6], candidates.max_partitions_contributed)
+        self.assertEqual([1, 3, 1, 3],
                          candidates.max_contributions_per_partition)
 
     def test_find_candidate_parameters_more_candidates_for_l_0_when_not_so_many_l_inf_candidates(
             self):
         mock_l0_histogram = histograms.Histogram(None, None)
-        mock_l0_histogram.quantiles = mock.Mock(return_value=[1, 2, 3, 4, 5])
-        mock_l0_histogram.max_value = mock.Mock(return_value=6)
+        mock_l0_histogram.max_value = mock.Mock(return_value=4)
         mock_linf_histogram = histograms.Histogram(None, None)
-        mock_linf_histogram.quantiles = mock.Mock(return_value=[6, 7])
-        mock_linf_histogram.max_value = mock.Mock(return_value=6)
+        mock_linf_histogram.max_value = mock.Mock(return_value=2)
 
         mock_histograms = histograms.DatasetHistograms(mock_l0_histogram, None,
                                                        mock_linf_histogram,
@@ -135,25 +88,21 @@ def test_find_candidate_parameters_more_candidates_for_l_0_when_not_so_many_l_in
             mock_histograms,
             parameters_to_tune,
             pipeline_dp.Metrics.COUNT,
-            ParametersSearchStrategy.QUANTILES,
             max_candidates=9)
-        # sqrt(9) = 3, but l_inf has only 2 quantiles, therefore for l_0 we can
-        # take 9 / 2 = 4 quantiles, we take first 4 quantiles (1, 2, 3, 4).
-        # Addition of max_value (6) to l_inf does not change anything because
-        # l_inf set already contains 6.
+        # sqrt(9) = 3, but l_inf has only 2 possible values,
+        # therefore for l_0 we can take 9 / 2 = 4 values,
+        # we take all 4 possible values (1, 2, 3, 4).
         self.assertEqual([1, 1, 2, 2, 3, 3, 4, 4],
                          candidates.max_partitions_contributed)
-        self.assertEqual([6, 7, 6, 7, 6, 7, 6, 7],
+        self.assertEqual([1, 2, 1, 2, 1, 2, 1, 2],
                          candidates.max_contributions_per_partition)
 
     def test_find_candidate_parameters_more_candidates_for_l_inf_when_not_so_many_l_0_candidates(
             self):
         mock_l0_histogram = histograms.Histogram(None, None)
-        mock_l0_histogram.quantiles = mock.Mock(return_value=[1])
-        mock_l0_histogram.max_value = mock.Mock(return_value=8)
+        mock_l0_histogram.max_value = mock.Mock(return_value=2)
         mock_linf_histogram = histograms.Histogram(None, None)
-        mock_linf_histogram.quantiles = mock.Mock(return_value=[3, 4, 5, 6, 7])
-        mock_linf_histogram.max_value = mock.Mock(return_value=8)
+        mock_linf_histogram.max_value = mock.Mock(return_value=4)
 
         mock_histograms = histograms.DatasetHistograms(mock_l0_histogram, None,
                                                        mock_linf_histogram,
@@ -166,13 +115,13 @@ def test_find_candidate_parameters_more_candidates_for_l_inf_when_not_so_many_l_
             mock_histograms,
             parameters_to_tune,
             pipeline_dp.Metrics.COUNT,
-            ParametersSearchStrategy.QUANTILES,
-            max_candidates=10)
-        # sqrt(10) = 3, but l_0 has only 2 quantiles (1 and 8 -- max_value),
-        # therefore for l_inf we can take 10 / 2 = 5 quantiles.
-        self.assertEqual([1, 1, 1, 1, 1, 8, 8, 8, 8, 8],
+            max_candidates=9)
+        # sqrt(9) = 3, but l_0 has only 2 possible values,
+        # therefore for l_inf we can take 9 / 2 = 4 values,
+        # we take all 4 possible values (1, 2, 3, 4).
+        self.assertEqual([1, 1, 1, 1, 2, 2, 2, 2],
                          candidates.max_partitions_contributed)
-        self.assertEqual([3, 4, 5, 6, 7, 3, 4, 5, 6, 7],
+        self.assertEqual([1, 2, 3, 4, 1, 2, 3, 4],
                          candidates.max_contributions_per_partition)
 
     @parameterized.named_parameters(
@@ -207,8 +156,8 @@ def test_find_candidate_parameters_more_candidates_for_l_inf_when_not_so_many_l_
             max_candidates=5,
             # ceil(1000^(i / 4)), where i in [0, 1, 2, 3, 4]
             expected_candidates=[1, 6, 32, 178, 1000]))
-    def test_find_candidate_parameters_constant_relative_step_strategy(
-            self, max_value, max_candidates, expected_candidates):
+    def test_find_candidate_parameters(self, max_value, max_candidates,
+                                       expected_candidates):
         mock_l0_histogram = histograms.Histogram(None, None)
         mock_l0_histogram.max_value = mock.Mock(return_value=max_value)
 
@@ -222,7 +171,6 @@ def test_find_candidate_parameters_constant_relative_step_strategy(
             mock_histograms,
             parameters_to_tune,
             pipeline_dp.Metrics.COUNT,
-            ParametersSearchStrategy.CONSTANT_RELATIVE_STEP,
             max_candidates=max_candidates)
 
         self.assertEqual(expected_candidates,
@@ -263,11 +211,7 @@ def test_find_candidate_parameters_generate_linf(
             max_contributions_per_partition=True)
 
         candidates = parameter_tuning._find_candidate_parameters(
-            mock_histograms,
-            parameters_to_tune,
-            metric,
-            ParametersSearchStrategy.CONSTANT_RELATIVE_STEP,
-            max_candidates=100)
+            mock_histograms, parameters_to_tune, metric, max_candidates=100)
 
         mock_find_candidate_from_histogram.assert_any_call(
             mock_l0_histogram, mock.ANY)