Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove quantiles strategy #483

Merged
merged 1 commit into from
Aug 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 17 additions & 49 deletions analysis/parameter_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,6 @@ class MinimizingFunction(Enum):
RELATIVE_ERROR = 'relative_error'


class ParametersSearchStrategy(Enum):
"""Strategy types for selecting candidate parameters."""

# Picks up candidates that correspond tp a predefined list of quantiles.
QUANTILES = 1
# Candidates are a sequence starting from 1 where relative difference
# between two neighbouring elements is the same. Mathematically it means
# that candidates are a sequence a_i, where
# a_i = max_value^(i / (max_candidates - 1)), i in [0..(max_candidates - 1)]
CONSTANT_RELATIVE_STEP = 2


@dataclass
class ParametersToTune:
"""Contains parameters to tune."""
Expand Down Expand Up @@ -82,8 +70,6 @@ class TuneOptions:
pre_aggregated_data: when True the input data is already pre-aggregated,
otherwise the input data are raw. Preaggregated data also can be
sampled.
parameters_search_strategy: specifies how to select candidates for
parameters.
number_of_parameter_candidates: how many candidates to generate for
parameter tuning. This is an upper bound, there can be fewer
candidates generated.
Expand All @@ -96,7 +82,6 @@ class TuneOptions:
parameters_to_tune: ParametersToTune
partitions_sampling_prob: float = 1
pre_aggregated_data: bool = False
parameters_search_strategy: ParametersSearchStrategy = ParametersSearchStrategy.CONSTANT_RELATIVE_STEP
number_of_parameter_candidates: int = 100

def __post_init__(self):
Expand Down Expand Up @@ -131,7 +116,6 @@ def _find_candidate_parameters(
hist: histograms.DatasetHistograms,
parameters_to_tune: ParametersToTune,
metric: Optional[pipeline_dp.Metric],
strategy: ParametersSearchStrategy,
max_candidates: int) -> analysis.MultiParameterConfiguration:
"""Finds candidates for l0 and/or l_inf parameters.

Expand All @@ -140,19 +124,10 @@ def _find_candidate_parameters(
parameters_to_tune: which parameters to tune.
metric: dp aggregation for which candidates are computed. If metric is
None, it means no metrics to compute, i.e. only select partitions.
strategy: determines the strategy how to select candidates, see comments
to enum values for full description of the respective strategies.
max_candidates: how many candidates ((l0, linf) pairs) can be in the
output. Note that output can contain fewer candidates. 100 is default
heuristically chosen value, better to adjust it for your use-case.
"""
if strategy == ParametersSearchStrategy.QUANTILES:
find_candidates_func = _find_candidates_quantiles
elif strategy == ParametersSearchStrategy.CONSTANT_RELATIVE_STEP:
find_candidates_func = _find_candidates_constant_relative_step
else:
raise ValueError("Unknown strategy for candidate parameters search.")

calculate_l0_param = parameters_to_tune.max_partitions_contributed
generate_linf = metric == pipeline_dp.Metrics.COUNT
calculate_linf_param = (parameters_to_tune.max_contributions_per_partition
Expand All @@ -161,22 +136,22 @@ def _find_candidate_parameters(

if calculate_l0_param and calculate_linf_param:
max_candidates_per_parameter = int(math.sqrt(max_candidates))
l0_candidates = find_candidates_func(hist.l0_contributions_histogram,
max_candidates_per_parameter)
linf_candidates = find_candidates_func(
l0_candidates = _find_candidates_constant_relative_step(
hist.l0_contributions_histogram, max_candidates_per_parameter)
linf_candidates = _find_candidates_constant_relative_step(
hist.linf_contributions_histogram, max_candidates_per_parameter)
l0_bounds, linf_bounds = [], []

# if linf or l0 has fewer candidates than requested then we can add more
# candidates for the other parameter.
if (len(linf_candidates) < max_candidates_per_parameter and
len(l0_candidates) == max_candidates_per_parameter):
l0_candidates = find_candidates_func(
l0_candidates = _find_candidates_constant_relative_step(
hist.l0_contributions_histogram,
int(max_candidates / len(linf_candidates)))
elif (len(l0_candidates) < max_candidates_per_parameter and
len(linf_candidates) == max_candidates_per_parameter):
linf_candidates = find_candidates_func(
linf_candidates = _find_candidates_constant_relative_step(
hist.linf_contributions_histogram,
int(max_candidates / len(l0_candidates)))

Expand All @@ -185,11 +160,11 @@ def _find_candidate_parameters(
l0_bounds.append(l0)
linf_bounds.append(linf)
elif calculate_l0_param:
l0_bounds = find_candidates_func(hist.l0_contributions_histogram,
max_candidates)
l0_bounds = _find_candidates_constant_relative_step(
hist.l0_contributions_histogram, max_candidates)
elif calculate_linf_param:
linf_bounds = find_candidates_func(hist.linf_contributions_histogram,
max_candidates)
linf_bounds = _find_candidates_constant_relative_step(
hist.linf_contributions_histogram, max_candidates)
else:
assert False, "Nothing to tune."

Expand All @@ -198,20 +173,15 @@ def _find_candidate_parameters(
max_contributions_per_partition=linf_bounds)


def _find_candidates_quantiles(histogram: histograms.Histogram,
max_candidates: int) -> List[int]:
"""Implementation of QUANTILES strategy."""
quantiles_to_use = [0.9, 0.95, 0.98, 0.99, 0.995]
candidates = histogram.quantiles(quantiles_to_use)
candidates.append(histogram.max_value())
candidates = list(set(candidates)) # remove duplicates
candidates.sort()
return candidates[:max_candidates]


def _find_candidates_constant_relative_step(histogram: histograms.Histogram,
max_candidates: int) -> List[int]:
"""Implementation of CONSTANT_RELATIVE_STEP strategy."""
"""Finds candidates with constant relative step.

Candidates are a sequence starting from 1 where relative difference
between two neighbouring elements is the same. Mathematically it means
that candidates are a sequence a_i, where
a_i = max_value^(i / (max_candidates - 1)), i in [0..(max_candidates - 1)]
"""
max_value = histogram.max_value()
assert max_value >= 1, "max_value has to be >= 1."
max_candidates = min(max_candidates, max_value)
Expand Down Expand Up @@ -244,8 +214,7 @@ def tune(col,
"""Tunes parameters.

It works in the following way:
1. Candidates for contribution bounding parameters chosen based on
options.parameters_search_strategy strategy.
1. Find candidates for contribution bounding parameters.
2. Utility analysis run for those parameters.
3. The best parameter set is chosen according to
options.minimizing_function.
Expand Down Expand Up @@ -282,7 +251,6 @@ def tune(col,

candidates = _find_candidate_parameters(
contribution_histograms, options.parameters_to_tune, metric,
options.parameters_search_strategy,
options.number_of_parameter_candidates)

utility_analysis_options = analysis.UtilityAnalysisOptions(
Expand Down
96 changes: 20 additions & 76 deletions analysis/tests/parameter_tuning_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import pipeline_dp
from analysis import metrics
from analysis import parameter_tuning
from analysis.parameter_tuning import ParametersSearchStrategy
from pipeline_dp.dataset_histograms import histograms
from pipeline_dp.dataset_histograms import computing_histograms

Expand All @@ -48,55 +47,12 @@ def _get_tune_options():

class ParameterTuning(parameterized.TestCase):

@parameterized.parameters(
(True, True, pipeline_dp.Metrics.COUNT, [1, 1, 2, 2, 6, 6
], [3, 6, 3, 6, 3, 6]),
(False, True, pipeline_dp.Metrics.COUNT, None, [3, 6]),
(True, False, pipeline_dp.Metrics.COUNT, [1, 2, 6], None),
(True, True, pipeline_dp.Metrics.PRIVACY_ID_COUNT, [1, 2, 6], None),
)
def test_find_candidate_parameters_quantiles_strategy(
self,
tune_max_partitions_contributed: bool,
tune_max_contributions_per_partition: bool,
metric: pipeline_dp.Metrics,
expected_max_partitions_contributed: List,
expected_max_contributions_per_partition: List,
):
mock_l0_histogram = histograms.Histogram(None, None)
mock_l0_histogram.quantiles = mock.Mock(return_value=[1, 1, 2])
mock_l0_histogram.max_value = mock.Mock(return_value=6)
mock_linf_histogram = histograms.Histogram(None, None)
mock_linf_histogram.quantiles = mock.Mock(return_value=[3, 6, 6])
mock_linf_histogram.max_value = mock.Mock(return_value=6)

mock_histograms = histograms.DatasetHistograms(mock_l0_histogram, None,
mock_linf_histogram,
None, None, None)
parameters_to_tune = parameter_tuning.ParametersToTune(
max_partitions_contributed=tune_max_partitions_contributed,
max_contributions_per_partition=tune_max_contributions_per_partition
)

candidates = parameter_tuning._find_candidate_parameters(
mock_histograms,
parameters_to_tune,
metric,
ParametersSearchStrategy.QUANTILES,
max_candidates=100)
self.assertEqual(expected_max_partitions_contributed,
candidates.max_partitions_contributed)
self.assertEqual(expected_max_contributions_per_partition,
candidates.max_contributions_per_partition)

def test_find_candidate_parameters_maximum_number_of_candidates_is_respected_when_both_parameters_needs_to_be_tuned(
self):
mock_l0_histogram = histograms.Histogram(None, None)
mock_l0_histogram.quantiles = mock.Mock(return_value=[1, 2, 3])
mock_l0_histogram.max_value = mock.Mock(return_value=6)
mock_linf_histogram = histograms.Histogram(None, None)
mock_linf_histogram.quantiles = mock.Mock(return_value=[4, 5, 6])
mock_linf_histogram.max_value = mock.Mock(return_value=6)
mock_linf_histogram.max_value = mock.Mock(return_value=3)

mock_histograms = histograms.DatasetHistograms(mock_l0_histogram, None,
mock_linf_histogram,
Expand All @@ -109,20 +65,17 @@ def test_find_candidate_parameters_maximum_number_of_candidates_is_respected_whe
mock_histograms,
parameters_to_tune,
pipeline_dp.Metrics.COUNT,
ParametersSearchStrategy.QUANTILES,
max_candidates=5)
self.assertEqual([1, 1, 2, 2], candidates.max_partitions_contributed)
self.assertEqual([4, 5, 4, 5],
self.assertEqual([1, 1, 6, 6], candidates.max_partitions_contributed)
self.assertEqual([1, 3, 1, 3],
candidates.max_contributions_per_partition)

def test_find_candidate_parameters_more_candidates_for_l_0_when_not_so_many_l_inf_candidates(
self):
mock_l0_histogram = histograms.Histogram(None, None)
mock_l0_histogram.quantiles = mock.Mock(return_value=[1, 2, 3, 4, 5])
mock_l0_histogram.max_value = mock.Mock(return_value=6)
mock_l0_histogram.max_value = mock.Mock(return_value=4)
mock_linf_histogram = histograms.Histogram(None, None)
mock_linf_histogram.quantiles = mock.Mock(return_value=[6, 7])
mock_linf_histogram.max_value = mock.Mock(return_value=6)
mock_linf_histogram.max_value = mock.Mock(return_value=2)

mock_histograms = histograms.DatasetHistograms(mock_l0_histogram, None,
mock_linf_histogram,
Expand All @@ -135,25 +88,21 @@ def test_find_candidate_parameters_more_candidates_for_l_0_when_not_so_many_l_in
mock_histograms,
parameters_to_tune,
pipeline_dp.Metrics.COUNT,
ParametersSearchStrategy.QUANTILES,
max_candidates=9)
# sqrt(9) = 3, but l_inf has only 2 quantiles, therefore for l_0 we can
# take 9 / 2 = 4 quantiles, we take first 4 quantiles (1, 2, 3, 4).
# Addition of max_value (6) to l_inf does not change anything because
# l_inf set already contains 6.
# sqrt(9) = 3, but l_inf has only 2 possible values,
# therefore for l_0 we can take 9 / 2 = 4 values,
# we take all 4 possible values (1, 2, 3, 4).
self.assertEqual([1, 1, 2, 2, 3, 3, 4, 4],
candidates.max_partitions_contributed)
self.assertEqual([6, 7, 6, 7, 6, 7, 6, 7],
self.assertEqual([1, 2, 1, 2, 1, 2, 1, 2],
candidates.max_contributions_per_partition)

def test_find_candidate_parameters_more_candidates_for_l_inf_when_not_so_many_l_0_candidates(
self):
mock_l0_histogram = histograms.Histogram(None, None)
mock_l0_histogram.quantiles = mock.Mock(return_value=[1])
mock_l0_histogram.max_value = mock.Mock(return_value=8)
mock_l0_histogram.max_value = mock.Mock(return_value=2)
mock_linf_histogram = histograms.Histogram(None, None)
mock_linf_histogram.quantiles = mock.Mock(return_value=[3, 4, 5, 6, 7])
mock_linf_histogram.max_value = mock.Mock(return_value=8)
mock_linf_histogram.max_value = mock.Mock(return_value=4)

mock_histograms = histograms.DatasetHistograms(mock_l0_histogram, None,
mock_linf_histogram,
Expand All @@ -166,13 +115,13 @@ def test_find_candidate_parameters_more_candidates_for_l_inf_when_not_so_many_l_
mock_histograms,
parameters_to_tune,
pipeline_dp.Metrics.COUNT,
ParametersSearchStrategy.QUANTILES,
max_candidates=10)
# sqrt(10) = 3, but l_0 has only 2 quantiles (1 and 8 -- max_value),
# therefore for l_inf we can take 10 / 2 = 5 quantiles.
self.assertEqual([1, 1, 1, 1, 1, 8, 8, 8, 8, 8],
max_candidates=9)
# sqrt(9) = 3, but l_0 has only 2 possible values,
# therefore for l_inf we can take 9 / 2 = 4 values,
# we take all 4 possible values (1, 2, 3, 4).
self.assertEqual([1, 1, 1, 1, 2, 2, 2, 2],
candidates.max_partitions_contributed)
self.assertEqual([3, 4, 5, 6, 7, 3, 4, 5, 6, 7],
self.assertEqual([1, 2, 3, 4, 1, 2, 3, 4],
candidates.max_contributions_per_partition)

@parameterized.named_parameters(
Expand Down Expand Up @@ -207,8 +156,8 @@ def test_find_candidate_parameters_more_candidates_for_l_inf_when_not_so_many_l_
max_candidates=5,
# ceil(1000^(i / 4)), where i in [0, 1, 2, 3, 4]
expected_candidates=[1, 6, 32, 178, 1000]))
def test_find_candidate_parameters_constant_relative_step_strategy(
self, max_value, max_candidates, expected_candidates):
def test_find_candidate_parameters(self, max_value, max_candidates,
expected_candidates):
mock_l0_histogram = histograms.Histogram(None, None)
mock_l0_histogram.max_value = mock.Mock(return_value=max_value)

Expand All @@ -222,7 +171,6 @@ def test_find_candidate_parameters_constant_relative_step_strategy(
mock_histograms,
parameters_to_tune,
pipeline_dp.Metrics.COUNT,
ParametersSearchStrategy.CONSTANT_RELATIVE_STEP,
max_candidates=max_candidates)

self.assertEqual(expected_candidates,
Expand Down Expand Up @@ -263,11 +211,7 @@ def test_find_candidate_parameters_generate_linf(
max_contributions_per_partition=True)

candidates = parameter_tuning._find_candidate_parameters(
mock_histograms,
parameters_to_tune,
metric,
ParametersSearchStrategy.CONSTANT_RELATIVE_STEP,
max_candidates=100)
mock_histograms, parameters_to_tune, metric, max_candidates=100)

mock_find_candidate_from_histogram.assert_any_call(
mock_l0_histogram, mock.ANY)
Expand Down
Loading