Skip to content

Commit

Permalink
Utility analysis for pre-thresholding (#487)
Browse files Browse the repository at this point in the history
  • Loading branch information
dvadym authored Sep 13, 2023
1 parent 09b7aad commit 4f32de8
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 15 deletions.
7 changes: 4 additions & 3 deletions analysis/cross_partition_combiners.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,13 +262,14 @@ def _average_utility_report(report: metrics.UtilityReport, sums_actual: Tuple,
return

for sum_actual, metric_error in zip(sums_actual, report.metric_errors):
scaling_factor = 0 if total_weight == 0 else 1.0 / total_weight
_multiply_float_dataclasses_field(
metric_error,
1.0 / total_weight,
scaling_factor,
fields_to_ignore=["noise_std", "ratio_data_dropped"])
scaling_factor = 1 if sum_actual == 0 else 1.0 / sum_actual
dropped_scaling_factor = 1 if sum_actual == 0 else 1.0 / sum_actual
_multiply_float_dataclasses_field(metric_error.ratio_data_dropped,
scaling_factor)
dropped_scaling_factor)


def partition_size_weight_fn(
Expand Down
11 changes: 7 additions & 4 deletions analysis/per_partition_combiners.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ def compute_probability_to_keep(self,
partition_selection_strategy: pipeline_dp.
PartitionSelectionStrategy, eps: float,
delta: float,
max_partitions_contributed: int) -> float:
max_partitions_contributed: int,
pre_threshold: Optional[int]) -> float:
"""Computes the probability that this partition is kept.
If self.probabilities is set, then the computed probability is exact,
Expand All @@ -131,7 +132,7 @@ def compute_probability_to_keep(self,
pmf = self._compute_pmf()
ps_strategy = partition_selection.create_partition_selection_strategy(
partition_selection_strategy, eps, delta,
max_partitions_contributed)
max_partitions_contributed, pre_threshold)
probability = 0
for i, prob in enumerate(pmf.probabilities, pmf.start):
probability += prob * ps_strategy.probability_of_keep(i)
Expand Down Expand Up @@ -217,9 +218,11 @@ def compute_metrics(self, acc: PartitionSelectionAccumulator) -> float:
probs, moments = acc
params = self._params
calculator = PartitionSelectionCalculator(probs, moments)
aggregate_params = params.aggregate_params
return calculator.compute_probability_to_keep(
params.aggregate_params.partition_selection_strategy, params.eps,
params.delta, params.aggregate_params.max_partitions_contributed)
aggregate_params.partition_selection_strategy, params.eps,
params.delta, aggregate_params.max_partitions_contributed,
aggregate_params.pre_threshold)


class SumCombiner(UtilityAnalysisCombiner):
Expand Down
23 changes: 17 additions & 6 deletions analysis/tests/per_partition_combiners_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,26 +202,37 @@ def test_add_accumulators_moments(self):
eps=100,
delta=0.5,
probabilities=[1.0] * 100,
expected_probability_to_keep=1.0),
expected_probability_to_keep=1.0,
pre_threshold=None),
dict(testcase_name='Small eps delta',
eps=1,
delta=1e-5,
probabilities=[0.1] * 100,
expected_probability_to_keep=0.3321336253750503),
expected_probability_to_keep=0.3321336253750503,
pre_threshold=None),
dict(testcase_name='All probabilities = 1',
eps=1,
delta=1e-5,
probabilities=[1] * 10,
expected_probability_to_keep=0.12818308050524607),
expected_probability_to_keep=0.12818308050524607,
pre_threshold=None),
dict(testcase_name='All probabilities = 1 with pre_threshold',
eps=1,
delta=1e-5,
probabilities=[1] * 12,
expected_probability_to_keep=0.12818308050524607,
pre_threshold=3),
)
def test_partition_selection_accumulator_compute_probability(
self, eps, delta, probabilities, expected_probability_to_keep):
self, eps, delta, probabilities, expected_probability_to_keep,
pre_threshold):
acc = combiners.PartitionSelectionCalculator(probabilities)
prob_to_keep = acc.compute_probability_to_keep(
pipeline_dp.PartitionSelectionStrategy.TRUNCATED_GEOMETRIC,
eps,
delta,
max_partitions_contributed=1)
max_partitions_contributed=1,
pre_threshold=pre_threshold)
self.assertAlmostEqual(expected_probability_to_keep,
prob_to_keep,
delta=1e-10)
Expand All @@ -242,7 +253,7 @@ def test_partition_selection_combiner(self,
combiner.compute_metrics(acc)
mock_compute_probability_to_keep.assert_called_with(
pipeline_dp.PartitionSelectionStrategy.TRUNCATED_GEOMETRIC,
params.eps, params.delta, 1)
params.eps, params.delta, 1, None)


def _create_combiner_params_for_sum(
Expand Down
43 changes: 43 additions & 0 deletions analysis/tests/utility_analysis_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from analysis import metrics
from analysis import utility_analysis
from analysis.tests import common
from typing import Optional


class UtilityAnalysis(parameterized.TestCase):
Expand Down Expand Up @@ -322,6 +323,48 @@ def test_unnest_metrics(self):
self.assertEqual(output[2], ((1, None), input_data[1]))
self.assertEqual(output[3], ((1, 100), input_data[1]))

@parameterized.named_parameters(
dict(testcase_name="without pre-threshold",
pre_threshold=None,
expected_prob=0.612579511),
dict(testcase_name="with pre-threshold",
pre_threshold=3,
expected_prob=0.0644512636),
)
def test_select_partition(self, pre_threshold: Optional[int],
expected_prob: float):
# Arrange
aggregate_params = pipeline_dp.AggregateParams(
noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
metrics=[],
max_partitions_contributed=1,
max_contributions_per_partition=2,
pre_threshold=pre_threshold)

# Input collection has 10 privacy ids where each privacy id
# contributes to the same 10 partitions, three times in each partition.
col = [(i, j) for i in range(10) for j in range(10)] * 3

data_extractors = pipeline_dp.DataExtractors(
privacy_id_extractor=lambda x: x[0],
partition_extractor=lambda x: f"pk{x[1]}",
value_extractor=lambda x: 1)

_, per_partition_result = analysis.perform_utility_analysis(
col=col,
backend=pipeline_dp.LocalBackend(),
options=analysis.UtilityAnalysisOptions(
epsilon=3, delta=0.9, aggregate_params=aggregate_params),
data_extractors=data_extractors)

per_partition_result = list(per_partition_result)

# Assert
key, per_partition_metrics = per_partition_result[0]
self.assertAlmostEqual(
per_partition_metrics.partition_selection_probability_to_keep,
expected_prob)


if __name__ == '__main__':
absltest.main()
6 changes: 4 additions & 2 deletions examples/restaurant_visits/run_without_frameworks_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from absl import flags
import pipeline_dp
import pandas as pd
import collections

import analysis
from analysis import parameter_tuning
Expand All @@ -39,6 +38,8 @@
'Whether public partitions are used')
flags.DEFINE_boolean('run_on_preaggregated_data', False,
'If true, the data is preaggregated before tuning')
flags.DEFINE_integer('pre_threshold', None,
'Pre threshold which is used in the DP aggregation')


def write_to_file(col, filename):
Expand Down Expand Up @@ -68,7 +69,8 @@ def get_aggregate_params():
noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
metrics=[pipeline_dp.Metrics.COUNT],
max_partitions_contributed=1,
max_contributions_per_partition=1)
max_contributions_per_partition=1,
pre_threshold=FLAGS.pre_threshold)


def get_data_extractors():
Expand Down

0 comments on commit 4f32de8

Please sign in to comment.