Skip to content

Commit

Permalink
Sparsity Preserving DP-SGD in TF Privacy
Browse files Browse the repository at this point in the history
Move get_registry_generator_fn from clip_grads.py to gradient_clipping_utils.py and change return type to dataclass.

See https://research.google/blog/sparsity-preserving-differentially-private-training/ for more details on the algorithm.

PiperOrigin-RevId: 660490424
  • Loading branch information
tensorflower-gardener committed Aug 7, 2024
1 parent fc6f1dc commit 753d130
Show file tree
Hide file tree
Showing 8 changed files with 280 additions and 201 deletions.
16 changes: 12 additions & 4 deletions tensorflow_privacy/privacy/fast_gradient_clipping/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,7 @@ py_test(
python_version = "PY3",
shard_count = 8,
srcs_version = "PY3",
deps = [
":gradient_clipping_utils",
":type_aliases",
],
deps = [":gradient_clipping_utils"],
)

py_library(
Expand All @@ -83,6 +80,11 @@ py_library(
],
)

py_library(
name = "noise_utils",
srcs = ["noise_utils.py"],
)

py_test(
name = "clip_grads_test",
srcs = ["clip_grads_test.py"],
Expand All @@ -96,3 +98,9 @@ py_test(
":type_aliases",
],
)

py_test(
name = "noise_utils_test",
srcs = ["noise_utils_test.py"],
deps = [":noise_utils"],
)
52 changes: 10 additions & 42 deletions tensorflow_privacy/privacy/fast_gradient_clipping/clip_grads.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,43 +32,6 @@
from tensorflow_privacy.privacy.fast_gradient_clipping import type_aliases


def get_registry_generator_fn(
tape: tf.GradientTape,
layer_registry: lr.LayerRegistry,
num_microbatches: Optional[type_aliases.BatchSize] = None,
):
"""Creates the generator function for `compute_gradient_norms()`."""
if layer_registry is None:
# Needed for backwards compatibility.
registry_generator_fn = None
else:

def registry_generator_fn(layer_instance, args, kwargs):
if layer_instance.trainable_variables:
# Only trainable variables factor into the gradient.
if not layer_registry.is_elem(layer_instance):
raise NotImplementedError(
'Layer %s is not in the registry of known layers that can '
'be used for efficient gradient clipping.'
% layer_instance.__class__.__name__
)
registry_fn = layer_registry.lookup(layer_instance)
(layer_vars, layer_outputs, layer_sqr_norm_fn) = registry_fn(
layer_instance, args, kwargs, tape, num_microbatches
)
return layer_outputs, (
str(id(layer_instance)),
layer_vars,
layer_sqr_norm_fn,
layer_instance.trainable_weights,
)
else:
# Non-trainable layer.
return layer_instance(*args, **kwargs), None

return registry_generator_fn


def _infer_per_example_loss_fn(model: tf.keras.Model):
"""Infer the per-example loss from model config."""

Expand Down Expand Up @@ -190,7 +153,7 @@ def compute_gradient_norms(
are applied prior to clipping.
"""
tape = tf.GradientTape(persistent=True, watch_accessed_variables=False)
registry_generator_fn = get_registry_generator_fn(
registry_generator_fn = gradient_clipping_utils.get_registry_generator_fn(
tape, layer_registry, num_microbatches
)
# First loop computes the model outputs, summed loss, and generator outputs.
Expand Down Expand Up @@ -241,12 +204,17 @@ def compute_gradient_norms(
# $sqrt(k * \sum_i c_i^2)$ where $c_i$ is the norm estimate of its i-th
# occurrence. This is an over-estimate of the actual norm. For more details,
# see the explanation in go/dp-sgd-shared-weights.
for layer_id, v, f, weights_list in filtered_outputs:
for registry_fn_output in filtered_outputs:
if trainable_vars is None or any(
w.ref() in trainable_vars for w in weights_list
w.ref() in trainable_vars
for w in registry_fn_output.layer_trainable_weights
):
layer_vars[layer_id].append(v)
layer_sqr_norm_fns[layer_id].append(f)
layer_vars[registry_fn_output.layer_id].append(
registry_fn_output.layer_vars
)
layer_sqr_norm_fns[registry_fn_output.layer_id].append(
registry_fn_output.layer_sqr_norm_fn
)
# Second loop evaluates the squared L2 norm functions and appends the results.
layer_grad_vars = tape.gradient(
summed_loss,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,23 @@
# limitations under the License.
"""Utility functions that help in the computation of per-example gradient norms."""

from collections.abc import Sequence, Set
from typing import Any, Literal, Optional
from collections.abc import Callable, Sequence, Set
import dataclasses
from typing import Any, Optional, Tuple

from absl import logging
import tensorflow as tf
from tensorflow_privacy.privacy.fast_gradient_clipping import layer_registry as lr
from tensorflow_privacy.privacy.fast_gradient_clipping import type_aliases


@dataclasses.dataclass(frozen=True)
class RegistryGeneratorFunctionOutput:
layer_id: str
layer_vars: Optional[Sequence[tf.Variable]]
layer_sqr_norm_fn: Optional[type_aliases.SquareNormFunction]
layer_trainable_weights: Optional[Sequence[tf.Variable]]


def has_internal_compute_graph(input_object: Any):
"""Checks if input is a TF model and has a TF internal compute graph."""
return (
Expand All @@ -33,6 +41,63 @@ def has_internal_compute_graph(input_object: Any):
)


def get_registry_generator_fn(
tape: tf.GradientTape,
layer_registry: lr.LayerRegistry,
num_microbatches: Optional[type_aliases.BatchSize] = None,
) -> Optional[Callable[..., Tuple[tf.Tensor, RegistryGeneratorFunctionOutput]]]:
"""Creates the generator function for `model_forward_backward_pass()`.
Args:
tape: The `tf.GradientTape` to use for the gradient computation.
layer_registry: A `dict` of layers that support "fast" gradient norm
computations. The key is the class of the layer and the value is a
function that returns a `tuple` `(output, sqr_grad_norms, vars)`, where
`output` is the pre-activator tensor, `sqr_grad_norms` is related to the
squared norms of a layer's pre-activation tensor, and `vars` are relevant
trainable
num_microbatches: An optional number or scalar `tf.Tensor` for the number of
microbatches. If not None, indicates that the loss is grouped into
num_microbatches (in this case, the batch dimension needs to be a multiple
of num_microbatches).
Returns:
A function that returns a `tuple` `(output, sqr_grad_norms, vars)`, where
`output` is the pre-activator tensor, `sqr_grad_norms` is related to the
squared norms of a layer's pre-activation tensor, and `vars` are relevant
trainable variables.
"""
if layer_registry is None:
# Needed for backwards compatibility.
registry_generator_fn = None
else:

def registry_generator_fn(layer_instance, args, kwargs):
if layer_instance.trainable_variables:
# Only trainable variables factor into the gradient.
if not layer_registry.is_elem(layer_instance):
raise NotImplementedError(
'Layer %s is not in the registry of known layers that can '
'be used for efficient gradient clipping.'
% layer_instance.__class__.__name__
)
registry_fn = layer_registry.lookup(layer_instance)
(layer_vars, layer_outputs, layer_sqr_norm_fn) = registry_fn(
layer_instance, args, kwargs, tape, num_microbatches
)
return layer_outputs, RegistryGeneratorFunctionOutput(
layer_id=str(id(layer_instance)),
layer_vars=layer_vars,
layer_sqr_norm_fn=layer_sqr_norm_fn,
layer_trainable_weights=layer_instance.trainable_weights,
)
else:
# Non-trainable layer.
return layer_instance(*args, **kwargs), None

return registry_generator_fn


def model_forward_pass(
input_model: tf.keras.Model,
inputs: type_aliases.PackedTensors,
Expand Down Expand Up @@ -144,101 +209,6 @@ def all_trainable_layers_are_registered(
return True


def _infer_loss_reduction_type(model: tf.keras.Model):
"""Infers what type of loss reduction is being performed."""
model_loss = model.loss
if isinstance(model_loss, tf.keras.losses.Loss):
return model_loss.reduction
elif isinstance(model.loss, dict):
reductions = set()
compiled_loss = model.compiled_loss
if compiled_loss is None:
raise ValueError('Model must be compiled for adding noise')
new_config_list = compiled_loss.get_config()['losses']
for loss_config in new_config_list:
reductions.add(loss_config['config']['reduction'])
if len(reductions) > 1:
raise ValueError(
'Reductions in models with multiple losses must all be the same'
)
return reductions.pop()
else:
raise ValueError(
'Unsupported type for adding noise: {}'.format(type(model_loss))
)


def add_aggregate_noise(
clipped_grads: list[tf.Tensor],
batch_size: tf.Tensor,
l2_norm_clip: float,
noise_multiplier: float,
loss_reduction: Optional[Literal['mean', 'sum']] = None,
loss_model: Optional[tf.keras.Model] = None,
) -> Sequence[tf.Tensor]:
"""Adds noise to a collection of clipped gradients.
The magnitude of the noise depends on the aggregation strategy of the
input model's loss function.
Args:
clipped_grads: A list of `tf.Tensor`s representing the clipped gradients.
batch_size: The batch size. Used for normalizing the noise when
`loss_reduction` is 'sum'.
l2_norm_clip: Clipping norm (max L2 norm of each gradient).
noise_multiplier: Ratio of the standard deviation to the clipping norm.
loss_reduction: An string description of how the loss is reduced over
examples. Currently supports 'mean' and 'sum'. If `None`, then the
aggregation type must be inferred from `input_model.loss`.
loss_model: An optional `tf.keras.Model` used to infer the loss reduction
strategy from if `loss_reduction` is `None`.
Returns:
A list of tensors containing the clipped gradients, but with the right
amount of Gaussian noise added to them (depending on the reduction
strategy of the loss function).
Raises:
ValueError: If both `loss_model` and `loss_reduction` are `None` or if
they are both not `None`.
"""
if loss_reduction is None and loss_model is None:
raise ValueError(
'Exactly one of `loss_reduction` and `loss_model` must be populated.'
' Instead, both arguments were `None`.'
)
if loss_reduction is not None and loss_model is not None:
raise ValueError(
'Exactly one of `loss_reduction` and `loss_model` must be populated.'
' Instead, both arguments were not `None`.'
)

if loss_reduction is None and loss_model is not None:
implicit_mean_reductions = [
tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE,
tf.keras.losses.Reduction.AUTO,
]
model_reduction = _infer_loss_reduction_type(loss_model)
loss_reduction = (
'mean' if model_reduction in implicit_mean_reductions else 'sum'
)
if model_reduction == tf.keras.losses.Reduction.AUTO:
logging.info(
'Assuming that the model loss reduction is `SUM_OVER_BATCH_SIZE`.'
)

scale = l2_norm_clip
if loss_reduction == 'mean':
scale /= tf.cast(batch_size, tf.float32)

def add_noise(g):
return g + tf.random.normal(
tf.shape(g), mean=0.0, stddev=noise_multiplier * scale
)

return tf.nest.map_structure(add_noise, clipped_grads)


def generate_model_outputs_using_core_keras_layers(
input_model: tf.keras.Model,
custom_layer_set: Optional[Set[type]] = None, # pylint: disable=g-bare-generic
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from typing import Any

from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from tensorflow_privacy.privacy.fast_gradient_clipping import gradient_clipping_utils

Expand Down Expand Up @@ -135,60 +134,6 @@ def test_outputs_are_consistent(
self.assertAllClose(computed_outputs, true_outputs)


class AddAggregateNoise(tf.test.TestCase, parameterized.TestCase):

@parameterized.product(
l2_norm_clip=[3.0, 5.0],
noise_multiplier=[2.0, 4.0],
batch_size=[1, 2, 10],
model_fn_reduction=[None, 'auto', 'sum_over_batch_size', 'sum'],
noise_fn_reduction=[None, 'mean', 'sum'],
)
def test_noise_is_computed_correctly(
self,
l2_norm_clip,
noise_multiplier,
batch_size,
model_fn_reduction,
noise_fn_reduction,
):
# Skip invalid combinations.
if model_fn_reduction is None and noise_fn_reduction is None:
return
if model_fn_reduction is not None and noise_fn_reduction is not None:
return
# Make an simple model container for storing the loss.
if model_fn_reduction is not None:
linear_model = tf.keras.Sequential([tf.keras.layers.Dense(1)])
linear_model.compile(
loss=tf.keras.losses.MeanSquaredError(reduction=model_fn_reduction)
)
else:
linear_model = None
# The main computation is done on a deterministic dummy vector.
num_units = 100
clipped_grads = [
tf.expand_dims(np.arange(num_units, dtype=np.float32), axis=-1)
]
noised_grads = gradient_clipping_utils.add_aggregate_noise(
clipped_grads,
batch_size,
l2_norm_clip,
noise_multiplier,
noise_fn_reduction,
linear_model,
)
# The only measure that varies is the standard deviation of the variation.
scale = (
1.0
if noise_fn_reduction == 'sum' or model_fn_reduction == 'sum'
else 1.0 / batch_size
)
computed_std = np.std(noised_grads[0] - clipped_grads[0])
expected_std = l2_norm_clip * noise_multiplier * scale
self.assertNear(computed_std, expected_std, 0.1 * expected_std)


class GenerateOutputsUsingCoreKerasLayers(
tf.test.TestCase, parameterized.TestCase
):
Expand Down
Loading

0 comments on commit 753d130

Please sign in to comment.