forked from ttsse/ContrastiveLosses
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ContrastiveLosses.py
328 lines (226 loc) · 12 KB
/
ContrastiveLosses.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
import tensorflow as tf
def gumbel_max(logits, K):
# Inspired by (blatantly taken from) https://github.com/tensorflow/tensorflow/issues/9260
if tf.shape(logits)[0] == 0:
return tf.convert_to_tensor([[]], dtype=tf.int32)
else:
z = -tf.math.log(-tf.math.log(tf.random.uniform(tf.shape(logits), 0, 1)))
_, indices = tf.nn.top_k(logits + z, K)
return indices
@tf.function
def lp_distance(x,y,p):
"""
Compute the pairwise L_p distance between x and y. Assumes that x and y are ordered as [samples, embedding_dimension]
"""
return tf.reduce_sum( (x-y)**p , axis = 1)
@tf.function
def distance_matrix(x,y):
"""
Compute the pairwise L_2 distance for all pairs in x and y. Assumes that x and y are ordered as [samples, embedding_dimension]
"""
return tf.sqrt(tf.reduce_sum((x[:,:,tf.newaxis] - tf.transpose(y[:,:,tf.newaxis]))**2,axis = 1))
@tf.function
def random_negatives(anchors, negative_pool, n_pairs):
"""
Draws n_pairs negative samples randomly from the negative pool for each samples in anchors.
"""
n = tf.shape(negative_pool)[0]
n_pairs = tf.minimum(n_pairs, n - 1)
indices = tf.random.uniform(shape=[tf.shape(anchors)[0], n_pairs], minval=0, maxval=tf.shape(negative_pool)[0],
dtype=tf.dtypes.int32)
N = tf.gather(params=negative_pool, indices=indices)
return N
@tf.function
def negatives_by_distance_random(anchors, negative_pool, n_pairs, alpha = 2):
"""
Draws n_pairs negative samples from the negative_pool for each sample in anchors. Choice is done by drawing them at random, with weights corresponding to the inverse distance.
"""
n = tf.shape(negative_pool)[0]
n_pairs = tf.minimum(n_pairs, n - 1)
distances = distance_matrix(anchors, negative_pool)**alpha
I = tf.eye(tf.shape(distances)[0], tf.shape(distances)[1])
logodds = tf.math.log((distances + I) ** -1 - I)
inds = gumbel_max(logodds, n_pairs)
N = tf.gather(negative_pool, inds)
return N
@tf.function
def negatives_by_distance(anchors, negative_pool, n_pairs):
"""
Draws n_pairs negative samples from the negative_pool for each sample in anchors.
Choice is done by drawing only the n_pairs closest ones.
"""
n = tf.math.minimum(tf.shape(anchors)[0], tf.shape(negative_pool)[0])
n_pairs = tf.minimum(n_pairs, n - 1)
distances = distance_matrix(anchors, negative_pool)
argsorted_indices = tf.argsort(distances, axis=1, direction="ASCENDING")
indices = argsorted_indices[:,1:n_pairs+1]
N = tf.gather(params = negative_pool, indices = indices)
return N
def generate_negatives(mode, n_pairs):
if mode == "random":
generate_negatives_fun = lambda anchors, negative_pool: random_negatives(anchors, negative_pool, n_pairs)
elif mode == "closest":
generate_negatives_fun = lambda anchors, negative_pool : negatives_by_distance(anchors, negative_pool, n_pairs)
elif mode == "distance_weighted_random":
generate_negatives_fun = lambda anchors, negative_pool : negatives_by_distance_random(anchors, negative_pool, n_pairs)
else:
exit(f"Incorrect mode for the choice of negatives in the loss function. Mode \"{mode}\" is not supported. Only \"random\",\"closest\",\"distance_weighted_random\" are implemented")
return generate_negatives_fun
class ContrastiveLoss():
def __init__(self):
exit("This is a base class for the implementations, You are not supposed to do this...")
return
def __call__(self, anchors, positives):
if tf.distribute.has_strategy():
global_anchors, global_positives = self.gather_samples(anchors, positives)
loss = ( self.compute_loss(anchors,positives, tf.stop_gradient(global_anchors)) + self.compute_loss(tf.stop_gradient(global_anchors),tf.stop_gradient(global_positives), anchors) ) / 2
else:
loss = self.compute_loss(anchors,positives,anchors)
return loss
def gather_samples(self, anchors, positives):
strategy=tf.distribute.get_strategy()
global_anchors = tf.distribute.get_replica_context().all_gather(anchors, axis = 0)
global_positives = tf.distribute.get_replica_context().all_gather(positives, axis = 0)
return global_anchors, global_positives
def compute_loss(self, anchors,positives, negative_pool):
exit("function compute_loss not implemented for the base class.")
return -1
class Triplet_loss(ContrastiveLoss):
def __init__(self, alpha = 1., mode = 'random', distance ="L2"):
self.alpha = alpha
self.mode = mode
if distance == "L2":
self.distance = lambda A,P: lp_distance(A,P,2)
elif distance== "L1":
self.distance = lambda A,P: lp_distance(A,P,1)
else:
exit("Currently only supports L1 and L2 distances.")
self.generate_negatives = generate_negatives(mode = mode, n_pairs = 1)
@tf.function
def compute_loss(self, anchors, positives, negative_pool):
negatives = tf.squeeze(self.generate_negatives(anchors, negative_pool), axis = 1)
anchor_pos = self.distance(anchors, positives)
anchor_neg = self.distance(anchors, negatives)
loss = tf.reduce_sum(tf.math.maximum( anchor_pos - anchor_neg+ self.alpha, 0 ) )
return loss
class n_pair(ContrastiveLoss):
def __init__(self, n_pairs = 20, alpha = 0., mode = 'distance_weighted_random', distance ="L2"):
self.mode = mode
self.alpha = alpha
if distance == "L2":
self.distance = lambda A,P: lp_distance(A,P,2)
elif distance== "L1":
self.distance = lambda A,P: lp_distance(A,P,1)
else:
exit("Currently only supports L1 and L2 distances.")
self.generate_negatives = generate_negatives(mode = mode, n_pairs = n_pairs)
@tf.function
def compute_loss(self,anchors, positives, negative_pool):
"""
Implementation of the the N-pair loss from "Improved Deep Metric Learning with Multi-class N-pair Loss Objective"
by Sohn, 2016
"""
n = tf.shape(anchors)[0]
if tf.shape(anchors)[0] == 0 or tf.shape(negative_pool)[0] == 0 : # To not have to handle empty batches which may come up in distributed training.
return 0.
tf.print(tf.shape(negative_pool))
A = anchors
P = positives
N = self.generate_negatives(anchors,negative_pool)
N_pairs = tf.shape(N)[1]
tf.print(tf.shape(N))
hehe = True
if hehe == False:
anchor_pos = tf.reduce_sum((tf.tile(A[:,tf.newaxis,:], [1, N_pairs, 1]) - tf.tile(P[:,tf.newaxis,:], [1, N_pairs, 1]))**2, axis=2)
anchor_neg = tf.reduce_sum((tf.tile(A[:,tf.newaxis,:], [1, N_pairs, 1]) - N)**2, axis=2)
loss = tf.reduce_sum(tf.math.maximum( anchor_pos - anchor_neg+ self.alpha, 0 ) ) / tf.cast(N_pairs,tf.float32)
tf.print(loss)
else:
dot_pos = tf.reduce_sum((A * P), axis=1)
dot_neg = tf.reduce_sum(tf.tile(A[:,tf.newaxis,:], [1, N_pairs, 1]) * N, axis=2)
m = tf.math.maximum(dot_pos, tf.reduce_max(dot_neg,axis = -1))
pos = tf.math.exp(dot_pos -m)
neg = tf.reduce_sum(tf.math.exp(dot_neg-m[:,tf.newaxis]), axis=1)
loss = -tf.reduce_sum(tf.math.log(pos / (neg + pos)))
return loss
class centroid(ContrastiveLoss):
def __init__(self, n_pairs = 20, mode = 'distance_weighted_random', distance ="L2"):
self.mode = mode
if distance == "L2":
self.distance = lambda A,P: lp_distance(A,P,2)
elif distance== "L1":
self.distance = lambda A,P: lp_distance(A,P,1)
else:
exit("Currently only supports L1 and L2 distances.")
self.generate_negatives = generate_negatives(mode = mode, n_pairs = n_pairs)
@tf.function
def compute_loss(self,anchors, positives, negative_pool):
"""
This loss should have only 2D output, and no L2 normalization on the output.
Input shapes in the single gpu case here is [batch size, dimension of embedding (most cases = 2)]
Want to have "regular" n-pair loss, but instead of computing vectors from the origin, compute
them with respect to some other point. We talked first about doing this with respect to the
mean coordinate of all the samples considered, but one thought I had was that, for large N, this
would in mean result in just using __approximately__ the origin.
Carl had an idea to use different centroids for each considered negative for each sample.
The centroid would then sort of be "(A+P+2*N)/4", weighting N twice since most likely A and P will lie close.
The loss function is sum_samples(log (1 + sum_negatives( exp( (A-C)'*(A-N)- (A-C)'* (A-P) ))) )
"""
n = tf.shape(anchors)[0]
if tf.shape(anchors)[0] == 0 or tf.shape(negative_pool)[0] == 0 : # To not have to handle empty batches which may come up in distributed training.
return 0.
A = anchors
P = positives
N = self.generate_negatives(anchors,negative_pool)
A_full = tf.tile(A[:, tf.newaxis, :], [1, tf.shape(N)[1], 1])
P_full = tf.tile(P[:, tf.newaxis, :], [1, tf.shape(N)[1], 1])
C = (A_full + P_full + 2 * N) / 4
AC = A_full - C
NC = N - C
PC = P_full - C
max_vec = tf.tile(tf.reduce_max(tf.stack(
[tf.reduce_sum(AC ** 2, axis=-1), tf.reduce_sum(NC ** 2, axis=-1), tf.reduce_sum(PC ** 2, axis=-1)], axis=-1),
axis=-1)[:, :, tf.newaxis], [1, 1, tf.shape(anchors)[1]])
eps = 1e-12
num = tf.reduce_sum(AC * NC / (max_vec+eps), axis=2)
denom = tf.reduce_sum(AC *PC / (max_vec+eps), axis=2)
m = tf.reduce_max(tf.stack([num, denom], axis=-1), axis=-1)
loss = 1 / tf.cast(n, tf.float32) * tf.reduce_sum(tf.math.log(
1 + tf.reduce_sum(-tf.math.exp(-2.) + tf.math.exp(num - m) / (tf.math.exp(denom - m) + eps), axis=1)))
return loss
class debiased_contrastive_loss(ContrastiveLoss):
def __init__(self, n_pairs = 20, mode = 'distance_weighted_random', distance ="L2"):
self.mode = mode
if distance == "L2":
self.distance = lambda A,P: lp_distance(A,P,2)
elif distance== "L1":
self.distance = lambda A,P: lp_distance(A,P,1)
else:
exit("Currently only supports L1 and L2 distances.")
self.generate_negatives = generate_negatives(mode = mode, n_pairs = n_pairs)
@tf.function
def compute_loss(self,anchors, positives, negative_pool):
"""
This implements the "Debiased contrastive loss", as presented in "Debiased Contrastive Learning", Chuang et.al. 2020.
This should essentially assume that the output is L2 normalized, that is, the output domain is on the d-dimensional hypersphere.
"""
n = tf.shape(anchors)[0]
if tf.shape(anchors)[0] == 0 or tf.shape(negative_pool)[0] == 0 : # To not have to handle empty batches which may come up in distributed training.
return 0.
A = anchors
P = positives
N = self.generate_negatives(anchors,negative_pool)
tf.print(tf.shape(N))
N_pairs = tf.shape(N)[1]
dot_pos = tf.reduce_sum((A * P), axis=1)
dot_neg =tf.reduce_sum(tf.tile(A[:,tf.newaxis,:], [1, N_pairs, 1]) * N, axis=2)
m = tf.math.reduce_max([tf.reduce_max(dot_neg),tf.reduce_max(dot_pos)])
neg = tf.reduce_sum(tf.math.exp(dot_neg-m), axis=1)
pos = tf.math.exp(dot_pos-m)
tau_plus = 0.01
t = 1.
eps = 1e-12
N_pairs2 = tf.cast(N_pairs, tf.float32)
Ng = tf.math.maximum( (-N_pairs2 * tau_plus * pos +neg) / (1-tau_plus), N_pairs2* tf.math.exp(-1/t-m))
debiased_loss = -tf.reduce_sum(tf.math.log(pos / (pos+ Ng+ eps )))
return debiased_loss