fix: explicitly using mask kwarg to use MultiHeadDotProductAttention …

…and also using sacrebleu
mlcommons · Dec 2, 2024 · 86029a7 · 86029a7
1 parent abbdc82
commit 86029a7
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 4 deletions.
diff --git a/algorithmic_efficiency/workloads/imagenet_resnet/imagenet_jax/randaugment.py b/algorithmic_efficiency/workloads/imagenet_resnet/imagenet_jax/randaugment.py
@@ -8,6 +8,7 @@
 import math
 
 import tensorflow as tf
+
 #from tensorflow_addons import image as contrib_image
 
 # This signifies the max integer that the controller RNN could predict for the

diff --git a/algorithmic_efficiency/workloads/wmt/wmt_jax/models.py b/algorithmic_efficiency/workloads/wmt/wmt_jax/models.py
@@ -224,7 +224,7 @@ def __call__(self, inputs, encoder_mask=None):
         dropout_rate=attention_dropout_rate,
         deterministic=cfg.deterministic)(cfg.attention_temp * x,
                                          x,
-                                         encoder_mask)
+                                         mask=encoder_mask)
 
     if cfg.dropout_rate is None:
       dropout_rate = 0.1
@@ -288,7 +288,7 @@ def __call__(self,
         broadcast_dropout=False,
         dropout_rate=attention_dropout_rate,
         deterministic=cfg.deterministic,
-        decode=cfg.decode)(cfg.attention_temp * x, x, decoder_mask)
+        decode=cfg.decode)(cfg.attention_temp * x, x, mask=decoder_mask)
     if cfg.dropout_rate is None:
       dropout_rate = 0.1
     else:
@@ -311,7 +311,7 @@ def __call__(self,
         dropout_rate=attention_dropout_rate,
         deterministic=cfg.deterministic)(cfg.attention_temp * y,
                                          encoded,
-                                         encoder_decoder_mask)
+                                         mask=encoder_decoder_mask)
 
     y = nn.Dropout(rate=dropout_rate)(y, deterministic=cfg.deterministic)
     y = y + x

diff --git a/algorithmic_efficiency/workloads/wmt/wmt_jax/workload.py b/algorithmic_efficiency/workloads/wmt/wmt_jax/workload.py
@@ -12,6 +12,7 @@
 import jax.numpy as jnp
 import numpy as np
 import optax
+import sacrebleu
 
 from algorithmic_efficiency import param_utils
 from algorithmic_efficiency import spec
@@ -203,7 +204,7 @@ def translate_and_calculate_bleu(self,
         predictions.append(self._decode_tokens(predicted[idx]))
 
     # Calculate BLEU score for translated eval corpus against reference.
-    bleu_score = bleu.corpus_bleu(predictions, [references]).score
+    bleu_score = sacrebleu.corpus_bleu(predictions, [references]).score
     return bleu_score
 
   def init_model_fn(