diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py index aca1b9e4d9dccf..bab8e70d99a5b5 100644 --- a/src/transformers/modeling_tf_pytorch_utils.py +++ b/src/transformers/modeling_tf_pytorch_utils.py @@ -57,7 +57,7 @@ def convert_tf_weight_name_to_pt_weight_name( transposed with regards to each other """ if name_scope is not None: - if not tf_name.startswith(name_scope): + if not tf_name.startswith(name_scope) and "final_logits_bias" not in tf_name: raise ValueError( f"Weight name {tf_name} does not start with name_scope {name_scope}. This is an internal error " "in Transformers, so (unless you were doing something really evil) please open an issue to report it!" diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py index e4709268721517..0509403bb0a4ac 100644 --- a/src/transformers/models/deberta/modeling_tf_deberta.py +++ b/src/transformers/models/deberta/modeling_tf_deberta.py @@ -638,10 +638,10 @@ def build(self, input_shape=None): self.pos_dropout.build(None) if getattr(self, "pos_proj", None) is not None: with tf.name_scope(self.pos_proj.name): - self.pos_proj.build(None) + self.pos_proj.build([self.config.hidden_size]) if getattr(self, "pos_q_proj", None) is not None: with tf.name_scope(self.pos_q_proj.name): - self.pos_q_proj.build(None) + self.pos_q_proj.build([self.config.hidden_size]) def transpose_for_scores(self, tensor: tf.Tensor) -> tf.Tensor: shape = shape_list(tensor)[:-1] + [self.num_attention_heads, -1] diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py index b0afdcc298a241..60ef671e1e89b7 100644 --- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py @@ -362,6 +362,9 @@ def __init__(self, config: DebertaV2Config, **kwargs): self.config = config def build(self, input_shape=None): + if self.built: + return + self.built = True with tf.name_scope("conv"): self.conv_kernel = self.add_weight( name="kernel", @@ -371,13 +374,9 @@ def build(self, input_shape=None): self.conv_bias = self.add_weight( name="bias", shape=[self.config.hidden_size], initializer=tf.zeros_initializer() ) - return - if self.built: - return - self.built = True if getattr(self, "LayerNorm", None) is not None: with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build(None) + self.LayerNorm.build([None, None, self.config.hidden_size]) if getattr(self, "dropout", None) is not None: with tf.name_scope(self.dropout.name): self.dropout.build(None) @@ -453,7 +452,7 @@ def build(self, input_shape=None): self.conv.build(None) if getattr(self, "LayerNorm", None) is not None: with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) + self.LayerNorm.build([None, self.config.hidden_size]) if getattr(self, "layer", None) is not None: for layer in self.layer: with tf.name_scope(layer.name): diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index 142616b2b09269..fc8e99e0578736 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -401,7 +401,6 @@ def __init__(self, filters, kernel_size, groups, explicit_padding, **kwargs): ) self.explicit_padding = explicit_padding self.filter_axis = 2 - self.initialized = False self.kernel_norm_axes = tf.constant([0, 1]) def _init_norm(self): @@ -428,13 +427,13 @@ def build(self, input_shape): dtype=self.weight_v.dtype, trainable=True, ) + self._init_norm() self.bias = self.add_weight(name="bias", shape=(self.filters,), initializer="zeros", trainable=True) def call(self, inputs): - if not self.initialized: - self._init_norm() - self.initialized = True - + # TODO Matt: Assigning to attributes in call() is deeply sinful in TensorFlow, as it should be idempotent. + # This whole layer should be replaced by a layer that doesn't inherit from Conv1D, but instead calls + # a functional 1d convolution with normalized weights that it generates (but does not store!) self._normalize_kernel() padded_inputs = tf.pad(inputs, ((0, 0), (self.explicit_padding, self.explicit_padding), (0, 0))) diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py index 002fcffbccf307..5a4bd20173568f 100644 --- a/src/transformers/models/rag/modeling_tf_rag.py +++ b/src/transformers/models/rag/modeling_tf_rag.py @@ -720,6 +720,15 @@ def call( generator_dec_attentions=gen_outputs.decoder_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + with tf.name_scope(self.generator.name): + self.generator.build(None) + with tf.name_scope(self.question_encoder.name): + self.question_encoder.build(None) + @add_start_docstrings_to_model_forward( """ diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py index 565a646b117882..ded4ed5f4b4589 100644 --- a/src/transformers/models/sam/modeling_tf_sam.py +++ b/src/transformers/models/sam/modeling_tf_sam.py @@ -604,6 +604,9 @@ def build(self, input_shape=None): if getattr(self, "iou_prediction_head", None) is not None: with tf.name_scope(self.iou_prediction_head.name): self.iou_prediction_head.build(None) + for mlp in self.output_hypernetworks_mlps: + with tf.name_scope(mlp.name): + mlp.build(None) def call( self, diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py index f5379f06d053d0..165b309fb57d00 100644 --- a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py @@ -247,16 +247,16 @@ def tf_to_pt_weight_rename(self, tf_weight): # However, the name of that extra layer is the name of the MainLayer in the base model. if "vision_model" in tf_weight: if tf_weight.count("vision_model") == 1: - return re.sub(r"vision_model\..*?\.", "vision_model.", tf_weight) + return (re.sub(r"vision_model\..*?\.", "vision_model.", tf_weight),) elif tf_weight.count("vision_model") == 2: - return re.sub(r"vision_model\..*?\.vision_model", "vision_model.vision_model", tf_weight) + return (re.sub(r"vision_model\..*?\.vision_model", "vision_model.vision_model", tf_weight),) else: raise ValueError( f"Unexpected weight name {tf_weight}. Please file an issue on the" " Transformers repo to let us know about this error!" ) elif "text_model" in tf_weight: - return re.sub(r"text_model\..*?\.", "text_model.", tf_weight) + return (re.sub(r"text_model\..*?\.", "text_model.", tf_weight),) else: return (tf_weight,) @@ -598,7 +598,7 @@ def from_vision_text_pretrained( if text_model.name != "text_model": raise ValueError("text model must be created with the name `text_model`.") - model.build() # Ensure model is fully built + model.build_in_name_scope() # Ensure model is fully built return model diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index 3251dd00aa52d3..9f2f5ab86f52b7 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -435,7 +435,6 @@ def __init__(self, filters, kernel_size, groups, explicit_padding, **kwargs): ) self.explicit_padding = explicit_padding self.filter_axis = 2 - self.initialized = False self.kernel_norm_axes = tf.constant([0, 1]) def _init_norm(self): @@ -462,13 +461,13 @@ def build(self, input_shape): dtype=self.weight_v.dtype, trainable=True, ) + self._init_norm() self.bias = self.add_weight(name="bias", shape=(self.filters,), initializer="zeros", trainable=True) def call(self, inputs): - if not self.initialized: - self._init_norm() - self.initialized = True - + # TODO Matt: Assigning to attributes in call() is deeply sinful in TensorFlow, as it should be idempotent. + # This whole layer should be replaced by a layer that doesn't inherit from Conv1D, but instead calls + # a functional 1d convolution with normalized weights that it generates (but does not store!) self._normalize_kernel() padded_inputs = tf.pad(inputs, ((0, 0), (self.explicit_padding, self.explicit_padding), (0, 0))) @@ -1208,13 +1207,13 @@ def __init__(self, config: Wav2Vec2Config, **kwargs): self.encoder = TFWav2Vec2Encoder(config, name="encoder") def build(self, input_shape=None): - self.masked_spec_embed = self.add_weight( - shape=(self.config.hidden_size,), initializer="uniform", trainable=True, name="masked_spec_embed" - ) - if self.built: return self.built = True + if self.config.mask_time_prob > 0.0 or self.config.mask_feature_prob > 0.0: + self.masked_spec_embed = self.add_weight( + shape=(self.config.hidden_size,), initializer="uniform", trainable=True, name="masked_spec_embed" + ) if getattr(self, "feature_extractor", None) is not None: with tf.name_scope(self.feature_extractor.name): self.feature_extractor.build(None) diff --git a/tests/models/bert/test_tokenization_bert_tf.py b/tests/models/bert/test_tokenization_bert_tf.py index 14a1c12fb9a1cc..e5f736ede71f8c 100644 --- a/tests/models/bert/test_tokenization_bert_tf.py +++ b/tests/models/bert/test_tokenization_bert_tf.py @@ -28,7 +28,7 @@ def __init__(self, tokenizer): def call(self, inputs): tokenized = self.tokenizer(inputs) - out = self.bert(**tokenized) + out = self.bert(tokenized) return out["pooler_output"] @@ -41,13 +41,8 @@ class BertTokenizationTest(unittest.TestCase): def setUp(self): super().setUp() - self.tokenizers = [ - BertTokenizer.from_pretrained(checkpoint) for checkpoint in (TOKENIZER_CHECKPOINTS * 2) - ] # repeat for when fast_bert_tokenizer=false - self.tf_tokenizers = [TFBertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS] + [ - TFBertTokenizer.from_pretrained(checkpoint, use_fast_bert_tokenizer=False) - for checkpoint in TOKENIZER_CHECKPOINTS - ] + self.tokenizers = [BertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS] + self.tf_tokenizers = [TFBertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS] assert len(self.tokenizers) == len(self.tf_tokenizers) self.test_sentences = [ @@ -94,15 +89,15 @@ def test_graph_mode(self): self.assertTrue(tf.reduce_all(eager_outputs[key] == compiled_outputs[key])) @slow - def test_saved_model(self): + def test_export_for_inference(self): for tf_tokenizer in self.tf_tokenizers: model = ModelToSave(tokenizer=tf_tokenizer) test_inputs = tf.convert_to_tensor(self.test_sentences) out = model(test_inputs) # Build model with some sample inputs with TemporaryDirectory() as tempdir: save_path = Path(tempdir) / "saved.model" - model.save(save_path) - loaded_model = tf.keras.models.load_model(save_path) - loaded_output = loaded_model(test_inputs) + model.export(save_path) + loaded_model = tf.saved_model.load(save_path) + loaded_output = loaded_model.serve(test_inputs) # We may see small differences because the loaded model is compiled, so we need an epsilon for the test self.assertLessEqual(tf.reduce_max(tf.abs(out - loaded_output)), 1e-5) diff --git a/tests/models/rag/test_modeling_tf_rag.py b/tests/models/rag/test_modeling_tf_rag.py index 0041b0e6c49ac4..a484017e60148b 100644 --- a/tests/models/rag/test_modeling_tf_rag.py +++ b/tests/models/rag/test_modeling_tf_rag.py @@ -1005,6 +1005,7 @@ def test_rag_sequence_from_pretrained(self): retriever=rag_retriever, config=rag_config, ) + rag_sequence.build_in_name_scope() # check that the from pretrained methods work rag_sequence.save_pretrained(tmp_dirname) rag_sequence.from_pretrained(tmp_dirname, retriever=rag_retriever) @@ -1056,6 +1057,7 @@ def test_rag_token_from_pretrained(self): retriever=rag_retriever, config=rag_config, ) + rag_token.build_in_name_scope() # check that the from pretrained methods work rag_token.save_pretrained(tmp_dirname) rag_token.from_pretrained(tmp_dirname, retriever=rag_retriever) diff --git a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py index 9d81a476531e0c..4a1e0bfdd040fa 100644 --- a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py +++ b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py @@ -858,6 +858,7 @@ def test_encoder_decoder_from_pretrained(self): pretrained_encoder_dir, pretrained_decoder_dir, ) + enc_dec_model.build_in_name_scope() # check that the from pretrained methods work enc_dec_model.save_pretrained(tmp_dirname) enc_dec_model = TFVisionEncoderDecoderModel.from_pretrained(tmp_dirname)