Phaze-A: Add MobileNetV3 encoder

o2team · May 4, 2022 · 0189029 · 0189029
1 parent 332394e
commit 0189029
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 63 deletions.
diff --git a/plugins/train/model/phaze_a.py b/plugins/train/model/phaze_a.py
@@ -2,15 +2,14 @@
 """ Phaze-A Model by TorzDF with thanks to BirbFakes and the myriad of testers. """
 
 import numpy as np
-import tensorflow as tf
 
 from lib.model.nn_blocks import (
     Conv2D, Conv2DBlock, Conv2DOutput, ResidualBlock, UpscaleBlock, Upscale2xBlock,
     UpscaleResizeImagesBlock)
 from lib.model.normalization import (
     AdaInstanceNormalization, GroupNormalization, InstanceNormalization, LayerNormalization,
     RMSNormalization)
-from lib.utils import get_backend, FaceswapError
+from lib.utils import get_backend, get_tf_version, FaceswapError
 
 from ._base import KerasModel, ModelBase, logger, _get_all_sub_models
 
@@ -62,6 +61,10 @@
         keras_name="MobileNet", scaling=(-1, 1), default_size=224),
     mobilenet_v2=dict(
         keras_name="MobileNetV2", scaling=(-1, 1), default_size=224),
+    mobilenet_v3_large=dict(
+        keras_name="MobileNetV3Large", no_amd=True, tf_min=2.4, scaling=(-1, 1), default_size=224),
+    mobilenet_v3_small=dict(
+        keras_name="MobileNetV3Small", no_amd=True, tf_min=2.4, scaling=(-1, 1), default_size=224),
     nasnet_large=dict(
         keras_name="NASNetLarge", scaling=(-1, 1), default_size=331, enforce_for_weights=True),
     nasnet_mobile=dict(
@@ -208,16 +211,32 @@ def _get_input_shape(self):
         Input shape is calculated from the selected Encoder's input size, scaled to the user
         selected Input Scaling, rounded down to the nearest 16 pixels.
 
+        Notes
+        -----
+        Some models (NasNet) require the input size to be of a certain dimension if loading
+        imagenet weights. In these instances resize inputs and raise warning message
+
         Returns
         -------
         tuple
             The shape tuple for the input size to the Phaze-A model
         """
-        size = _MODEL_MAPPING[self.config["enc_architecture"]]["default_size"]
-        min_size = _MODEL_MAPPING[self.config["enc_architecture"]].get("min_size", 32)
+        arch = self.config["enc_architecture"]
+        enforce_size = _MODEL_MAPPING[arch].get("enforce_for_weights", False)
+        default_size = _MODEL_MAPPING[arch]["default_size"]
         scaling = self.config["enc_scaling"] / 100
-        size = int(max(min_size, min(size, ((size * scaling) // 16) * 16)))
-        retval = (size, size, 3)
+
+        min_size = _MODEL_MAPPING[arch].get("min_size", 32)
+        size = int(max(min_size, min(default_size, ((default_size * scaling) // 16) * 16)))
+
+        if self.config["enc_load_weights"] and enforce_size and scaling != 1.0:
+            logger.warning("%s requires input size to be %spx when loading imagenet weights. "
+                           "Adjusting input size from %spx to %spx",
+                           arch, default_size, size, default_size)
+            retval = (default_size, default_size, 3)
+        else:
+            retval = (size, size, 3)
+
         logger.debug("Encoder input set to: %s", retval)
         return retval
 
@@ -238,7 +257,7 @@ def _validate_encoder_architecture(self):
             raise FaceswapError(f"'{arch}' is not compatible with the AMD backend. Choose one of "
                                 f"{valid}.")
 
-        tf_ver = float(".".join(tf.__version__.split(".")[:2]))  # pylint:disable=no-member
+        tf_ver = get_tf_version()
         tf_min = model.get("tf_min", 2.0)
         if get_backend() != "amd" and tf_ver < tf_min:
             raise FaceswapError(f"{arch}' is not compatible with your version of Tensorflow. The "
@@ -549,7 +568,10 @@ def _model_kwargs(self):
         return dict(mobilenet=dict(alpha=self._config["mobilenet_width"],
                                    depth_multiplier=self._config["mobilenet_depth"],
                                    dropout=self._config["mobilenet_dropout"]),
-                    mobilenet_v2=dict(alpha=self._config["mobilenet_width"]))
+                    mobilenet_v2=dict(alpha=self._config["mobilenet_width"]),
+                    mobilenet_v3=dict(alpha=self._config["mobilenet_width"],
+                                      minimalist=self._config["mobilenet_minimalistic"],
+                                      include_preprocessing=False))
 
     @property
     def _selected_model(self):
@@ -559,22 +581,6 @@ def _selected_model(self):
         model["kwargs"] = self._model_kwargs.get(arch, {})
         return model
 
-    @property
-    def _model_input_shape(self):
-        """ tuple: The required input shape for the encoder model.
-
-        Notes
-        -----
-        NasNet does not allow custom input sizes when loading pre-trained weights, so we need to
-        resize the input for this model
-        """
-        default_size = self._selected_model.get("default_size")
-        if self._config["enc_load_weights"] and self._selected_model.get("enforce_for_weights"):
-            retval = (default_size, default_size, 3)
-        else:
-            retval = self._input_shape
-        return retval
-
     def __call__(self):
         """ Create the Phaze-A Encoder Model.
 
@@ -583,12 +589,9 @@ def __call__(self):
         :class:`keras.models.Model`
             The selected Encoder Model
         """
-        input_ = Input(shape=self._model_input_shape)
+        input_ = Input(shape=self._input_shape)
         var_x = input_
 
-        if self._input_shape != self._model_input_shape:
-            var_x = self._resize_inputs(var_x)
-
         scaling = self._selected_model.get("scaling")
         if scaling:
             #  Some models expect different scaling.
@@ -611,28 +614,6 @@ def __call__(self):
 
         return KerasModel(input_, var_x, name="encoder")
 
-    def _resize_inputs(self, inputs):
-        """ Some models (specifically NasNet) need a specific input size when loading trained
-        weights. This is slightly hacky, but arbitrarily resize the input for these instances.
-
-        Parameters
-        ----------
-        inputs: tensor
-            The input tensor to be resized
-
-        Returns
-        -------
-        tensor
-            The resized input tensor
-        """
-        input_size = self._input_shape[0]
-        new_size = self._model_input_shape[0]
-        logger.debug("Resizing input for encoder: '%s' from %s to %s due to trained weights usage",
-                     self._config["enc_architecture"], input_size, new_size)
-        scale = new_size / input_size
-        interp = "bilinear" if scale > 1 else "nearest"
-        return K.resize_images(size=scale, interpolation=interp)(inputs)
-
     def _get_encoder_model(self):
         """ Return the model defined by the selected architecture.
 
@@ -648,7 +629,7 @@ def _get_encoder_model(self):
         """
         if self._selected_model.get("keras_name"):
             kwargs = self._selected_model["kwargs"]
-            kwargs["input_shape"] = self._model_input_shape
+            kwargs["input_shape"] = self._input_shape
             kwargs["include_top"] = False
             kwargs["weights"] = "imagenet" if self._config["enc_load_weights"] else None
             retval = getattr(kapp, self._selected_model["keras_name"])(**kwargs)

diff --git a/plugins/train/model/phaze_a_defaults.py b/plugins/train/model/phaze_a_defaults.py
@@ -52,7 +52,8 @@
 if get_backend() != "amd":
     _ENCODERS.extend(["efficientnet_b0", "efficientnet_b1", "efficientnet_b2", "efficientnet_b3",
                       "efficientnet_b4", "efficientnet_b5", "efficientnet_b6", "efficientnet_b7",
-                      "resnet50_v2", "resnet101", "resnet101_v2", "resnet152", "resnet152_v2"])
+                      "mobilenet_v3_large", "mobilenet_v3_small", "resnet50_v2", "resnet101",
+                      "resnet101_v2", "resnet152", "resnet152_v2"])
 _ENCODERS = sorted(_ENCODERS)
 
 
@@ -157,6 +158,9 @@
              "\n\tmobilenet_v2: (32px - 224px). Additional MobileNet parameters can be set with "
              "the 'mobilenet' options. Ref: MobileNetV2: Inverted Residuals and Linear "
              "Bottlenecks (2018): https://arxiv.org/abs/1801.04381"
+             "\n\tmobilenet_v3: (32px - 224px). Additional MobileNet parameters can be set with "
+             "the 'mobilenet' options. Ref: Searching for MobileNetV3 (2019): "
+             "https://arxiv.org/pdf/1905.02244.pdf"
              "\n\tnasnet: (32px - 331px (large) or 224px (mobile)). Ref: Learning Transferable "
              "Architectures for Scalable Image Recognition (2017): "
              "https://arxiv.org/abs/1707.07012"
@@ -569,20 +573,21 @@
              "each layer. Values greater than 1.0 proportionally increase the number of filters "
              "within each layer. 1.0 is the default number of layers used within the paper.\n"
              "NB: This option is ignored for any non-mobilenet encoders.\n"
-             "NB: If loading ImageNet weights, then for mobilenet v1 only values of '0.25', "
-             "'0.5', '0.75' or '1.0 can be selected. For mobilenet v2 only values of '0.35', "
-             "'0.50', '0.75', '1.0', '1.3' or '1.4' can be selected",
+             "NB: If loading ImageNet weights, then for MobilenetV1 only values of '0.25', "
+             "'0.5', '0.75' or '1.0 can be selected. For MobilenetV2 only values of '0.35', "
+             "'0.50', '0.75', '1.0', '1.3' or '1.4' can be selected. For mobilenet_v3 only values "
+             "of '0.75' or '1.0' can be selected",
         datatype=float,
         min_max=(0.1, 2.0),
         rounding=2,
         group="mobilenet encoder configuration",
         fixed=True),
     mobilenet_depth=dict(
         default=1,
-        info="The depth multiplier for mobilenet v1 encoder. This is the depth multiplier "
+        info="The depth multiplier for MobilenetV1 encoder. This is the depth multiplier "
              "for depthwise convolution (known as the resolution multiplier within the original "
              "paper).\n"
-             "NB: This option is only used for mobilenet v1 and is ignored for all other "
+             "NB: This option is only used for MobilenetV1 and is ignored for all other "
              "encoders.\n"
              "NB: If loading ImageNet weights, this must be set to 1.",
         datatype=int,
@@ -592,13 +597,25 @@
         fixed=True),
     mobilenet_dropout=dict(
         default=0.001,
-        info="The dropout rate for for mobilenet v1 encoder.\n"
-             "NB: This option is only used for mobilenet v1 and is ignored for all other "
-             "encoders.\n"
-             "NB: If loading ImageNet weights, this must be set to 1.0.",
+        info="The dropout rate for MobilenetV1 encoder.\n"
+             "NB: This option is only used for MobilenetV1 and is ignored for all other "
+             "encoders.",
         datatype=float,
-        min_max=(0.1, 2.0),
-        rounding=2,
+        min_max=(0.001, 2.0),
+        rounding=3,
+        group="mobilenet encoder configuration",
+        fixed=True),
+    mobilenet_minimalistic=dict(
+        default=False,
+        info="Use a minimilist version of MobilenetV3.\n"
+             "In addition to large and small models MobilenetV3 also contains so-called "
+             "minimalistic models, these models have the same per-layer dimensions characteristic "
+             "as MobilenetV3 however, they don't utilize any of the advanced blocks "
+             "(squeeze-and-excite units, hard-swish, and 5x5 convolutions). While these models "
+             "are less efficient on CPU, they are much more performant on GPU/DSP.\n"
+             "NB: This option is only used for MobilenetV3 and is ignored for all other "
+             "encoders.\n",
+        datatype=bool,
         group="mobilenet encoder configuration",
         fixed=True),
     )