diff --git a/src/anemoi/models/layers/attention.py b/src/anemoi/models/layers/attention.py index 6c333dd..cbbf412 100644 --- a/src/anemoi/models/layers/attention.py +++ b/src/anemoi/models/layers/attention.py @@ -37,8 +37,8 @@ def __init__( window_size: Optional[int] = None, dropout_p: float = 0.0, use_flash_attention: bool = False, - softcap: float | None = None, - use_alibi_slopes: bool | None = None, + softcap: float = None, + use_alibi_slopes: bool = None, ): """Initialize MultiHeadSelfAttention. diff --git a/src/anemoi/models/layers/block.py b/src/anemoi/models/layers/block.py index 8be2a56..2c60125 100644 --- a/src/anemoi/models/layers/block.py +++ b/src/anemoi/models/layers/block.py @@ -64,8 +64,8 @@ def __init__( window_size: int, dropout_p: float = 0.0, use_flash_attention: bool = False, - softcap: float | None = None, - use_alibi_slopes: bool | None = None, + softcap: float = None, + use_alibi_slopes: bool = None, ): super().__init__() diff --git a/src/anemoi/models/layers/chunk.py b/src/anemoi/models/layers/chunk.py index 0e84234..dd039b2 100644 --- a/src/anemoi/models/layers/chunk.py +++ b/src/anemoi/models/layers/chunk.py @@ -75,8 +75,8 @@ def __init__( activation: str = "GELU", dropout_p: float = 0.0, use_flash_attention: bool = False, - softcap: float | None = None, - use_alibi_slopes: bool | None = None, + softcap: float = None, + use_alibi_slopes: bool = None, ) -> None: """Initialize TransformerProcessor. diff --git a/src/anemoi/models/layers/processor.py b/src/anemoi/models/layers/processor.py index 8cda610..6f6ab88 100644 --- a/src/anemoi/models/layers/processor.py +++ b/src/anemoi/models/layers/processor.py @@ -97,8 +97,8 @@ def __init__( mlp_hidden_ratio: int = 4, dropout_p: float = 0.1, use_flash_attention: bool = False, - softcap: float | None = 0.0, - use_alibi_slopes: Tensor | None = None, + softcap: float = 0.0, + use_alibi_slopes: Tensor = None, **kwargs, ) -> None: """Initialize TransformerProcessor. @@ -120,7 +120,7 @@ def __init__( dropout_p: float, optional Dropout probability used for multi-head self attention, default 0.0 softcap : float, optional - Anything > 0 activates softcapping flash attention, by default 0.0 + Anything > 0 activates softcapping flash attention, by default None use_alibi_slopes : bool, optional Use aLiBI option, only used for flash attention, by default None """