From bddc2df3c2e403e84d0efa3eafad895d6d1d5c91 Mon Sep 17 00:00:00 2001 From: Kye Date: Wed, 20 Dec 2023 12:02:08 -0500 Subject: [PATCH] [DOCS][FusedDenseGELUDense] --- docs/zeta/nn/modules/fused_gelu_dense.md | 140 ++++++++++++++++++++++ mkdocs.yml | 1 + tests/nn/modules/test_fused_gelu_dense.py | 70 +++++++++++ zeta/cloud/main.py | 2 +- zeta/nn/modules/fused_gelu_dense.py | 98 +++++++++++++++ zeta/optim/__init__.py | 2 +- 6 files changed, 311 insertions(+), 2 deletions(-) create mode 100644 docs/zeta/nn/modules/fused_gelu_dense.md create mode 100644 tests/nn/modules/test_fused_gelu_dense.py create mode 100644 zeta/nn/modules/fused_gelu_dense.py diff --git a/docs/zeta/nn/modules/fused_gelu_dense.md b/docs/zeta/nn/modules/fused_gelu_dense.md new file mode 100644 index 00000000..77868b86 --- /dev/null +++ b/docs/zeta/nn/modules/fused_gelu_dense.md @@ -0,0 +1,140 @@ +# `FusedDenseGELUDense` + +## Overview + +The `FusedDenseGELUDense` module is a versatile neural network layer designed for efficient computation of dense layers with GELU (Gaussian Error Linear Unit) activations. This documentation will provide an in-depth understanding of the module's architecture, purpose, parameters, and usage examples. + +## Table of Contents + +1. [Introduction](#introduction) +2. [Architecture](#architecture) +3. [Purpose](#purpose) +4. [Class Definition](#class-definition) + - [Parameters](#parameters) + - [Internal Layers](#internal-layers) +5. [Functionality and Usage](#functionality-and-usage) + - [Forward Pass](#forward-pass) +6. [Examples](#examples) + - [Basic Usage](#basic-usage) + - [Custom Configuration](#custom-configuration) + - [Quantization with bitsandbytes](#quantization-with-bitsandbytes) +7. [Additional Information](#additional-information) +8. [References](#references) + +--- + +## 1. Introduction + +The `FusedDenseGELUDense` module combines dense layers with GELU activations in a single neural network layer. This fusion improves computational efficiency and is particularly useful in various deep learning applications. + +## 2. Architecture + +The `FusedDenseGELUDense` layer consists of two dense sub-layers, each followed by a GELU activation function. It takes an input tensor and passes it through these sub-layers to produce the final output. + +## 3. Purpose + +The primary purpose of the `FusedDenseGELUDense` layer is to efficiently compute dense transformations with GELU activations. It is designed for use in neural networks, providing a convenient way to incorporate these operations into deep learning models. + +## 4. Class Definition + +### Parameters + +- `dim` (int): Input dimension. +- `dim_out` (int): Output dimension. +- `bias` (bool, optional): Whether to include bias terms. Defaults to True. +- `has_fp16_weights` (bool, optional): Whether to use fp16 weights. Defaults to False. +- `threshold` (float, optional): Threshold for quantization. Defaults to 6.0. + +### Internal Layers + +The `FusedDenseGELUDense` layer consists of the following internal layers: + +1. `dense1`: The first dense layer. +2. `act`: The GELU activation function. +3. `dense2`: The second dense layer. + +## 5. Functionality and Usage + +### Forward Pass + +The `forward` method of the `FusedDenseGELUDense` layer performs the following operations: + +1. Applies the first dense layer (`dense1`) to the input tensor. +2. Applies the GELU activation function (`act`) to the result. +3. Applies the second dense layer (`dense2`) to the GELU-activated output. + +## 6. Examples + +### Basic Usage + +Here's a basic example of using the `FusedDenseGELUDense` layer: + +```python +import torch +from zeta.nn import FusedDenseGELUDense + +# Create an instance of FusedDenseGELUDense +model = FusedDenseGELUDense(dim=512, dim_out=1024) + +# Generate random input tensor +x = torch.randn(1, 512) + +# Forward pass +out = model(x) + +# Check the output shape +print(out.shape) # torch.Size([1, 512]) +``` + +### Custom Configuration + +You can customize the layer by specifying different parameters: + +```python +# Create a custom FusedDenseGELUDense layer +custom_model = FusedDenseGELUDense( + dim=256, dim_out=512, bias=False, has_fp16_weights=True, threshold=4.0 +) + +# Generate random input tensor +x = torch.randn(1, 256) + +# Forward pass with the custom configuration +out = custom_model(x) +``` + +### Quantization with bitsandbytes + +You can enable quantization using the `bitsandbytes` library by providing a quantized implementation of the dense layers: + +```python +# Install bitsandbytes if not already installed +# pip install bitsandbytes + +import torch +from zeta.nn import FusedDenseGELUDense + +# Create an instance of FusedDenseGELUDense with quantization +quantized_model = FusedDenseGELUDense( + dim=512, dim_out=1024, has_fp16_weights=True, threshold=4.0 +) + +# Generate random input tensor +x = torch.randn(1, 512) + +# Forward pass with quantization +out = quantized_model(x) +``` + +## 7. Additional Information + +- The `FusedDenseGELUDense` layer efficiently combines dense and GELU activation operations. +- Custom configurations for bias, weight precision, and threshold are supported. +- Quantization can be enabled using the `bitsandbytes` library for further efficiency. + +## 8. References + +For more information on GELU activations and dense layers in PyTorch, refer to the official PyTorch documentation: + +- [GELU Activation Function](https://pytorch.org/docs/stable/generated/torch.nn.GELU.html) +- [Dense Layer](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) diff --git a/mkdocs.yml b/mkdocs.yml index 30720331..cc239ae2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -109,6 +109,7 @@ nav: - MultiModalAdapterDenseNetwork: "zeta/nn/modules/mm_adapter.md" - CustomMLP: "zeta/nn/modules/custom_mlp.md" - PolymorphicNeuronLayer: "zeta/nn/modules/polymorphic_activation.md" + - FusedDenseGELUDense: "zeta/nn/modules/fused_gelu_dense.md" - zeta.nn.attention: - FlashAttention: "zeta/nn/attention/flash_attention.md" - MultiQueryAttention: "zeta/nn/attention/multiquery.md" diff --git a/tests/nn/modules/test_fused_gelu_dense.py b/tests/nn/modules/test_fused_gelu_dense.py new file mode 100644 index 00000000..5ea5ce5a --- /dev/null +++ b/tests/nn/modules/test_fused_gelu_dense.py @@ -0,0 +1,70 @@ +import pytest +import torch +from zeta.nn.modules.fused_gelu_dense import FusedDenseGELUDense + +def test_class_init(): + model = FusedDenseGELUDense(512, 1024) + + assert model.dim == 512 + assert model.dim_out == 1024 + assert model.bias == True + assert model.has_fp16_weights == False + assert model.threshold == 6.0 + +def test_class_init_with_args(): + model = FusedDenseGELUDense(512, 1024, bias=False, has_fp16_weights=True, threshold=5.0) + + assert model.dim == 512 + assert model.dim_out == 1024 + assert model.bias == False + assert model.has_fp16_weights == True + assert model.threshold == 5.0 + +def test_forward(): + model = FusedDenseGELUDense(512, 1024) + x = torch.randn(1, 512) + out = model(x) + + assert out.shape == torch.Size([1, 512]) + +def test_forward_with_different_input(): + model = FusedDenseGELUDense(512, 1024) + x = torch.randn(2, 512) + out = model(x) + + assert out.shape == torch.Size([2, 512]) + +def test_forward_with_different_dim(): + model = FusedDenseGELUDense(256, 512) + x = torch.randn(1, 256) + out = model(x) + + assert out.shape == torch.Size([1, 256]) + +def test_forward_with_different_dim_out(): + model = FusedDenseGELUDense(512, 2048) + x = torch.randn(1, 512) + out = model(x) + + assert out.shape == torch.Size([1, 512]) + +def test_forward_with_no_bias(): + model = FusedDenseGELUDense(512, 1024, bias=False) + x = torch.randn(1, 512) + out = model(x) + + assert out.shape == torch.Size([1, 512]) + +def test_forward_with_fp16_weights(): + model = FusedDenseGELUDense(512, 1024, has_fp16_weights=True) + x = torch.randn(1, 512) + out = model(x) + + assert out.shape == torch.Size([1, 512]) + +def test_forward_with_different_threshold(): + model = FusedDenseGELUDense(512, 1024, threshold=5.0) + x = torch.randn(1, 512) + out = model(x) + + assert out.shape == torch.Size([1, 512]) \ No newline at end of file diff --git a/zeta/cloud/main.py b/zeta/cloud/main.py index e2760272..7b3e1e4e 100644 --- a/zeta/cloud/main.py +++ b/zeta/cloud/main.py @@ -13,7 +13,7 @@ def zetacloud( task_name: str = None, - cluster_name: str = "[ZetaTrainingRun]", + cluster_name: str = "ZetaTrainingRun", cloud: Any = AWS(), gpus: str = None, filename: str = "train.py", diff --git a/zeta/nn/modules/fused_gelu_dense.py b/zeta/nn/modules/fused_gelu_dense.py new file mode 100644 index 00000000..d47d934e --- /dev/null +++ b/zeta/nn/modules/fused_gelu_dense.py @@ -0,0 +1,98 @@ +import torch +from torch import nn + +class FusedDenseGELUDense(nn.Module): + """FuseFusedDenseGELUDense + + Args + dim (int): Input dimension + dim_out (int): Output dimension + bias (bool, optional): Bias. Defaults to True. + has_fp16_weights (bool, optional): Use fp16 weights. Defaults to False. + threshold (float, optional): Threshold for quantization. Defaults to 6.0. + + Examples: + >>> x = torch.randn(1, 512) + >>> model = FusedDenseGELUDense(512, 1024) + >>> out = model(x) + >>> out.shape + torch.Size([1, 512]) + """ + def __init__( + self, + dim: int, + dim_out: int, + bias: bool = True, + has_fp16_weights: bool = False, + threshold: float = 6.0, + *args, + **kwargs + ): + super(FusedDenseGELUDense, self).__init__() + self.dim = dim + self.dim_out = dim_out + self.bias = bias + self.has_fp16_weights = has_fp16_weights + self.threshold = threshold + + + try: + import bitsandbytes as bnb + # Using bitsandbytes for quantization + self.dense1 = bnb.nn.Linear8bitLt( + dim, + dim_out, + bias=bias, + has_fp16_weights=has_fp16_weights, + threshold=threshold, + *args, + **kwargs + ) + + # Reverse + self.dense2 = bnb.nn.Linear8bitLt( + dim_out, + dim, + bias=bias, + has_fp16_weights=has_fp16_weights, + threshold=threshold, + *args, + **kwargs + ) + + except ModuleNotFoundError: + # Using torch.nn.Linear + self.dense1 = nn.Linear( + dim, + dim_out, + bias=bias + *args, + **kwargs + ) + + # Dense 2 + self.dense2 = nn.Linear( + dim_out, + dim, + bias=bias + *args, + **kwargs + ) + + # Activation + self.act = nn.GELU() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward pass + + Args: + x (torch.Tensor): x input + + Returns: + torch.Tensor: _description_ + """ + x = self.dense1(x) + x = self.act(x) + x = self.dense2(x) + return x + \ No newline at end of file diff --git a/zeta/optim/__init__.py b/zeta/optim/__init__.py index f9009c4f..b7e81e34 100644 --- a/zeta/optim/__init__.py +++ b/zeta/optim/__init__.py @@ -27,5 +27,5 @@ "StableAdamWUnfused", "GradientAscent", "GradientEquilibrum", - "DecoupledLionW8Bit" + "DecoupledLionW8Bit", ]