Skip to content

Commit

Permalink
[CLEANUP]
Browse files Browse the repository at this point in the history
  • Loading branch information
Kye Gomez authored and Kye Gomez committed Sep 1, 2024
1 parent bc785bc commit be17f4b
Show file tree
Hide file tree
Showing 41 changed files with 217 additions and 196 deletions.
25 changes: 15 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,26 @@ $ pip3 install -U zetascale

## Starting Your Journey

Creating a model empowered with the aforementioned breakthrough research features is a breeze. Here's how to quickly materialize the renowned Flash Attention
Creating a model empowered with the aforementioned breakthrough research features is a breeze. Here's how to quickly materialize the renowned Multi Query Attention

```python
import torch
from zeta import MultiQueryAttention

from zeta.nn import FlashAttention

q = torch.randn(2, 4, 6, 8)
k = torch.randn(2, 4, 10, 8)
v = torch.randn(2, 4, 10, 8)
# Model
model = MultiQueryAttention(
dim=512,
heads=8,
)

attention = FlashAttention(causal=False, dropout=0.1, flash=True)
output = attention(q, k, v)
# Input
text = torch.randn(2, 4, 512)

# Output
output, _, _ = model(text)
print(output.shape)
print(output)

```


Expand Down Expand Up @@ -352,8 +357,8 @@ model = YourModelClass()
# Quantize the model dynamically, specifying layers to quantize
niva(
model=model,
model_path="path_to_pretrained_model_weights.pt",
output_path="quantized_model.pt",
model_path="path_to_pretrainedim_weights.pt",
output_path="quantizedim.pt",
quant_type="dynamic",
quantize_layers=[nn.Linear, nn.Conv2d],
dtype=torch.qint8,
Expand Down
2 changes: 1 addition & 1 deletion audio_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def __init__(
)

transformer_encoder_layer = TransformerEncoderLayer(
d_model=cnn_channels * 8,
dim=cnn_channels * 8,
nhead=nhead,
dim_feedforward=dim_feedforward,
)
Expand Down
4 changes: 2 additions & 2 deletions docs/zeta/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -327,8 +327,8 @@ model = YourModelClass()
# Quantize the model dynamically, specifying layers to quantize
niva(
model=model,
model_path="path_to_pretrained_model_weights.pt",
output_path="quantized_model.pt",
model_path="path_to_pretrainedim_weights.pt",
output_path="quantizedim.pt",
quant_type="dynamic",
quantize_layers=[nn.Linear, nn.Conv2d],
dtype=torch.qint8,
Expand Down
4 changes: 2 additions & 2 deletions docs/zeta/nn/attention/multiquery.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class MultiQueryAttention(nn.Module):
```

### Parameters:
- `d_model` (int): Dimension of the model.
- `dim` (int): Dimension of the model.
- `heads` (int): Number of parallel attention heads.
- `attn_impl` (str, optional): Attention implementation type, can be either 'triton', 'flash', or 'torch'. Default is 'triton'.
- `clip_qkv` (Optional[float]): Clipping value for query, key, and value. If specified, qkv is clamped within the range [-clip_qkv, clip_qkv].
Expand Down Expand Up @@ -68,7 +68,7 @@ import torch
from zeta.nn import MultiQueryAttention

# Initialize the attention module
attention_layer = MultiQueryAttention(d_model=512, heads=8, attn_impl="torch")
attention_layer = MultiQueryAttention(dim=512, heads=8, attn_impl="torch")

# Random input tensor
x = torch.rand(16, 10, 512) # Batch of 16, sequence length 10, embedding size 512
Expand Down
16 changes: 8 additions & 8 deletions docs/zeta/nn/modules/averagemodelmerger.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ class AverageModelMerger:
model2 = nn.Linear(in_features=10, out_features=10)
model3 = nn.Linear(in_features=10, out_features=10)
merge = AverageModelMerger([model1, model2, model3])
merged_model = merge.merge_models()
print(merged_model)
mergedim = merge.merge_models()
print(mergedim)
"""
```

Expand Down Expand Up @@ -80,10 +80,10 @@ model3 = nn.Linear(in_features=10, out_features=10)
merger = AverageModelMerger([model1, model2, model3])

# Merge models
merged_model = merger.merge_models()
mergedim = merger.merge_models()

# Print merged model
print(merged_model)
print(mergedim)
```

### Example 2
Expand All @@ -101,10 +101,10 @@ model3 = nn.Conv2d(3, 6, 5)
merger = AverageModelMerger([model1, model2, model3])

# Merge models
merged_model = merger.merge_models()
mergedim = merger.merge_models()

# Print merged model
print(merged_model)
print(mergedim)
```

### Example 3
Expand All @@ -122,10 +122,10 @@ model3 = nn.CrossEntropyLoss()
merger = AverageModelMerger([model1, model2, model3])

# Merge models
merged_model = merger.merge_models()
mergedim = merger.merge_models()

# Print merged model
print(merged_model)
print(mergedim)
```

All the examples above demonstrate the basic usage of this class. In cases where you have multiple trained models (e.g., resultant from a k-fold cross-validation or models trained on different datasets), you can use this class to merge or average their weights. The resultant model will carry averaged weights, giving a balanced representation of all the models.
4 changes: 2 additions & 2 deletions docs/zeta/nn/modules/fused_gelu_dense.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,15 +117,15 @@ import torch
from zeta.nn import FusedDenseGELUDense

# Create an instance of FusedDenseGELUDense with quantization
quantized_model = FusedDenseGELUDense(
quantizedim = FusedDenseGELUDense(
dim=512, dim_out=1024, has_fp16_weights=True, threshold=4.0
)

# Generate random input tensor
x = torch.randn(1, 512)

# Forward pass with quantization
out = quantized_model(x)
out = quantizedim(x)
```

## 7. Additional Information <a name="additional-information"></a>
Expand Down
4 changes: 2 additions & 2 deletions docs/zeta/nn/modules/slerpmodelmerger.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ model1 = nn.Linear(10, 10)
model2 = nn.Linear(10, 10)

merger = SLERPModelMerger(model1, model2, 0.5)
merged_model = merger.merge()
mergedim = merger.merge()

# This will output the merged state_dict
print(merged_model.state_dict())
print(mergedim.state_dict())
```

The prints statement will output the state_dict of the merged model. The state_dict is a Python dictionary that maps each layer to its corresponding parameters (tensors).
Expand Down
8 changes: 4 additions & 4 deletions docs/zeta/quant/niva.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ model = YourModelClass()
# Quantize the model dynamically, specifying layers to quantize
niva(
model=model,
model_path="path_to_pretrained_model_weights.pt",
output_path="quantized_model.pt",
model_path="path_to_pretrainedim_weights.pt",
output_path="quantizedim.pt",
quant_type="dynamic",
quantize_layers=[nn.Linear, nn.Conv2d],
dtype=torch.qint8,
Expand All @@ -93,8 +93,8 @@ model = YourModelClass()
# Quantize the entire model statically
niva(
model=model,
model_path="path_to_pretrained_model_weights.pt",
output_path="quantized_model.pt",
model_path="path_to_pretrainedim_weights.pt",
output_path="quantizedim.pt",
quant_type="static",
dtype=torch.qint8,
)
Expand Down
4 changes: 2 additions & 2 deletions docs/zeta/structs/autoregressivewrapper.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ This method is particularly useful for generating multiple forecasted sequence p
The `evaluate_and_select_best_solution()` method evaluates the solutions based on a reward model and returns the best one.

```python
def evaluate_and_select_best_solution(self, solutions, reward_model)
def evaluate_and_select_best_solution(self, solutions, rewardim)
```


Expand Down Expand Up @@ -113,7 +113,7 @@ The third example shows generating multiple solutions and selecting the best one
```python
solutions = net.generate_n_solutions(start_tokens, n=5, seqlen=10)
best_solution = net.evaluate_and_select_best_solution(
solutions, reward_model=lambda x: -x.sum()
solutions, rewardim=lambda x: -x.sum()
)
```
In the example above, the reward model simply returns the negative sum of the sequence, and the solution with lowest sum is selected as the best solution.
Expand Down
12 changes: 6 additions & 6 deletions docs/zeta/structs/encoder.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ from zeta.structs import AttentionLayers


class MyEncoder(AttentionLayers):
def __init__(self, d_model, nhead, num_layers):
super().__init__(d_model=d_model, nhead=nhead, num_layers=num_layers)
self.linear = nn.Linear(d_model, d_model)
def __init__(self, dim, nhead, num_layers):
super().__init__(dim=dim, nhead=nhead, num_layers=num_layers)
self.linear = nn.Linear(dim, dim)

def forward(self, x):
x = super().forward(x)
Expand All @@ -47,16 +47,16 @@ We built a custom encoder by extending the AttentionLayers, added a linear layer

Firstly, let's initialize the model:
```python
model = MyEncoder(d_model=512, nhead=8, num_layers=6)
model = MyEncoder(dim=512, nhead=8, num_layers=6)
```
The model is initialized with the dimensions of model `d_model=512`, number of heads `nhead=8`, and the number of layers `num_layers=6`.
The model is initialized with the dimensions of model `dim=512`, number of heads `nhead=8`, and the number of layers `num_layers=6`.

Now, let's define some dummy input data and pass it through the model:

```python
import torch

x = torch.randn(10, 32, 512) # (sequence_length, batch_size, d_model)
x = torch.randn(10, 32, 512) # (sequence_length, batch_size, dim)
output = model(x) # forward pass
print(output.shape) # torch.Size([10, 32, 512])
```
Expand Down
6 changes: 3 additions & 3 deletions docs/zeta/utils/main.md
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,7 @@ output = resnet_block(x, time_emb=time_emb)
print(output.shape)
```

## Function: load_model(path)
## Function: loadim(path)
Load a model from a file.

### Parameters:
Expand All @@ -610,9 +610,9 @@ Load a model from a file.

### Example:
```python
from zeta.utils.main import load_model
from zeta.utils.main import loadim

model = load_model("model_checkpoint.pth")
model = loadim("model_checkpoint.pth")
print(model)
```

Expand Down
2 changes: 1 addition & 1 deletion docs/zeta/utils/save_load.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ model = MyModel(32, 10)
model.save("model.pt")

# Load your model
loaded_model = MyModel.load("model.pt")
loadedim = MyModel.load("model.pt")
```

### Example 2: Using the `save_load` with non-default arguments
Expand Down
8 changes: 4 additions & 4 deletions docs/zeta/utils/save_load_wrapper.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ my_model = MyModel()
my_model.save("my_model.pth")

# Load the model checkpoint
loaded_model = MyModel.load("my_model.pth")
loadedim = MyModel.load("my_model.pth")
```

#### Custom Methods and Hooks <a name="custom-methods-and-hooks"></a>
Expand Down Expand Up @@ -171,13 +171,13 @@ class VersionedModel(Module):


# Create an instance of VersionedModel
versioned_model = VersionedModel()
versionedim = VersionedModel()

# Save the model checkpoint
versioned_model.save("versioned_model.pth")
versionedim.save("versionedim.pth")

# Load the model checkpoint with version compatibility check
loaded_versioned_model = VersionedModel.load("versioned_model.pth")
loaded_versionedim = VersionedModel.load("versionedim.pth")
```

## 5. Additional Information <a name="additional-information"></a>
Expand Down
16 changes: 16 additions & 0 deletions multi_query_attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import torch
from zeta import MultiQueryAttention

# Model
model = MultiQueryAttention(
dim=512,
heads=8,
)

# Input
text = torch.randn(2, 4, 512)

# Output
output, _, _ = model(text)
print(output.shape)
print(output)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "zetascale"
version = "2.6.7"
version = "2.6.9"
description = "Rapidly Build, Optimize, and Train SOTA AI Models"
authors = ["Zeta Team <[email protected]>"]
license = "MIT"
Expand Down
8 changes: 4 additions & 4 deletions tests/nn/attentions/test_mhaa.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def test_attention_distribution(self):
)

def setUp(self):
self.d_model = 128
self.dim = 128
self.num_heads = 4
self.dilation_rate = 2
self.segment_size = 32
Expand All @@ -129,10 +129,10 @@ def setUp(self):
self.batch_size = 10
self.seq_len = 100

self.x = torch.rand(self.batch_size, self.seq_len, self.d_model)
self.x = torch.rand(self.batch_size, self.seq_len, self.dim)

self.sparse_dilated_attention = MultiheadAttention(
self.d_model,
self.dim,
self.num_heads,
self.dilation_rate,
self.segment_size,
Expand All @@ -145,7 +145,7 @@ def setUp(self):
def test_forward_pass(self):
output = self.sparse_dilated_attention(self.x)
self.assertEqual(
output.size(), (self.batch_size, self.seq_len, self.d_model)
output.size(), (self.batch_size, self.seq_len, self.dim)
)

def test_attention_outputs(self):
Expand Down
10 changes: 5 additions & 5 deletions tests/nn/attentions/test_mqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@


def test_multiqueryattention_initialization():
model = MultiQueryAttention(d_model=512, heads=8)
model = MultiQueryAttention(dim=512, heads=8)
assert isinstance(model, MultiQueryAttention)
assert model.d_model == 512
assert model.dim == 512
assert model.heads == 8
assert model.head_dim == 64
assert model.softmax_scale == 1 / 8


def test_multiqueryattention_forward():
model = MultiQueryAttention(d_model=512, heads=8)
model = MultiQueryAttention(dim=512, heads=8)
x = torch.randn(1, 10, 512)
output, attn_weights, past_key_value = model(x)
assert output.shape == (1, 10, 512)
Expand All @@ -24,14 +24,14 @@ def test_multiqueryattention_forward():

@pytest.mark.parametrize("x_len", [0])
def test_multiqueryattention_forward_edge_cases(x_len):
model = MultiQueryAttention(d_model=512, heads=8)
model = MultiQueryAttention(dim=512, heads=8)
x = torch.randn(1, x_len, 512)
with pytest.raises(Exception):
model(x)


def test_multiqueryattention_forward_invalid_dimensions():
model = MultiQueryAttention(d_model=512, heads=8)
model = MultiQueryAttention(dim=512, heads=8)
x = torch.randn(1, 10, 256)
with pytest.raises(Exception):
model(x)
Loading

0 comments on commit be17f4b

Please sign in to comment.