[CLEANUP]

kyegomez · Sep 1, 2024 · be17f4b · be17f4b
1 parent bc785bc
commit be17f4b
Show file tree

Hide file tree

Showing 41 changed files with 217 additions and 196 deletions.
diff --git a/README.md b/README.md
@@ -34,21 +34,26 @@ $ pip3 install -U zetascale
 
 ## Starting Your Journey
 
-Creating a model empowered with the aforementioned breakthrough research features is a breeze. Here's how to quickly materialize the renowned Flash Attention
+Creating a model empowered with the aforementioned breakthrough research features is a breeze. Here's how to quickly materialize the renowned Multi Query Attention
 
 ```python
 import torch
+from zeta import MultiQueryAttention
 
-from zeta.nn import FlashAttention
-
-q = torch.randn(2, 4, 6, 8)
-k = torch.randn(2, 4, 10, 8)
-v = torch.randn(2, 4, 10, 8)
+# Model
+model = MultiQueryAttention(
+ dim=512,
+ heads=8,
+)
 
-attention = FlashAttention(causal=False, dropout=0.1, flash=True)
-output = attention(q, k, v)
+# Input
+text = torch.randn(2, 4, 512)
 
+# Output
+output, _, _ = model(text)
 print(output.shape)
+print(output)
+
 ```
 
 
@@ -352,8 +357,8 @@ model = YourModelClass()
 # Quantize the model dynamically, specifying layers to quantize
 niva(
  model=model,
- model_path="path_to_pretrained_model_weights.pt",
- output_path="quantized_model.pt",
+ model_path="path_to_pretrainedim_weights.pt",
+ output_path="quantizedim.pt",
  quant_type="dynamic",
  quantize_layers=[nn.Linear, nn.Conv2d],
  dtype=torch.qint8,

diff --git a/audio_encoder.py b/audio_encoder.py
@@ -66,7 +66,7 @@ def __init__(
  )
 
  transformer_encoder_layer = TransformerEncoderLayer(
- d_model=cnn_channels * 8,
+ dim=cnn_channels * 8,
  nhead=nhead,
  dim_feedforward=dim_feedforward,
  )

diff --git a/docs/zeta/index.md b/docs/zeta/index.md
@@ -327,8 +327,8 @@ model = YourModelClass()
 # Quantize the model dynamically, specifying layers to quantize
 niva(
  model=model,
- model_path="path_to_pretrained_model_weights.pt",
- output_path="quantized_model.pt",
+ model_path="path_to_pretrainedim_weights.pt",
+ output_path="quantizedim.pt",
  quant_type="dynamic",
  quantize_layers=[nn.Linear, nn.Conv2d],
  dtype=torch.qint8,

diff --git a/docs/zeta/nn/attention/multiquery.md b/docs/zeta/nn/attention/multiquery.md
@@ -15,7 +15,7 @@ class MultiQueryAttention(nn.Module):
 ```
 
 ### Parameters:
-- `d_model` (int): Dimension of the model.
+- `dim` (int): Dimension of the model.
 - `heads` (int): Number of parallel attention heads.
 - `attn_impl` (str, optional): Attention implementation type, can be either 'triton', 'flash', or 'torch'. Default is 'triton'.
 - `clip_qkv` (Optional[float]): Clipping value for query, key, and value. If specified, qkv is clamped within the range [-clip_qkv, clip_qkv].
@@ -68,7 +68,7 @@ import torch
 from zeta.nn import MultiQueryAttention
 
 # Initialize the attention module
-attention_layer = MultiQueryAttention(d_model=512, heads=8, attn_impl="torch")
+attention_layer = MultiQueryAttention(dim=512, heads=8, attn_impl="torch")
 
 # Random input tensor
 x = torch.rand(16, 10, 512) # Batch of 16, sequence length 10, embedding size 512

diff --git a/docs/zeta/nn/modules/averagemodelmerger.md b/docs/zeta/nn/modules/averagemodelmerger.md
@@ -19,8 +19,8 @@ class AverageModelMerger:
  model2 = nn.Linear(in_features=10, out_features=10)
  model3 = nn.Linear(in_features=10, out_features=10)
  merge = AverageModelMerger([model1, model2, model3])
- merged_model = merge.merge_models()
- print(merged_model)
+ mergedim = merge.merge_models()
+ print(mergedim)
  """
 ```
 
@@ -80,10 +80,10 @@ model3 = nn.Linear(in_features=10, out_features=10)
 merger = AverageModelMerger([model1, model2, model3])
 
 # Merge models
-merged_model = merger.merge_models()
+mergedim = merger.merge_models()
 
 # Print merged model
-print(merged_model)
+print(mergedim)
 ```
 
 ### Example 2
@@ -101,10 +101,10 @@ model3 = nn.Conv2d(3, 6, 5)
 merger = AverageModelMerger([model1, model2, model3])
 
 # Merge models
-merged_model = merger.merge_models()
+mergedim = merger.merge_models()
 
 # Print merged model
-print(merged_model)
+print(mergedim)
 ```
 
 ### Example 3
@@ -122,10 +122,10 @@ model3 = nn.CrossEntropyLoss()
 merger = AverageModelMerger([model1, model2, model3])
 
 # Merge models
-merged_model = merger.merge_models()
+mergedim = merger.merge_models()
 
 # Print merged model
-print(merged_model)
+print(mergedim)
 ```
 
 All the examples above demonstrate the basic usage of this class. In cases where you have multiple trained models (e.g., resultant from a k-fold cross-validation or models trained on different datasets), you can use this class to merge or average their weights. The resultant model will carry averaged weights, giving a balanced representation of all the models.
diff --git a/docs/zeta/nn/modules/fused_gelu_dense.md b/docs/zeta/nn/modules/fused_gelu_dense.md
@@ -117,15 +117,15 @@ import torch
 from zeta.nn import FusedDenseGELUDense
 
 # Create an instance of FusedDenseGELUDense with quantization
-quantized_model = FusedDenseGELUDense(
+quantizedim = FusedDenseGELUDense(
  dim=512, dim_out=1024, has_fp16_weights=True, threshold=4.0
 )
 
 # Generate random input tensor
 x = torch.randn(1, 512)
 
 # Forward pass with quantization
-out = quantized_model(x)
+out = quantizedim(x)
 ```
 
 ## 7. Additional Information <a name="additional-information"></a>

diff --git a/docs/zeta/nn/modules/slerpmodelmerger.md b/docs/zeta/nn/modules/slerpmodelmerger.md
@@ -49,10 +49,10 @@ model1 = nn.Linear(10, 10)
 model2 = nn.Linear(10, 10)
 
 merger = SLERPModelMerger(model1, model2, 0.5)
-merged_model = merger.merge()
+mergedim = merger.merge()
 
 # This will output the merged state_dict
-print(merged_model.state_dict())
+print(mergedim.state_dict())
 ```
 
 The prints statement will output the state_dict of the merged model. The state_dict is a Python dictionary that maps each layer to its corresponding parameters (tensors). 

diff --git a/docs/zeta/quant/niva.md b/docs/zeta/quant/niva.md
@@ -70,8 +70,8 @@ model = YourModelClass()
 # Quantize the model dynamically, specifying layers to quantize
 niva(
  model=model,
- model_path="path_to_pretrained_model_weights.pt",
- output_path="quantized_model.pt",
+ model_path="path_to_pretrainedim_weights.pt",
+ output_path="quantizedim.pt",
  quant_type="dynamic",
  quantize_layers=[nn.Linear, nn.Conv2d],
  dtype=torch.qint8,
@@ -93,8 +93,8 @@ model = YourModelClass()
 # Quantize the entire model statically
 niva(
  model=model,
- model_path="path_to_pretrained_model_weights.pt",
- output_path="quantized_model.pt",
+ model_path="path_to_pretrainedim_weights.pt",
+ output_path="quantizedim.pt",
  quant_type="static",
  dtype=torch.qint8,
 )

diff --git a/docs/zeta/structs/autoregressivewrapper.md b/docs/zeta/structs/autoregressivewrapper.md
@@ -76,7 +76,7 @@ This method is particularly useful for generating multiple forecasted sequence p
 The `evaluate_and_select_best_solution()` method evaluates the solutions based on a reward model and returns the best one.
 
 ```python
-def evaluate_and_select_best_solution(self, solutions, reward_model)
+def evaluate_and_select_best_solution(self, solutions, rewardim)
 ```
 
 
@@ -113,7 +113,7 @@ The third example shows generating multiple solutions and selecting the best one
 ```python
 solutions = net.generate_n_solutions(start_tokens, n=5, seqlen=10)
 best_solution = net.evaluate_and_select_best_solution(
- solutions, reward_model=lambda x: -x.sum()
+ solutions, rewardim=lambda x: -x.sum()
 )
 ```
 In the example above, the reward model simply returns the negative sum of the sequence, and the solution with lowest sum is selected as the best solution.

diff --git a/docs/zeta/structs/encoder.md b/docs/zeta/structs/encoder.md
@@ -33,9 +33,9 @@ from zeta.structs import AttentionLayers
 
 
 class MyEncoder(AttentionLayers):
- def __init__(self, d_model, nhead, num_layers):
- super().__init__(d_model=d_model, nhead=nhead, num_layers=num_layers)
- self.linear = nn.Linear(d_model, d_model)
+ def __init__(self, dim, nhead, num_layers):
+ super().__init__(dim=dim, nhead=nhead, num_layers=num_layers)
+ self.linear = nn.Linear(dim, dim)
 
  def forward(self, x):
  x = super().forward(x)
@@ -47,16 +47,16 @@ We built a custom encoder by extending the AttentionLayers, added a linear layer
 
 Firstly, let's initialize the model:
 ```python
-model = MyEncoder(d_model=512, nhead=8, num_layers=6)
+model = MyEncoder(dim=512, nhead=8, num_layers=6)
 ```
-The model is initialized with the dimensions of model `d_model=512`, number of heads `nhead=8`, and the number of layers `num_layers=6`.
+The model is initialized with the dimensions of model `dim=512`, number of heads `nhead=8`, and the number of layers `num_layers=6`.
 
 Now, let's define some dummy input data and pass it through the model:
 
 ```python
 import torch
 
-x = torch.randn(10, 32, 512) # (sequence_length, batch_size, d_model)
+x = torch.randn(10, 32, 512) # (sequence_length, batch_size, dim)
 output = model(x) # forward pass
 print(output.shape) # torch.Size([10, 32, 512])
 ```

diff --git a/docs/zeta/utils/main.md b/docs/zeta/utils/main.md
@@ -599,7 +599,7 @@ output = resnet_block(x, time_emb=time_emb)
 print(output.shape)
 ```
 
-## Function: load_model(path)
+## Function: loadim(path)
 Load a model from a file.
 
 ### Parameters:
@@ -610,9 +610,9 @@ Load a model from a file.
 
 ### Example:
 ```python
-from zeta.utils.main import load_model
+from zeta.utils.main import loadim
 
-model = load_model("model_checkpoint.pth")
+model = loadim("model_checkpoint.pth")
 print(model)
 ```
 

diff --git a/docs/zeta/utils/save_load.md b/docs/zeta/utils/save_load.md
@@ -82,7 +82,7 @@ model = MyModel(32, 10)
 model.save("model.pt")
 
 # Load your model
-loaded_model = MyModel.load("model.pt")
+loadedim = MyModel.load("model.pt")
 ```
 
 ### Example 2: Using the `save_load` with non-default arguments

diff --git a/docs/zeta/utils/save_load_wrapper.md b/docs/zeta/utils/save_load_wrapper.md
@@ -90,7 +90,7 @@ my_model = MyModel()
 my_model.save("my_model.pth")
 
 # Load the model checkpoint
-loaded_model = MyModel.load("my_model.pth")
+loadedim = MyModel.load("my_model.pth")
 ```
 
 #### Custom Methods and Hooks <a name="custom-methods-and-hooks"></a>
@@ -171,13 +171,13 @@ class VersionedModel(Module):
 
 
 # Create an instance of VersionedModel
-versioned_model = VersionedModel()
+versionedim = VersionedModel()
 
 # Save the model checkpoint
-versioned_model.save("versioned_model.pth")
+versionedim.save("versionedim.pth")
 
 # Load the model checkpoint with version compatibility check
-loaded_versioned_model = VersionedModel.load("versioned_model.pth")
+loaded_versionedim = VersionedModel.load("versionedim.pth")
 ```
 
 ## 5. Additional Information <a name="additional-information"></a>

diff --git a/multi_query_attention.py b/multi_query_attention.py
@@ -0,0 +1,16 @@
+import torch
+from zeta import MultiQueryAttention
+
+# Model
+model = MultiQueryAttention(
+ dim=512,
+ heads=8,
+)
+
+# Input
+text = torch.randn(2, 4, 512)
+
+# Output
+output, _, _ = model(text)
+print(output.shape)
+print(output)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "zetascale"
-version = "2.6.7"
+version = "2.6.9"
 description = "Rapidly Build, Optimize, and Train SOTA AI Models"
 authors = ["Zeta Team <[email protected]>"]
 license = "MIT"

diff --git a/tests/nn/attentions/test_mhaa.py b/tests/nn/attentions/test_mhaa.py
@@ -117,7 +117,7 @@ def test_attention_distribution(self):
  )
 
  def setUp(self):
- self.d_model = 128
+ self.dim = 128
  self.num_heads = 4
  self.dilation_rate = 2
  self.segment_size = 32
@@ -129,10 +129,10 @@ def setUp(self):
  self.batch_size = 10
  self.seq_len = 100
 
- self.x = torch.rand(self.batch_size, self.seq_len, self.d_model)
+ self.x = torch.rand(self.batch_size, self.seq_len, self.dim)
 
  self.sparse_dilated_attention = MultiheadAttention(
- self.d_model,
+ self.dim,
  self.num_heads,
  self.dilation_rate,
  self.segment_size,
@@ -145,7 +145,7 @@ def setUp(self):
  def test_forward_pass(self):
  output = self.sparse_dilated_attention(self.x)
  self.assertEqual(
- output.size(), (self.batch_size, self.seq_len, self.d_model)
+ output.size(), (self.batch_size, self.seq_len, self.dim)
  )
 
  def test_attention_outputs(self):

diff --git a/tests/nn/attentions/test_mqa.py b/tests/nn/attentions/test_mqa.py
@@ -5,16 +5,16 @@
 
 
 def test_multiqueryattention_initialization():
- model = MultiQueryAttention(d_model=512, heads=8)
+ model = MultiQueryAttention(dim=512, heads=8)
  assert isinstance(model, MultiQueryAttention)
- assert model.d_model == 512
+ assert model.dim == 512
  assert model.heads == 8
  assert model.head_dim == 64
  assert model.softmax_scale == 1 / 8
 
 
 def test_multiqueryattention_forward():
- model = MultiQueryAttention(d_model=512, heads=8)
+ model = MultiQueryAttention(dim=512, heads=8)
  x = torch.randn(1, 10, 512)
  output, attn_weights, past_key_value = model(x)
  assert output.shape == (1, 10, 512)
@@ -24,14 +24,14 @@ def test_multiqueryattention_forward():
 
 @pytest.mark.parametrize("x_len", [0])
 def test_multiqueryattention_forward_edge_cases(x_len):
- model = MultiQueryAttention(d_model=512, heads=8)
+ model = MultiQueryAttention(dim=512, heads=8)
  x = torch.randn(1, x_len, 512)
  with pytest.raises(Exception):
  model(x)
 
 
 def test_multiqueryattention_forward_invalid_dimensions():
- model = MultiQueryAttention(d_model=512, heads=8)
+ model = MultiQueryAttention(dim=512, heads=8)
  x = torch.randn(1, 10, 256)
  with pytest.raises(Exception):
  model(x)