Automated update from source repository

Aleph-Alpha · Oct 29, 2024 · 01b9d21 · 01b9d21
1 parent 3a04cdf
commit 01b9d21
Show file tree

Hide file tree

Showing 184 changed files with 432,090 additions and 1,426 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,9 +10,6 @@ __pycache__/
 *.jpg
 *.png
 
-# Determined
-.detignore
-
 # Distribution / packaging
 .Python
 build/
@@ -151,8 +148,7 @@ Todo.json
 cython_debug/
 
 # Temporary test files
-tests/luminous/files/
-tests/aleph_alpha_scaling/files/
+tests/.tmp/
 debug_logs/
 envs/
 .data/

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,13 +1,9 @@
+exclude: '^src/scaling/core/data/proto/'
+
 repos:
-  # pip-compile-multi
-  - repo: https://github.com/peterdemin/pip-compile-multi
-    rev: v2.4.5
-    hooks:
-      - id: pip-compile-multi-verify
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v3.4.0
     hooks:
-      #    -   id: check-yaml
       - id: check-json
       - id: pretty-format-json
         args:
@@ -22,8 +18,8 @@ repos:
     rev: v0.6.1
     hooks:
       - id: ruff
-        args: [--select, "I,E,F", --fix]
-        files: ^src|^tests
+        args: [--select, "I,E,F,TID252", --fix]
+        files: ^src|^tests|^examples
       - id: ruff-format
         exclude: ^src/scaling/core/trainer/warnings.txt
-        files: ^src|^tests
+        files: ^src|^tests|^examples
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,10 +4,38 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unreleased]
+## [0.2.0] - 2024-10-29
 
-## [0.1.0] - 2024-08-22
+### Changed
+- Use poetry with pip instead of conda with pip for env management
+- Upgraded to PyTorch 2.4
+- Renamed `allowed_missing_keys_in_optimizer_groups` was renamed to `allow_missing_params_in_optimizer`
+
+### Removed
+- Removed `finetune` from training config. This field is replaced by optimizer groups
+- Removed `finetunable_parameters` from training config. This field is replaced by optimizer groups
+- Removed `parameters_exclude` from training config. Those fields are replaced by optimizer groups
+- Removed `use_separate_lr_on_embeddings` from training config. Those fields are replaced by optimizer groups
 
 ### Added
+- Implemented U-MUP method
+- Implemented FP8 linear layers for training and inference (naive casting, no per-tensor-scaling)
+- Tensor Statistics Recorder for monitoring activation and parameter distributions
+- Configurable Loss Functions
+- Configurable Language Modeling Head
+- Added Cross Entropy Loss as Configurable Loss
+- Added Contrastive Loss as Configurable Loss
+- Added Memory Map Dataset based on Protobuf serialization
+- Semantic Embedding Inference
+- Semantic Embedding Inference Example
+- Added `training_groups` for configurable optimizer groups
+- Added tests for Transformer example and MLP example
+
+### Fixed
+- Fix Pydantic Warning on Startup
+
 
+## [0.1.0] - 2024-08-22
+
+### Added
 - Added core and transformer modules
diff --git a/README.md b/README.md
@@ -36,20 +36,22 @@ Among the featured architecture options we support:
 
 The installation requires Linux with Python 3.10 and PyTorch 2.1.1.
 You will also need the appropriate CUDA dependencies and version installed on your system for GPU support.
-Clone this repository and install:
+Clone this repository and install via [poetry](https://python-poetry.org/docs/):
 
 ```bash
-pip install .
+poetry install
 ```
 
+See also the "Development" section below for additional, optional steps.
+
 ### Flash Attention
 
-To install Flash Attention, you need to make sure you have PyTorch installed already. 
+To install Flash Attention, you need to make sure you have PyTorch installed already.
 Simply install the base depenendencies with `pip install .` before installing Flash Attention.
 Then install Flash Attention with:
 
 ```bash
-pip install .[gpu_optimization]
+poetry run pip install --no-build-isolation flash-attn==2.4.2
 ```
 
 Ensure that your environment variables are set correctly.
@@ -114,6 +116,12 @@ The MLP example is the best way to start if you want to learn about how to use t
 
 # Development
 
+Please install pre-commit hooks:
+
+```bash
+pre-commit install
+```
+
 Additional dependencies are required if you want to run tests or type checks.
 Install them as follows:
 

diff --git a/examples/inference_example/__init__.py b/examples/inference_example/__init__.py
diff --git a/examples/inference_example/embedding.py b/examples/inference_example/embedding.py
@@ -0,0 +1,30 @@
+import argparse
+from pathlib import Path
+
+from scaling.core.logging import logger
+from scaling.transformer.inference import TransformerInferenceModule
+
+
+def main(checkpoint_dir: str, vocab_file: str | Path | None) -> None:
+    if vocab_file is not None:
+        vocab_file = Path(vocab_file)
+    model = TransformerInferenceModule.from_checkpoint(
+        checkpoint_dir=Path(checkpoint_dir),
+        devices=[0],
+        vocab_file=vocab_file,
+    )
+    try:
+        while True:
+            text = input("Please enter the text to encode (Press Ctrl+C to exit):\n")
+            output = model.encode([text])
+            logger.info(output)
+    except KeyboardInterrupt:
+        exit(0)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_dir", type=str)
+    parser.add_argument("--vocab_file", default=None, type=str)
+    args = parser.parse_args()
+    main(args.checkpoint_dir, args.vocab_file)
diff --git a/examples/mlp_example/config.py b/examples/mlp_example/config.py
@@ -9,24 +9,14 @@
     TopologyConfig,
     TrainerConfig,
 )
-
 from scaling.core.logging import LoggerConfig
 
 
 class MLPArchitectureConfig(BaseConfig):
     n_hidden_layers: int = Field(
-        default=0,
-        ge=0, description=(
-            "The number of layers in the network, excluding input and "
-            "output layers."
-        )
-    )
-    hidden_dim: int = Field(
-        default=64,
-        gt=0, description=(
-            "The number of hidden units in each hidden layer."
-        )
+        default=0, ge=0, description=("The number of layers in the network, excluding input and " "output layers.")
     )
+    hidden_dim: int = Field(default=64, gt=0, description=("The number of hidden units in each hidden layer."))
 
 
 class TrainingConfig(BaseConfig):

diff --git a/examples/mlp_example/context.py b/examples/mlp_example/context.py
@@ -1,8 +1,7 @@
+from examples.mlp_example.config import MLPConfig
 from scaling.core import BaseContext
 from scaling.core.topology import Topology
 
-from examples.mlp_example.config import MLPConfig
-
 
 class MLPContext(BaseContext):
     config: MLPConfig

diff --git a/examples/mlp_example/data.py b/examples/mlp_example/data.py
@@ -1,26 +1,21 @@
 from pathlib import Path
+from typing import Any
 
 import torch
 import torchvision
-
 from torchvision import transforms
 
-from scaling.core import broadcast_data
-from scaling.core import BaseDataset
-from scaling.core import BaseDatasetBatch
-from scaling.core import BaseDatasetItem
+from scaling.core import BaseDataset, BaseDatasetBatch, BaseDatasetItem, broadcast_data
 from scaling.core.topology import Topology
 
 
 class MNISTDatasetItem(BaseDatasetItem):
-
-    def __init__(self, input_, target):
+    def __init__(self, input_: Any, target: Any) -> None:
         self.input = torch.tensor(input_, dtype=torch.float16)
         self.target = torch.tensor(target, dtype=torch.float16)
 
 
 class MNISTDatasetBatch(BaseDatasetBatch):
-
     def __init__(
         self,
         inputs: torch.Tensor | None = None,
@@ -29,23 +24,21 @@ def __init__(
         self.inputs = inputs
         self.targets = targets
 
-    def only_inputs(self):
+    def only_inputs(self) -> "MNISTDatasetBatch":
         return MNISTDatasetBatch(inputs=self.inputs)
 
-    def only_targets(self):
+    def only_targets(self) -> "MNISTDatasetBatch":
         return MNISTDatasetBatch(targets=self.targets)
 
 
-class MNISTDataset(BaseDataset[
-    MNISTDatasetItem,
-    MNISTDatasetBatch,
-    MNISTDatasetBatch
-]):
-
+class MNISTDataset(BaseDataset[MNISTDatasetItem, MNISTDatasetBatch, MNISTDatasetBatch]):
     def __init__(self, root: Path = Path("./.data"), train: bool = True):
-        transform = transforms.Compose([
-            transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,)),
-        ])
+        transform = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize((0.5,), (0.5,)),
+            ]
+        )
 
         self.dataset = torchvision.datasets.MNIST(
             root=root,
@@ -54,7 +47,7 @@ def __init__(self, root: Path = Path("./.data"), train: bool = True):
             download=True,
         )
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self.dataset)
 
     def __getitem__(self, index: int) -> MNISTDatasetItem:
@@ -63,10 +56,10 @@ def __getitem__(self, index: int) -> MNISTDatasetItem:
             target=self.dataset[index][1],
         )
 
-    def ident(self):
+    def ident(self) -> str:
         return "MNIST"
 
-    def set_seed(self, seed: int, shuffle: bool = True):
+    def set_seed(self, seed: int, shuffle: bool = True) -> None:
         return
 
     def collate(self, batch: list[MNISTDatasetItem]) -> MNISTDatasetBatch:
@@ -86,9 +79,7 @@ def sync_batch_to_model_parallel(
             assert batch is None
             tensors = [None, None]
 
-        broadcast_tensors = broadcast_data(
-            tensors=tensors, dtype=torch.float16, topology=topology
-        )
+        broadcast_tensors = broadcast_data(tensors=tensors, dtype=torch.float16, topology=topology)
 
         return MNISTDatasetBatch(
             inputs=broadcast_tensors[0],