Fix memory estimation (#25)

* add loss_bwd_memory * update log info * format * update bwd prefetch memory calc * fix test * added hsdp through rdp_size * fix mlp flops calc * Update coverage.yml
cli99 · May 28, 2024 · a026ecd · a026ecd
1 parent 6bacd4f
commit a026ecd
Show file tree

Hide file tree

Showing 10 changed files with 277 additions and 161 deletions.
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
@@ -7,7 +7,8 @@ jobs:
       matrix:
         python-version: ['3.8']
     steps:
-      - uses: actions/checkout@v3
+      - name: Checkout repository
+      - uses: actions/checkout@v4
 
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v4
@@ -25,7 +26,7 @@ jobs:
           poetry run pytest --cov=./ --cov-report=xml
 
       - name: Upload Coverage to Codecov
-        uses: codecov/codecov-action@v3
+        uses: codecov/codecov-action@v4
         with:
           token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
           fail_ci_if_error: true # optional (default = false)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,15 +18,26 @@ repos:
       - id: fix-encoding-pragma
         args: [--remove]
   - repo: https://github.com/pycqa/flake8
-    rev: 4.0.1
+    rev: 7.0.0
     hooks:
       - id: flake8
         args: ["--config=.flake8"]
   - repo: https://github.com/google/yapf
     rev: v0.32.0
     hooks:
       - id: yapf
-        additional_dependencies: [toml]
+        name: yapf
+        description: "A formatter for Python files."
+        entry: yapf
+        args: [-i, -vv, -p] # inplace
+        language: python
+        types: [python]
+        additional_dependencies:
+          - "toml"
+  - repo: https://github.com/pycqa/isort
+    hooks:
+      - id: isort
+    rev: 5.12.0
   - repo: https://github.com/codespell-project/codespell
     rev: v2.1.0
     hooks:

diff --git a/examples/llama2/run_infer_cursor.py b/examples/llama2/run_infer_cursor.py
@@ -1,12 +1,10 @@
-from llm_analysis.config import (
-    ParallelismConfig,
-    get_dtype_config_by_name,
-    get_gpu_config_by_name,
-    get_model_config_by_name,
-)
-from llm_analysis.analysis import LLMAnalysis
 import csv
 
+from llm_analysis.analysis import LLMAnalysis
+from llm_analysis.config import (ParallelismConfig, get_dtype_config_by_name,
+                                 get_gpu_config_by_name,
+                                 get_model_config_by_name)
+
 gpu_name = "a100-sxm-80gb"
 dtype_name = "w16a16e16"
 model_name = "upstage/Llama-2-70b-instruct-v2"

diff --git a/llm_analysis/analysis.py b/llm_analysis/analysis.py
diff --git a/llm_analysis/config.py b/llm_analysis/config.py
@@ -21,11 +21,8 @@
 
 import fire
 
-from llm_analysis.constant import (
-    DTYPE_CONFIG_DIR_NAME,
-    GPU_CONFIG_DIR_NAME,
-    MODEL_CONFIG_DIR_NAME,
-)
+from llm_analysis.constant import (DTYPE_CONFIG_DIR_NAME, GPU_CONFIG_DIR_NAME,
+                                   MODEL_CONFIG_DIR_NAME)
 from llm_analysis.logger import logger
 
 try:
@@ -116,8 +113,9 @@ class ParallelismConfig:
     tp_size: int = 1  # tensor parallelism size, Megatron-LM tensor parallelism implementation
     pp_size: int = 1  # pipeline parallelism size, Megatron-LM pipeline parallelism implementation
     dp_size: int = (
-        1  # data parallelism size, DeepSpeed Zero parallelism implementation
+        1  # sharded data parallelism size, PyTorch FSDP or DeepSpeed Zero parallelism implementation
     )
+    rdp_size: int = 1  # replicated data parallelism size, PyTorch HSDP implementation
     ep_size: int = 1  # expert parallelism size
     sp_size: int = None  # sequence parallelism size, Megatron-LM sequence parallelism implementation
 
@@ -357,10 +355,10 @@ def get_model_config_by_name(name_or_path: str) -> ModelConfig:
                     model_configs[config.name] = config
             return config
         except Exception as e:
-            raise ValueError(f"unknown gpu config name: {e}")
+            raise ValueError(f"unknown model config name: {e}")
     model_config = get_model_config_from_hf(name_or_path)
     if model_config is None:
-        raise (
+        raise ValueError(
             f"unknown model config name: {name_or_path}, and none is found on HuggingFace Hub"
         )
     return model_config

diff --git a/llm_analysis/utils.py b/llm_analysis/utils.py
@@ -14,6 +14,8 @@
 
 
 def _num_to_string(num, precision=2, divisor=1024):
+    if num is None:
+        return None
     if num < 0:
         sign = '-'
         num = -num

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,17 +8,17 @@ readme = "README.md"
 packages = [{ include = "llm_analysis" }]
 
 [tool.poetry.dependencies]
-python = "^3.8"
+python = ">=3.8"
 fire = "^0.5.0"
-huggingface-hub = "^0.14.1"
-transformers = "^4.28.1"
+# huggingface-hub = "^0.14.1"
+# transformers = "^4.28.1"
 
 [tool.poetry.group.dev.dependencies]
-pytest = "^7.3.1"
-coverage = { extras = ["toml"], version = "^7.2.5" }
-sphinx = "^7.0.0"
-sphinx-autodoc-typehints = "^1.23.0"
-pytest-cov = "^4.0.0"
+pytest = ">=7.3.1"
+coverage = { extras = ["toml"], version = ">=7.2.5" }
+sphinx = ">=7.0.0"
+sphinx-autodoc-typehints = ">=1.23.0"
+pytest-cov = ">=4.0.0"
 
 [tool.coverage.run]
 omit = [".*", "*/site-packages/*"]

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -12,18 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from llm_analysis.config import (
-    ModelConfig,
-    GPUConfig,
-    DtypeConfig,
-    get_dtype_config_by_name,
-    get_gpu_config_by_name,
-    get_model_config_by_name,
-)
+from llm_analysis.config import (DtypeConfig, GPUConfig, ModelConfig,
+                                 get_dtype_config_by_name,
+                                 get_gpu_config_by_name,
+                                 get_model_config_by_name)
 
 
 def test_get_model_config_by_name():
-    model_name = "facebook/opt-125m"
+    model_name = "facebook_opt-125m"
     model_config = get_model_config_by_name(model_name)
     assert isinstance(model_config, ModelConfig)
     assert model_config.num_layers == 12

diff --git a/tests/test_inference.py b/tests/test_inference.py
@@ -12,14 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from llm_analysis.utils import within_range
 from llm_analysis.analysis import LLMAnalysis
-from llm_analysis.config import (
-    ParallelismConfig,
-    get_dtype_config_by_name,
-    get_gpu_config_by_name,
-    get_model_config_by_name,
-)
+from llm_analysis.config import (ParallelismConfig, get_dtype_config_by_name,
+                                 get_gpu_config_by_name,
+                                 get_model_config_by_name)
+from llm_analysis.utils import within_range
 
 TOLERANCE = 0.1
 
@@ -55,7 +52,7 @@ def test_fastertransformer_13b_tp1():
 
 
 def test_llama2_70b():
-    model_name = "upstage/Llama-2-70b-instruct-v2"
+    model_name = "upstage_Llama-2-70b-instruct-v2"
     dtype_name = "w16a16e16"
     gpu_name = "a100-sxm-80gb"
 

diff --git a/tests/test_training.py b/tests/test_training.py
@@ -13,12 +13,9 @@
 # limitations under the License.
 
 from llm_analysis.analysis import ActivationRecomputation, DSZeRO, LLMAnalysis
-from llm_analysis.config import (
-    ParallelismConfig,
-    get_dtype_config_by_name,
-    get_gpu_config_by_name,
-    get_model_config_by_name,
-)
+from llm_analysis.config import (ParallelismConfig, get_dtype_config_by_name,
+                                 get_gpu_config_by_name,
+                                 get_model_config_by_name)
 from llm_analysis.utils import _latency_to_string, _num_to_string, within_range
 
 TOLERANCE = 0.05