Skip to content

Commit

Permalink
Fix memory estimation (#25)
Browse files Browse the repository at this point in the history
* add loss_bwd_memory

* update log info

* format

* update bwd prefetch memory calc

* fix test

* added hsdp through rdp_size

* fix mlp flops calc

* Update coverage.yml
  • Loading branch information
cli99 authored May 28, 2024
1 parent 6bacd4f commit a026ecd
Show file tree
Hide file tree
Showing 10 changed files with 277 additions and 161 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ jobs:
matrix:
python-version: ['3.8']
steps:
- uses: actions/checkout@v3
- name: Checkout repository
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
Expand All @@ -25,7 +26,7 @@ jobs:
poetry run pytest --cov=./ --cov-report=xml
- name: Upload Coverage to Codecov
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v4
with:
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
fail_ci_if_error: true # optional (default = false)
Expand Down
15 changes: 13 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,26 @@ repos:
- id: fix-encoding-pragma
args: [--remove]
- repo: https://github.com/pycqa/flake8
rev: 4.0.1
rev: 7.0.0
hooks:
- id: flake8
args: ["--config=.flake8"]
- repo: https://github.com/google/yapf
rev: v0.32.0
hooks:
- id: yapf
additional_dependencies: [toml]
name: yapf
description: "A formatter for Python files."
entry: yapf
args: [-i, -vv, -p] # inplace
language: python
types: [python]
additional_dependencies:
- "toml"
- repo: https://github.com/pycqa/isort
hooks:
- id: isort
rev: 5.12.0
- repo: https://github.com/codespell-project/codespell
rev: v2.1.0
hooks:
Expand Down
12 changes: 5 additions & 7 deletions examples/llama2/run_infer_cursor.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
from llm_analysis.config import (
ParallelismConfig,
get_dtype_config_by_name,
get_gpu_config_by_name,
get_model_config_by_name,
)
from llm_analysis.analysis import LLMAnalysis
import csv

from llm_analysis.analysis import LLMAnalysis
from llm_analysis.config import (ParallelismConfig, get_dtype_config_by_name,
get_gpu_config_by_name,
get_model_config_by_name)

gpu_name = "a100-sxm-80gb"
dtype_name = "w16a16e16"
model_name = "upstage/Llama-2-70b-instruct-v2"
Expand Down
338 changes: 227 additions & 111 deletions llm_analysis/analysis.py

Large diffs are not rendered by default.

14 changes: 6 additions & 8 deletions llm_analysis/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,8 @@

import fire

from llm_analysis.constant import (
DTYPE_CONFIG_DIR_NAME,
GPU_CONFIG_DIR_NAME,
MODEL_CONFIG_DIR_NAME,
)
from llm_analysis.constant import (DTYPE_CONFIG_DIR_NAME, GPU_CONFIG_DIR_NAME,
MODEL_CONFIG_DIR_NAME)
from llm_analysis.logger import logger

try:
Expand Down Expand Up @@ -116,8 +113,9 @@ class ParallelismConfig:
tp_size: int = 1 # tensor parallelism size, Megatron-LM tensor parallelism implementation
pp_size: int = 1 # pipeline parallelism size, Megatron-LM pipeline parallelism implementation
dp_size: int = (
1 # data parallelism size, DeepSpeed Zero parallelism implementation
1 # sharded data parallelism size, PyTorch FSDP or DeepSpeed Zero parallelism implementation
)
rdp_size: int = 1 # replicated data parallelism size, PyTorch HSDP implementation
ep_size: int = 1 # expert parallelism size
sp_size: int = None # sequence parallelism size, Megatron-LM sequence parallelism implementation

Expand Down Expand Up @@ -357,10 +355,10 @@ def get_model_config_by_name(name_or_path: str) -> ModelConfig:
model_configs[config.name] = config
return config
except Exception as e:
raise ValueError(f"unknown gpu config name: {e}")
raise ValueError(f"unknown model config name: {e}")
model_config = get_model_config_from_hf(name_or_path)
if model_config is None:
raise (
raise ValueError(
f"unknown model config name: {name_or_path}, and none is found on HuggingFace Hub"
)
return model_config
Expand Down
2 changes: 2 additions & 0 deletions llm_analysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@


def _num_to_string(num, precision=2, divisor=1024):
if num is None:
return None
if num < 0:
sign = '-'
num = -num
Expand Down
16 changes: 8 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@ readme = "README.md"
packages = [{ include = "llm_analysis" }]

[tool.poetry.dependencies]
python = "^3.8"
python = ">=3.8"
fire = "^0.5.0"
huggingface-hub = "^0.14.1"
transformers = "^4.28.1"
# huggingface-hub = "^0.14.1"
# transformers = "^4.28.1"

[tool.poetry.group.dev.dependencies]
pytest = "^7.3.1"
coverage = { extras = ["toml"], version = "^7.2.5" }
sphinx = "^7.0.0"
sphinx-autodoc-typehints = "^1.23.0"
pytest-cov = "^4.0.0"
pytest = ">=7.3.1"
coverage = { extras = ["toml"], version = ">=7.2.5" }
sphinx = ">=7.0.0"
sphinx-autodoc-typehints = ">=1.23.0"
pytest-cov = ">=4.0.0"

[tool.coverage.run]
omit = [".*", "*/site-packages/*"]
Expand Down
14 changes: 5 additions & 9 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from llm_analysis.config import (
ModelConfig,
GPUConfig,
DtypeConfig,
get_dtype_config_by_name,
get_gpu_config_by_name,
get_model_config_by_name,
)
from llm_analysis.config import (DtypeConfig, GPUConfig, ModelConfig,
get_dtype_config_by_name,
get_gpu_config_by_name,
get_model_config_by_name)


def test_get_model_config_by_name():
model_name = "facebook/opt-125m"
model_name = "facebook_opt-125m"
model_config = get_model_config_by_name(model_name)
assert isinstance(model_config, ModelConfig)
assert model_config.num_layers == 12
Expand Down
13 changes: 5 additions & 8 deletions tests/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from llm_analysis.utils import within_range
from llm_analysis.analysis import LLMAnalysis
from llm_analysis.config import (
ParallelismConfig,
get_dtype_config_by_name,
get_gpu_config_by_name,
get_model_config_by_name,
)
from llm_analysis.config import (ParallelismConfig, get_dtype_config_by_name,
get_gpu_config_by_name,
get_model_config_by_name)
from llm_analysis.utils import within_range

TOLERANCE = 0.1

Expand Down Expand Up @@ -55,7 +52,7 @@ def test_fastertransformer_13b_tp1():


def test_llama2_70b():
model_name = "upstage/Llama-2-70b-instruct-v2"
model_name = "upstage_Llama-2-70b-instruct-v2"
dtype_name = "w16a16e16"
gpu_name = "a100-sxm-80gb"

Expand Down
9 changes: 3 additions & 6 deletions tests/test_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,9 @@
# limitations under the License.

from llm_analysis.analysis import ActivationRecomputation, DSZeRO, LLMAnalysis
from llm_analysis.config import (
ParallelismConfig,
get_dtype_config_by_name,
get_gpu_config_by_name,
get_model_config_by_name,
)
from llm_analysis.config import (ParallelismConfig, get_dtype_config_by_name,
get_gpu_config_by_name,
get_model_config_by_name)
from llm_analysis.utils import _latency_to_string, _num_to_string, within_range

TOLERANCE = 0.05
Expand Down

0 comments on commit a026ecd

Please sign in to comment.