From fb6c84b3d7fdfc3b52714cbeadcdf17cddc04eb5 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Tue, 21 May 2024 04:12:57 -0700 Subject: [PATCH] wip --- llm_analysis/__init__.py | 2 +- llm_analysis/analysis.py | 59 ++++++++++++++++++---------------------- llm_analysis/config.py | 4 +-- llm_analysis/utils.py | 2 ++ pyproject.toml | 16 +++++------ 5 files changed, 39 insertions(+), 44 deletions(-) diff --git a/llm_analysis/__init__.py b/llm_analysis/__init__.py index bcff761..e3f684f 100644 --- a/llm_analysis/__init__.py +++ b/llm_analysis/__init__.py @@ -10,4 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. +# limitations under the License. \ No newline at end of file diff --git a/llm_analysis/analysis.py b/llm_analysis/analysis.py index f887865..40a639d 100644 --- a/llm_analysis/analysis.py +++ b/llm_analysis/analysis.py @@ -393,8 +393,8 @@ def get_weight_memory_per_layer( weight_memory_per_layer = weight_memory_attn_per_layer + weight_memory_mlp_per_layer + weight_memory_layernorm_per_layer - logger.info( - f'weight_memory_attn_per_layer: {_num_to_string(weight_memory_attn_per_layer)}B, weight_memory_mlp_per_layer: {_num_to_string(weight_memory_mlp_per_layer)}B, weight_memory_layernorm_per_layer: {_num_to_string(weight_memory_layernorm_per_layer)}B' + logger.debug( + f'is_sharded: {is_sharded}, weight_memory_attn_per_layer: {_num_to_string(weight_memory_attn_per_layer)}B, weight_memory_mlp_per_layer: {_num_to_string(weight_memory_mlp_per_layer)}B, weight_memory_layernorm_per_layer: {_num_to_string(weight_memory_layernorm_per_layer)}B' ) if return_breakdown: @@ -1172,13 +1172,15 @@ def get_latency_fwd_per_layernorm( Returns: float: the latency in seconds for the forward pass of a single layernorm in a transformer layer """ + input_numel = seq_len * batch_size * self.model_config.hidden_dim + compute_latency = input_numel * 5 / (self.get_TFLOPS_per_gpu() * 10**12) activation_memory = self.get_activation_memory_per_layernorm( batch_size, seq_len, ) activation_memory_latency = activation_memory / ( self.get_gpu_hbm_bandwidth() * 10**9) - return activation_memory_latency + return max(compute_latency, activation_memory_latency) def get_latency_fwd_per_tp_comm(self, batch_size: int, seq_len: int, dtype_bytes: int) -> float: @@ -2060,19 +2062,7 @@ def training( estimated_bwd_prefetch_memory_per_gpu)) logger.info( - f"weight_memory_per_gpu: {_num_to_string(weight_memory_per_gpu)}B" - " (embedding_memory:" - f" {_num_to_string(weight_memory_embedding_per_gpu)}B)," - " optimizer_state_memory_per_gpu:" - f" {_num_to_string(optimizer_state_memory_per_gpu)}B," - " gradient_memory_per_gpu:" - f" {_num_to_string(gradient_memory_per_gpu)}B", - " estimated_fwd_prefetch_memory_per_gpu:" - f" {_num_to_string(estimated_fwd_prefetch_memory_per_gpu)}B", - " estimated_bwd_prefetch_memory_per_gpu:" - f" {_num_to_string(estimated_bwd_prefetch_memory_per_gpu)}B", - " memory_left:" - f" {_num_to_string(memory_left)}B", + f"weight_memory_per_gpu: {_num_to_string(weight_memory_per_gpu)}B (embedding_memory: {_num_to_string(weight_memory_embedding_per_gpu)}B), optimizer_state_memory_per_gpu: {_num_to_string(optimizer_state_memory_per_gpu)}B, gradient_memory_per_gpu: {_num_to_string(gradient_memory_per_gpu)}B, estimated_fwd_prefetch_memory_per_gpu: {_num_to_string(estimated_fwd_prefetch_memory_per_gpu)}B, estimated_bwd_prefetch_memory_per_gpu: {_num_to_string(estimated_bwd_prefetch_memory_per_gpu)}B" ) if memory_left < 0: @@ -2230,18 +2220,25 @@ def training( ds_zero=ds_zero, ) + latency_fwd_per_layer_attn_compute = self.get_latency_fwd_per_layer_attn( + batch_size_per_gpu, seq_len, False, activation_recomputation) + latency_fwd_per_layer_mlp_compute = self.get_latency_fwd_per_layer_mlp( + batch_size_per_gpu, seq_len, False, activation_recomputation) + latency_fwd_per_layernorm_compute = self.get_latency_fwd_per_layernorm( + batch_size_per_gpu, + seq_len, + layernorm_dtype_bytes, + ) + num_layers_per_gpu = int(self.model_config.num_layers / + self.parallelism_config.pp_size) if activation_recomputation == ActivationRecomputation.FULL: - latency_recompute = latency_fwd + latency_recompute = num_layers_per_gpu * (latency_fwd_per_layer_attn_compute + latency_fwd_per_layer_mlp_compute + 2 * latency_fwd_per_layernorm_compute) elif activation_recomputation == ActivationRecomputation.NORM_ATTN_NORM: - latency_recompute = self.get_latency_fwd_per_layer_attn( - batch_size_per_gpu, seq_len, False, activation_recomputation - ) + 2 * self.get_latency_fwd_per_layernorm( - batch_size_per_gpu, seq_len, layernorm_dtype_bytes) + latency_recompute = num_layers_per_gpu * (latency_fwd_per_layer_attn_compute + 2 * latency_fwd_per_layernorm_compute) elif activation_recomputation == ActivationRecomputation.ATTN: - latency_recompute = self.get_latency_fwd_per_layer_attn( - batch_size_per_gpu, seq_len, False, activation_recomputation) + latency_recompute = num_layers_per_gpu * latency_fwd_per_layer_attn_compute elif activation_recomputation == ActivationRecomputation.ATTN_COMPUTE: - latency_recompute = self.get_num_flops_total_attn_compute( + latency_recompute = num_layers_per_gpu * self.get_num_flops_total_attn_compute( batch_size_per_gpu, seq_len) / ( (self.parallelism_config.tp_size * self.parallelism_config.pp_size) * @@ -2300,6 +2297,7 @@ def training( else: total_training_latency = None + total_training_latency_using_flops = None gpu_hours = (total_training_latency * total_num_gpus / 3600 if total_training_latency is not None else None) @@ -2404,23 +2402,18 @@ def training( estimated_fwd_prefetch_memory_per_gpu, "estimated_bwd_prefetch_memory_per_gpu": estimated_bwd_prefetch_memory_per_gpu, - "estimated_peak_fwd_memory_per_gpu": - optimizer_state_memory_per_gpu + weight_memory_per_gpu + - activation_memory_per_gpu + estimated_fwd_prefetch_memory_per_gpu, - "estimated_peak_bwd_memory_per_gpu": - optimizer_state_memory_per_gpu + weight_memory_per_gpu + - activation_memory_per_gpu + estimated_bwd_prefetch_memory_per_gpu, - "memory_left_per_gpu": - memory_left, + "estimated_peak_allocated_memory_per_gpu": + self.gpu_config.mem_per_GPU_in_GB * 1024**3 - memory_left, "latency_per_micro_batch": latency_per_micro_batch, "latency_fwd": latency_fwd, } summary_dict.update(latency_fwd_breakdown) + device_tokens_per_sec = round(seq_len * batch_size_per_gpu / latency_per_iter, 2) summary_dict.update({ "latency_per_iter": latency_per_iter, - "iters_per_sec": round(1 / latency_per_iter, 2), + "device_tokens_per_sec": device_tokens_per_sec, "total_training_latency": total_training_latency, "latency_per_iter_using_flops": latency_per_iter_using_flops, "total_training_latency_using_flops": diff --git a/llm_analysis/config.py b/llm_analysis/config.py index a22d1c2..92f8e43 100644 --- a/llm_analysis/config.py +++ b/llm_analysis/config.py @@ -354,10 +354,10 @@ def get_model_config_by_name(name_or_path: str) -> ModelConfig: model_configs[config.name] = config return config except Exception as e: - raise ValueError(f"unknown gpu config name: {e}") + raise ValueError(f"unknown model config name: {e}") model_config = get_model_config_from_hf(name_or_path) if model_config is None: - raise ( + raise ValueError( f"unknown model config name: {name_or_path}, and none is found on HuggingFace Hub" ) return model_config diff --git a/llm_analysis/utils.py b/llm_analysis/utils.py index 3801b56..a1c2185 100644 --- a/llm_analysis/utils.py +++ b/llm_analysis/utils.py @@ -14,6 +14,8 @@ def _num_to_string(num, precision=2, divisor=1024): + if num is None: + return None if num < 0: sign = '-' num = -num diff --git a/pyproject.toml b/pyproject.toml index de49b27..68d5404 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,17 +8,17 @@ readme = "README.md" packages = [{ include = "llm_analysis" }] [tool.poetry.dependencies] -python = "^3.8" +python = ">=3.8" fire = "^0.5.0" -huggingface-hub = "^0.14.1" -transformers = "^4.28.1" +# huggingface-hub = "^0.14.1" +# transformers = "^4.28.1" [tool.poetry.group.dev.dependencies] -pytest = "^7.3.1" -coverage = { extras = ["toml"], version = "^7.2.5" } -sphinx = "^7.0.0" -sphinx-autodoc-typehints = "^1.23.0" -pytest-cov = "^4.0.0" +pytest = ">=7.3.1" +coverage = { extras = ["toml"], version = ">=7.2.5" } +sphinx = ">=7.0.0" +sphinx-autodoc-typehints = ">=1.23.0" +pytest-cov = ">=4.0.0" [tool.coverage.run] omit = [".*", "*/site-packages/*"]