diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 9aec3c7..03fb346 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -7,7 +7,8 @@ jobs: matrix: python-version: ['3.8'] steps: - - uses: actions/checkout@v3 + - name: Checkout repository + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 @@ -25,7 +26,7 @@ jobs: poetry run pytest --cov=./ --cov-report=xml - name: Upload Coverage to Codecov - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v4 with: token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos fail_ci_if_error: true # optional (default = false) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 92fb69b..129fc94 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,7 +18,7 @@ repos: - id: fix-encoding-pragma args: [--remove] - repo: https://github.com/pycqa/flake8 - rev: 4.0.1 + rev: 7.0.0 hooks: - id: flake8 args: ["--config=.flake8"] @@ -26,7 +26,18 @@ repos: rev: v0.32.0 hooks: - id: yapf - additional_dependencies: [toml] + name: yapf + description: "A formatter for Python files." + entry: yapf + args: [-i, -vv, -p] # inplace + language: python + types: [python] + additional_dependencies: + - "toml" + - repo: https://github.com/pycqa/isort + hooks: + - id: isort + rev: 5.12.0 - repo: https://github.com/codespell-project/codespell rev: v2.1.0 hooks: diff --git a/examples/llama2/run_infer_cursor.py b/examples/llama2/run_infer_cursor.py index 1dd0e57..6f22a2a 100644 --- a/examples/llama2/run_infer_cursor.py +++ b/examples/llama2/run_infer_cursor.py @@ -1,12 +1,10 @@ -from llm_analysis.config import ( - ParallelismConfig, - get_dtype_config_by_name, - get_gpu_config_by_name, - get_model_config_by_name, -) -from llm_analysis.analysis import LLMAnalysis import csv +from llm_analysis.analysis import LLMAnalysis +from llm_analysis.config import (ParallelismConfig, get_dtype_config_by_name, + get_gpu_config_by_name, + get_model_config_by_name) + gpu_name = "a100-sxm-80gb" dtype_name = "w16a16e16" model_name = "upstage/Llama-2-70b-instruct-v2" diff --git a/llm_analysis/analysis.py b/llm_analysis/analysis.py index 71a3d6d..68d11bc 100644 --- a/llm_analysis/analysis.py +++ b/llm_analysis/analysis.py @@ -22,15 +22,10 @@ import fire -from llm_analysis.config import ( - DtypeConfig, - GPUConfig, - ModelConfig, - ParallelismConfig, - get_dtype_config_by_name, - get_gpu_config_by_name, - get_model_config_by_name, -) +from llm_analysis.config import (DtypeConfig, GPUConfig, ModelConfig, + ParallelismConfig, get_dtype_config_by_name, + get_gpu_config_by_name, + get_model_config_by_name) from llm_analysis.constant import * from llm_analysis.logger import logger from llm_analysis.utils import _latency_to_string, _num_to_string, within_range @@ -361,9 +356,11 @@ def get_num_active_params_total(self) -> int: self.get_num_params_last_layernorm()) def get_weight_memory_per_layer( - self, - ds_zero: DSZeRO = DSZeRO.NONE, - return_breakdown: bool = False) -> Union[float, tuple]: + self, + is_sharded: bool = False, + ds_zero: DSZeRO = DSZeRO.NONE, + return_breakdown: bool = False, + ) -> Union[float, tuple]: """Get the memory (in bytes) required to store the weights of a transformer layer, given the number of parameters in a transformer layer, the data type used for the weights, the tensor parallelism size, and the DeepSpeed ZeRO stage. WIth @@ -375,7 +372,7 @@ def get_weight_memory_per_layer( Returns: Union[float, tuple]: the memory (in bytes) required to store the weights of a transformer layer, or a tuple of its breakdown """ - if ds_zero == DSZeRO.STAGE_3: + if is_sharded and ds_zero == DSZeRO.STAGE_3: sharded_dp_size = self.parallelism_config.dp_size mlp_sharded_dp_size = self.parallelism_config.dp_size / self.parallelism_config.ep_size else: @@ -396,8 +393,8 @@ def get_weight_memory_per_layer( weight_memory_per_layer = weight_memory_attn_per_layer + weight_memory_mlp_per_layer + weight_memory_layernorm_per_layer - logger.info( - f'weight_memory_attn_per_layer: {_num_to_string(weight_memory_attn_per_layer)}B, weight_memory_mlp_per_layer: {_num_to_string(weight_memory_mlp_per_layer)}B, weight_memory_layernorm_per_layer: {_num_to_string(weight_memory_layernorm_per_layer)}B' + logger.debug( + f'is_sharded: {is_sharded}, weight_memory_attn_per_layer: {_num_to_string(weight_memory_attn_per_layer)}B, weight_memory_mlp_per_layer: {_num_to_string(weight_memory_mlp_per_layer)}B, weight_memory_layernorm_per_layer: {_num_to_string(weight_memory_layernorm_per_layer)}B' ) if return_breakdown: @@ -530,6 +527,7 @@ def get_memory_optimizer_state_and_gradient_last_layernorm( def get_memory_embedding( self, ds_zero: DSZeRO = DSZeRO.NONE, + is_sharded: bool = True, ) -> float: """Get the memory (in bytes) required to store the embedding layer, given the number of parameters in the embedding layer, the data type (defaults to FP32) @@ -545,6 +543,8 @@ def get_memory_embedding( dtype_bytes = self.dtype_config.embedding_bits / BITS_PER_BYTE memory_embedding = (self.get_num_params_embedding() / self.parallelism_config.tp_size) * dtype_bytes + if not is_sharded: + return memory_embedding if ds_zero == DSZeRO.STAGE_3: memory_embedding /= self.parallelism_config.dp_size @@ -692,8 +692,6 @@ def get_activation_memory_per_layer_mlp( bytes_per_gelu_input = mlp_activation_quant_bits / BITS_PER_BYTE bytes_per_2linear_input = mlp_activation_quant_bits / BITS_PER_BYTE - num_experts_per_gpu = self.model_config.moe_num_experts / ep_size - if is_inference: return max( bytes_per_1linear_input, @@ -757,11 +755,20 @@ def get_activation_memory_per_layernorm( return (seq_len * batch_size * self.model_config.hidden_dim / self.parallelism_config.sp_size) * dtype_bytes + def get_activation_memory_input_embedding(self, batch_size: int, + seq_len: int) -> float: + """Get the memory (in bytes) required to store the activations of output embedding (logits)""" + return self.model_config.hidden_dim * batch_size * seq_len * self.dtype_config.activation_bits / BITS_PER_BYTE / self.parallelism_config.tp_size + def get_activation_memory_output_embedding(self, batch_size: int, seq_len: int) -> float: """Get the memory (in bytes) required to store the activations of output embedding (logits)""" return self.model_config.vocab_size * batch_size * seq_len * self.dtype_config.activation_bits / BITS_PER_BYTE / self.parallelism_config.tp_size + def get_loss_bwd_memory(self, batch_size: int, seq_len: int) -> float: + """Get the temporary memory (in bytes) required for the backward pass of the loss function""" + return self.get_activation_memory_output_embedding(batch_size, seq_len) + def get_activation_memory_per_layer( self, batch_size: int, @@ -944,7 +951,9 @@ def get_num_flops_fwd_per_layer_mlp(self, batch_size: int, Returns: int: the number of floating point operations for the forward pass of the MLP module in a transformer layer """ - return 4 * batch_size * seq_len * self.model_config.hidden_dim**2 * self.model_config.expansion_ratio + return ( + 6 if self.model_config.mlp_gated_linear_units else 4 + ) * batch_size * seq_len * self.model_config.hidden_dim**2 * self.model_config.expansion_ratio def get_num_flops_fwd_per_layer( self, @@ -1091,7 +1100,7 @@ def get_latency_fwd_per_layer_mlp_moe_alltoall(self, batch_size: int, latency = data_bytes / ( (self.get_intra_node_bandwidth() if self.parallelism_config.ep_size <= 8 else self.get_inter_node_bandwidth()) * 10**9) - logger.info( + logger.debug( f'moe_alltoall data_bytes = {_num_to_string(data_bytes)}B, latency = {round(latency*1000, 3)} ms' ) return latency @@ -1174,13 +1183,16 @@ def get_latency_fwd_per_layernorm( Returns: float: the latency in seconds for the forward pass of a single layernorm in a transformer layer """ + input_numel = seq_len * batch_size * self.model_config.hidden_dim + compute_latency = input_numel * 5 / (self.get_TFLOPS_per_gpu() * + 10**12) activation_memory = self.get_activation_memory_per_layernorm( batch_size, seq_len, ) activation_memory_latency = activation_memory / ( self.get_gpu_hbm_bandwidth() * 10**9) - return activation_memory_latency + return max(compute_latency, activation_memory_latency) def get_latency_fwd_per_tp_comm(self, batch_size: int, seq_len: int, dtype_bytes: int) -> float: @@ -1500,9 +1512,10 @@ def output_summary_dict( summary_dict: dict, output_dir: str, print_human_readable: bool = True, + output_file_prefix: str = "", output_file_suffix: str = "", ): - file_name = self.get_configs_desc( + file_name = output_file_prefix + self.get_configs_desc( ) + output_file_suffix + "-summary.json" if not os.path.exists(output_dir): @@ -1519,9 +1532,8 @@ def output_summary_dict( f"Summary written to {os.path.join(output_dir, file_name)}") if print_human_readable: log_str = self.get_readable_summary_dict(summary_dict) - file_name = self.get_configs_desc( + file_name = output_file_prefix + self.get_configs_desc( ) + output_file_suffix + "-summary-readable.txt" - file_name = output_file_suffix + "-summary-readable.txt" with open(os.path.join(output_dir, file_name), "w") as f: f.write(log_str) logger.info( @@ -1539,6 +1551,7 @@ def inference( kv_cache_dtype_bytes: int = None, cost_per_gpu_hour: float = None, output_dir: str = None, + output_file_prefix: str = "", output_file_suffix: str = "", ) -> dict: """Inference analysis given the configs and inputs. @@ -1612,12 +1625,12 @@ def inference( is_inference=True, layernorm_dtype_bytes=layernorm_dtype_bytes, ) - prefill_activation_memory_embedding_output_batch_size_1 = self.get_activation_memory_output_embedding( + prefill_activation_memory_output_embedding_batch_size_1 = self.get_activation_memory_output_embedding( 1, seq_len) prefill_activation_memory_batch_size_1 = max( prefill_activation_memory_per_layer_batch_size_1, - prefill_activation_memory_embedding_output_batch_size_1) + prefill_activation_memory_output_embedding_batch_size_1) prefill_max_batch_size_per_gpu = int( memory_left / prefill_activation_memory_batch_size_1) @@ -1632,11 +1645,11 @@ def inference( is_inference=True, layernorm_dtype_bytes=layernorm_dtype_bytes, ) - prefill_activation_memory_embedding_output = self.get_activation_memory_output_embedding( + prefill_activation_memory_output_embedding = self.get_activation_memory_output_embedding( batch_size_per_gpu, seq_len) prefill_activation_memory_per_gpu = max( prefill_activation_memory_per_layer, - prefill_activation_memory_embedding_output) + prefill_activation_memory_output_embedding) logger.info("prefill_activation_memory_per_gpu with batch_size_per_gpu" f" {batch_size_per_gpu}:" @@ -1688,11 +1701,11 @@ def inference( is_inference=True, layernorm_dtype_bytes=layernorm_dtype_bytes, ) - decode_activation_memory_embedding_output = self.get_activation_memory_output_embedding( + decode_activation_memory_output_embedding = self.get_activation_memory_output_embedding( batch_size_per_gpu, 1) decode_activation_memory_per_gpu = max( decode_activation_memory_per_layer, - decode_activation_memory_embedding_output) + decode_activation_memory_output_embedding) logger.info( "kv_cache_memory_per_gpu:" @@ -1846,6 +1859,7 @@ def compute_cost_per_1k_tokens(tokens_per_sec): self.output_summary_dict(summary_dict, output_dir, print_human_readable=True, + output_file_prefix=output_file_prefix, output_file_suffix=output_file_suffix) return summary_dict @@ -1953,6 +1967,8 @@ def training( activation_recomputation: ActivationRecomputation = ActivationRecomputation.NONE, ds_zero: DSZeRO = DSZeRO.NONE, + fwd_prefetch: bool = True, + bwd_prefetch: bool = True, layernorm_dtype_bytes: int = BYTES_FP32, master_weights_dtype_bytes: int = BYTES_FP32, other_op_bytes: int = None, @@ -1964,6 +1980,7 @@ def training( mlp_2linear_quant_bits: int = None, mlp_recompute_gelu: bool = False, output_dir: str = None, + output_file_prefix: str = "", output_file_suffix: str = "", ) -> dict: """Training analysis given the configs and inputs. @@ -2018,17 +2035,20 @@ def training( "num_layers not be divisible by pp_size, taking the floor") weight_memory_embedding_per_gpu = self.get_memory_embedding(ds_zero) + unsharded_weight_memory_embedding = self.get_memory_embedding( + ds_zero, is_sharded=False) weight_memory_layers_per_gpu, weight_memory_attn_per_gpu, weight_memory_mlp_per_gpu, weight_memory_layernorm_per_gpu = [ - x * num_layers_per_gpu - for x in self.get_weight_memory_per_layer(ds_zero, - return_breakdown=True) + x * num_layers_per_gpu for x in self.get_weight_memory_per_layer( + is_sharded=True, ds_zero=ds_zero, return_breakdown=True) ] weight_memory_last_layernorm = self.get_weight_memory_last_layernorm( ds_zero) weight_memory_per_gpu = (weight_memory_embedding_per_gpu + weight_memory_layers_per_gpu + weight_memory_last_layernorm) + unsharded_weight_memory_per_layer, unsharded_weight_memory_attn_per_layer, unsharded_weight_memory_mlp_per_layer, unshared_weight_memory_layernorm = self.get_weight_memory_per_layer( + is_sharded=False, ds_zero=ds_zero, return_breakdown=True) optimizer_state_memory_per_layer, gradient_memory_per_layer = self.get_memory_optimizer_state_and_gradient_per_layer( master_weights_dtype_bytes, other_op_bytes, ds_zero) @@ -2042,71 +2062,100 @@ def training( optimizer_state_memory_per_gpu = optimizer_state_memory_per_layer * num_layers_per_gpu + optimizer_state_memory_embedding + optimizer_state_memory_last_layernorm gradient_memory_per_gpu = gradient_memory_per_layer * num_layers_per_gpu + gradient_memory_embedding + gradient_memory_last_layernorm - self.weight_grad_op_state_memory_per_gpu = weight_memory_per_gpu + gradient_memory_per_gpu + optimizer_state_memory_per_gpu + self.weight_grad_op_state_memory_per_gpu = ( + weight_memory_per_gpu + optimizer_state_memory_per_gpu + + gradient_memory_per_gpu) + + estimated_fwd_prefetch_memory_per_gpu = unsharded_weight_memory_embedding + unsharded_weight_memory_per_layer + + estimated_bwd_prefetch_memory_per_gpu = ( + int(fwd_prefetch) + + int(bwd_prefetch)) * (unsharded_weight_memory_per_layer) + + estimated_prefetch_memory_per_gpu = max( + estimated_fwd_prefetch_memory_per_gpu, + estimated_bwd_prefetch_memory_per_gpu) memory_left = (self.gpu_config.mem_per_GPU_in_GB * 1024**3 - - self.weight_grad_op_state_memory_per_gpu) + weight_memory_per_gpu - optimizer_state_memory_per_gpu) logger.info( - f"weight_memory_per_gpu: {_num_to_string(weight_memory_per_gpu)}B" - " (embedding_memory:" - f" {_num_to_string(weight_memory_embedding_per_gpu)}B)," - " optimizer_state_memory_per_gpu:" - f" {_num_to_string(optimizer_state_memory_per_gpu)}B," - " gradient_memory_per_gpu:" - f" {_num_to_string(gradient_memory_per_gpu)}B, memory_left:" - f" {_num_to_string(memory_left)}B") + f"weight_memory_per_gpu: {_num_to_string(weight_memory_per_gpu)}B (embedding_memory: {_num_to_string(weight_memory_embedding_per_gpu)}B), optimizer_state_memory_per_gpu: {_num_to_string(optimizer_state_memory_per_gpu)}B, gradient_memory_per_gpu: {_num_to_string(gradient_memory_per_gpu)}B, estimated_fwd_prefetch_memory_per_gpu: {_num_to_string(estimated_fwd_prefetch_memory_per_gpu)}B, estimated_bwd_prefetch_memory_per_gpu: {_num_to_string(estimated_bwd_prefetch_memory_per_gpu)}B" + ) if memory_left < 0: logger.warning( - "model weight/optimizer stage/gradient is too large (requiring" - f" {_num_to_string(weight_memory_per_gpu)}B /" - f" {_num_to_string(optimizer_state_memory_per_gpu)}B /" - f" {_num_to_string(gradient_memory_per_gpu)}B) to fit in total GPU" - " memory") + "model weight/optimizer state memory usage is too large to fit in GPU memory" + ) + + if memory_left - max(estimated_prefetch_memory_per_gpu, + gradient_memory_per_gpu) < 0: + logger.warning( + "model gradient or bwd prefetch memory usage is too large to fit in GPU memory" + ) + + loss_bwd_memory_batch_size_1 = self.get_loss_bwd_memory(1, seq_len) + if memory_left - loss_bwd_memory_batch_size_1 < 0: + logger.warning("loss_bwd_memory is too large to fit in GPU memory") # With pipeline parallelism, each stage contains L/p layers so the first stage must store p ×L/p = L layers worth of activations regardless of the pipeline parallel size p; activation memory required for the input embeddings, the last layer-norm, and the output layer are ignored here. Refer to https://arxiv.org/abs/2205.05198 for more details. - activation_memory_batch_size_1, activation_memory_attn_batch_size_1, mlp_activation_memory_batch_size_1, layernorm_activation_memory_batch_size_1 = [ + activation_memory_per_layer_batch_size_1, attn_activation_memory_per_layer_batch_size_1, mlp_activation_memory_per_layer_batch_size_1, layernorm_activation_memory_per_layer_batch_size_1 = self.get_activation_memory_per_layer( + 1, + seq_len, + is_inference=False, + activation_recomputation=activation_recomputation, + layernorm_dtype_bytes=layernorm_dtype_bytes, + flash_attn=flash_attn, + softmax_dropout=softmax_dropout, + mlp_activation_quant_bits=mlp_activation_quant_bits, + mlp_1linear_quant_bits=mlp_1linear_quant_bits, + mlp_gelu_input_quant_bits=mlp_gelu_input_quant_bits, + mlp_2linear_quant_bits=mlp_2linear_quant_bits, + mlp_recompute_gelu=mlp_recompute_gelu, + return_breakdown=True, + ) + activation_memory_batch_size_1, attn_activation_memory_batch_size_1, mlp_activation_memory_batch_size_1, layernorm_activation_memory_batch_size_1 = [ x * self.model_config.num_layers - for x in self.get_activation_memory_per_layer( - 1, - seq_len, - is_inference=False, - activation_recomputation=activation_recomputation, - layernorm_dtype_bytes=layernorm_dtype_bytes, - flash_attn=flash_attn, - softmax_dropout=softmax_dropout, - mlp_activation_quant_bits=mlp_activation_quant_bits, - mlp_1linear_quant_bits=mlp_1linear_quant_bits, - mlp_gelu_input_quant_bits=mlp_gelu_input_quant_bits, - mlp_2linear_quant_bits=mlp_2linear_quant_bits, - mlp_recompute_gelu=mlp_recompute_gelu, - return_breakdown=True, - ) + for x in (activation_memory_per_layer_batch_size_1, + attn_activation_memory_per_layer_batch_size_1, + mlp_activation_memory_per_layer_batch_size_1, + layernorm_activation_memory_per_layer_batch_size_1) ] - activation_memory_embedding_output_batch_size_1 = self.get_activation_memory_output_embedding( + + activation_memory_input_embedding_batch_size_1 = self.get_activation_memory_input_embedding( 1, seq_len) - logger.info( - f"activation_memory_embedding_output for micro batch size 1: {_num_to_string(activation_memory_embedding_output_batch_size_1)}B" - ) - activation_memory_batch_size_1 += activation_memory_embedding_output_batch_size_1 + activation_memory_batch_size_1 += activation_memory_input_embedding_batch_size_1 + activation_memory_output_embedding_batch_size_1 = self.get_activation_memory_output_embedding( + 1, seq_len) + activation_memory_batch_size_1 += activation_memory_output_embedding_batch_size_1 activation_memory_batch_size_1 += self.get_activation_memory_per_layernorm( 1, seq_len, layernorm_dtype_bytes, ) - max_batch_size_per_gpu = int(memory_left // - activation_memory_batch_size_1) - - if memory_left < activation_memory_batch_size_1: + if memory_left - max( + estimated_prefetch_memory_per_gpu, + loss_bwd_memory_batch_size_1) < activation_memory_batch_size_1: logger.warning( f"memory_left {_num_to_string(memory_left)} < activation_memory_batch_size_1 {_num_to_string(activation_memory_batch_size_1)}" ) logger.info( - f"activation_memory for micro batch size 1: {_num_to_string(activation_memory_batch_size_1)}B, max_batch_size_per_gpu: {max_batch_size_per_gpu}" + f"activation_memory_per_gpu with micro batch size 1: {_num_to_string(activation_memory_batch_size_1)}B (attn + mlp + layernorm + input_embed + output_embed: {_num_to_string(attn_activation_memory_batch_size_1)}B + {_num_to_string(mlp_activation_memory_batch_size_1)}B + {_num_to_string(layernorm_activation_memory_batch_size_1)}B + {_num_to_string(activation_memory_input_embedding_batch_size_1)}B + {_num_to_string(activation_memory_output_embedding_batch_size_1)}B)" + ) + + max_batch_size_per_gpu = int(memory_left // + activation_memory_batch_size_1) + while memory_left < max( + estimated_prefetch_memory_per_gpu, + self.get_loss_bwd_memory(max_batch_size_per_gpu, seq_len) + ) + activation_memory_batch_size_1 * max_batch_size_per_gpu: + max_batch_size_per_gpu -= 1 + + logger.info( + f"max_batch_size_per_gpu: {max_batch_size_per_gpu}, estimated_prefetch_memory_per_gpu: {_num_to_string(estimated_prefetch_memory_per_gpu)}B, loss_bwd_memory: {_num_to_string(self.get_loss_bwd_memory(max_batch_size_per_gpu, seq_len))}B" ) ( @@ -2121,7 +2170,9 @@ def training( ) if batch_size_per_gpu == 1: - activation_memory_per_gpu, activation_memory_attn_per_gpu, activation_memory_mlp_per_gpu, activation_memory_layernorm_per_gpu = activation_memory_batch_size_1, activation_memory_attn_batch_size_1, mlp_activation_memory_batch_size_1, layernorm_activation_memory_batch_size_1 + activation_memory_per_gpu, activation_memory_attn_per_gpu, activation_memory_mlp_per_gpu, activation_memory_layernorm_per_gpu = activation_memory_batch_size_1, attn_activation_memory_batch_size_1, mlp_activation_memory_batch_size_1, layernorm_activation_memory_batch_size_1 + activation_memory_input_embedding_per_gpu = activation_memory_input_embedding_batch_size_1 + activation_memory_output_embedding_per_gpu = activation_memory_output_embedding_batch_size_1 else: activation_memory_per_gpu, activation_memory_attn_per_gpu, activation_memory_mlp_per_gpu, activation_memory_layernorm_per_gpu = [ x * self.model_config.num_layers @@ -2141,27 +2192,35 @@ def training( return_breakdown=True, ) ] - activation_memory_embedding_output_per_gpu = self.get_activation_memory_output_embedding( - batch_size_per_gpu, seq_len) - activation_memory_per_gpu += activation_memory_embedding_output_per_gpu - activation_memory_per_gpu += self.get_activation_memory_per_layernorm( - batch_size_per_gpu, - seq_len, - layernorm_dtype_bytes, - ) + activation_memory_input_embedding_per_gpu = self.get_activation_memory_input_embedding( + batch_size_per_gpu, seq_len) + activation_memory_output_embedding_per_gpu = self.get_activation_memory_output_embedding( + batch_size_per_gpu, seq_len) + activation_memory_per_gpu += activation_memory_input_embedding_per_gpu + activation_memory_per_gpu += activation_memory_output_embedding_per_gpu + activation_memory_per_gpu += self.get_activation_memory_per_layernorm( + batch_size_per_gpu, + seq_len, + layernorm_dtype_bytes, + ) + logger.info( + f"activation_memory_per_gpu with micro batch size {batch_size_per_gpu}: {_num_to_string(activation_memory_per_gpu)}B (attn + mlp + layernorm + input_embed + output_embed: {_num_to_string(activation_memory_attn_per_gpu)}B + {_num_to_string(activation_memory_mlp_per_gpu)}B + {_num_to_string(activation_memory_layernorm_per_gpu)}B + {_num_to_string(activation_memory_input_embedding_per_gpu)}B + {_num_to_string(activation_memory_output_embedding_per_gpu)}B)" + ) - logger.info("activation_memory_per_gpu with micro batch size" - f" {batch_size_per_gpu}:" - f" {_num_to_string(activation_memory_per_gpu)}B") - if memory_left < activation_memory_per_gpu: + loss_bwd_memory = self.get_loss_bwd_memory(batch_size_per_gpu, seq_len) + + if memory_left < activation_memory_per_gpu + max( + estimated_prefetch_memory_per_gpu, loss_bwd_memory): logger.warning( - "activation memory is too large with batch_size_per_gpu =" + "activation_memory_per_gpu memory or loss_bwd_memory is too large with batch_size_per_gpu =" f" {batch_size_per_gpu} to fit in GPU memory (requiring" - f" {_num_to_string(activation_memory_per_gpu)}B, memory_left after" + f" activation_memory_per_gpu={_num_to_string(activation_memory_per_gpu)}B, loss_bwd_memory={_num_to_string(loss_bwd_memory)}Bmemory_left after" " fitting in model weights, gradients, and optimizer states =" f" {_num_to_string(memory_left)}B, max_batch_size_per_gpu =" f" {max_batch_size_per_gpu})") - memory_left -= activation_memory_per_gpu + + memory_left = memory_left - activation_memory_per_gpu - max( + estimated_prefetch_memory_per_gpu, loss_bwd_memory) num_flops_fwd_total = self.get_num_flops_fwd_total( batch_size_per_gpu, seq_len) @@ -2210,18 +2269,30 @@ def training( ds_zero=ds_zero, ) + latency_fwd_per_layer_attn_compute = self.get_latency_fwd_per_layer_attn( + batch_size_per_gpu, seq_len, False, activation_recomputation) + latency_fwd_per_layer_mlp_compute = self.get_latency_fwd_per_layer_mlp( + batch_size_per_gpu, seq_len, False, activation_recomputation) + latency_fwd_per_layernorm_compute = self.get_latency_fwd_per_layernorm( + batch_size_per_gpu, + seq_len, + layernorm_dtype_bytes, + ) + num_layers_per_gpu = int(self.model_config.num_layers / + self.parallelism_config.pp_size) if activation_recomputation == ActivationRecomputation.FULL: - latency_recompute = latency_fwd + latency_recompute = num_layers_per_gpu * ( + latency_fwd_per_layer_attn_compute + + latency_fwd_per_layer_mlp_compute + + 2 * latency_fwd_per_layernorm_compute) elif activation_recomputation == ActivationRecomputation.NORM_ATTN_NORM: - latency_recompute = self.get_latency_fwd_per_layer_attn( - batch_size_per_gpu, seq_len, False, activation_recomputation - ) + 2 * self.get_latency_fwd_per_layernorm( - batch_size_per_gpu, seq_len, layernorm_dtype_bytes) + latency_recompute = num_layers_per_gpu * ( + latency_fwd_per_layer_attn_compute + + 2 * latency_fwd_per_layernorm_compute) elif activation_recomputation == ActivationRecomputation.ATTN: - latency_recompute = self.get_latency_fwd_per_layer_attn( - batch_size_per_gpu, seq_len, False, activation_recomputation) + latency_recompute = num_layers_per_gpu * latency_fwd_per_layer_attn_compute elif activation_recomputation == ActivationRecomputation.ATTN_COMPUTE: - latency_recompute = self.get_num_flops_total_attn_compute( + latency_recompute = num_layers_per_gpu * self.get_num_flops_total_attn_compute( batch_size_per_gpu, seq_len) / ( (self.parallelism_config.tp_size * self.parallelism_config.pp_size) * @@ -2280,6 +2351,7 @@ def training( else: total_training_latency = None + total_training_latency_using_flops = None gpu_hours = (total_training_latency * total_num_gpus / 3600 if total_training_latency is not None else None) @@ -2349,12 +2421,28 @@ def training( weight_memory_mlp_per_gpu, "weight_memory_layernorm_per_gpu": weight_memory_layernorm_per_gpu, + "unsharded_weight_memory_embedding": + unsharded_weight_memory_embedding, + "unsharded_weight_memory_per_layer": + unsharded_weight_memory_per_layer, + "unsharded_weight_memory_attn_per_layer": + unsharded_weight_memory_attn_per_layer, + "unsharded_weight_memory_mlp_per_layer": + unsharded_weight_memory_mlp_per_layer, + "unshared_weight_memory_layernorm": + unshared_weight_memory_layernorm, "gradient_memory_per_gpu": gradient_memory_per_gpu, "optimizer_state_memory_per_gpu": optimizer_state_memory_per_gpu, - "(weight+op_state+grad)_memory_per_gpu": - self.weight_grad_op_state_memory_per_gpu, + "(weight+op_state)_memory_per_gpu": + optimizer_state_memory_per_gpu + weight_memory_per_gpu, + "estimated_fwd_prefetch_memory_per_gpu": + estimated_fwd_prefetch_memory_per_gpu, + "estimated_bwd_prefetch_memory_per_gpu": + estimated_bwd_prefetch_memory_per_gpu, + "loss_bwd_memory": + loss_bwd_memory, "activation_memory_batch_size_1": activation_memory_batch_size_1, "activation_memory_per_gpu": @@ -2365,22 +2453,30 @@ def training( activation_memory_mlp_per_gpu, "activation_memory_layernorm_per_gpu": activation_memory_layernorm_per_gpu, - "activation_memory_embedding_output_per_gpu": - activation_memory_embedding_output_per_gpu, - "(weight+op_state+grad+act)_memory_per_gpu": - self.weight_grad_op_state_memory_per_gpu + + "activation_memory_input_embedding_per_gpu": + activation_memory_input_embedding_per_gpu, + "activation_memory_output_embedding_per_gpu": + activation_memory_output_embedding_per_gpu, + "(weight+op_state+act)_memory_per_gpu": + optimizer_state_memory_per_gpu + weight_memory_per_gpu + activation_memory_per_gpu, - "memory_left_per_gpu": - memory_left, + "(weight+op_state+grad)_memory_per_gpu": + self.weight_grad_op_state_memory_per_gpu, + "estimated_peak_memory_per_gpu": + optimizer_state_memory_per_gpu + weight_memory_per_gpu + + max(activation_memory_per_gpu, gradient_memory_per_gpu) + + max(estimated_bwd_prefetch_memory_per_gpu, loss_bwd_memory), "latency_per_micro_batch": latency_per_micro_batch, "latency_fwd": latency_fwd, } summary_dict.update(latency_fwd_breakdown) + device_tokens_per_sec = round( + seq_len * batch_size_per_gpu / latency_per_iter, 2) summary_dict.update({ "latency_per_iter": latency_per_iter, - "iters_per_sec": round(1 / latency_per_iter, 2), + "device_tokens_per_sec": device_tokens_per_sec, "total_training_latency": total_training_latency, "latency_per_iter_using_flops": latency_per_iter_using_flops, "total_training_latency_using_flops": @@ -2394,6 +2490,7 @@ def training( self.output_summary_dict(summary_dict, output_dir, print_human_readable=True, + output_file_prefix=output_file_prefix, output_file_suffix=output_file_suffix) return summary_dict @@ -2423,6 +2520,7 @@ def infer( inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY, cost_per_gpu_hour: float = None, output_dir: str = None, + output_file_prefix: str = "", output_file_suffix: str = "", ) -> dict: """_summary_ @@ -2451,6 +2549,7 @@ def infer( inter_node_memory_efficiency (float, optional): inter-node memory efficiency, ranging from 0 to 1. Defaults to INTER_NODE_MEMORY_EFFICIENCY. cost_per_gpu_hour (float, optional): dollar cost per GPU hour. Defaults to None. output_dir (str, optional): if set to a directory path, write the return summary dict out to the directory with the setup. Defaults to None.. Defaults to None. + output_file_prefix (str, optional): prefix of the output file. Defaults to "". output_file_suffix (str, optional): suffix of the output file. Defaults to "". Returns: @@ -2496,6 +2595,7 @@ def infer( kv_cache_dtype_bytes=kv_cache_dtype_bytes, cost_per_gpu_hour=cost_per_gpu_hour, output_dir=output_dir, + output_file_prefix=output_file_prefix, output_file_suffix=output_file_suffix, ) @@ -2514,6 +2614,8 @@ def train( total_num_tokens: int = None, activation_recomputation: int = 0, ds_zero: int = 0, + fwd_prefetch: bool = True, + bwd_prefetch: bool = True, dp_size: int = None, tp_size: int = 1, pp_size: int = 1, @@ -2537,6 +2639,7 @@ def train( inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY, num_gpus_per_node: int = NUM_GPUS_PER_NODE, output_dir: str = None, + output_file_prefix: str = "", output_file_suffix: str = "", ) -> dict: """Entry point function of training analysis for the command line interface. This @@ -2590,18 +2693,27 @@ def train( " and is best kept within a single node where high bandwidth NVLink" " is available.") + rdp_size = 1 if total_num_gpus and dp_size: - assert ( - total_num_gpus == dp_size * tp_size * pp_size - ), f"total_num_gpus {total_num_gpus} must be equal to dp_size * tp_size * pp_size {dp_size * tp_size * pp_size}" + assert total_num_gpus % ( + dp_size * tp_size * pp_size + ), f"total_num_gpus {total_num_gpus} must be divisible by dp_size * tp_size * pp_size {dp_size * tp_size * pp_size}" + rdp_size = total_num_gpus / (dp_size * tp_size * pp_size) elif total_num_gpus: - assert (total_num_gpus % (tp_size * pp_size) == 0 - ), f"total_num_gpus must be a multiple of tp_size * pp_size" + assert ( + total_num_gpus % (tp_size * pp_size) == 0 + ), f"dp_size is not specified, assuming total_num_gpus = dp_size * tp_size * pp_size, total_num_gpus must be a multiple of tp_size * pp_size" dp_size = total_num_gpus // (tp_size * pp_size) elif dp_size: total_num_gpus = dp_size * tp_size * pp_size + logger.info( + f'total_num_gpus is not specified, assuming total_num_gpus = dp_size * tp_size * pp_size' + ) else: dp_size = 1 + logger.info( + f'neither dp_size or total_num_gpus is specified, assuming dp_size = 1' + ) model_config = get_model_config_by_name(model_name) gpu_config = get_gpu_config_by_name(gpu_name) @@ -2612,6 +2724,7 @@ def train( tp_size=tp_size, pp_size=pp_size, dp_size=dp_size, + rdp_size=rdp_size, sp_size=sp_size if sp_size else tp_size, ep_size=ep_size) @@ -2636,6 +2749,8 @@ def train( activation_recomputation=ActivationRecomputation( activation_recomputation), ds_zero=DSZeRO(ds_zero), + fwd_prefetch=fwd_prefetch, + bwd_prefetch=bwd_prefetch, layernorm_dtype_bytes=layernorm_dtype_bytes, master_weights_dtype_bytes=master_weights_dtype_bytes, other_op_bytes=other_op_bytes, @@ -2647,6 +2762,7 @@ def train( mlp_2linear_quant_bits=mlp_2linear_quant_bits, mlp_recompute_gelu=mlp_recompute_gelu, output_dir=output_dir, + output_file_prefix=output_file_prefix, output_file_suffix=output_file_suffix, ) diff --git a/llm_analysis/config.py b/llm_analysis/config.py index e8d81cc..1eb2915 100644 --- a/llm_analysis/config.py +++ b/llm_analysis/config.py @@ -21,11 +21,8 @@ import fire -from llm_analysis.constant import ( - DTYPE_CONFIG_DIR_NAME, - GPU_CONFIG_DIR_NAME, - MODEL_CONFIG_DIR_NAME, -) +from llm_analysis.constant import (DTYPE_CONFIG_DIR_NAME, GPU_CONFIG_DIR_NAME, + MODEL_CONFIG_DIR_NAME) from llm_analysis.logger import logger try: @@ -116,8 +113,9 @@ class ParallelismConfig: tp_size: int = 1 # tensor parallelism size, Megatron-LM tensor parallelism implementation pp_size: int = 1 # pipeline parallelism size, Megatron-LM pipeline parallelism implementation dp_size: int = ( - 1 # data parallelism size, DeepSpeed Zero parallelism implementation + 1 # sharded data parallelism size, PyTorch FSDP or DeepSpeed Zero parallelism implementation ) + rdp_size: int = 1 # replicated data parallelism size, PyTorch HSDP implementation ep_size: int = 1 # expert parallelism size sp_size: int = None # sequence parallelism size, Megatron-LM sequence parallelism implementation @@ -357,10 +355,10 @@ def get_model_config_by_name(name_or_path: str) -> ModelConfig: model_configs[config.name] = config return config except Exception as e: - raise ValueError(f"unknown gpu config name: {e}") + raise ValueError(f"unknown model config name: {e}") model_config = get_model_config_from_hf(name_or_path) if model_config is None: - raise ( + raise ValueError( f"unknown model config name: {name_or_path}, and none is found on HuggingFace Hub" ) return model_config diff --git a/llm_analysis/utils.py b/llm_analysis/utils.py index 3801b56..a1c2185 100644 --- a/llm_analysis/utils.py +++ b/llm_analysis/utils.py @@ -14,6 +14,8 @@ def _num_to_string(num, precision=2, divisor=1024): + if num is None: + return None if num < 0: sign = '-' num = -num diff --git a/pyproject.toml b/pyproject.toml index de49b27..68d5404 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,17 +8,17 @@ readme = "README.md" packages = [{ include = "llm_analysis" }] [tool.poetry.dependencies] -python = "^3.8" +python = ">=3.8" fire = "^0.5.0" -huggingface-hub = "^0.14.1" -transformers = "^4.28.1" +# huggingface-hub = "^0.14.1" +# transformers = "^4.28.1" [tool.poetry.group.dev.dependencies] -pytest = "^7.3.1" -coverage = { extras = ["toml"], version = "^7.2.5" } -sphinx = "^7.0.0" -sphinx-autodoc-typehints = "^1.23.0" -pytest-cov = "^4.0.0" +pytest = ">=7.3.1" +coverage = { extras = ["toml"], version = ">=7.2.5" } +sphinx = ">=7.0.0" +sphinx-autodoc-typehints = ">=1.23.0" +pytest-cov = ">=4.0.0" [tool.coverage.run] omit = [".*", "*/site-packages/*"] diff --git a/tests/test_config.py b/tests/test_config.py index 6d46221..72733f8 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -12,18 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -from llm_analysis.config import ( - ModelConfig, - GPUConfig, - DtypeConfig, - get_dtype_config_by_name, - get_gpu_config_by_name, - get_model_config_by_name, -) +from llm_analysis.config import (DtypeConfig, GPUConfig, ModelConfig, + get_dtype_config_by_name, + get_gpu_config_by_name, + get_model_config_by_name) def test_get_model_config_by_name(): - model_name = "facebook/opt-125m" + model_name = "facebook_opt-125m" model_config = get_model_config_by_name(model_name) assert isinstance(model_config, ModelConfig) assert model_config.num_layers == 12 diff --git a/tests/test_inference.py b/tests/test_inference.py index e94000b..42e11a2 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -12,14 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from llm_analysis.utils import within_range from llm_analysis.analysis import LLMAnalysis -from llm_analysis.config import ( - ParallelismConfig, - get_dtype_config_by_name, - get_gpu_config_by_name, - get_model_config_by_name, -) +from llm_analysis.config import (ParallelismConfig, get_dtype_config_by_name, + get_gpu_config_by_name, + get_model_config_by_name) +from llm_analysis.utils import within_range TOLERANCE = 0.1 @@ -55,7 +52,7 @@ def test_fastertransformer_13b_tp1(): def test_llama2_70b(): - model_name = "upstage/Llama-2-70b-instruct-v2" + model_name = "upstage_Llama-2-70b-instruct-v2" dtype_name = "w16a16e16" gpu_name = "a100-sxm-80gb" diff --git a/tests/test_training.py b/tests/test_training.py index e3955fb..b28aec3 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -13,12 +13,9 @@ # limitations under the License. from llm_analysis.analysis import ActivationRecomputation, DSZeRO, LLMAnalysis -from llm_analysis.config import ( - ParallelismConfig, - get_dtype_config_by_name, - get_gpu_config_by_name, - get_model_config_by_name, -) +from llm_analysis.config import (ParallelismConfig, get_dtype_config_by_name, + get_gpu_config_by_name, + get_model_config_by_name) from llm_analysis.utils import _latency_to_string, _num_to_string, within_range TOLERANCE = 0.05