Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
cli99 committed Oct 19, 2023
1 parent d16c2f9 commit c15cf33
Show file tree
Hide file tree
Showing 13 changed files with 55 additions and 36 deletions.
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,19 @@ repos:
- id: flake8
args:
[
'--ignore=E,F403,F405,F541,F841,W',
'--select=E9,F,W6',
'--per-file-ignores=__init__.py:F401',
"--ignore=E,F403,F405,F541,F841,W",
"--select=E9,F,W6",
"--per-file-ignores=__init__.py:F401",
]
- repo: https://github.com/codespell-project/codespell
rev: v2.1.0
hooks:
- id: codespell
args: [
# Do not check files that are automatically generated
'--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json',
"--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json",
'--ignore-regex=\\n', # Do not count the 'n' in an escaped newline as part of a word
'--ignore-words-list=unsupport', # Word used in error messages that need rewording
"--ignore-words-list=unsupport", # Word used in error messages that need rewording
--check-filenames,
--check-hidden,
]
Expand Down
3 changes: 0 additions & 3 deletions examples/llama2/run_infer_cursor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
from llm_analysis.config import (
DtypeConfig,
GPUConfig,
ModelConfig,
ParallelismConfig,
get_dtype_config_by_name,
get_gpu_config_by_name,
Expand Down
56 changes: 40 additions & 16 deletions llm_analysis/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def get_num_params_per_layer_attn(self) -> int:
int: the number of parameters in the attention linear layers
"""
num_heads_per_gpu = max(self.model_config.num_key_value_heads / self.parallelism_config.tp_size, 1) # At least on attention head on each tensor-parallel GPU
return self.model_config.hidden_dim**2 + self.model_config.hidden_dim**2 + 2*self.model_config.hidden_dim*(self.model_config.hidden_dim * self.model_config.num_key_value_head /self.model_config.n_head)
return self.model_config.hidden_dim**2 + self.model_config.hidden_dim**2 + 2*self.model_config.hidden_dim*(self.model_config.hidden_dim * self.model_config.num_key_value_heads /self.model_config.n_head)

def get_num_params_per_layer_mlp(self) -> int:
"""Get the number of parameters in the MLP linear layers, including the
Expand All @@ -305,6 +305,15 @@ def get_num_params_per_layer_mlp(self) -> int:
"""
return 2 * self.model_config.hidden_dim*self.model_config.ffn_embed_dim*self.model_config.moe_num_experts

def get_num_params_per_layer_router(self)->int:
if self.model_config.moe_num_experts > 1:
return self.model_config.hidden_dim * self.model_config.moe_num_experts
else:
return 0

def get_num_params_per_layer_layernorm(self) -> int:
return 2 * self.model_config.hidden_dim

def get_num_params_per_layer(self) -> int:
"""Get the number of parameters in a transformer layer, including the
attention and MLP linear layers.
Expand All @@ -315,7 +324,7 @@ def get_num_params_per_layer(self) -> int:

return (
self.get_num_params_per_layer_attn()
+ self.get_num_params_per_layer_mlp()
+ self.get_num_params_per_layer_mlp() + self.get_num_params_per_layer_router() + self.get_num_params_per_layer_layernorm()
)

def get_num_active_params_per_layer(self) -> int:
Expand All @@ -329,6 +338,7 @@ def get_num_active_params_per_layer(self) -> int:
return (
self.get_num_params_per_layer_attn()
+ self.get_num_params_per_layer_mlp()*self.model_config.moe_top_k/self.model_config.moe_num_experts
+ self.get_num_params_per_layer_router() + self.get_num_params_per_layer_layernorm()
)


Expand All @@ -341,7 +351,7 @@ def get_num_params_total(self) -> int:
"""
return (
self.model_config.num_layers * self.get_num_params_per_layer()
+ self.get_num_params_embedding()
+ self.get_num_params_embedding() + self.get_num_params_per_layer_layernorm()
)

def get_memory_weight_per_layer(
Expand All @@ -360,10 +370,13 @@ def get_memory_weight_per_layer(
float: the memory (in bytes) required to store the weights of a transformer layer
"""
memory_weight_per_layer = (
self.get_num_params_per_layer()
(
self.get_num_params_per_layer_attn()
+ self.get_num_params_per_layer_mlp() / self.parallelism_config.ep_size + self.get_num_params_per_layer_router() + self.get_num_params_per_layer_layernorm()
)
* self.dtype_config.weight_bits
/ BITS_PER_BYTE
/ self.parallelism_config.tp_size / self.parallelism_config.ep_size
/ self.parallelism_config.tp_size
)
if ds_zero == DSZeRO.STAGE_3:
memory_weight_per_layer /= self.parallelism_config.dp_size
Expand Down Expand Up @@ -509,7 +522,7 @@ def get_memory_activation_per_layer_attn(

memory_activation_per_layer_attn = (
(1 * seq_len * batch_size * hidden_dim / sp_size)
+ (4 * seq_len * batch_size * hidden_dim / tp_size)
+ (+2*self.model_config.num_key_value_heads/self.model_config.n_head)*(seq_len * batch_size * hidden_dim / tp_size)
+ selective_compute_elems
) * bytes_per_activation + drop_out_masks

Expand Down Expand Up @@ -557,11 +570,16 @@ def get_memory_activation_per_layer_mlp(
seq_len * batch_size * hidden_dim / sp_size
)

memory_activation_per_layer_mlp = (
(1 * seq_len * batch_size * hidden_dim / sp_size)
+ (8 * seq_len * batch_size * hidden_dim / tp_size)
) * bytes_per_activation + drop_out_mask

if self.model_config.moe_num_experts == 1:
memory_activation_per_layer_mlp = (
(1 * seq_len * batch_size * hidden_dim / sp_size)
+ (2 * seq_len * batch_size * hidden_dim * self.model_config.expansion_ratio / tp_size)
) * bytes_per_activation + drop_out_mask
else:
memory_activation_per_layer_mlp = self.model_config.moe_top_k *(
(1 * seq_len * batch_size * hidden_dim / sp_size)
+ (2 * seq_len * batch_size * hidden_dim * self.model_config.expansion_ratio * self.model_config.moe_num_experts/ ep_size / tp_size)
) * bytes_per_activation + drop_out_mask
return memory_activation_per_layer_mlp

def get_memory_activation_per_layer_layernorm(
Expand Down Expand Up @@ -1272,7 +1290,7 @@ def print_config(self, name="Training Configs") -> None:
logger.info(config_str)

def get_configs_desc(self) -> str:
return f"{self.model_config.name}-{self.gpu_config.name}-{self.dtype_config.name}-tp{self.parallelism_config.tp_size}-pp{self.parallelism_config.pp_size}-dp{self.parallelism_config.dp_size}-sp{self.parallelism_config.sp_size}-fe{round(self.flops_efficiency, 2)}-hbme{round(self.hbm_memory_efficiency, 2)}"
return f"{self.model_config.name}-{self.gpu_config.name}-{self.dtype_config.name}-tp{self.parallelism_config.tp_size}-pp{self.parallelism_config.pp_size}-dp{self.parallelism_config.dp_size}-sp{self.parallelism_config.sp_size}-fe{round(self.flops_efficiency, 2)}-ep{self.parallelism_config.ep_size}-hbme{round(self.hbm_memory_efficiency, 2)}"

def get_readable_summary_dict(
self, summary_dict: dict, title="Summary"
Expand All @@ -1282,7 +1300,7 @@ def get_readable_summary_dict(
if "num_tokens" in key or "num_params" in key or "flops" in key:
log_str += f"{key}: {_num_to_string(value)}\n"
elif "gpu_hours" == key:
log_str += f"{key}: {int(value)}\n"
log_str += f"{key}: {int(value)}\n" if value else ""
elif "memory" in key and "efficiency" not in key:
log_str += f"{key}: {_num_to_string(value)}B\n"
elif "latency" in key:
Expand Down Expand Up @@ -1591,6 +1609,7 @@ def inference(
"batch_size_per_gpu": batch_size_per_gpu,
"seq_len": seq_len,
"tp_size": self.parallelism_config.tp_size,
"ep_size": self.parallelism_config.ep_size,
"pp_size": self.parallelism_config.pp_size,
"num_tokens_to_generate": num_tokens_to_generate,
"flops_efficiency": self.flops_efficiency,
Expand Down Expand Up @@ -1775,6 +1794,7 @@ def training(
ds_zero: DSZeRO = DSZeRO.NONE,
layernorm_dtype_bytes: int = BYTES_FP32,
output_dir: str = None,
output_file_suffix: str = "",
) -> dict:
"""Training analysis given the configs and inputs.
Expand Down Expand Up @@ -1878,7 +1898,7 @@ def training(
)
max_batch_size_per_gpu = int(memory_left // activation_memory_batch_size_1)
logger.info(
f"activation_memory_batch_size_1:{_num_to_string(activation_memory_batch_size_1)}B,"
f"activation_memory_batch_size_1: {_num_to_string(activation_memory_batch_size_1)}B,"
f" max_batch_size_per_gpu: {max_batch_size_per_gpu}"
)

Expand Down Expand Up @@ -2041,6 +2061,7 @@ def training(
"tp_size": self.parallelism_config.tp_size,
"pp_size": self.parallelism_config.pp_size,
"sp_size": self.parallelism_config.sp_size,
"ep_size": self.parallelism_config.ep_size,
"ds_zero": DSZeRO(ds_zero).name,
"total_num_gpus": total_num_gpus,
"seq_len": seq_len,
Expand Down Expand Up @@ -2074,7 +2095,7 @@ def training(

if output_dir is not None:
self.output_summary_dict(
summary_dict, output_dir, print_human_readable=True
summary_dict, output_dir, print_human_readable=True, output_file_suffix=output_file_suffix
)

return summary_dict
Expand Down Expand Up @@ -2191,6 +2212,7 @@ def train(
tp_size: int = 1,
pp_size: int = 1,
sp_size: int = None,
ep_size: int = 1,
total_num_gpus: int = None,
layernorm_dtype_bytes: int = BYTES_FP32,
achieved_tflops: float = None,
Expand All @@ -2200,6 +2222,7 @@ def train(
inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY,
num_gpus_per_node: int = NUM_GPUS_PER_NODE,
output_dir: str = None,
output_file_suffix: str = "",
) -> dict:
"""Entry point function of training analysis for the command line
interface. This uses pre-defined name-to-configuration mapping and common
Expand Down Expand Up @@ -2260,7 +2283,7 @@ def train(
gpu_config = get_gpu_config_by_name(gpu_name)
dtype_config = get_dtype_config_by_name(dtype_name)
parallel_config = ParallelismConfig(
tp_size=tp_size, pp_size=pp_size, dp_size=dp_size, sp_size=sp_size if sp_size else tp_size
tp_size=tp_size, pp_size=pp_size, dp_size=dp_size, sp_size=sp_size if sp_size else tp_size, ep_size=ep_size
)

analysis = LLMAnalysis(
Expand All @@ -2287,6 +2310,7 @@ def train(
ds_zero=DSZeRO(ds_zero),
layernorm_dtype_bytes=layernorm_dtype_bytes,
output_dir=output_dir,
output_file_suffix=output_file_suffix,
)

return summary_dict
Expand Down
4 changes: 1 addition & 3 deletions llm_analysis/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,14 +116,12 @@ class ParallelismConfig:
dp_size: int = (
1 # data parallelism size, DeepSpeed Zero parallelism implementation
)
ep_size: int = 1 # expert parallelism size
sp_size: int = None # sequence parallelism size, Megatron-LM sequence parallelism implementation
ep_size: int = None # expert parallelism size

def __post_init__(self):
if self.sp_size is None:
self.sp_size = self.tp_size
if self.ep_size is None:
self.ep_size = 1

# model name and configurations mapping populated from MODEL_CONFIG_DIR_NAME
model_configs = {}
Expand Down
2 changes: 1 addition & 1 deletion llm_analysis/gpu_configs/a10-pcie-28gb.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
"peak_i8_TFLOPS": 250,
"peak_i4_TFLOPS": 500,
"inter_node_bandwidth_in_GB_per_sec": 200
}
}
2 changes: 1 addition & 1 deletion llm_analysis/gpu_configs/a10g-pcie-24gb.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
"peak_i8_TFLOPS": 140,
"peak_i4_TFLOPS": 280,
"inter_node_bandwidth_in_GB_per_sec": 200
}
}
2 changes: 1 addition & 1 deletion llm_analysis/gpu_configs/h100-pcie-80gb.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@
"peak_fp16_TFLOPS": 756,
"peak_i8_TFLOPS": 1513,
"peak_i4_TFLOPS": 3026,
"inter_node_bandwidth_in_GB_per_sec": 200
"inter_node_bandwidth_in_GB_per_sec": 400
}
2 changes: 1 addition & 1 deletion llm_analysis/gpu_configs/h100-sxm-80gb.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@
"peak_fp16_TFLOPS": 989,
"peak_i8_TFLOPS": 1979,
"peak_i4_TFLOPS": 3958,
"inter_node_bandwidth_in_GB_per_sec": 200
"inter_node_bandwidth_in_GB_per_sec": 400
}
2 changes: 1 addition & 1 deletion llm_analysis/gpu_configs/v100-pcie-16gb.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
"peak_i8_TFLOPS": 0,
"peak_i4_TFLOPS": 0,
"inter_node_bandwidth_in_GB_per_sec": 200
}
}
2 changes: 1 addition & 1 deletion llm_analysis/gpu_configs/v100-pcie-32gb.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
"peak_i8_TFLOPS": 0,
"peak_i4_TFLOPS": 0,
"inter_node_bandwidth_in_GB_per_sec": 200
}
}
2 changes: 1 addition & 1 deletion llm_analysis/gpu_configs/v100-sxm-16gb.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
"peak_i8_TFLOPS": 0,
"peak_i4_TFLOPS": 0,
"inter_node_bandwidth_in_GB_per_sec": 200
}
}
2 changes: 1 addition & 1 deletion llm_analysis/gpu_configs/v100-sxm-32gb.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
"peak_i8_TFLOPS": 0,
"peak_i4_TFLOPS": 0,
"inter_node_bandwidth_in_GB_per_sec": 200
}
}
2 changes: 1 addition & 1 deletion tests/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,4 +81,4 @@ def test_llama2_70b():

assert within_range(
summary_dict["total_decode_latency"], 17.05, TOLERANCE
)
)

0 comments on commit c15cf33

Please sign in to comment.