Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
cli99 committed May 21, 2024
1 parent ff25f1f commit fb6c84b
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 44 deletions.
2 changes: 1 addition & 1 deletion llm_analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
59 changes: 26 additions & 33 deletions llm_analysis/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,8 +393,8 @@ def get_weight_memory_per_layer(

weight_memory_per_layer = weight_memory_attn_per_layer + weight_memory_mlp_per_layer + weight_memory_layernorm_per_layer

logger.info(
f'weight_memory_attn_per_layer: {_num_to_string(weight_memory_attn_per_layer)}B, weight_memory_mlp_per_layer: {_num_to_string(weight_memory_mlp_per_layer)}B, weight_memory_layernorm_per_layer: {_num_to_string(weight_memory_layernorm_per_layer)}B'
logger.debug(
f'is_sharded: {is_sharded}, weight_memory_attn_per_layer: {_num_to_string(weight_memory_attn_per_layer)}B, weight_memory_mlp_per_layer: {_num_to_string(weight_memory_mlp_per_layer)}B, weight_memory_layernorm_per_layer: {_num_to_string(weight_memory_layernorm_per_layer)}B'
)

if return_breakdown:
Expand Down Expand Up @@ -1172,13 +1172,15 @@ def get_latency_fwd_per_layernorm(
Returns:
float: the latency in seconds for the forward pass of a single layernorm in a transformer layer
"""
input_numel = seq_len * batch_size * self.model_config.hidden_dim
compute_latency = input_numel * 5 / (self.get_TFLOPS_per_gpu() * 10**12)
activation_memory = self.get_activation_memory_per_layernorm(
batch_size,
seq_len,
)
activation_memory_latency = activation_memory / (
self.get_gpu_hbm_bandwidth() * 10**9)
return activation_memory_latency
return max(compute_latency, activation_memory_latency)

def get_latency_fwd_per_tp_comm(self, batch_size: int, seq_len: int,
dtype_bytes: int) -> float:
Expand Down Expand Up @@ -2060,19 +2062,7 @@ def training(
estimated_bwd_prefetch_memory_per_gpu))

logger.info(
f"weight_memory_per_gpu: {_num_to_string(weight_memory_per_gpu)}B"
" (embedding_memory:"
f" {_num_to_string(weight_memory_embedding_per_gpu)}B),"
" optimizer_state_memory_per_gpu:"
f" {_num_to_string(optimizer_state_memory_per_gpu)}B,"
" gradient_memory_per_gpu:"
f" {_num_to_string(gradient_memory_per_gpu)}B",
" estimated_fwd_prefetch_memory_per_gpu:"
f" {_num_to_string(estimated_fwd_prefetch_memory_per_gpu)}B",
" estimated_bwd_prefetch_memory_per_gpu:"
f" {_num_to_string(estimated_bwd_prefetch_memory_per_gpu)}B",
" memory_left:"
f" {_num_to_string(memory_left)}B",
f"weight_memory_per_gpu: {_num_to_string(weight_memory_per_gpu)}B (embedding_memory: {_num_to_string(weight_memory_embedding_per_gpu)}B), optimizer_state_memory_per_gpu: {_num_to_string(optimizer_state_memory_per_gpu)}B, gradient_memory_per_gpu: {_num_to_string(gradient_memory_per_gpu)}B, estimated_fwd_prefetch_memory_per_gpu: {_num_to_string(estimated_fwd_prefetch_memory_per_gpu)}B, estimated_bwd_prefetch_memory_per_gpu: {_num_to_string(estimated_bwd_prefetch_memory_per_gpu)}B"
)

if memory_left < 0:
Expand Down Expand Up @@ -2230,18 +2220,25 @@ def training(
ds_zero=ds_zero,
)

latency_fwd_per_layer_attn_compute = self.get_latency_fwd_per_layer_attn(
batch_size_per_gpu, seq_len, False, activation_recomputation)
latency_fwd_per_layer_mlp_compute = self.get_latency_fwd_per_layer_mlp(
batch_size_per_gpu, seq_len, False, activation_recomputation)
latency_fwd_per_layernorm_compute = self.get_latency_fwd_per_layernorm(
batch_size_per_gpu,
seq_len,
layernorm_dtype_bytes,
)
num_layers_per_gpu = int(self.model_config.num_layers /
self.parallelism_config.pp_size)
if activation_recomputation == ActivationRecomputation.FULL:
latency_recompute = latency_fwd
latency_recompute = num_layers_per_gpu * (latency_fwd_per_layer_attn_compute + latency_fwd_per_layer_mlp_compute + 2 * latency_fwd_per_layernorm_compute)
elif activation_recomputation == ActivationRecomputation.NORM_ATTN_NORM:
latency_recompute = self.get_latency_fwd_per_layer_attn(
batch_size_per_gpu, seq_len, False, activation_recomputation
) + 2 * self.get_latency_fwd_per_layernorm(
batch_size_per_gpu, seq_len, layernorm_dtype_bytes)
latency_recompute = num_layers_per_gpu * (latency_fwd_per_layer_attn_compute + 2 * latency_fwd_per_layernorm_compute)
elif activation_recomputation == ActivationRecomputation.ATTN:
latency_recompute = self.get_latency_fwd_per_layer_attn(
batch_size_per_gpu, seq_len, False, activation_recomputation)
latency_recompute = num_layers_per_gpu * latency_fwd_per_layer_attn_compute
elif activation_recomputation == ActivationRecomputation.ATTN_COMPUTE:
latency_recompute = self.get_num_flops_total_attn_compute(
latency_recompute = num_layers_per_gpu * self.get_num_flops_total_attn_compute(
batch_size_per_gpu, seq_len) / (
(self.parallelism_config.tp_size *
self.parallelism_config.pp_size) *
Expand Down Expand Up @@ -2300,6 +2297,7 @@ def training(

else:
total_training_latency = None
total_training_latency_using_flops = None

gpu_hours = (total_training_latency * total_num_gpus /
3600 if total_training_latency is not None else None)
Expand Down Expand Up @@ -2404,23 +2402,18 @@ def training(
estimated_fwd_prefetch_memory_per_gpu,
"estimated_bwd_prefetch_memory_per_gpu":
estimated_bwd_prefetch_memory_per_gpu,
"estimated_peak_fwd_memory_per_gpu":
optimizer_state_memory_per_gpu + weight_memory_per_gpu +
activation_memory_per_gpu + estimated_fwd_prefetch_memory_per_gpu,
"estimated_peak_bwd_memory_per_gpu":
optimizer_state_memory_per_gpu + weight_memory_per_gpu +
activation_memory_per_gpu + estimated_bwd_prefetch_memory_per_gpu,
"memory_left_per_gpu":
memory_left,
"estimated_peak_allocated_memory_per_gpu":
self.gpu_config.mem_per_GPU_in_GB * 1024**3 - memory_left,
"latency_per_micro_batch":
latency_per_micro_batch,
"latency_fwd":
latency_fwd,
}
summary_dict.update(latency_fwd_breakdown)
device_tokens_per_sec = round(seq_len * batch_size_per_gpu / latency_per_iter, 2)
summary_dict.update({
"latency_per_iter": latency_per_iter,
"iters_per_sec": round(1 / latency_per_iter, 2),
"device_tokens_per_sec": device_tokens_per_sec,
"total_training_latency": total_training_latency,
"latency_per_iter_using_flops": latency_per_iter_using_flops,
"total_training_latency_using_flops":
Expand Down
4 changes: 2 additions & 2 deletions llm_analysis/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,10 +354,10 @@ def get_model_config_by_name(name_or_path: str) -> ModelConfig:
model_configs[config.name] = config
return config
except Exception as e:
raise ValueError(f"unknown gpu config name: {e}")
raise ValueError(f"unknown model config name: {e}")
model_config = get_model_config_from_hf(name_or_path)
if model_config is None:
raise (
raise ValueError(
f"unknown model config name: {name_or_path}, and none is found on HuggingFace Hub"
)
return model_config
Expand Down
2 changes: 2 additions & 0 deletions llm_analysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@


def _num_to_string(num, precision=2, divisor=1024):
if num is None:
return None
if num < 0:
sign = '-'
num = -num
Expand Down
16 changes: 8 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@ readme = "README.md"
packages = [{ include = "llm_analysis" }]

[tool.poetry.dependencies]
python = "^3.8"
python = ">=3.8"
fire = "^0.5.0"
huggingface-hub = "^0.14.1"
transformers = "^4.28.1"
# huggingface-hub = "^0.14.1"
# transformers = "^4.28.1"

[tool.poetry.group.dev.dependencies]
pytest = "^7.3.1"
coverage = { extras = ["toml"], version = "^7.2.5" }
sphinx = "^7.0.0"
sphinx-autodoc-typehints = "^1.23.0"
pytest-cov = "^4.0.0"
pytest = ">=7.3.1"
coverage = { extras = ["toml"], version = ">=7.2.5" }
sphinx = ">=7.0.0"
sphinx-autodoc-typehints = ">=1.23.0"
pytest-cov = ">=4.0.0"

[tool.coverage.run]
omit = [".*", "*/site-packages/*"]
Expand Down

0 comments on commit fb6c84b

Please sign in to comment.