Source code for kempnerforge.metrics.mfu
"""Model FLOPs Utilization (MFU) computation.
Implements the PaLM paper formula for estimating achieved FLOPS relative
to hardware peak, with auto-detection of GPU capabilities.
MFU = achieved_tflops / peak_tflops
Where:
model_flops_per_token = 6*P + 12*L*D*S (forward + backward)
achieved_tflops = model_flops_per_token * tokens_per_sec / 1e12
"""
from __future__ import annotations
import logging
import torch
from kempnerforge.config.schema import ModelConfig
logger = logging.getLogger(__name__)
# Peak bf16 TFLOPS for common GPU types (per GPU)
# Source: NVIDIA specs (dense tensor core throughput)
_GPU_PEAK_TFLOPS: dict[str, float] = {
# H-series
"H200": 989.0,
"H100": 989.0,
"H100 SXM": 989.0,
"H100 PCIe": 756.0,
"H800": 989.0,
# A-series
"A100": 312.0,
"A100 SXM": 312.0,
"A100 PCIe": 312.0,
"A100-SXM4-80GB": 312.0,
"A100-SXM4-40GB": 312.0,
"A100-PCIE-80GB": 312.0,
"A100-PCIE-40GB": 312.0,
# Consumer / other
"A10G": 125.0,
"L40S": 362.0,
"RTX 4090": 330.0,
"RTX 3090": 142.0,
}
[docs]
def get_gpu_peak_tflops(device: int = 0) -> float:
"""Auto-detect GPU peak bf16 TFLOPS.
Tries to match the GPU name against known models. Falls back to a
conservative estimate based on compute capability.
Args:
device: CUDA device index.
Returns:
Peak bf16 TFLOPS for this GPU.
"""
if not torch.cuda.is_available():
return 1.0 # dummy for CPU-only
name = torch.cuda.get_device_name(device)
# Try exact and substring matches
for gpu_name, tflops in _GPU_PEAK_TFLOPS.items():
if gpu_name in name:
logger.info(f"Detected GPU: {name} → {tflops} bf16 TFLOPS")
return tflops
# Fallback: estimate from compute capability
major, minor = torch.cuda.get_device_capability(device)
if major >= 9:
# Hopper-class
tflops = 989.0
elif major >= 8:
# Ampere-class
tflops = 312.0
else:
tflops = 100.0
logger.warning(
f"Unknown GPU: {name} (cc {major}.{minor}). "
f"Using estimated {tflops} bf16 TFLOPS. "
f"Add this GPU to _GPU_PEAK_TFLOPS for accuracy."
)
return tflops
[docs]
def estimate_model_flops_per_token(config: ModelConfig, seq_len: int | None = None) -> int:
"""Estimate FLOPS per token for forward + backward pass.
Uses the PaLM paper approximation: ``6*P + 12*L*D*S``
For MoE: uses active params (top_k experts per layer, not all experts).
Excludes embedding (table lookup, not matmul). Includes output projection.
The 12*L*D*S attention term does not discount GQA — FlashAttention expands
GQA internally, so the hardware performs full attention compute.
Router FLOPS (dim × num_experts) are intentionally omitted — negligible.
Args:
config: Model configuration.
seq_len: Actual training sequence length. Falls back to
config.max_seq_len if not provided.
Returns:
Estimated FLOPS per token.
"""
s = seq_len if seq_len is not None else config.max_seq_len
if config.is_moe:
return _moe_flops_per_token(config, s)
return _dense_flops_per_token(config, s)
def _dense_flops_per_token(config: ModelConfig, seq_len: int) -> int:
head_dim = config.head_dim
attn_params = (
config.dim * (config.n_heads * head_dim) # Q
+ 2 * config.dim * (config.n_kv_heads * head_dim) # type: ignore[reportOptionalOperand] # K+V
+ (config.n_heads * head_dim) * config.dim # O
)
mlp_params = 3 * config.dim * config.computed_ffn_hidden_dim # SwiGLU
per_layer = attn_params + mlp_params
output_params = config.vocab_size * config.dim
active_params = config.n_layers * per_layer + output_params
return 6 * active_params + 12 * config.n_layers * config.dim * seq_len
def _moe_flops_per_token(config: ModelConfig, seq_len: int) -> int:
head_dim = config.head_dim
attn_params = (
config.dim * (config.n_heads * head_dim)
+ 2 * config.dim * (config.n_kv_heads * head_dim) # type: ignore[reportOptionalOperand]
+ (config.n_heads * head_dim) * config.dim
)
mlp_params = 3 * config.dim * config.computed_ffn_hidden_dim
n_moe_layers = sum(1 for i in range(config.n_layers) if (i + 1) % config.moe_frequency == 0)
n_dense_layers = config.n_layers - n_moe_layers
dense_active = n_dense_layers * (attn_params + mlp_params)
shared_mlp = config.moe_shared_experts * mlp_params
moe_active = n_moe_layers * (attn_params + config.moe_top_k * mlp_params + shared_mlp)
output_params = config.vocab_size * config.dim
active_params = dense_active + moe_active + output_params
return 6 * active_params + 12 * config.n_layers * config.dim * seq_len
[docs]
def compute_mfu(
config: ModelConfig,
tokens_per_sec: float,
num_gpus: int = 1,
gpu_peak_tflops: float | None = None,
seq_len: int | None = None,
) -> float:
"""Compute Model FLOPs Utilization.
Args:
config: Model configuration.
tokens_per_sec: Global throughput (tokens/sec across all GPUs).
num_gpus: Number of GPUs.
gpu_peak_tflops: Peak bf16 TFLOPS per GPU. Auto-detected if None.
seq_len: Actual training sequence length for attention FLOPS.
Falls back to config.max_seq_len if not provided.
Returns:
MFU as a fraction (0.0 to 1.0).
"""
if gpu_peak_tflops is None:
gpu_peak_tflops = get_gpu_peak_tflops()
flops_per_token = estimate_model_flops_per_token(config, seq_len=seq_len)
achieved_tflops = flops_per_token * tokens_per_sec / 1e12
peak_tflops = gpu_peak_tflops * num_gpus
if peak_tflops == 0:
return 0.0
return achieved_tflops / peak_tflops