Source code for kempnerforge.metrics.mfu

"""Model FLOPs Utilization (MFU) computation.

Implements the PaLM paper formula for estimating achieved FLOPS relative
to hardware peak, with auto-detection of GPU capabilities.

MFU = achieved_tflops / peak_tflops

Where:
  model_flops_per_token = 6*P + 12*L*D*S  (forward + backward)
  achieved_tflops = model_flops_per_token * tokens_per_sec / 1e12
"""

from __future__ import annotations

import logging

import torch

from kempnerforge.config.schema import ModelConfig

logger = logging.getLogger(__name__)

# Peak bf16 TFLOPS for common GPU types (per GPU)
# Source: NVIDIA specs (dense tensor core throughput)
_GPU_PEAK_TFLOPS: dict[str, float] = {
    # H-series
    "H200": 989.0,
    "H100": 989.0,
    "H100 SXM": 989.0,
    "H100 PCIe": 756.0,
    "H800": 989.0,
    # A-series
    "A100": 312.0,
    "A100 SXM": 312.0,
    "A100 PCIe": 312.0,
    "A100-SXM4-80GB": 312.0,
    "A100-SXM4-40GB": 312.0,
    "A100-PCIE-80GB": 312.0,
    "A100-PCIE-40GB": 312.0,
    # Consumer / other
    "A10G": 125.0,
    "L40S": 362.0,
    "RTX 4090": 330.0,
    "RTX 3090": 142.0,
}



[docs]
def get_gpu_peak_tflops(device: int = 0) -> float:
    """Auto-detect GPU peak bf16 TFLOPS.

    Tries to match the GPU name against known models. Falls back to a
    conservative estimate based on compute capability.

    Args:
        device: CUDA device index.

    Returns:
        Peak bf16 TFLOPS for this GPU.
    """
    if not torch.cuda.is_available():
        return 1.0  # dummy for CPU-only

    name = torch.cuda.get_device_name(device)

    # Try exact and substring matches
    for gpu_name, tflops in _GPU_PEAK_TFLOPS.items():
        if gpu_name in name:
            logger.info(f"Detected GPU: {name} → {tflops} bf16 TFLOPS")
            return tflops

    # Fallback: estimate from compute capability
    major, minor = torch.cuda.get_device_capability(device)
    if major >= 9:
        # Hopper-class
        tflops = 989.0
    elif major >= 8:
        # Ampere-class
        tflops = 312.0
    else:
        tflops = 100.0

    logger.warning(
        f"Unknown GPU: {name} (cc {major}.{minor}). "
        f"Using estimated {tflops} bf16 TFLOPS. "
        f"Add this GPU to _GPU_PEAK_TFLOPS for accuracy."
    )
    return tflops




[docs]
def estimate_model_flops_per_token(config: ModelConfig, seq_len: int | None = None) -> int:
    """Estimate FLOPS per token for forward + backward pass.

    Uses the PaLM paper approximation: ``6*P + 12*L*D*S``

    For MoE: uses active params (top_k experts per layer, not all experts).
    Excludes embedding (table lookup, not matmul). Includes output projection.
    The 12*L*D*S attention term does not discount GQA — FlashAttention expands
    GQA internally, so the hardware performs full attention compute.
    Router FLOPS (dim × num_experts) are intentionally omitted — negligible.

    Args:
        config: Model configuration.
        seq_len: Actual training sequence length. Falls back to
            config.max_seq_len if not provided.

    Returns:
        Estimated FLOPS per token.
    """
    s = seq_len if seq_len is not None else config.max_seq_len
    if config.is_moe:
        return _moe_flops_per_token(config, s)
    return _dense_flops_per_token(config, s)



def _dense_flops_per_token(config: ModelConfig, seq_len: int) -> int:
    head_dim = config.head_dim
    attn_params = (
        config.dim * (config.n_heads * head_dim)  # Q
        + 2 * config.dim * (config.n_kv_heads * head_dim)  # type: ignore[reportOptionalOperand]  # K+V
        + (config.n_heads * head_dim) * config.dim  # O
    )
    mlp_params = 3 * config.dim * config.computed_ffn_hidden_dim  # SwiGLU
    per_layer = attn_params + mlp_params
    output_params = config.vocab_size * config.dim
    active_params = config.n_layers * per_layer + output_params
    return 6 * active_params + 12 * config.n_layers * config.dim * seq_len


def _moe_flops_per_token(config: ModelConfig, seq_len: int) -> int:
    head_dim = config.head_dim
    attn_params = (
        config.dim * (config.n_heads * head_dim)
        + 2 * config.dim * (config.n_kv_heads * head_dim)  # type: ignore[reportOptionalOperand]
        + (config.n_heads * head_dim) * config.dim
    )
    mlp_params = 3 * config.dim * config.computed_ffn_hidden_dim

    n_moe_layers = sum(1 for i in range(config.n_layers) if (i + 1) % config.moe_frequency == 0)
    n_dense_layers = config.n_layers - n_moe_layers

    dense_active = n_dense_layers * (attn_params + mlp_params)
    shared_mlp = config.moe_shared_experts * mlp_params
    moe_active = n_moe_layers * (attn_params + config.moe_top_k * mlp_params + shared_mlp)

    output_params = config.vocab_size * config.dim
    active_params = dense_active + moe_active + output_params
    return 6 * active_params + 12 * config.n_layers * config.dim * seq_len



[docs]
def compute_mfu(
    config: ModelConfig,
    tokens_per_sec: float,
    num_gpus: int = 1,
    gpu_peak_tflops: float | None = None,
    seq_len: int | None = None,
) -> float:
    """Compute Model FLOPs Utilization.

    Args:
        config: Model configuration.
        tokens_per_sec: Global throughput (tokens/sec across all GPUs).
        num_gpus: Number of GPUs.
        gpu_peak_tflops: Peak bf16 TFLOPS per GPU. Auto-detected if None.
        seq_len: Actual training sequence length for attention FLOPS.
            Falls back to config.max_seq_len if not provided.

    Returns:
        MFU as a fraction (0.0 to 1.0).
    """
    if gpu_peak_tflops is None:
        gpu_peak_tflops = get_gpu_peak_tflops()

    flops_per_token = estimate_model_flops_per_token(config, seq_len=seq_len)
    achieved_tflops = flops_per_token * tokens_per_sec / 1e12
    peak_tflops = gpu_peak_tflops * num_gpus

    if peak_tflops == 0:
        return 0.0

    return achieved_tflops / peak_tflops