Source code for kempnerforge.config.model
"""Model architecture configuration."""
from __future__ import annotations
import math
from dataclasses import dataclass
from enum import StrEnum
[docs]
class NormType(StrEnum):
rmsnorm = "rmsnorm"
layernorm = "layernorm"
[docs]
class Activation(StrEnum):
silu = "silu"
gelu = "gelu"
relu = "relu"
[docs]
@dataclass
class ModelConfig:
"""Architecture hyperparameters for a transformer model."""
dim: int = 4096
n_layers: int = 32
n_heads: int = 32
n_kv_heads: int | None = None # None -> same as n_heads (MHA)
vocab_size: int = 32000
ffn_dim_multiplier: float = 1.0
ffn_hidden_dim: int | None = None # Override computed hidden dim
norm_type: NormType = NormType.rmsnorm
norm_eps: float = 1e-5
activation: Activation = Activation.silu
max_seq_len: int = 2048
rope_theta: float = 10000.0
tie_embeddings: bool = False
qk_norm: bool = False # Apply RMSNorm to Q/K per-head before RoPE (Gemma, DeepSeek-V3)
init_std: float = 0.02 # Std for weight initialization (GPT-2/Llama default)
model_type: str = "transformer" # Registry key for model builder
# SDPA backend: "auto" lets PyTorch select (recommended). Override to force
# a specific kernel for benchmarking or debugging.
sdpa_backend: str = "auto" # "auto", "flash", "efficient", "cudnn", "math"
# MoE (all defaults produce a dense model -- zero behavior change)
num_experts: int = 0 # 0 = dense, >0 = MoE
moe_top_k: int = 2 # experts selected per token
moe_frequency: int = 1 # MoE every N layers (1=all, 2=alternating)
moe_router: str = "softmax_topk" # registry key for router type
moe_shared_experts: int = 0 # shared experts that process all tokens
moe_aux_loss_weight: float = 0.01 # aux loss coefficient in training loss
moe_capacity_factor: float = 0.0 # 0=no drop, >0=cap tokens/expert (e.g. 1.25)
moe_sequence_aux_loss_weight: float = 0.0 # Sequence-level balance loss (0=off)
moe_gradient_scale: bool = False # Per-expert gradient normalization
moe_bias_schedule: str = "constant" # "constant", "cosine_decay", "linear_warmup"
moe_packed_experts: bool = False # Pack expert weights into one tensor per projection
def __post_init__(self) -> None:
if self.n_kv_heads is None:
self.n_kv_heads = self.n_heads
# Positivity checks first (before any division)
if self.dim <= 0 or self.n_layers <= 0 or self.n_heads <= 0:
raise ValueError("dim, n_layers, and n_heads must be positive")
if self.vocab_size <= 0:
raise ValueError("vocab_size must be positive")
if self.n_kv_heads <= 0:
raise ValueError("n_kv_heads must be positive")
# Divisibility checks
if self.dim % self.n_heads != 0:
raise ValueError(f"dim ({self.dim}) must be divisible by n_heads ({self.n_heads})")
if self.n_heads % self.n_kv_heads != 0:
raise ValueError(
f"n_heads ({self.n_heads}) must be divisible by n_kv_heads ({self.n_kv_heads})"
)
# SDPA backend validation
if self.sdpa_backend not in ("auto", "flash", "efficient", "cudnn", "math"):
raise ValueError(
f"Unknown sdpa_backend: '{self.sdpa_backend}'. "
"Options: 'auto', 'flash', 'efficient', 'cudnn', 'math'"
)
# MoE validation
if self.num_experts > 0:
if self.moe_top_k <= 0:
raise ValueError("moe_top_k must be positive when num_experts > 0")
if self.moe_top_k > self.num_experts:
raise ValueError(
f"moe_top_k ({self.moe_top_k}) must be <= num_experts ({self.num_experts})"
)
if self.moe_frequency <= 0:
raise ValueError("moe_frequency must be positive")
if self.moe_sequence_aux_loss_weight < 0:
raise ValueError("moe_sequence_aux_loss_weight must be non-negative")
if self.moe_bias_schedule not in ("constant", "cosine_decay", "linear_warmup"):
raise ValueError(
f"Unknown moe_bias_schedule: '{self.moe_bias_schedule}'. "
"Options: 'constant', 'cosine_decay', 'linear_warmup'"
)
@property
def is_moe(self) -> bool:
"""Whether this config uses Mixture-of-Experts."""
return self.num_experts > 0
@property
def head_dim(self) -> int:
return self.dim // self.n_heads
@property
def computed_ffn_hidden_dim(self) -> int:
"""FFN hidden dimension, rounded to nearest multiple of 256 for hardware efficiency."""
if self.ffn_hidden_dim is not None:
return self.ffn_hidden_dim
# Llama-style: 4 * dim * (2/3) * ffn_dim_multiplier, rounded up to multiple of 256
raw = int(4 * self.dim * (2 / 3) * self.ffn_dim_multiplier)
return 256 * math.ceil(raw / 256)
@property
def num_params_estimate(self) -> int:
"""Rough total parameter count estimate (excluding embedding if tied).
For MoE models, counts all expert parameters (total, not active).
"""
d = self.dim
h = self.computed_ffn_hidden_dim
n_kv = self.n_kv_heads
head_d = self.head_dim
# Per layer: attention (Q + K + V + O) + 2 norms
attn = d * (self.n_heads * head_d) + 2 * d * (n_kv * head_d) + (self.n_heads * head_d) * d # type: ignore[reportOptionalOperand]
mlp = d * h + d * h + h * d # gate + up + down (SwiGLU has 3 matrices)
norm = 2 * d # 2 norms per layer
if self.is_moe:
n_moe = sum(1 for i in range(self.n_layers) if (i + 1) % self.moe_frequency == 0)
n_dense = self.n_layers - n_moe
router = d * self.num_experts # gate linear per MoE layer
shared_mlp = self.moe_shared_experts * mlp
moe_per_layer = attn + self.num_experts * mlp + router + shared_mlp + norm
dense_per_layer = attn + mlp + norm
layer_params = n_moe * moe_per_layer + n_dense * dense_per_layer
else:
layer_params = self.n_layers * (attn + mlp + norm)
embedding = self.vocab_size * d
output = 0 if self.tie_embeddings else self.vocab_size * d
final_norm = d
return layer_params + embedding + output + final_norm