kempnerforge.config.model

Model architecture configuration.

Classes

Activation

ModelConfig

Architecture hyperparameters for a transformer model.

NormType

class kempnerforge.config.model.NormType[source]

Bases: StrEnum

rmsnorm = 'rmsnorm'
layernorm = 'layernorm'
__new__(value)
class kempnerforge.config.model.Activation[source]

Bases: StrEnum

silu = 'silu'
gelu = 'gelu'
relu = 'relu'
__new__(value)
class kempnerforge.config.model.ModelConfig[source]

Bases: object

Architecture hyperparameters for a transformer model.

dim: int = 4096
n_layers: int = 32
n_heads: int = 32
n_kv_heads: int | None = None
vocab_size: int = 32000
ffn_dim_multiplier: float = 1.0
ffn_hidden_dim: int | None = None
norm_type: NormType = 'rmsnorm'
norm_eps: float = 1e-05
activation: Activation = 'silu'
max_seq_len: int = 2048
rope_theta: float = 10000.0
tie_embeddings: bool = False
qk_norm: bool = False
init_std: float = 0.02
model_type: str = 'transformer'
sdpa_backend: str = 'auto'
num_experts: int = 0
moe_top_k: int = 2
moe_frequency: int = 1
moe_router: str = 'softmax_topk'
moe_shared_experts: int = 0
moe_aux_loss_weight: float = 0.01
moe_capacity_factor: float = 0.0
moe_sequence_aux_loss_weight: float = 0.0
moe_gradient_scale: bool = False
moe_bias_schedule: str = 'constant'
moe_packed_experts: bool = False
property is_moe: bool

Whether this config uses Mixture-of-Experts.

property head_dim: int
property computed_ffn_hidden_dim: int

FFN hidden dimension, rounded to nearest multiple of 256 for hardware efficiency.

property num_params_estimate: int

Rough total parameter count estimate (excluding embedding if tied).

For MoE models, counts all expert parameters (total, not active).

__init__(dim=4096, n_layers=32, n_heads=32, n_kv_heads=None, vocab_size=32000, ffn_dim_multiplier=1.0, ffn_hidden_dim=None, norm_type=NormType.rmsnorm, norm_eps=1e-05, activation=Activation.silu, max_seq_len=2048, rope_theta=10000.0, tie_embeddings=False, qk_norm=False, init_std=0.02, model_type='transformer', sdpa_backend='auto', num_experts=0, moe_top_k=2, moe_frequency=1, moe_router='softmax_topk', moe_shared_experts=0, moe_aux_loss_weight=0.01, moe_capacity_factor=0.0, moe_sequence_aux_loss_weight=0.0, moe_gradient_scale=False, moe_bias_schedule='constant', moe_packed_experts=False)
Parameters:
  • dim (int)

  • n_layers (int)

  • n_heads (int)

  • n_kv_heads (int | None)

  • vocab_size (int)

  • ffn_dim_multiplier (float)

  • ffn_hidden_dim (int | None)

  • norm_type (NormType)

  • norm_eps (float)

  • activation (Activation)

  • max_seq_len (int)

  • rope_theta (float)

  • tie_embeddings (bool)

  • qk_norm (bool)

  • init_std (float)

  • model_type (str)

  • sdpa_backend (str)

  • num_experts (int)

  • moe_top_k (int)

  • moe_frequency (int)

  • moe_router (str)

  • moe_shared_experts (int)

  • moe_aux_loss_weight (float)

  • moe_capacity_factor (float)

  • moe_sequence_aux_loss_weight (float)

  • moe_gradient_scale (bool)

  • moe_bias_schedule (str)

  • moe_packed_experts (bool)

Return type:

None