kempnerforge.config

Configuration system for KempnerForge.

class kempnerforge.config.CheckpointConfig[source]

Bases: object

Checkpointing settings.

dir: str = 'checkpoints'
interval: int = 1000
async_mode: AsyncCheckpointMode = 'disabled'
keep_last_n: int = 3
load_path: str | None = None
export_dtype: Literal['float32', 'bfloat16'] = 'bfloat16'
exclude_from_loading: list[str]
__init__(dir='checkpoints', interval=1000, async_mode=AsyncCheckpointMode.disabled, keep_last_n=3, load_path=None, export_dtype='bfloat16', exclude_from_loading=<factory>)
Parameters:
Return type:

None

class kempnerforge.config.DataConfig[source]

Bases: object

Data pipeline settings.

dataset_path: str = ''
file_pattern: str = '*.npy'
tokenizer_path: str = ''
num_workers: int = 4
pin_memory: bool = True
prefetch_factor: int = 2
hf_dataset_name: str | None = None
hf_dataset_config: str | None = None
hf_dataset_split: str = 'train'
hf_dataset_text_field: str = 'text'
hf_streaming: bool = False
pack_sequences: bool = False
datasets: list[DatasetSource]
mix_temperature: float = 1.0
phases: list[TrainingPhase]
anneal_start_step: int = 0
anneal_weights: dict[str, float]
__init__(dataset_path='', file_pattern='*.npy', tokenizer_path='', num_workers=4, pin_memory=True, prefetch_factor=2, hf_dataset_name=None, hf_dataset_config=None, hf_dataset_split='train', hf_dataset_text_field='text', hf_streaming=False, pack_sequences=False, datasets=<factory>, mix_temperature=1.0, phases=<factory>, anneal_start_step=0, anneal_weights=<factory>)
Parameters:
Return type:

None

class kempnerforge.config.DistributedConfig[source]

Bases: object

Parallelism dimensions and distributed settings.

dp_shard: int = -1
dp_replicate: int = 1
tp: int = 1
pp: int = 1
pp_schedule: PipelineSchedule = '1f1b'
cp: int = 1
ep: int = 1
nccl_timeout_sec: int = 1800
backend: str = 'cpu:gloo,cuda:nccl'
validate_world_size(world_size)[source]

Validate that parallelism dimensions match world size.

Parameters:

world_size (int)

Return type:

None

resolve(world_size)[source]

Return a copy with dp_shard resolved to a concrete value.

Parameters:

world_size (int)

Return type:

DistributedConfig

__init__(dp_shard=-1, dp_replicate=1, tp=1, pp=1, pp_schedule=PipelineSchedule.schedule_1f1b, cp=1, ep=1, nccl_timeout_sec=1800, backend='cpu:gloo,cuda:nccl')
Parameters:
Return type:

None

class kempnerforge.config.EvalConfig[source]

Bases: object

Evaluation pipeline settings (disabled by default).

enabled: bool = False
interval: int = 1000
steps: int = 50
dataset_path: str = ''
file_pattern: str = '*.npy'
hf_dataset_name: str | None = None
hf_dataset_config: str | None = None
hf_dataset_split: str = 'validation'
__init__(enabled=False, interval=1000, steps=50, dataset_path='', file_pattern='*.npy', hf_dataset_name=None, hf_dataset_config=None, hf_dataset_split='validation')
Parameters:
  • enabled (bool)

  • interval (int)

  • steps (int)

  • dataset_path (str)

  • file_pattern (str)

  • hf_dataset_name (str | None)

  • hf_dataset_config (str | None)

  • hf_dataset_split (str)

Return type:

None

class kempnerforge.config.JobConfig[source]

Bases: object

Top-level configuration aggregating all sub-configs.

model: ModelConfig
train: TrainConfig
optimizer: OptimizerConfig
scheduler: SchedulerConfig
data: DataConfig
eval: EvalConfig
distributed: DistributedConfig
checkpoint: CheckpointConfig
metrics: MetricsConfig
profiling: ProfilingConfig
validate(world_size=1)[source]

Run cross-config validations.

Parameters:

world_size (int)

Return type:

None

__init__(model=<factory>, train=<factory>, optimizer=<factory>, scheduler=<factory>, data=<factory>, eval=<factory>, distributed=<factory>, checkpoint=<factory>, metrics=<factory>, profiling=<factory>)
Parameters:
Return type:

None

class kempnerforge.config.MetricsConfig[source]

Bases: object

Logging and metrics settings.

log_interval: int = 10
enable_wandb: bool = False
enable_tensorboard: bool = False
wandb_project: str = 'kempnerforge'
wandb_run_name: str | None = None
wandb_run_id: str = ''
tensorboard_dir: str = 'tb_logs'
__init__(log_interval=10, enable_wandb=False, enable_tensorboard=False, wandb_project='kempnerforge', wandb_run_name=None, wandb_run_id='', tensorboard_dir='tb_logs')
Parameters:
  • log_interval (int)

  • enable_wandb (bool)

  • enable_tensorboard (bool)

  • wandb_project (str)

  • wandb_run_name (str | None)

  • wandb_run_id (str)

  • tensorboard_dir (str)

Return type:

None

class kempnerforge.config.ModelConfig[source]

Bases: object

Architecture hyperparameters for a transformer model.

dim: int = 4096
n_layers: int = 32
n_heads: int = 32
n_kv_heads: int | None = None
vocab_size: int = 32000
ffn_dim_multiplier: float = 1.0
ffn_hidden_dim: int | None = None
norm_type: NormType = 'rmsnorm'
norm_eps: float = 1e-05
activation: Activation = 'silu'
max_seq_len: int = 2048
rope_theta: float = 10000.0
tie_embeddings: bool = False
qk_norm: bool = False
init_std: float = 0.02
model_type: str = 'transformer'
sdpa_backend: str = 'auto'
num_experts: int = 0
moe_top_k: int = 2
moe_frequency: int = 1
moe_router: str = 'softmax_topk'
moe_shared_experts: int = 0
moe_aux_loss_weight: float = 0.01
moe_capacity_factor: float = 0.0
moe_sequence_aux_loss_weight: float = 0.0
moe_gradient_scale: bool = False
moe_bias_schedule: str = 'constant'
moe_packed_experts: bool = False
property is_moe: bool

Whether this config uses Mixture-of-Experts.

property head_dim: int
property computed_ffn_hidden_dim: int

FFN hidden dimension, rounded to nearest multiple of 256 for hardware efficiency.

property num_params_estimate: int

Rough total parameter count estimate (excluding embedding if tied).

For MoE models, counts all expert parameters (total, not active).

__init__(dim=4096, n_layers=32, n_heads=32, n_kv_heads=None, vocab_size=32000, ffn_dim_multiplier=1.0, ffn_hidden_dim=None, norm_type=NormType.rmsnorm, norm_eps=1e-05, activation=Activation.silu, max_seq_len=2048, rope_theta=10000.0, tie_embeddings=False, qk_norm=False, init_std=0.02, model_type='transformer', sdpa_backend='auto', num_experts=0, moe_top_k=2, moe_frequency=1, moe_router='softmax_topk', moe_shared_experts=0, moe_aux_loss_weight=0.01, moe_capacity_factor=0.0, moe_sequence_aux_loss_weight=0.0, moe_gradient_scale=False, moe_bias_schedule='constant', moe_packed_experts=False)
Parameters:
  • dim (int)

  • n_layers (int)

  • n_heads (int)

  • n_kv_heads (int | None)

  • vocab_size (int)

  • ffn_dim_multiplier (float)

  • ffn_hidden_dim (int | None)

  • norm_type (NormType)

  • norm_eps (float)

  • activation (Activation)

  • max_seq_len (int)

  • rope_theta (float)

  • tie_embeddings (bool)

  • qk_norm (bool)

  • init_std (float)

  • model_type (str)

  • sdpa_backend (str)

  • num_experts (int)

  • moe_top_k (int)

  • moe_frequency (int)

  • moe_router (str)

  • moe_shared_experts (int)

  • moe_aux_loss_weight (float)

  • moe_capacity_factor (float)

  • moe_sequence_aux_loss_weight (float)

  • moe_gradient_scale (bool)

  • moe_bias_schedule (str)

  • moe_packed_experts (bool)

Return type:

None

class kempnerforge.config.OptimizerConfig[source]

Bases: object

Optimizer settings.

name: str = 'adamw'
lr: float = 0.0003
weight_decay: float = 0.1
betas: tuple[float, float] = (0.9, 0.95)
eps: float = 1e-08
fused: bool = True
muon_momentum: float = 0.95
muon_ns_steps: int = 5
muon_adam_lr: float | None = None
schedule_free_warmup_steps: int = 0
__init__(name='adamw', lr=0.0003, weight_decay=0.1, betas=(0.9, 0.95), eps=1e-08, fused=True, muon_momentum=0.95, muon_ns_steps=5, muon_adam_lr=None, schedule_free_warmup_steps=0)
Parameters:
Return type:

None

class kempnerforge.config.ProfilingConfig[source]

Bases: object

Performance profiling settings.

enable: bool = False
start_step: int = 5
end_step: int = 8
trace_dir: str = 'profiler_traces'
__init__(enable=False, start_step=5, end_step=8, trace_dir='profiler_traces')
Parameters:
Return type:

None

class kempnerforge.config.SchedulerConfig[source]

Bases: object

Learning rate schedule settings.

name: SchedulerType = 'cosine'
warmup_steps: int = 2000
decay_steps: int | None = None
min_lr_ratio: float = 0.1
stable_steps: int | None = None
wsd_decay_type: str = 'cosine'
rex_alpha: float = 1.0
__init__(name=SchedulerType.cosine, warmup_steps=2000, decay_steps=None, min_lr_ratio=0.1, stable_steps=None, wsd_decay_type='cosine', rex_alpha=1.0)
Parameters:
Return type:

None

class kempnerforge.config.TrainConfig[source]

Bases: object

Training hyperparameters.

batch_size: int = 8
seq_len: int = 2048
max_steps: int = 100000
grad_accum_steps: int = 1
grad_clip_norm: float = 1.0
seed: int = 42
compile_model: bool = True
mixed_precision: Literal['bf16', 'fp16', 'fp32', 'fp8'] = 'bf16'
activation_checkpointing: ActivationCheckpointing = 'none'
loss_fn: str = 'cross_entropy'
z_loss_weight: float = 0.0
ce_chunk_size: int = 0
shutdown_timeout_sec: float = 600.0
nccl_health_check_interval: int = 0
property param_dtype: torch.dtype

Resolve mixed_precision to the master weight dtype.

FP8 uses bf16 master weights – FP8 is a compute mode, not a storage dtype.

property is_fp8: bool

Whether FP8 mixed precision is enabled.

__init__(batch_size=8, seq_len=2048, max_steps=100000, grad_accum_steps=1, grad_clip_norm=1.0, seed=42, compile_model=True, mixed_precision='bf16', activation_checkpointing=ActivationCheckpointing.none, loss_fn='cross_entropy', z_loss_weight=0.0, ce_chunk_size=0, shutdown_timeout_sec=600.0, nccl_health_check_interval=0)
Parameters:
Return type:

None

kempnerforge.config.load_config(config_path=None, cli_args=None)[source]

Load a JobConfig from optional TOML file + CLI overrides.

The returned config has all sub-config __post_init__ validations applied. Cross-config validation (e.g., parallelism vs world_size) requires calling config.validate(world_size=…) separately at distributed setup time.

Parameters:
  • config_path (str | Path | None) – Path to a TOML config file (or None for defaults).

  • cli_args (list[str] | None) – CLI arguments to parse (defaults to sys.argv[1:]).

Returns:

A JobConfig with layered defaults → TOML → CLI overrides.

Return type:

JobConfig

Modules

checkpoint

Checkpoint configuration.

data

Data pipeline configuration.

distributed

Distributed parallelism configuration.

eval

Evaluation configuration.

job

Top-level job configuration aggregating all sub-configs.

loader

Config loading: TOML files → dataclass configs with CLI overrides.

metrics

Metrics configuration.

model

Model architecture configuration.

optimizer

Optimizer configuration.

profiling

Profiling configuration.

registry

Central registry for named components.

scheduler

LR scheduler configuration.

schema

Backward-compatible re-exports.

training

Training configuration.