Index _ | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | Z _ __init__() (kempnerforge.checkpoint.async_save.AsyncCheckpointer method) (kempnerforge.checkpoint.AsyncCheckpointer method) (kempnerforge.checkpoint.CheckpointManager method) (kempnerforge.checkpoint.manager.CheckpointManager method) (kempnerforge.config.checkpoint.CheckpointConfig method) (kempnerforge.config.CheckpointConfig method) (kempnerforge.config.data.DataConfig method) (kempnerforge.config.data.DatasetSource method) (kempnerforge.config.data.TrainingPhase method) (kempnerforge.config.DataConfig method) (kempnerforge.config.distributed.DistributedConfig method) (kempnerforge.config.DistributedConfig method) (kempnerforge.config.eval.EvalConfig method) (kempnerforge.config.EvalConfig method) (kempnerforge.config.job.JobConfig method) (kempnerforge.config.JobConfig method) (kempnerforge.config.metrics.MetricsConfig method) (kempnerforge.config.MetricsConfig method) (kempnerforge.config.model.ModelConfig method) (kempnerforge.config.ModelConfig method) (kempnerforge.config.optimizer.OptimizerConfig method) (kempnerforge.config.OptimizerConfig method) (kempnerforge.config.profiling.ProfilingConfig method) (kempnerforge.config.ProfilingConfig method) (kempnerforge.config.scheduler.SchedulerConfig method) (kempnerforge.config.SchedulerConfig method) (kempnerforge.config.TrainConfig method) (kempnerforge.config.training.TrainConfig method) (kempnerforge.data.dataloader.StatefulDataLoader method) (kempnerforge.data.dataset.HuggingFaceDataset method) (kempnerforge.data.dataset.MemoryMappedDataset method) (kempnerforge.data.dataset.MixtureDataset method) (kempnerforge.data.dataset.StreamingHuggingFaceDataset method) (kempnerforge.data.DistributedSampler method) (kempnerforge.data.HuggingFaceDataset method) (kempnerforge.data.MemoryMappedDataset method) (kempnerforge.data.sampler.DistributedSampler method) (kempnerforge.data.sampler.MixtureSampler method) (kempnerforge.data.StatefulDataLoader method) (kempnerforge.distributed.pipeline_parallel.PipelineStageModule method) (kempnerforge.metrics.DeviceMemoryMonitor method) (kempnerforge.metrics.memory.DeviceMemoryMonitor method) (kempnerforge.metrics.MetricsTracker method) (kempnerforge.metrics.StepMetrics method) (kempnerforge.metrics.tracker.MetricsTracker method) (kempnerforge.metrics.tracker.StepMetrics method) (kempnerforge.metrics.tracker.TensorBoardBackend method) (kempnerforge.metrics.tracker.WandBBackend method) (kempnerforge.model.attention.Attention method) (kempnerforge.model.attention.KVCache method) (kempnerforge.model.embedding.OutputHead method) (kempnerforge.model.embedding.TokenEmbedding method) (kempnerforge.model.hooks.ActivationStore method) (kempnerforge.model.mlp.StandardMLP method) (kempnerforge.model.mlp.SwiGLUMLP method) (kempnerforge.model.moe.MoEMLP method) (kempnerforge.model.norm.RMSNorm method) (kempnerforge.model.router.SigmoidTopKRouter method) (kempnerforge.model.router.SoftmaxTopKRouter method) (kempnerforge.model.Transformer method) (kempnerforge.model.transformer.Transformer method) (kempnerforge.model.transformer.TransformerBlock method) (kempnerforge.model.TransformerBlock method) (kempnerforge.profiling.cuda_timer.CUDATimer method) (kempnerforge.profiling.cuda_timer.CUDATimerCollection method) (kempnerforge.profiling.CUDATimer method) (kempnerforge.profiling.CUDATimerCollection method) (kempnerforge.resilience.elastic.SLURMInfo method) (kempnerforge.resilience.health.NaNDetector method) (kempnerforge.resilience.health.NaNState method) (kempnerforge.resilience.NaNDetector method) (kempnerforge.resilience.NaNState method) (kempnerforge.resilience.ShutdownHandler method) (kempnerforge.resilience.signal_handler.ShutdownHandler method) (kempnerforge.resilience.SLURMInfo method) (kempnerforge.training.hooks.HookRunner method) (kempnerforge.training.hooks.StepContext method) (kempnerforge.training.optimizer.Lion method) (kempnerforge.training.optimizer.Muon method) (kempnerforge.training.optimizer.ScheduleFreeAdamW method) __new__() (kempnerforge.config.checkpoint.AsyncCheckpointMode method) (kempnerforge.config.distributed.PipelineSchedule method) (kempnerforge.config.model.Activation method) (kempnerforge.config.model.NormType method) (kempnerforge.config.scheduler.SchedulerType method) (kempnerforge.config.training.ActivationCheckpointing method) A Activation (class in kempnerforge.config.model) activation (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) activation_checkpointing (kempnerforge.config.TrainConfig attribute) (kempnerforge.config.training.TrainConfig attribute) ActivationCheckpointing (class in kempnerforge.config.training) activations (kempnerforge.model.hooks.ActivationStore property) ActivationStore (class in kempnerforge.model.hooks) allocated_gb (kempnerforge.metrics.StepMetrics attribute) (kempnerforge.metrics.tracker.StepMetrics attribute) anneal_start_step (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) anneal_weights (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) apply_ac() (in module kempnerforge.distributed) (in module kempnerforge.distributed.parallel) apply_dataloader_state() (kempnerforge.checkpoint.CheckpointManager method) (kempnerforge.checkpoint.manager.CheckpointManager method) apply_expert_parallel() (in module kempnerforge.distributed.expert_parallel) apply_float8() (in module kempnerforge.distributed.parallel) apply_fsdp2() (in module kempnerforge.distributed) (in module kempnerforge.distributed.parallel) apply_rope() (in module kempnerforge.model.position) apply_tensor_parallel() (in module kempnerforge.distributed) (in module kempnerforge.distributed.tensor_parallel) array_task_id (kempnerforge.resilience.elastic.SLURMInfo attribute) (kempnerforge.resilience.SLURMInfo attribute) async_ (kempnerforge.config.checkpoint.AsyncCheckpointMode attribute) async_mode (kempnerforge.config.checkpoint.CheckpointConfig attribute) (kempnerforge.config.CheckpointConfig attribute) async_pinned (kempnerforge.config.checkpoint.AsyncCheckpointMode attribute) AsyncCheckpointer (class in kempnerforge.checkpoint) (class in kempnerforge.checkpoint.async_save) AsyncCheckpointMode (class in kempnerforge.config.checkpoint) Attention (class in kempnerforge.model.attention) aux_loss (kempnerforge.model.moe.MoEMLP property) B backend (kempnerforge.config.distributed.DistributedConfig attribute) (kempnerforge.config.DistributedConfig attribute) batch_size (kempnerforge.config.TrainConfig attribute) (kempnerforge.config.training.TrainConfig attribute) betas (kempnerforge.config.optimizer.OptimizerConfig attribute) (kempnerforge.config.OptimizerConfig attribute) build_loss_fn() (in module kempnerforge.training) (in module kempnerforge.training.loss) build_mlp() (in module kempnerforge.model.mlp) build_moe() (in module kempnerforge.model.moe) build_norm() (in module kempnerforge.model.norm) build_optimizer() (in module kempnerforge.training) (in module kempnerforge.training.optimizer) build_parallel_model() (in module kempnerforge.distributed.parallel) build_pipeline_schedule() (in module kempnerforge.distributed) (in module kempnerforge.distributed.pipeline_parallel) build_pipeline_stage() (in module kempnerforge.distributed) (in module kempnerforge.distributed.pipeline_parallel) build_profiler() (in module kempnerforge.profiling) (in module kempnerforge.profiling.profiler) build_scheduler() (in module kempnerforge.training) (in module kempnerforge.training.scheduler) build_stage_module() (in module kempnerforge.distributed) (in module kempnerforge.distributed.pipeline_parallel) build_train_state() (in module kempnerforge.checkpoint) (in module kempnerforge.checkpoint.state) C capture_snapshot() (kempnerforge.metrics.DeviceMemoryMonitor method) (kempnerforge.metrics.memory.DeviceMemoryMonitor method) ce_chunk_size (kempnerforge.config.TrainConfig attribute) (kempnerforge.config.training.TrainConfig attribute) check_gpu_health() (in module kempnerforge.resilience) (in module kempnerforge.resilience.health) check_gradients() (kempnerforge.resilience.health.NaNDetector method) (kempnerforge.resilience.NaNDetector method) check_loss() (kempnerforge.resilience.health.NaNDetector method) (kempnerforge.resilience.NaNDetector method) check_nccl_health() (in module kempnerforge.resilience) (in module kempnerforge.resilience.health) checkpoint (kempnerforge.config.job.JobConfig attribute) (kempnerforge.config.JobConfig attribute) CheckpointConfig (class in kempnerforge.config) (class in kempnerforge.config.checkpoint) CheckpointManager (class in kempnerforge.checkpoint) (class in kempnerforge.checkpoint.manager) chunked_cross_entropy_loss() (in module kempnerforge.training.loss) clear() (kempnerforge.model.hooks.ActivationStore method) clip_grad_norm_() (in module kempnerforge.distributed) (in module kempnerforge.distributed.utils) close() (kempnerforge.data.dataset.MemoryMappedDataset method) (kempnerforge.data.MemoryMappedDataset method) (kempnerforge.metrics.MetricsTracker method) (kempnerforge.metrics.tracker.MetricsTracker method) (kempnerforge.metrics.tracker.TensorBoardBackend method) (kempnerforge.metrics.tracker.WandBBackend method) compile_model (kempnerforge.config.TrainConfig attribute) (kempnerforge.config.training.TrainConfig attribute) compute_layer_assignment() (in module kempnerforge.distributed) (in module kempnerforge.distributed.pipeline_parallel) compute_mfu() (in module kempnerforge.metrics) (in module kempnerforge.metrics.mfu) computed_ffn_hidden_dim (kempnerforge.config.model.ModelConfig property) (kempnerforge.config.ModelConfig property) consecutive_nans (kempnerforge.resilience.health.NaNState attribute) (kempnerforge.resilience.NaNState attribute) constant (kempnerforge.config.scheduler.SchedulerType attribute) cosine (kempnerforge.config.scheduler.SchedulerType attribute) cp (kempnerforge.config.distributed.DistributedConfig attribute) (kempnerforge.config.DistributedConfig attribute) cross_entropy_loss() (in module kempnerforge.training.loss) CUDATimer (class in kempnerforge.profiling) (class in kempnerforge.profiling.cuda_timer) CUDATimerCollection (class in kempnerforge.profiling) (class in kempnerforge.profiling.cuda_timer) cumulative_sizes (kempnerforge.data.dataset.MixtureDataset property) D data (kempnerforge.config.job.JobConfig attribute) (kempnerforge.config.JobConfig attribute) DataConfig (class in kempnerforge.config) (class in kempnerforge.config.data) dataset_names (kempnerforge.data.dataset.MixtureDataset property) dataset_path (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) (kempnerforge.config.eval.EvalConfig attribute) (kempnerforge.config.EvalConfig attribute) dataset_weights (kempnerforge.config.data.TrainingPhase attribute) datasets (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) DatasetSource (class in kempnerforge.config.data) decay_steps (kempnerforge.config.scheduler.SchedulerConfig attribute) (kempnerforge.config.SchedulerConfig attribute) default_mp_policy() (in module kempnerforge.distributed) (in module kempnerforge.distributed.parallel) destroy_distributed() (in module kempnerforge.distributed) (in module kempnerforge.distributed.setup) DeviceMemoryMonitor (class in kempnerforge.metrics) (class in kempnerforge.metrics.memory) dim (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) dir (kempnerforge.config.checkpoint.CheckpointConfig attribute) (kempnerforge.config.CheckpointConfig attribute) disable() (kempnerforge.model.hooks.ActivationStore method) disabled (kempnerforge.config.checkpoint.AsyncCheckpointMode attribute) distributed (kempnerforge.config.job.JobConfig attribute) (kempnerforge.config.JobConfig attribute) DistributedConfig (class in kempnerforge.config) (class in kempnerforge.config.distributed) DistributedSampler (class in kempnerforge.data) (class in kempnerforge.data.sampler) dp_replicate (kempnerforge.config.distributed.DistributedConfig attribute) (kempnerforge.config.DistributedConfig attribute) dp_shard (kempnerforge.config.distributed.DistributedConfig attribute) (kempnerforge.config.DistributedConfig attribute) E elapsed_all() (kempnerforge.profiling.cuda_timer.CUDATimerCollection method) (kempnerforge.profiling.CUDATimerCollection method) elapsed_ms() (kempnerforge.profiling.cuda_timer.CUDATimer method) (kempnerforge.profiling.cuda_timer.CUDATimerCollection method) (kempnerforge.profiling.CUDATimer method) (kempnerforge.profiling.CUDATimerCollection method) enable (kempnerforge.config.profiling.ProfilingConfig attribute) (kempnerforge.config.ProfilingConfig attribute) enable() (kempnerforge.model.hooks.ActivationStore method) enable_tensorboard (kempnerforge.config.metrics.MetricsConfig attribute) (kempnerforge.config.MetricsConfig attribute) enable_wandb (kempnerforge.config.metrics.MetricsConfig attribute) (kempnerforge.config.MetricsConfig attribute) enabled (kempnerforge.config.eval.EvalConfig attribute) (kempnerforge.config.EvalConfig attribute) (kempnerforge.model.hooks.ActivationStore property) (kempnerforge.profiling.cuda_timer.CUDATimerCollection property) (kempnerforge.profiling.CUDATimerCollection property) end_step (kempnerforge.config.profiling.ProfilingConfig attribute) (kempnerforge.config.ProfilingConfig attribute) end_step() (kempnerforge.metrics.MetricsTracker method) (kempnerforge.metrics.tracker.MetricsTracker method) ep (kempnerforge.config.distributed.DistributedConfig attribute) (kempnerforge.config.DistributedConfig attribute) ep_dispatch_and_compute() (in module kempnerforge.distributed.expert_parallel) eps (kempnerforge.config.optimizer.OptimizerConfig attribute) (kempnerforge.config.OptimizerConfig attribute) estimate_model_flops_per_token() (in module kempnerforge.metrics) (in module kempnerforge.metrics.mfu) eval (kempnerforge.config.job.JobConfig attribute) (kempnerforge.config.JobConfig attribute) eval_params() (kempnerforge.training.optimizer.ScheduleFreeAdamW method) EvalConfig (class in kempnerforge.config) (class in kempnerforge.config.eval) exclude_from_loading (kempnerforge.config.checkpoint.CheckpointConfig attribute) (kempnerforge.config.CheckpointConfig attribute) expert_counts (kempnerforge.model.moe.MoEMLP property) export_dtype (kempnerforge.config.checkpoint.CheckpointConfig attribute) (kempnerforge.config.CheckpointConfig attribute) extract_representations() (in module kempnerforge.model.hooks) F ffn_dim_multiplier (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) ffn_hidden_dim (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) file_pattern (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) (kempnerforge.config.eval.EvalConfig attribute) (kempnerforge.config.EvalConfig attribute) finish() (kempnerforge.resilience.ShutdownHandler method) (kempnerforge.resilience.signal_handler.ShutdownHandler method) format_memory_stats() (in module kempnerforge.metrics) (in module kempnerforge.metrics.memory) format_metrics() (in module kempnerforge.metrics) (in module kempnerforge.metrics.logger) forward() (kempnerforge.distributed.pipeline_parallel.PipelineStageModule method) (kempnerforge.model.attention.Attention method) (kempnerforge.model.embedding.OutputHead method) (kempnerforge.model.embedding.TokenEmbedding method) (kempnerforge.model.mlp.StandardMLP method) (kempnerforge.model.mlp.SwiGLUMLP method) (kempnerforge.model.moe.MoEMLP method) (kempnerforge.model.norm.RMSNorm method) (kempnerforge.model.router.SigmoidTopKRouter method) (kempnerforge.model.router.SoftmaxTopKRouter method) (kempnerforge.model.Transformer method) (kempnerforge.model.transformer.Transformer method) (kempnerforge.model.transformer.TransformerBlock method) (kempnerforge.model.TransformerBlock method) full (kempnerforge.config.training.ActivationCheckpointing attribute) fused (kempnerforge.config.optimizer.OptimizerConfig attribute) (kempnerforge.config.OptimizerConfig attribute) G gelu (kempnerforge.config.model.Activation attribute) generate() (in module kempnerforge.model.generate) get() (kempnerforge.model.hooks.ActivationStore method) get_dp_info() (in module kempnerforge.distributed.utils) get_dp_mesh() (in module kempnerforge.distributed) (in module kempnerforge.distributed.parallel) get_expert_counts() (kempnerforge.model.Transformer method) (kempnerforge.model.transformer.Transformer method) get_gpu_peak_tflops() (in module kempnerforge.metrics) (in module kempnerforge.metrics.mfu) get_logger() (in module kempnerforge.metrics) (in module kempnerforge.metrics.logger) get_memory_stats() (in module kempnerforge.metrics) (in module kempnerforge.metrics.memory) get_memory_utilization() (in module kempnerforge.metrics) (in module kempnerforge.metrics.memory) get_moe_aux_loss() (kempnerforge.model.Transformer method) (kempnerforge.model.transformer.Transformer method) get_pp_mesh() (in module kempnerforge.distributed) (in module kempnerforge.distributed.pipeline_parallel) get_pp_rank() (in module kempnerforge.distributed) (in module kempnerforge.distributed.pipeline_parallel) get_pp_size() (in module kempnerforge.distributed) (in module kempnerforge.distributed.pipeline_parallel) get_rng_state() (in module kempnerforge.checkpoint) (in module kempnerforge.checkpoint.state) get_slurm_info() (in module kempnerforge.resilience) (in module kempnerforge.resilience.elastic) get_tp_mesh() (in module kempnerforge.distributed) (in module kempnerforge.distributed.tensor_parallel) get_world_info() (in module kempnerforge.distributed) (in module kempnerforge.distributed.setup) gpipe (kempnerforge.config.distributed.PipelineSchedule attribute) grad_accum_steps (kempnerforge.config.TrainConfig attribute) (kempnerforge.config.training.TrainConfig attribute) grad_clip_norm (kempnerforge.config.TrainConfig attribute) (kempnerforge.config.training.TrainConfig attribute) grad_norm (kempnerforge.metrics.StepMetrics attribute) (kempnerforge.metrics.tracker.StepMetrics attribute) (kempnerforge.training.hooks.StepContext attribute) grouped_expert_forward() (in module kempnerforge.model.moe) grouped_expert_forward_packed() (in module kempnerforge.model.moe) H has_dp_mesh() (in module kempnerforge.distributed.parallel) head_dim (kempnerforge.config.model.ModelConfig property) (kempnerforge.config.ModelConfig property) hf_config (kempnerforge.config.data.DatasetSource attribute) hf_dataset_config (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) (kempnerforge.config.eval.EvalConfig attribute) (kempnerforge.config.EvalConfig attribute) hf_dataset_name (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) (kempnerforge.config.eval.EvalConfig attribute) (kempnerforge.config.EvalConfig attribute) hf_dataset_split (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) (kempnerforge.config.eval.EvalConfig attribute) (kempnerforge.config.EvalConfig attribute) hf_dataset_text_field (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) hf_name (kempnerforge.config.data.DatasetSource attribute) hf_streaming (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) HookRunner (class in kempnerforge.training.hooks) hooks (kempnerforge.training.hooks.HookRunner attribute) HuggingFaceDataset (class in kempnerforge.data) (class in kempnerforge.data.dataset) I init_backends() (kempnerforge.metrics.MetricsTracker method) (kempnerforge.metrics.tracker.MetricsTracker method) init_distributed() (in module kempnerforge.distributed) (in module kempnerforge.distributed.setup) init_std (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) init_weights() (in module kempnerforge.model.init) init_weights_and_freqs() (kempnerforge.distributed.pipeline_parallel.PipelineStageModule method) (kempnerforge.model.Transformer method) (kempnerforge.model.transformer.Transformer method) interleaved_1f1b (kempnerforge.config.distributed.PipelineSchedule attribute) interval (kempnerforge.config.checkpoint.CheckpointConfig attribute) (kempnerforge.config.CheckpointConfig attribute) (kempnerforge.config.eval.EvalConfig attribute) (kempnerforge.config.EvalConfig attribute) is_fp8 (kempnerforge.config.TrainConfig property) (kempnerforge.config.training.TrainConfig property) is_moe (kempnerforge.config.model.ModelConfig property) (kempnerforge.config.ModelConfig property) is_pending (kempnerforge.checkpoint.async_save.AsyncCheckpointer property) (kempnerforge.checkpoint.AsyncCheckpointer property) is_rank_zero() (in module kempnerforge.distributed) (in module kempnerforge.distributed.setup) is_requeued (kempnerforge.resilience.elastic.SLURMInfo property) (kempnerforge.resilience.SLURMInfo property) is_slurm_job() (in module kempnerforge.resilience) (in module kempnerforge.resilience.elastic) is_slurm_requeue() (in module kempnerforge.resilience) (in module kempnerforge.resilience.elastic) J job_id (kempnerforge.resilience.elastic.SLURMInfo attribute) (kempnerforge.resilience.SLURMInfo attribute) job_name (kempnerforge.resilience.elastic.SLURMInfo attribute) (kempnerforge.resilience.SLURMInfo attribute) JobConfig (class in kempnerforge.config) (class in kempnerforge.config.job) K keep_last_n (kempnerforge.config.checkpoint.CheckpointConfig attribute) (kempnerforge.config.CheckpointConfig attribute) kempnerforge.checkpoint module kempnerforge.checkpoint.async_save module kempnerforge.checkpoint.manager module kempnerforge.checkpoint.state module kempnerforge.config module kempnerforge.config.checkpoint module kempnerforge.config.data module kempnerforge.config.distributed module kempnerforge.config.eval module kempnerforge.config.job module kempnerforge.config.loader module kempnerforge.config.metrics module kempnerforge.config.model module kempnerforge.config.optimizer module kempnerforge.config.profiling module kempnerforge.config.scheduler module kempnerforge.config.schema module kempnerforge.config.training module kempnerforge.data module kempnerforge.data.dataloader module kempnerforge.data.dataset module kempnerforge.data.sampler module kempnerforge.distributed module kempnerforge.distributed.expert_parallel module kempnerforge.distributed.parallel module kempnerforge.distributed.pipeline_parallel module kempnerforge.distributed.setup module kempnerforge.distributed.tensor_parallel module kempnerforge.distributed.utils module kempnerforge.metrics module kempnerforge.metrics.logger module kempnerforge.metrics.memory module kempnerforge.metrics.mfu module kempnerforge.metrics.tracker module kempnerforge.model module kempnerforge.model.attention module kempnerforge.model.embedding module kempnerforge.model.generate module kempnerforge.model.hooks module kempnerforge.model.init module kempnerforge.model.mlp module kempnerforge.model.moe module kempnerforge.model.norm module kempnerforge.model.position module kempnerforge.model.router module kempnerforge.model.transformer module kempnerforge.profiling module kempnerforge.profiling.cuda_timer module kempnerforge.profiling.profiler module kempnerforge.resilience module kempnerforge.resilience.elastic module kempnerforge.resilience.health module kempnerforge.resilience.signal_handler module kempnerforge.training module kempnerforge.training.eval module kempnerforge.training.grad module kempnerforge.training.hooks module kempnerforge.training.loss module kempnerforge.training.optimizer module kempnerforge.training.scheduler module KVCache (class in kempnerforge.model.attention) L last_good_loss (kempnerforge.resilience.health.NaNState attribute) (kempnerforge.resilience.NaNState attribute) last_good_step (kempnerforge.resilience.health.NaNState attribute) (kempnerforge.resilience.NaNState attribute) layer_names (kempnerforge.model.hooks.ActivationStore property) layernorm (kempnerforge.config.model.NormType attribute) linear (kempnerforge.config.scheduler.SchedulerType attribute) Lion (class in kempnerforge.training.optimizer) load() (kempnerforge.checkpoint.CheckpointManager method) (kempnerforge.checkpoint.manager.CheckpointManager method) load_config() (in module kempnerforge.config) (in module kempnerforge.config.loader) load_path (kempnerforge.config.checkpoint.CheckpointConfig attribute) (kempnerforge.config.CheckpointConfig attribute) load_state_dict() (kempnerforge.data.dataloader.StatefulDataLoader method) (kempnerforge.data.dataset.HuggingFaceDataset method) (kempnerforge.data.dataset.MemoryMappedDataset method) (kempnerforge.data.dataset.MixtureDataset method) (kempnerforge.data.dataset.StreamingHuggingFaceDataset method) (kempnerforge.data.DistributedSampler method) (kempnerforge.data.HuggingFaceDataset method) (kempnerforge.data.MemoryMappedDataset method) (kempnerforge.data.sampler.DistributedSampler method) (kempnerforge.data.sampler.MixtureSampler method) (kempnerforge.data.StatefulDataLoader method) (kempnerforge.training.optimizer.Muon method) (kempnerforge.training.optimizer.ScheduleFreeAdamW method) load_toml() (in module kempnerforge.config.loader) log() (kempnerforge.metrics.tracker.TensorBoardBackend method) (kempnerforge.metrics.tracker.WandBBackend method) log_eval() (kempnerforge.metrics.MetricsTracker method) (kempnerforge.metrics.tracker.MetricsTracker method) log_interval (kempnerforge.config.metrics.MetricsConfig attribute) (kempnerforge.config.MetricsConfig attribute) log_job_info() (in module kempnerforge.resilience) (in module kempnerforge.resilience.elastic) loss (kempnerforge.metrics.StepMetrics attribute) (kempnerforge.metrics.tracker.StepMetrics attribute) (kempnerforge.training.hooks.StepContext attribute) loss_fn (kempnerforge.config.TrainConfig attribute) (kempnerforge.config.training.TrainConfig attribute) lr (kempnerforge.config.optimizer.OptimizerConfig attribute) (kempnerforge.config.OptimizerConfig attribute) (kempnerforge.metrics.StepMetrics attribute) (kempnerforge.metrics.tracker.StepMetrics attribute) (kempnerforge.training.hooks.StepContext attribute) lr_scale (kempnerforge.config.data.TrainingPhase attribute) M max_seq_len (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) max_steps (kempnerforge.config.TrainConfig attribute) (kempnerforge.config.training.TrainConfig attribute) maybe_no_sync() (in module kempnerforge.training) (in module kempnerforge.training.grad) mem_utilization (kempnerforge.metrics.StepMetrics attribute) (kempnerforge.metrics.tracker.StepMetrics attribute) MemoryMappedDataset (class in kempnerforge.data) (class in kempnerforge.data.dataset) metrics (kempnerforge.config.job.JobConfig attribute) (kempnerforge.config.JobConfig attribute) MetricsConfig (class in kempnerforge.config) (class in kempnerforge.config.metrics) MetricsTracker (class in kempnerforge.metrics) (class in kempnerforge.metrics.tracker) mfu (kempnerforge.metrics.StepMetrics attribute) (kempnerforge.metrics.tracker.StepMetrics attribute) min_lr_ratio (kempnerforge.config.scheduler.SchedulerConfig attribute) (kempnerforge.config.SchedulerConfig attribute) mix_temperature (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) mixed_precision (kempnerforge.config.TrainConfig attribute) (kempnerforge.config.training.TrainConfig attribute) MixtureDataset (class in kempnerforge.data.dataset) MixtureSampler (class in kempnerforge.data.sampler) model (kempnerforge.config.job.JobConfig attribute) (kempnerforge.config.JobConfig attribute) (kempnerforge.training.hooks.StepContext attribute) model_type (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) ModelConfig (class in kempnerforge.config) (class in kempnerforge.config.model) module kempnerforge.checkpoint kempnerforge.checkpoint.async_save kempnerforge.checkpoint.manager kempnerforge.checkpoint.state kempnerforge.config kempnerforge.config.checkpoint kempnerforge.config.data kempnerforge.config.distributed kempnerforge.config.eval kempnerforge.config.job kempnerforge.config.loader kempnerforge.config.metrics kempnerforge.config.model kempnerforge.config.optimizer kempnerforge.config.profiling kempnerforge.config.scheduler kempnerforge.config.schema kempnerforge.config.training kempnerforge.data kempnerforge.data.dataloader kempnerforge.data.dataset kempnerforge.data.sampler kempnerforge.distributed kempnerforge.distributed.expert_parallel kempnerforge.distributed.parallel kempnerforge.distributed.pipeline_parallel kempnerforge.distributed.setup kempnerforge.distributed.tensor_parallel kempnerforge.distributed.utils kempnerforge.metrics kempnerforge.metrics.logger kempnerforge.metrics.memory kempnerforge.metrics.mfu kempnerforge.metrics.tracker kempnerforge.model kempnerforge.model.attention kempnerforge.model.embedding kempnerforge.model.generate kempnerforge.model.hooks kempnerforge.model.init kempnerforge.model.mlp kempnerforge.model.moe kempnerforge.model.norm kempnerforge.model.position kempnerforge.model.router kempnerforge.model.transformer kempnerforge.profiling kempnerforge.profiling.cuda_timer kempnerforge.profiling.profiler kempnerforge.resilience kempnerforge.resilience.elastic kempnerforge.resilience.health kempnerforge.resilience.signal_handler kempnerforge.training kempnerforge.training.eval kempnerforge.training.grad kempnerforge.training.hooks kempnerforge.training.loss kempnerforge.training.optimizer kempnerforge.training.scheduler moe_aux_loss_weight (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) moe_bias_schedule (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) moe_capacity_factor (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) moe_frequency (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) moe_gradient_scale (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) moe_packed_experts (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) moe_router (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) moe_sequence_aux_loss_weight (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) moe_shared_experts (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) moe_top_k (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) MoEMLP (class in kempnerforge.model.moe) Muon (class in kempnerforge.training.optimizer) muon_adam_lr (kempnerforge.config.optimizer.OptimizerConfig attribute) (kempnerforge.config.OptimizerConfig attribute) muon_momentum (kempnerforge.config.optimizer.OptimizerConfig attribute) (kempnerforge.config.OptimizerConfig attribute) muon_ns_steps (kempnerforge.config.optimizer.OptimizerConfig attribute) (kempnerforge.config.OptimizerConfig attribute) N n_heads (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) n_kv_heads (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) n_layers (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) name (kempnerforge.config.data.DatasetSource attribute) (kempnerforge.config.optimizer.OptimizerConfig attribute) (kempnerforge.config.OptimizerConfig attribute) (kempnerforge.config.scheduler.SchedulerConfig attribute) (kempnerforge.config.SchedulerConfig attribute) nan_steps (kempnerforge.resilience.health.NaNState attribute) (kempnerforge.resilience.NaNState attribute) NaNDetector (class in kempnerforge.resilience) (class in kempnerforge.resilience.health) NaNState (class in kempnerforge.resilience) (class in kempnerforge.resilience.health) nccl_health_check_interval (kempnerforge.config.TrainConfig attribute) (kempnerforge.config.training.TrainConfig attribute) nccl_timeout_sec (kempnerforge.config.distributed.DistributedConfig attribute) (kempnerforge.config.DistributedConfig attribute) node_list (kempnerforge.resilience.elastic.SLURMInfo attribute) (kempnerforge.resilience.SLURMInfo attribute) none (kempnerforge.config.scheduler.SchedulerType attribute) (kempnerforge.config.training.ActivationCheckpointing attribute) norm_eps (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) norm_type (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) NormType (class in kempnerforge.config.model) ntasks_per_node (kempnerforge.resilience.elastic.SLURMInfo attribute) (kempnerforge.resilience.SLURMInfo attribute) num_experts (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) num_nodes (kempnerforge.resilience.elastic.SLURMInfo attribute) (kempnerforge.resilience.SLURMInfo attribute) num_params_estimate (kempnerforge.config.model.ModelConfig property) (kempnerforge.config.ModelConfig property) num_workers (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) O on_checkpoint_save() (kempnerforge.training.hooks.HookRunner method) (kempnerforge.training.hooks.TrainingHook method) on_eval_end() (kempnerforge.training.hooks.HookRunner method) (kempnerforge.training.hooks.TrainingHook method) on_step_end() (kempnerforge.training.hooks.HookRunner method) (kempnerforge.training.hooks.TrainingHook method) on_train_begin() (kempnerforge.training.hooks.HookRunner method) (kempnerforge.training.hooks.TrainingHook method) on_train_end() (kempnerforge.training.hooks.HookRunner method) (kempnerforge.training.hooks.TrainingHook method) optimizer (kempnerforge.config.job.JobConfig attribute) (kempnerforge.config.JobConfig attribute) (kempnerforge.training.hooks.StepContext attribute) OptimizerConfig (class in kempnerforge.config) (class in kempnerforge.config.optimizer) OutputHead (class in kempnerforge.model.embedding) P pack_sequences (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) param_dtype (kempnerforge.config.TrainConfig property) (kempnerforge.config.training.TrainConfig property) partition (kempnerforge.resilience.elastic.SLURMInfo attribute) (kempnerforge.resilience.SLURMInfo attribute) path (kempnerforge.config.data.DatasetSource attribute) peak_gb (kempnerforge.metrics.StepMetrics attribute) (kempnerforge.metrics.tracker.StepMetrics attribute) phases (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) pin_memory (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) PipelineSchedule (class in kempnerforge.config.distributed) PipelineStageModule (class in kempnerforge.distributed.pipeline_parallel) pp (kempnerforge.config.distributed.DistributedConfig attribute) (kempnerforge.config.DistributedConfig attribute) pp_schedule (kempnerforge.config.distributed.DistributedConfig attribute) (kempnerforge.config.DistributedConfig attribute) precompute_rope_frequencies() (in module kempnerforge.model.position) prefetch_factor (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) print_profiler_summary() (in module kempnerforge.profiling) (in module kempnerforge.profiling.profiler) profiling (kempnerforge.config.job.JobConfig attribute) (kempnerforge.config.JobConfig attribute) ProfilingConfig (class in kempnerforge.config) (class in kempnerforge.config.profiling) Q qk_norm (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) R register() (kempnerforge.resilience.ShutdownHandler method) (kempnerforge.resilience.signal_handler.ShutdownHandler method) registry (in module kempnerforge.config) relu (kempnerforge.config.model.Activation attribute) report() (kempnerforge.metrics.DeviceMemoryMonitor method) (kempnerforge.metrics.memory.DeviceMemoryMonitor method) reserved_gb (kempnerforge.metrics.StepMetrics attribute) (kempnerforge.metrics.tracker.StepMetrics attribute) reset() (kempnerforge.resilience.health.NaNDetector method) (kempnerforge.resilience.NaNDetector method) reset_peak_memory() (in module kempnerforge.metrics) (in module kempnerforge.metrics.memory) resolve() (kempnerforge.config.distributed.DistributedConfig method) (kempnerforge.config.DistributedConfig method) resolve_resume_path() (in module kempnerforge.resilience) (in module kempnerforge.resilience.elastic) restart_count (kempnerforge.resilience.elastic.SLURMInfo attribute) (kempnerforge.resilience.SLURMInfo attribute) restore_train_state() (in module kempnerforge.checkpoint) (in module kempnerforge.checkpoint.state) rex (kempnerforge.config.scheduler.SchedulerType attribute) rex_alpha (kempnerforge.config.scheduler.SchedulerConfig attribute) (kempnerforge.config.SchedulerConfig attribute) RMSNorm (class in kempnerforge.model.norm) rmsnorm (kempnerforge.config.model.NormType attribute) rope_theta (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) run_eval() (in module kempnerforge.training) (in module kempnerforge.training.eval) S sample() (in module kempnerforge.model.generate) save() (kempnerforge.checkpoint.async_save.AsyncCheckpointer method) (kempnerforge.checkpoint.AsyncCheckpointer method) (kempnerforge.checkpoint.CheckpointManager method) (kempnerforge.checkpoint.manager.CheckpointManager method) save_activations() (in module kempnerforge.model.hooks) schedule_1f1b (kempnerforge.config.distributed.PipelineSchedule attribute) schedule_free_warmup_steps (kempnerforge.config.optimizer.OptimizerConfig attribute) (kempnerforge.config.OptimizerConfig attribute) ScheduleFreeAdamW (class in kempnerforge.training.optimizer) scheduler (kempnerforge.config.job.JobConfig attribute) (kempnerforge.config.JobConfig attribute) SchedulerConfig (class in kempnerforge.config) (class in kempnerforge.config.scheduler) SchedulerType (class in kempnerforge.config.scheduler) sdpa_backend (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) seed (kempnerforge.config.TrainConfig attribute) (kempnerforge.config.training.TrainConfig attribute) selective (kempnerforge.config.training.ActivationCheckpointing attribute) seq_len (kempnerforge.config.TrainConfig attribute) (kempnerforge.config.training.TrainConfig attribute) set_epoch() (kempnerforge.data.DistributedSampler method) (kempnerforge.data.sampler.DistributedSampler method) (kempnerforge.data.sampler.MixtureSampler method) set_moe_step() (kempnerforge.model.Transformer method) (kempnerforge.model.transformer.Transformer method) set_rng_state() (in module kempnerforge.checkpoint) (in module kempnerforge.checkpoint.state) set_skip() (kempnerforge.data.DistributedSampler method) (kempnerforge.data.sampler.DistributedSampler method) (kempnerforge.data.sampler.MixtureSampler method) set_step() (kempnerforge.model.router.SigmoidTopKRouter method) should_rollback (kempnerforge.resilience.health.NaNDetector property) (kempnerforge.resilience.NaNDetector property) should_shutdown() (kempnerforge.resilience.ShutdownHandler method) (kempnerforge.resilience.signal_handler.ShutdownHandler method) shutdown_requested (kempnerforge.resilience.ShutdownHandler property) (kempnerforge.resilience.signal_handler.ShutdownHandler property) shutdown_timeout_sec (kempnerforge.config.TrainConfig attribute) (kempnerforge.config.training.TrainConfig attribute) ShutdownHandler (class in kempnerforge.resilience) (class in kempnerforge.resilience.signal_handler) SigmoidTopKRouter (class in kempnerforge.model.router) signal_received (kempnerforge.resilience.ShutdownHandler property) (kempnerforge.resilience.signal_handler.ShutdownHandler property) silu (kempnerforge.config.model.Activation attribute) SLURMInfo (class in kempnerforge.resilience) (class in kempnerforge.resilience.elastic) SoftmaxTopKRouter (class in kempnerforge.model.router) stable_steps (kempnerforge.config.scheduler.SchedulerConfig attribute) (kempnerforge.config.SchedulerConfig attribute) StandardMLP (class in kempnerforge.model.mlp) start() (kempnerforge.profiling.cuda_timer.CUDATimer method) (kempnerforge.profiling.cuda_timer.CUDATimerCollection method) (kempnerforge.profiling.CUDATimer method) (kempnerforge.profiling.CUDATimerCollection method) start_step (kempnerforge.config.data.TrainingPhase attribute) (kempnerforge.config.profiling.ProfilingConfig attribute) (kempnerforge.config.ProfilingConfig attribute) start_step() (kempnerforge.metrics.MetricsTracker method) (kempnerforge.metrics.tracker.MetricsTracker method) state_dict() (kempnerforge.data.dataloader.StatefulDataLoader method) (kempnerforge.data.dataset.HuggingFaceDataset method) (kempnerforge.data.dataset.MemoryMappedDataset method) (kempnerforge.data.dataset.MixtureDataset method) (kempnerforge.data.dataset.StreamingHuggingFaceDataset method) (kempnerforge.data.DistributedSampler method) (kempnerforge.data.HuggingFaceDataset method) (kempnerforge.data.MemoryMappedDataset method) (kempnerforge.data.sampler.DistributedSampler method) (kempnerforge.data.sampler.MixtureSampler method) (kempnerforge.data.StatefulDataLoader method) (kempnerforge.training.optimizer.Muon method) (kempnerforge.training.optimizer.ScheduleFreeAdamW method) StatefulDataLoader (class in kempnerforge.data) (class in kempnerforge.data.dataloader) step (kempnerforge.training.hooks.StepContext attribute) step() (kempnerforge.training.optimizer.Lion method) (kempnerforge.training.optimizer.Muon method) (kempnerforge.training.optimizer.ScheduleFreeAdamW method) step_time_sec (kempnerforge.metrics.StepMetrics attribute) (kempnerforge.metrics.tracker.StepMetrics attribute) StepContext (class in kempnerforge.training.hooks) StepMetrics (class in kempnerforge.metrics) (class in kempnerforge.metrics.tracker) steps (kempnerforge.config.eval.EvalConfig attribute) (kempnerforge.config.EvalConfig attribute) stop() (kempnerforge.profiling.cuda_timer.CUDATimer method) (kempnerforge.profiling.cuda_timer.CUDATimerCollection method) (kempnerforge.profiling.CUDATimer method) (kempnerforge.profiling.CUDATimerCollection method) StreamingHuggingFaceDataset (class in kempnerforge.data.dataset) SwiGLUMLP (class in kempnerforge.model.mlp) T tensorboard_dir (kempnerforge.config.metrics.MetricsConfig attribute) (kempnerforge.config.MetricsConfig attribute) TensorBoardBackend (class in kempnerforge.metrics.tracker) tie_embeddings (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) tie_weights() (kempnerforge.model.embedding.OutputHead method) TokenEmbedding (class in kempnerforge.model.embedding) tokenizer_path (kempnerforge.config.data.DataConfig attribute) (kempnerforge.config.DataConfig attribute) tokens_per_sec (kempnerforge.metrics.StepMetrics attribute) (kempnerforge.metrics.tracker.StepMetrics attribute) tokens_seen (kempnerforge.training.hooks.StepContext attribute) total_gb (kempnerforge.metrics.StepMetrics attribute) (kempnerforge.metrics.tracker.StepMetrics attribute) total_nans (kempnerforge.resilience.health.NaNState attribute) (kempnerforge.resilience.NaNState attribute) tp (kempnerforge.config.distributed.DistributedConfig attribute) (kempnerforge.config.DistributedConfig attribute) trace_dir (kempnerforge.config.profiling.ProfilingConfig attribute) (kempnerforge.config.ProfilingConfig attribute) train (kempnerforge.config.job.JobConfig attribute) (kempnerforge.config.JobConfig attribute) train_params() (kempnerforge.training.optimizer.ScheduleFreeAdamW method) TrainConfig (class in kempnerforge.config) (class in kempnerforge.config.training) TrainingHook (class in kempnerforge.training.hooks) TrainingPhase (class in kempnerforge.config.data) Transformer (class in kempnerforge.model) (class in kempnerforge.model.transformer) TransformerBlock (class in kempnerforge.model) (class in kempnerforge.model.transformer) U unregister() (kempnerforge.resilience.ShutdownHandler method) (kempnerforge.resilience.signal_handler.ShutdownHandler method) update() (kempnerforge.model.attention.KVCache method) update_weights() (kempnerforge.data.sampler.MixtureSampler method) V validate() (kempnerforge.config.job.JobConfig method) (kempnerforge.config.JobConfig method) validate_world_size() (kempnerforge.config.distributed.DistributedConfig method) (kempnerforge.config.DistributedConfig method) vocab_size (kempnerforge.config.model.ModelConfig attribute) (kempnerforge.config.ModelConfig attribute) W wait() (kempnerforge.checkpoint.async_save.AsyncCheckpointer method) (kempnerforge.checkpoint.AsyncCheckpointer method) (kempnerforge.checkpoint.CheckpointManager method) (kempnerforge.checkpoint.manager.CheckpointManager method) wandb_project (kempnerforge.config.metrics.MetricsConfig attribute) (kempnerforge.config.MetricsConfig attribute) wandb_run_id (kempnerforge.config.metrics.MetricsConfig attribute) (kempnerforge.config.MetricsConfig attribute) wandb_run_name (kempnerforge.config.metrics.MetricsConfig attribute) (kempnerforge.config.MetricsConfig attribute) WandBBackend (class in kempnerforge.metrics.tracker) warmup_steps (kempnerforge.config.scheduler.SchedulerConfig attribute) (kempnerforge.config.SchedulerConfig attribute) weight (kempnerforge.config.data.DatasetSource attribute) weight_decay (kempnerforge.config.optimizer.OptimizerConfig attribute) (kempnerforge.config.OptimizerConfig attribute) wsd (kempnerforge.config.scheduler.SchedulerType attribute) wsd_decay_type (kempnerforge.config.scheduler.SchedulerConfig attribute) (kempnerforge.config.SchedulerConfig attribute) Z z_loss() (in module kempnerforge.training.loss) z_loss_weight (kempnerforge.config.TrainConfig attribute) (kempnerforge.config.training.TrainConfig attribute)