Source code for kempnerforge.config.data

"""Data pipeline configuration."""

from __future__ import annotations

from dataclasses import dataclass, field



[docs]
@dataclass
class DatasetSource:
    """A single data source in a multi-dataset mixture.

    Either ``path`` (pre-tokenized) or ``hf_name`` (HuggingFace) must be set.
    ``weight`` controls the relative sampling probability.
    """

    path: str = ""  # Pre-tokenized data directory
    weight: float = 1.0  # Relative sampling weight
    name: str = ""  # Name for per-dataset metrics (auto-derived if empty)
    hf_name: str = ""  # HuggingFace dataset name
    hf_config: str = ""  # HuggingFace dataset config

    def __post_init__(self) -> None:
        if self.weight <= 0:
            raise ValueError(f"Dataset weight must be positive, got {self.weight}")




[docs]
@dataclass
class TrainingPhase:
    """A training phase with custom dataset weights and LR scaling.

    Used for data annealing: at ``start_step``, the mixture sampler switches
    to ``dataset_weights`` and the learning rate is multiplied by ``lr_scale``.
    """

    start_step: int = 0
    dataset_weights: dict[str, float] = field(default_factory=dict)
    lr_scale: float = 1.0

    def __post_init__(self) -> None:
        if self.start_step < 0:
            raise ValueError("TrainingPhase.start_step must be non-negative")
        if self.lr_scale <= 0:
            raise ValueError("TrainingPhase.lr_scale must be positive")
        for name, w in self.dataset_weights.items():
            if w < 0:
                raise ValueError(f"TrainingPhase.dataset_weights['{name}'] must be non-negative")




[docs]
@dataclass
class DataConfig:
    """Data pipeline settings."""

    dataset_path: str = ""
    file_pattern: str = "*.npy"  # Glob pattern for data files (e.g., "*.npy", "tokenized_*.bin")
    tokenizer_path: str = ""
    num_workers: int = 4
    pin_memory: bool = True
    prefetch_factor: int = 2
    # For HuggingFace datasets
    hf_dataset_name: str | None = None
    hf_dataset_config: str | None = None  # Dataset config name (e.g., "wikitext-2-raw-v1")
    hf_dataset_split: str = "train"
    hf_dataset_text_field: str = "text"
    hf_dataset_image_field: str = "image"  # VLM only: image column name
    hf_dataset_prompt_field: str = ""  # VLM only: prompt column name (empty = no prompt mask)
    hf_image_size: int = 224  # VLM only: target square image size
    hf_streaming: bool = False  # Use streaming (IterableDataset) for large HF datasets
    pack_sequences: bool = False  # Document-aware packing with cross-doc isolation
    # Multi-dataset mixing (overrides dataset_path/hf_dataset_name when non-empty)
    datasets: list[DatasetSource] = field(default_factory=list)
    mix_temperature: float = 1.0  # Temperature for weight scaling (1.0=as-is, >1=uniform)
    # Phase scheduling (step-triggered weight/LR transitions)
    phases: list[TrainingPhase] = field(default_factory=list)
    # Annealing shortcut (syntactic sugar for a common 2-phase pattern)
    anneal_start_step: int = 0  # 0 = disabled
    anneal_weights: dict[str, float] = field(default_factory=dict)

    def __post_init__(self) -> None:
        if self.num_workers < 0:
            raise ValueError("num_workers must be non-negative")
        if self.prefetch_factor < 1:
            raise ValueError("prefetch_factor must be >= 1")
        if self.mix_temperature <= 0:
            raise ValueError("mix_temperature must be positive")
        for src in self.datasets:
            if not src.path and not src.hf_name:
                raise ValueError(f"DatasetSource '{src.name}' must have either path or hf_name")
        # Phase validation
        if self.phases:
            steps = [p.start_step for p in self.phases]
            if steps != sorted(steps) or len(steps) != len(set(steps)):
                raise ValueError(
                    "data.phases start_steps must be strictly monotonically increasing"
                )
        if self.anneal_start_step < 0:
            raise ValueError("anneal_start_step must be non-negative")
        if self.phases and self.anneal_start_step > 0:
            raise ValueError(
                "Cannot use both data.phases and data.anneal_start_step — "
                "use phases for multi-phase scheduling"
            )