Source code for kempnerforge.config.video

"""Video input configuration.

``VideoConfig`` is the ``[video]`` top-level section. When present, the job
trains on a video dataset through the VLM wrapper: a clip is decoded into an
ordered set of frames, each preprocessed like an image and fed to the vision
encoder. The section is a sibling of ``[vision_encoder]`` / ``[adapter]`` /
``[vlm]`` and requires ``[vlm]`` to be set.

Frame-sampling defaults follow the Molmo2 paper (sample at ``fps`` per second,
include the first and last frame, cap at ``max_frames``). ``max_frames`` is the
per-clip frame budget; the number of visual tokens it implies
(``max_frames * tokens_per_frame``) feeds the residual-stream / sequence-length
math once the model consumes video.
"""

from __future__ import annotations

from dataclasses import dataclass

_VIDEO_SPLITS = ("train", "validation")



[docs]
@dataclass
class VideoConfig:
    """Video dataset location and frame-sampling knobs.

    Fields:
        data_root: Root directory of the on-disk video dataset.
        dataset_type: Registry key for the dataset builder (``"webvid"`` default).
        dataset_name: On-disk corpus name within a style (e.g. ``"webvid-10M"``).
        sampling_policy: Registry key for the frame-sampling policy (``"uniform"``).
        split: Which split to read (``"train"`` or ``"validation"``).
        max_samples: Cap the manifest to this many examples (``0`` = all).
        max_frames: Maximum frames sampled per clip (the per-clip budget).
        min_frames: Minimum frames sampled per clip; short clips pad up to this.
        fps: Target sampling rate in frames per second (Molmo2 uses 2).
        frame_size: Square pixel size each frame is resized to.
        prompt: Optional instruction prepended to the target text, masked from loss.
    """

    data_root: str = ""
    dataset_type: str = "webvid"
    dataset_name: str = "webvid-10M"
    sampling_policy: str = "uniform"
    split: str = "train"
    max_samples: int = 0
    max_frames: int = 16
    min_frames: int = 4
    fps: float = 2.0
    frame_size: int = 224
    prompt: str = ""

    def __post_init__(self) -> None:
        # Late imports populate the dataset/sampling registries (their decorators
        # run on import) and avoid a config->data import cycle; only hit for a
        # video job. ``av`` is not required here (it is lazy inside the decoder).
        import kempnerforge.data.video_dataset  # noqa: F401, PLC0415
        import kempnerforge.data.video_io  # noqa: F401, PLC0415
        from kempnerforge.config.registry import registry  # noqa: PLC0415

        if self.dataset_type not in registry.list_video_datasets():
            raise ValueError(
                f"video.dataset_type must be one of {sorted(registry.list_video_datasets())} "
                f"(got {self.dataset_type!r})"
            )
        if self.sampling_policy not in registry.list_sampling_policies():
            raise ValueError(
                "video.sampling_policy must be one of "
                f"{sorted(registry.list_sampling_policies())} (got {self.sampling_policy!r})"
            )
        if self.split not in _VIDEO_SPLITS:
            raise ValueError(f"video.split must be one of {_VIDEO_SPLITS} (got {self.split!r})")
        if self.max_samples < 0:
            raise ValueError(f"video.max_samples must be non-negative (got {self.max_samples})")
        if self.min_frames < 1:
            raise ValueError(f"video.min_frames must be >= 1 (got {self.min_frames})")
        if self.max_frames < 1:
            raise ValueError(f"video.max_frames must be >= 1 (got {self.max_frames})")
        if self.min_frames > self.max_frames:
            raise ValueError(
                f"video.min_frames ({self.min_frames}) must be <= video.max_frames "
                f"({self.max_frames})"
            )
        if self.fps <= 0:
            raise ValueError(f"video.fps must be positive (got {self.fps})")
        if self.frame_size <= 0:
            raise ValueError(f"video.frame_size must be positive (got {self.frame_size})")