Source code for kempnerforge.config.video

"""Video input configuration.

``VideoConfig`` is the ``[video]`` top-level section. When present, the job
trains on a video dataset through the VLM wrapper: a clip is decoded into an
ordered set of frames, each preprocessed like an image and fed to the vision
encoder. The section is a sibling of ``[vision_encoder]`` / ``[adapter]`` /
``[vlm]`` and requires ``[vlm]`` to be set.

Frame-sampling defaults follow the Molmo2 paper (sample at ``fps`` per second,
include the first and last frame, cap at ``max_frames``). ``max_frames`` is the
per-clip frame budget; the number of visual tokens it implies
(``max_frames * tokens_per_frame``) feeds the residual-stream / sequence-length
math once the model consumes video.
"""

from __future__ import annotations

from dataclasses import dataclass

_VIDEO_SPLITS = ("train", "validation")


[docs] @dataclass class VideoConfig: """Video dataset location and frame-sampling knobs. Fields: data_root: Root directory of the on-disk video dataset. dataset_type: Registry key for the dataset builder (``"webvid"`` default). dataset_name: On-disk corpus name within a style (e.g. ``"webvid-10M"``). sampling_policy: Registry key for the frame-sampling policy (``"uniform"``). split: Which split to read (``"train"`` or ``"validation"``). max_samples: Cap the manifest to this many examples (``0`` = all). max_frames: Maximum frames sampled per clip (the per-clip budget). min_frames: Minimum frames sampled per clip; short clips pad up to this. fps: Target sampling rate in frames per second (Molmo2 uses 2). frame_size: Square pixel size each frame is resized to. prompt: Optional instruction prepended to the target text, masked from loss. """ data_root: str = "" dataset_type: str = "webvid" dataset_name: str = "webvid-10M" sampling_policy: str = "uniform" split: str = "train" max_samples: int = 0 max_frames: int = 16 min_frames: int = 4 fps: float = 2.0 frame_size: int = 224 prompt: str = "" def __post_init__(self) -> None: # Late imports populate the dataset/sampling registries (their decorators # run on import) and avoid a config->data import cycle; only hit for a # video job. ``av`` is not required here (it is lazy inside the decoder). import kempnerforge.data.video_dataset # noqa: F401, PLC0415 import kempnerforge.data.video_io # noqa: F401, PLC0415 from kempnerforge.config.registry import registry # noqa: PLC0415 if self.dataset_type not in registry.list_video_datasets(): raise ValueError( f"video.dataset_type must be one of {sorted(registry.list_video_datasets())} " f"(got {self.dataset_type!r})" ) if self.sampling_policy not in registry.list_sampling_policies(): raise ValueError( "video.sampling_policy must be one of " f"{sorted(registry.list_sampling_policies())} (got {self.sampling_policy!r})" ) if self.split not in _VIDEO_SPLITS: raise ValueError(f"video.split must be one of {_VIDEO_SPLITS} (got {self.split!r})") if self.max_samples < 0: raise ValueError(f"video.max_samples must be non-negative (got {self.max_samples})") if self.min_frames < 1: raise ValueError(f"video.min_frames must be >= 1 (got {self.min_frames})") if self.max_frames < 1: raise ValueError(f"video.max_frames must be >= 1 (got {self.max_frames})") if self.min_frames > self.max_frames: raise ValueError( f"video.min_frames ({self.min_frames}) must be <= video.max_frames " f"({self.max_frames})" ) if self.fps <= 0: raise ValueError(f"video.fps must be positive (got {self.fps})") if self.frame_size <= 0: raise ValueError(f"video.frame_size must be positive (got {self.frame_size})")