Source code for kempnerforge.config.video
"""Video input configuration.
``VideoConfig`` is the ``[video]`` top-level section. When present, the job
trains on a video dataset through the VLM wrapper: a clip is decoded into an
ordered set of frames, each preprocessed like an image and fed to the vision
encoder. The section is a sibling of ``[vision_encoder]`` / ``[adapter]`` /
``[vlm]`` and requires ``[vlm]`` to be set.
Frame-sampling defaults follow the Molmo2 paper (sample at ``fps`` per second,
include the first and last frame, cap at ``max_frames``). ``max_frames`` is the
per-clip frame budget; the number of visual tokens it implies
(``max_frames * tokens_per_frame``) feeds the residual-stream / sequence-length
math once the model consumes video.
"""
from __future__ import annotations
from dataclasses import dataclass
_VIDEO_SPLITS = ("train", "validation")
[docs]
@dataclass
class VideoConfig:
"""Video dataset location and frame-sampling knobs.
Fields:
data_root: Root directory of the on-disk video dataset.
dataset_type: Registry key for the dataset builder (``"webvid"`` default).
dataset_name: On-disk corpus name within a style (e.g. ``"webvid-10M"``).
sampling_policy: Registry key for the frame-sampling policy (``"uniform"``).
split: Which split to read (``"train"`` or ``"validation"``).
max_samples: Cap the manifest to this many examples (``0`` = all).
max_frames: Maximum frames sampled per clip (the per-clip budget).
min_frames: Minimum frames sampled per clip; short clips pad up to this.
fps: Target sampling rate in frames per second (Molmo2 uses 2).
frame_size: Square pixel size each frame is resized to.
prompt: Optional instruction prepended to the target text, masked from loss.
"""
data_root: str = ""
dataset_type: str = "webvid"
dataset_name: str = "webvid-10M"
sampling_policy: str = "uniform"
split: str = "train"
max_samples: int = 0
max_frames: int = 16
min_frames: int = 4
fps: float = 2.0
frame_size: int = 224
prompt: str = ""
def __post_init__(self) -> None:
# Late imports populate the dataset/sampling registries (their decorators
# run on import) and avoid a config->data import cycle; only hit for a
# video job. ``av`` is not required here (it is lazy inside the decoder).
import kempnerforge.data.video_dataset # noqa: F401, PLC0415
import kempnerforge.data.video_io # noqa: F401, PLC0415
from kempnerforge.config.registry import registry # noqa: PLC0415
if self.dataset_type not in registry.list_video_datasets():
raise ValueError(
f"video.dataset_type must be one of {sorted(registry.list_video_datasets())} "
f"(got {self.dataset_type!r})"
)
if self.sampling_policy not in registry.list_sampling_policies():
raise ValueError(
"video.sampling_policy must be one of "
f"{sorted(registry.list_sampling_policies())} (got {self.sampling_policy!r})"
)
if self.split not in _VIDEO_SPLITS:
raise ValueError(f"video.split must be one of {_VIDEO_SPLITS} (got {self.split!r})")
if self.max_samples < 0:
raise ValueError(f"video.max_samples must be non-negative (got {self.max_samples})")
if self.min_frames < 1:
raise ValueError(f"video.min_frames must be >= 1 (got {self.min_frames})")
if self.max_frames < 1:
raise ValueError(f"video.max_frames must be >= 1 (got {self.max_frames})")
if self.min_frames > self.max_frames:
raise ValueError(
f"video.min_frames ({self.min_frames}) must be <= video.max_frames "
f"({self.max_frames})"
)
if self.fps <= 0:
raise ValueError(f"video.fps must be positive (got {self.fps})")
if self.frame_size <= 0:
raise ValueError(f"video.frame_size must be positive (got {self.frame_size})")