Source code for kempnerforge.config.vision
"""Vision-encoder configuration.
``VisionEncoderConfig`` selects and parameterizes the vision encoder
that the ``VLMWrapper`` composes alongside the text backbone and
adapter. It is a top-level section in TOML (``[vision_encoder]``),
sibling to ``[model]``, ``[adapter]``, and ``[vlm]``.
Field summary:
- ``type`` selects the encoder by registry key
(see ``registry.register_vision_encoder``). Defaults to ``"random"``
for tests; production configs set ``"siglip2"`` / ``"clip"`` etc.
- ``path`` is the HF Hub id or local path passed to the encoder
builder. Empty string is accepted for stub encoders (``"random"``).
- ``feature_dim`` is the output feature dim of the encoder. ``0`` means
"infer from the encoder at build time".
- ``num_tokens`` is the number of image tokens the encoder produces per
image. ``0`` means "infer at build time". When ``> 0`` it is cross-
checked against ``model.max_seq_len`` at config time inside
``JobConfig.__post_init__``.
"""
from __future__ import annotations
from dataclasses import dataclass
from kempnerforge.config.registry import registry
[docs]
@dataclass
class VisionEncoderConfig:
"""Configuration for the vision encoder component of a VLM."""
type: str = "random"
path: str = ""
feature_dim: int = 0
num_tokens: int = 0
def __post_init__(self) -> None:
# Late import: importing the encoder module triggers the
# ``@registry.register_vision_encoder`` decorators that populate the
# registry. Without this, ``list_vision_encoders()`` would return an
# empty list when this dataclass is constructed before any encoder
# module has been imported (e.g. in unit-test isolation).
import kempnerforge.model.vision # noqa: F401, PLC0415
registered = tuple(registry.list_vision_encoders())
if self.type not in registered:
raise ValueError(
f"Unknown vision_encoder.type: {self.type!r}. Registered: {sorted(registered)}."
)
if self.feature_dim < 0:
raise ValueError(
f"vision_encoder.feature_dim must be non-negative (got {self.feature_dim})"
)
if self.num_tokens < 0:
raise ValueError(
f"vision_encoder.num_tokens must be non-negative (got {self.num_tokens})"
)