Source code for kempnerpulse.config

"""Cross-cutting tier — parsed configuration.

Turns command-line arguments into a single frozen ``Config`` value that the
lifecycle, reader, translate, compute, and present layers all read from. This
module owns *parsing and validation only*: it builds the argument parser,
resolves the weight preset, applies the backend-aware ``--poll`` default, and
reports poll-validation problems as data. It never prints, never exits, and
never installs signal handlers — those are the lifecycle's responsibility.

Runtime dependencies are the standard library only.
"""
from __future__ import annotations

import argparse
from dataclasses import dataclass
from typing import Optional, Tuple

from .compute.presets import (
    DEFAULT_PRESET_NAME,
    PRESETS,
    Weights,
    preset_name_for_weights,
)
from .reader.base import BackendKind

# DCGM profiling counters refresh at ~10 Hz through the shared hardware-counter
# multiplexer; below this floor a direct ``dcgmi`` stream returns mostly-blank
# profiling rows, so a sub-floor ``--poll`` is clamped up to it. Kept in sync
# with the reader's ``DCGM_STREAM_MIN_INTERVAL_MS``.
DCGM_STREAM_MIN_INTERVAL_MS = 100

# Backend-aware ``--poll`` defaults (seconds), applied when ``--poll`` is unset.
DEFAULT_POLL_DCGM_SECONDS = 0.1
DEFAULT_POLL_PROMETHEUS_SECONDS = 1.0

# Minimum samples retained for sparkline history regardless of ``--history``.
MIN_HISTORY_LENGTH = 10

_DEFAULT_SOURCE = "http://localhost:9400/metrics"

# CLI backend token -> reader BackendKind.
_BACKEND_BY_NAME = {
    "dcgm": BackendKind.DCGMI,
    "prometheus": BackendKind.PROMETHEUS,
    "replay": BackendKind.REPLAY,   # replay a saved capture (--source FILE); no GPU needed
}


[docs] @dataclass(frozen=True) class Config: """Fully-resolved run configuration, frozen for the process lifetime. Field semantics: * ``gpu_ids`` is the *explicit* ``--gpus`` selection (already a tuple of string ids), or ``None`` when the flag was not supplied. Environment / accessibility resolution is the selection layer's job, not config's. * ``weights`` is normalized to sum to 1; ``preset_name`` is the matching preset name (``"ai"`` / ``"hpc"`` / ``"mem"``) or ``"custom"``. * ``export_spec`` is ``None`` (no export), ``"default"`` (``--export`` with no argument), ``"all"``, or a comma-separated column list. * ``focus_gpu`` is the id to start focused on, or ``None``. """ backend: BackendKind poll_seconds: float source: str gpu_ids: Optional[Tuple[str, ...]] show_all: bool weights: Weights preset_name: str export_spec: Optional[str] once: bool focus_gpu: Optional[str] history_length: int
def _pkg_version() -> str: """Best-effort installed version of ``kempnerpulse`` for ``--version``.""" try: from importlib.metadata import PackageNotFoundError, version except ImportError: # pragma: no cover - importlib.metadata is stdlib >=3.8 return "unknown" try: return version("kempnerpulse") except PackageNotFoundError: return "unknown" except Exception: return "unknown"
[docs] def parse_weights(raw: str) -> Weights: """Validate a ``--weights`` string into a normalized 4-tuple. Requires exactly four comma-separated numeric values in ``SM,TENSOR,DRAM,GR`` order summing to a positive value; the tuple is normalized to sum to 1. Raises ``argparse.ArgumentTypeError`` on any malformed input so argparse reports it cleanly. """ parts = [p.strip() for p in raw.split(",") if p.strip()] if len(parts) != 4: raise argparse.ArgumentTypeError( "--weights requires four comma-separated values: SM,TENSOR,DRAM,GR" ) try: vals = tuple(float(p) for p in parts) except ValueError as exc: raise argparse.ArgumentTypeError("--weights values must be numeric") from exc total = sum(vals) if total <= 0: raise argparse.ArgumentTypeError("--weights must sum to a positive value") if abs(total - 1.0) > 1e-6: vals = tuple(v / total for v in vals) return vals # type: ignore[return-value]
_HELP_EPILOG = """ Application: KempnerPulse is a terminal dashboard for NVIDIA DCGM hardware-counter metrics. It is designed to help distinguish idle GPUs, real compute, memory pressure, transfer/copy pressure, and hardware-health issues at a glance. Real util equation: RealUtil = clamp(0, 100, Wsm * SM_ACTIVE + Wtensor * TENSOR_ACTIVE + Wdram * DRAM_ACTIVE + Wgr * GR_ENGINE_ACTIVE) GR_ENGINE_ACTIVE is a profiling-level hardware counter (DCGM field 1001). If it is unavailable the dashboard falls back to GPU_UTIL (field 203). Weight presets (convenience flags): --ai-weights AI / LLM training and inference (0.35,0.35,0.20,0.10) [default] --hpc-weights General mixed CUDA / HPC (0.45,0.15,0.25,0.15) --mem-weights Memory-bound / bandwidth-heavy (0.35,0.10,0.40,0.15) Or supply custom weights with --weights W_SM,W_TENSOR,W_DRAM,W_GR (normalized to sum to 1). GPU visibility selection: The dashboard uses the first matching source in this order: 1. --gpus 2. CUDA_VISIBLE_DEVICES 3. NVIDIA_VISIBLE_DEVICES 4. SLURM_STEP_GPUS 5. SLURM_JOB_GPUS If none are usable, all GPUs accessible to the process are shown. Use --show-all to ignore the environment and show every accessible GPU, or --gpus to force an explicit list. All selections are filtered against GPUs accessible to the current process (as reported by nvidia-smi), respecting cgroup and container restrictions. Backend selection: --backend dcgm (default) Query dcgmi dmon directly for true per-sample resolution (down to a 100ms floor). Best for single-node workload profiling. Requires the DCGM host engine. --backend prometheus Read metrics from the dcgm-exporter Prometheus HTTP endpoint. Profiling fields update at the exporter's configured interval (typically ~30s). Best for fleet-level monitoring; requires --poll >= 1.0. Examples: kempnerpulse kempnerpulse --poll 1.0 kempnerpulse --focus-gpu 0 kempnerpulse --hpc-weights kempnerpulse --weights 0.40,0.30,0.20,0.10 kempnerpulse --gpus 2,3 kempnerpulse --show-all kempnerpulse --source http://otherhost:9400/metrics kempnerpulse --backend dcgm --poll 0.5 kempnerpulse --backend dcgm --export all --poll 0.1 """
[docs] def build_parser() -> argparse.ArgumentParser: """Construct the KempnerPulse command-line parser. Returned standalone so callers (and tests) can introspect or reuse it without triggering a parse. Defaults mirror the legacy CLI surface; the weight presets and custom-weight default are sourced from the compute layer's ``PRESETS`` so the two never drift. """ default_weights = PRESETS[DEFAULT_PRESET_NAME] parser = argparse.ArgumentParser( prog="kempnerpulse", description=( "KempnerPulse: CLI dashboard for NVIDIA DCGM hardware-counter metrics " "with SLURM/CUDA GPU visibility awareness" ), epilog=_HELP_EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument( "--version", action="version", version=f"%(prog)s {_pkg_version()}" ) parser.add_argument( "--source", default=_DEFAULT_SOURCE, help=( "Path to a dcgm-exporter text file or an http(s) /metrics endpoint. " f"Default: {_DEFAULT_SOURCE}" ), ) parser.add_argument( "--backend", choices=["prometheus", "dcgm", "replay"], default="dcgm", help=( "Metric collection backend. 'prometheus' reads from the dcgm-exporter " "HTTP endpoint (~30s resolution for profiling fields). 'dcgm' queries " "dcgmi dmon directly for true high-resolution sampling (down to 100ms). " "Default: dcgm" ), ) parser.add_argument( "--poll", type=float, default=None, help=( "Sampling/refresh interval in seconds. With --backend dcgm, drives a " "persistent dcgmi stream honored down to a 100ms floor (DCGM profiling " "counters refresh at ~10Hz internally; smaller values would yield blank " "profiling rows). With --backend prometheus, must be >= 1.0 " "(dcgm-exporter scrapes profiling fields at ~30s, so sub-second values " "just duplicate samples). Default: 0.1 (dcgm) / 1.0 (prometheus)." ), ) parser.add_argument( "--history", type=int, default=120, help="Number of samples kept for sparkline history. Default: 120", ) parser.add_argument( "--focus-gpu", default=None, help="Start in focused view for one GPU id, for example 0", ) parser.add_argument( "--once", action="store_true", help="Render one snapshot and exit instead of running live", ) parser.add_argument( "--gpus", default=None, help=( "Explicit GPU ids or ranges to show, for example 0,1 or 0-3. " "Overrides SLURM/CUDA visibility env vars." ), ) parser.add_argument( "--show-all", action="store_true", help="Ignore SLURM/CUDA visibility env vars and show every accessible GPU", ) parser.add_argument( "--weights", type=parse_weights, default=default_weights, help=( "Comma-separated real-util weights in SM,TENSOR,DRAM,GR order. Values " "are normalized to sum to 1. Example: --weights 0.40,0.30,0.20,0.10" ), ) parser.add_argument( "--ai-weights", dest="weights", action="store_const", const=PRESETS["ai"], help="Use AI/LLM training weight preset (0.35,0.35,0.20,0.10) — the default", ) parser.add_argument( "--hpc-weights", dest="weights", action="store_const", const=PRESETS["hpc"], help="Use general HPC weight preset (0.45,0.15,0.25,0.15)", ) parser.add_argument( "--mem-weights", dest="weights", action="store_const", const=PRESETS["mem"], help="Use memory-bound weight preset (0.35,0.10,0.40,0.15)", ) parser.add_argument( "--export", nargs="?", const="default", default=None, metavar="COLS", help=( "Output CSV to stdout. Use --export for default columns, --export all " "for every column, or --export col1,col2,... for a custom set." ), ) return parser
[docs] def build_config(argv: Optional[list[str]] = None) -> Config: """Parse ``argv`` (or ``sys.argv``) into a frozen ``Config``. Resolves the backend enum, applies the backend-aware ``--poll`` default when unset, names the weight preset, and floors the history length. Poll *values* are not validated here — call :func:`validate_poll` on the returned config. """ parser = build_parser() args = parser.parse_args(argv) backend = _BACKEND_BY_NAME[args.backend] poll_seconds = args.poll if poll_seconds is None: poll_seconds = ( DEFAULT_POLL_DCGM_SECONDS if backend is BackendKind.DCGMI else DEFAULT_POLL_PROMETHEUS_SECONDS ) weights: Weights = tuple(args.weights) # type: ignore[assignment] preset_name = preset_name_for_weights(weights) gpu_ids: Optional[Tuple[str, ...]] = None if args.gpus is not None: gpu_ids = tuple(args.gpus.split(",")) if isinstance(args.gpus, str) else None return Config( backend=backend, poll_seconds=poll_seconds, source=args.source, gpu_ids=gpu_ids, show_all=args.show_all, weights=weights, preset_name=preset_name, export_spec=args.export, once=args.once, focus_gpu=args.focus_gpu, history_length=max(MIN_HISTORY_LENGTH, args.history), )
[docs] @dataclass(frozen=True) class PollValidation: """Outcome of poll validation, returned as data for the lifecycle to act on. ``error`` is a user-facing message when the configured poll is invalid (the lifecycle should print it and exit non-zero), else ``None``. When the dcgm backend is asked for a sub-floor interval, ``clamped`` is ``True`` and ``note`` carries an advisory the lifecycle may print to stderr; ``effective_poll_seconds`` is the interval that will actually be used. """ error: Optional[str] clamped: bool note: Optional[str] effective_poll_seconds: float
[docs] def validate_poll(config: Config) -> PollValidation: """Validate ``config.poll_seconds`` against the backend, returning data only. Rules (ported from the legacy CLI): * ``poll <= 0`` is an error for any backend. * prometheus requires ``poll >= 1.0`` (the exporter's scrape interval is the true ceiling; sub-second values just duplicate samples). * dcgm allows a sub-100ms request but clamps it up to the 100ms profiling floor, reported via ``clamped`` / ``note`` rather than printed here. No printing or exiting happens in this function; the lifecycle owns that. """ poll = config.poll_seconds if poll <= 0: return PollValidation( error=( f"--poll must be a positive number of seconds (got {poll}). " "Use e.g. --poll 0.1 for 100ms or --poll 2 for 2s." ), clamped=False, note=None, effective_poll_seconds=poll, ) if config.backend is BackendKind.PROMETHEUS: if poll < 1.0: return PollValidation( error=( f"--poll {poll}s is below the Prometheus backend's effective " "sampling rate. dcgm-exporter scrapes DCGM at ~30s for " "profiling fields, so sub-second --poll values produce " "duplicate samples with no new data. Use --backend dcgm for " "true high-resolution sampling, or raise --poll to >= 1.0." ), clamped=False, note=None, effective_poll_seconds=poll, ) return PollValidation( error=None, clamped=False, note=None, effective_poll_seconds=poll ) # dcgm backend: clamp sub-floor requests up to the profiling floor. requested_ms = int(round(poll * 1000)) if requested_ms < DCGM_STREAM_MIN_INTERVAL_MS: effective = DCGM_STREAM_MIN_INTERVAL_MS / 1000.0 note = ( "DCGM profiling counters (SM/Tensor/DRAM Active, etc.) refresh at " f"~10Hz internally; --poll {poll}s would yield mostly-blank profiling " f"rows. Clamping to {DCGM_STREAM_MIN_INTERVAL_MS}ms." ) return PollValidation( error=None, clamped=True, note=note, effective_poll_seconds=effective ) return PollValidation( error=None, clamped=False, note=None, effective_poll_seconds=poll )