Coverage for src/ai_jury/adapters.py: 99%

1"""Agent adapters — each wraps one native coding-agent CLI in headless mode.

3Every adapter turns a prompt into a subprocess invocation and captures stdout as

4the agent's response. Adapters are intentionally thin: the orchestrator owns the

5prompt content and the round structure; an adapter only knows how to *invoke its

6CLI*.

8Headless invocations (verified against installed CLIs, early 2026). The prompt

9embeds the redacted diff, so it is delivered on STDIN (never argv) for every

10real adapter so it is not exposed in the process list (issue #287):

11 - Claude Code : ``claude -p --output-format text`` (prompt piped via stdin)

12 - Codex CLI : ``codex exec <args>`` (prompt piped via stdin)

13 - Antigravity : ``agy --print`` (prompt piped via stdin)

14"""

16from __future__ import annotations

18import contextlib

19import os

20import re

21import shutil

22import signal

23import subprocess

24import time

25from dataclasses import dataclass, field

27from . import privilege, redaction

28from .config import AgentSpec

30# Cap on a single local-model HTTP response body (issue #293/F-9). A chat

31# completion is small; an unbounded read from a malicious/buggy endpoint would

32# let it OOM the process.

33_MAX_RESPONSE_BYTES = 16 * 1024 * 1024

36def _kill_process_group(proc: subprocess.Popen) -> None:

37 """Best-effort kill of the child's whole process group (issue #293/F-7)."""

38 try:

39 if hasattr(os, "killpg"): 39 ↛ 44line 39 didn't jump to line 44 because the condition on line 39 was always true

40 os.killpg(os.getpgid(proc.pid), signal.SIGKILL)

41 return

42 except (ProcessLookupError, PermissionError, OSError):

43 pass

44 with contextlib.suppress(OSError):

45 proc.kill()

48def _spawn(argv: list[str], stdin: str | None, timeout: int) -> subprocess.CompletedProcess:

49 """Run a CLI with stdout/stderr captured, killing the whole group on timeout.

51 ``subprocess.run(timeout=…)`` SIGKILLs only the direct child, so an agent CLI

52 that wraps node/python can leak orphaned grandchildren (issue #293/F-7). The

53 child is started in its own session (process-group leader); on timeout the

54 entire group is killed before re-raising ``TimeoutExpired`` so the caller's

55 handling is unchanged. Returns a ``CompletedProcess``.

56 """

57 popen_kwargs: dict = {

58 "stdin": subprocess.PIPE if stdin is not None else None,

59 "stdout": subprocess.PIPE,

60 "stderr": subprocess.PIPE,

61 "text": True,

62 }

63 if hasattr(os, "setsid"): 63 ↛ 65line 63 didn't jump to line 65 because the condition on line 63 was always true

64 popen_kwargs["start_new_session"] = True

65 proc = subprocess.Popen(argv, **popen_kwargs)

66 try:

67 out, err = proc.communicate(input=stdin, timeout=timeout)

68 except subprocess.TimeoutExpired:

69 _kill_process_group(proc)

70 with contextlib.suppress(Exception):

71 proc.communicate() # reap the killed child

72 raise

73 return subprocess.CompletedProcess(argv, proc.returncode, out, err)

76def _read_only_extra_args(spec: AgentSpec) -> list[str]:

77 """The agent's ``extra_args`` with its mandatory read-only sandbox guaranteed.

79 Enforced at the adapter layer (issue #288) so a missing/misconfigured

80 ``extra_args`` cannot strip the sandbox: a reviewer of an attacker-controlled

81 diff is never write/tool-capable. Config may widen a codex sandbox knowingly,

82 but never remove the restriction.

83 """

84 return privilege.enforce_read_only(spec.vendor, spec.name, spec.extra_args)

87# Short timeout for capability/version probes. Detection is best-effort and must

88# never slow down or block a normal run, so probes are deliberately snappy.

89_VERSION_PROBE_TIMEOUT = 10

91# Matches a version-looking token, e.g. "1.2", "1.2.3", "v0.45.1".

92_VERSION_RE = re.compile(r"\d+\.\d+(?:\.\d+)?")

94# Capability/version probe statuses.

95CAP_OK = "ok"

96CAP_UNKNOWN_VERSION = "unknown_version"

97CAP_UNAVAILABLE = "unavailable"

99# Stable, typed error taxonomy for failed agent executions. These codes let

100# reports and CI/policy distinguish retryable from non-retryable failures

101# instead of pattern-matching free-text error strings.

102ERR_MISSING_CLI = "missing_cli"

103ERR_AUTH_REQUIRED = "auth_required"

104ERR_PERMISSION_PROMPT = "permission_prompt"

105ERR_TIMEOUT = "timeout"

106ERR_NONZERO_EXIT = "nonzero_exit"

107ERR_EMPTY_OUTPUT = "empty_output"

108ERR_SPAWN_FAILED = "spawn_failed"

109ERR_RATE_LIMITED = "rate_limited"

110# Local/HTTP adapter could not reach its server (issue #43): connection refused,

111# DNS failure, or the local model server is not running.

112ERR_CONNECTION = "connection_error"

113# Hosted-API adapter (issue #430): the vendor's API key env var is unset. Distinct

114# from ERR_AUTH_REQUIRED (a key was sent but the server rejected it) so a report

115# can tell "never configured" apart from "misconfigured/expired/revoked".

116ERR_MISSING_API_KEY = "missing_api_key"

117# Hosted-API adapter (issue #430): the configured key contains a control

118# character and was rejected BEFORE being sent as a header, rather than

119# letting http.client raise (and risk echoing a transformed/escaped copy of

120# the secret in its exception text — see _HostedApiAdapter._invalid_key_reason).

121ERR_INVALID_API_KEY = "invalid_api_key"

122ERR_UNKNOWN = "unknown"

123

124ERROR_CODES = frozenset(

125 {

126 ERR_MISSING_CLI,

127 ERR_AUTH_REQUIRED,

128 ERR_PERMISSION_PROMPT,

129 ERR_TIMEOUT,

130 ERR_NONZERO_EXIT,

131 ERR_EMPTY_OUTPUT,

132 ERR_SPAWN_FAILED,

133 ERR_RATE_LIMITED,

134 ERR_CONNECTION,

135 ERR_MISSING_API_KEY,

136 ERR_INVALID_API_KEY,

137 ERR_UNKNOWN,

138 }

139)

140

141# Failures that are worth retrying because they are typically transient (issue

142# #30): a timeout, a rate-limit, a process that failed to spawn, or a local

143# server that was briefly unreachable (#43). Auth, missing-CLI,

144# permission-prompt, empty-output, and generic nonzero-exit are treated as

145# deterministic — retrying them just burns time and tokens.

146RETRYABLE_ERROR_CODES = frozenset(

147 {

148 ERR_TIMEOUT,

149 ERR_RATE_LIMITED,

150 ERR_SPAWN_FAILED,

151 ERR_CONNECTION,

152 }

153)

154

155

156# Ordered keyword groups for classify_stderr. Each keyword is matched on word

157# boundaries (\b...\b) so incidental substrings do NOT trigger a false

158# classification: bare "auth" matches "auth error" but not "author identity",

159# and "login" matches "login required" but not "login_attempts" ("_" is a word

160# char, so there is no boundary inside "login_attempts"). Multi-word phrases

161# tolerate a space OR "_" between tokens (e.g. "rate limit"/"rate_limit").

162def _keyword_pattern(*keywords: str) -> re.Pattern[str]:

163 parts = [r"[ _]+".join(re.escape(tok) for tok in kw.split()) for kw in keywords]

164 return re.compile(r"\b(?:" + "|".join(parts) + r")\b")

165

166

167# Order matters: auth and rate-limit signals are checked before the generic

168# permission and nonzero-exit fallbacks.

169_AUTH_RE = _keyword_pattern(

170 "not authenticated",

171 "unauthenticated",

172 "authentication",

173 "unauthorized",

174 "api key",

175 "auth",

176 "log in",

177 "login",

178 "credential",

179 "credentials",

180)

181_RATE_LIMIT_RE = _keyword_pattern("rate limit", "429", "quota", "too many requests")

182_PERMISSION_RE = _keyword_pattern(

183 "permission",

184 "permissions",

185 "approve",

186 "approval",

187 "confirm",

188 "confirmation",

189)

190

191

192def classify_stderr(returncode: int, stderr: str) -> str:

193 """Classify a nonzero-exit failure into a typed error code from its stderr.

194

195 Token-aware matching against the lowercased stderr: each keyword group is a

196 word-boundary regex, so incidental substrings (e.g. "author" containing

197 "auth") never cause a misclassification. Ordering matters (auth and

198 rate-limit signals are checked before the generic permission and

199 nonzero-exit fallbacks). Returns one of the ``ERR_*`` codes.

200 """

201 text = (stderr or "").lower()

202 if _AUTH_RE.search(text):

203 return ERR_AUTH_REQUIRED

204 if _RATE_LIMIT_RE.search(text):

205 return ERR_RATE_LIMITED

206 if _PERMISSION_RE.search(text):

207 return ERR_PERMISSION_PROMPT

208 del returncode

209 return ERR_NONZERO_EXIT

210

211

212@dataclass

213class AgentResult:

214 agent: str

215 vendor: str

216 ok: bool

217 output: str

218 duration_s: float

219 error: str | None = None

220 findings: list = field(default_factory=list)

221 warnings: list = field(default_factory=list)

222 error_code: str | None = None

223 # Number of attempts made for this result (issue #30): 1 means no retry.

224 # >1 records that a transient failure was retried before this outcome.

225 attempts: int = 1

226

227

228class Adapter:

229 """Base adapter. Subclasses build the argv for their CLI."""

230

231 # Declarative capability metadata. Real coding-agent CLIs support a headless

232 # (non-interactive) invocation and model selection; subclasses override where

233 # this differs. ``MockAdapter`` reports synthetic capabilities.

234 SUPPORTS_HEADLESS = True

235 SUPPORTS_MODEL_SELECTION = True

236

237 # Args passed to the CLI to print its version. Subclasses override if the CLI

238 # uses a different verb/flag (e.g. ``codex --version``).

239 _VERSION_ARGS = ("--version",)

240

241 def __init__(self, spec: AgentSpec):

242 self.spec = spec

243

244 @property

245 def name(self) -> str:

246 return self.spec.name

247

248 def available(self) -> bool:

249 return shutil.which(self.spec.command) is not None

250

251 def build_argv(self, prompt: str) -> list[str]: # pragma: no cover - overridden

252 raise NotImplementedError

253

254 def _stdin_for(self, prompt: str) -> str | None:

255 """Prompt to feed on stdin, or None to pass it in argv (the default)."""

256 del prompt

257 return None

258

259 def _version_argv(self) -> list[str]:

260 """Argv used to probe the CLI's version."""

261 return [self.spec.command, *self._VERSION_ARGS]

262

263 def detect_capabilities(self) -> dict:

264 """Best-effort probe of this agent's version and capabilities.

265

266 Returns a dict shaped like::

267

268 {

269 "version": "<str|None>",

270 "supports_headless": bool,

271 "supports_model_selection": bool,

272 "raw_version_output": "<short str>",

273 "status": "ok|unknown_version|unavailable",

274 "warnings": [...],

275 }

276

277 This is intentionally fast and forgiving: it runs ``<command> --version``

278 with a SHORT timeout and swallows ALL errors (missing CLI, timeout,

279 nonzero exit, garbage output). It NEVER raises, so it is safe to call

280 from diagnostics without blocking or crashing a run.

281 """

282 caps = {

283 "version": None,

284 "supports_headless": self.SUPPORTS_HEADLESS,

285 "supports_model_selection": self.SUPPORTS_MODEL_SELECTION,

286 "raw_version_output": "",

287 "status": CAP_UNAVAILABLE,

288 "warnings": [],

289 }

290

291 # Not on PATH: report unavailable without spawning a subprocess.

292 if not self.available():

293 return caps

294

295 try:

296 # Via _spawn so the probe also runs in its own process group and the

297 # whole group is killed on timeout (issue #303/L-1) — matching the

298 # main run path; a bare subprocess.run would orphan grandchildren.

299 proc = _spawn(self._version_argv(), None, _VERSION_PROBE_TIMEOUT)

300 except subprocess.TimeoutExpired:

301 caps["status"] = CAP_UNKNOWN_VERSION

302 caps["warnings"].append(

303 f"version probe for '{self.spec.command}' timed out after {_VERSION_PROBE_TIMEOUT}s"

304 )

305 return caps

306 except Exception as exc: # noqa: BLE001 - swallow any spawn failure

307 caps["status"] = CAP_UNKNOWN_VERSION

308 caps["warnings"].append(f"version probe for '{self.spec.command}' failed: {redaction.redact(str(exc))[0]}")

309 return caps

310

311 raw = ((proc.stdout or "") + (proc.stderr or "")).strip()

312 caps["raw_version_output"] = redaction.redact(raw[:200])[0]

313 match = _VERSION_RE.search(raw)

314 if proc.returncode == 0 and match:

315 caps["version"] = match.group(0)

316 caps["status"] = CAP_OK

317 else:

318 caps["status"] = CAP_UNKNOWN_VERSION

319 caps["warnings"].append(

320 f"could not determine version of '{self.spec.command}' "

321 f"(exit {proc.returncode}); capabilities assumed from vendor defaults"

322 )

323 return caps

324

325 def run(self, prompt: str, phase: str = "review", timeout: int | None = None) -> AgentResult:

326 del phase

327 if not self.available():

328 return AgentResult(

329 self.name,

330 self.spec.vendor,

331 False,

332 "",

333 0.0,

334 f"command not found on PATH: {self.spec.command}",

335 error_code=ERR_MISSING_CLI,

336 )

337 # The effective timeout is the caller's override (the run budget, issue

338 # #30) when smaller than the agent's own bound, else the agent timeout.

339 effective_timeout = self.spec.timeout

340 if timeout is not None:

341 effective_timeout = max(1, min(self.spec.timeout, int(timeout)))

342 argv = self.build_argv(prompt)

343 stdin = self._stdin_for(prompt)

344 start = time.monotonic()

345 try:

346 proc = _spawn(argv, stdin, effective_timeout)

347 except subprocess.TimeoutExpired:

348 return AgentResult(

349 self.name,

350 self.spec.vendor,

351 False,

352 "",

353 time.monotonic() - start,

354 f"timed out after {effective_timeout}s",

355 error_code=ERR_TIMEOUT,

356 )

357 except Exception as exc: # noqa: BLE001 - surface any spawn failure

358 return AgentResult(

359 self.name,

360 self.spec.vendor,

361 False,

362 "",

363 time.monotonic() - start,

364 f"spawn failed: {redaction.redact(str(exc))[0]}",

365 error_code=ERR_SPAWN_FAILED,

366 )

367 dur = time.monotonic() - start

368 out = (proc.stdout or "").strip()

369 # A nonzero exit is ALWAYS a failure, even with stdout (issue #101): a

370 # crashing CLI can still print partial or error output, and counting that

371 # as a clean review would silently feed it into consensus, synthesis, and

372 # the CI gate. We classify from stderr (falling back to any stdout) and

373 # keep a short snippet in the error for debugging — but ok=False, so the

374 # orchestrator excludes it.

375 if proc.returncode != 0:

376 stderr = (proc.stderr or "").strip()

377 detail = stderr or out

378 # Redact before embedding in the error: a crashing CLI can dump an

379 # env var / token into its stderr, and this string is rendered into

380 # the report and posted to the PR. Mirrors the LocalAdapter path

381 # (#293/F-8); the asymmetry was a secret-leak vector (audit

382 # 2026-06-13/N-1). Classify on the raw text (no secrets in codes).

383 safe_detail = redaction.redact(detail)[0]

384 return AgentResult(

385 self.name,

386 self.spec.vendor,

387 False,

388 "",

389 dur,

390 f"exit {proc.returncode}: {safe_detail[:500]}",

391 error_code=classify_stderr(proc.returncode, stderr or out),

392 )

393 if not out:

394 # Exit 0 but nothing on stdout: the agent produced no usable review.

395 return AgentResult(

396 self.name,

397 self.spec.vendor,

398 False,

399 "",

400 dur,

401 f"exit {proc.returncode}: empty output",

402 error_code=ERR_EMPTY_OUTPUT,

403 )

404 return AgentResult(self.name, self.spec.vendor, True, out, dur)

405

406

407class ClaudeAdapter(Adapter):

408 # The prompt embeds the (redacted) diff and PR/issue context; deliver it on

409 # STDIN rather than as a process argument so it is not exposed in `ps` /

410 # /proc/<pid>/cmdline to other local users (issue #287). `claude -p` reads

411 # the prompt from stdin when no positional prompt is given.

412 def build_argv(self, prompt: str) -> list[str]:

413 del prompt

414 argv = [self.spec.command, "-p"]

415 if self.spec.model:

416 argv += ["--model", self.spec.model]

417 return argv + _read_only_extra_args(self.spec)

418

419 def _stdin_for(self, prompt: str) -> str | None:

420 return prompt

421

422

423class CodexAdapter(Adapter):

424 # Pipe the prompt on stdin (not positionally) so ``codex exec`` never blocks

425 # waiting for input in non-interactive runs. Sandbox flags live in extra_args;

426 # the shipped default is ``-s read-only`` (secure by default, #100) — the

427 # reviewer only reads its prompt, since the jury fetches the diff via ``gh``.

428 def build_argv(self, prompt: str) -> list[str]:

429 del prompt

430 argv = [self.spec.command, "exec"]

431 if self.spec.model:

432 argv += ["-m", self.spec.model]

433 return argv + _read_only_extra_args(self.spec)

434

435 def _stdin_for(self, prompt: str) -> str | None:

436 return prompt

437

438

439class AgyAdapter(Adapter):

440 # Prompt on STDIN, not argv (issue #287): `agy --print` reads the prompt from

441 # stdin when no positional prompt is given (verified against agy 1.0.6), so the

442 # redacted diff is not exposed in the process list.

443 def build_argv(self, prompt: str) -> list[str]:

444 del prompt

445 argv = [self.spec.command, "--print"]

446 if self.spec.model:

447 argv += ["--model", self.spec.model]

448 return argv + _read_only_extra_args(self.spec)

449

450 def _stdin_for(self, prompt: str) -> str | None:

451 return prompt

452

453

454_DEFAULT_LOCAL_ENDPOINT = "http://localhost:11434/v1"

455

456

457def _http_only_opener():

458 """An opener that handles ONLY http/https (issue #291, SSRF defense).

459

460 The default ``urllib`` opener honors ``file://`` and ``ftp://``, so an

461 attacker-influenced ``endpoint`` could read local files or reach other

462 schemes. This OpenerDirector registers no ``FileHandler``/``FTPHandler``, so

463 any non-http(s) URL raises ``URLError("unknown url type")`` regardless of

464 config validation — defense in depth alongside ``config._endpoint_issues``.

465

466 It also registers NO ``HTTPRedirectHandler`` (review of #291): otherwise a

467 malicious/compromised endpoint could 302-redirect to an internal/metadata

468 host (e.g. ``169.254.169.254``) and the opener would follow it, bypassing the

469 configured-URL validation. Without the handler a 3xx surfaces as an

470 ``HTTPError`` (a failed review) and is never followed.

471 """

472 import urllib.request

473

474 opener = urllib.request.OpenerDirector()

475 for handler in (

476 urllib.request.HTTPHandler,

477 urllib.request.HTTPSHandler,

478 urllib.request.HTTPDefaultErrorHandler,

479 urllib.request.HTTPErrorProcessor,

480 # UnknownHandler raises URLError("unknown url type: …") for any scheme

481 # without a registered handler — so file://, ftp://, etc. fail loudly

482 # instead of silently resolving to None.

483 urllib.request.UnknownHandler,

484 ):

485 opener.add_handler(handler())

486 return opener

487

488

489def _open(target, timeout):

490 """Open an http/https URL or Request via the restricted opener (issue #291).

491

492 Single seam for every local-adapter HTTP call so the SSRF-safe opener (no

493 file/ftp handlers) is always used.

494 """

495 return _http_only_opener().open(target, timeout=timeout)

496

497

498def list_local_models(endpoint: str = _DEFAULT_LOCAL_ENDPOINT) -> list[str]:

499 """List model ids from a local OpenAI-compatible server (issue #109).

500

501 GETs ``{endpoint}/models`` (the OpenAI-compatible listing that Ollama,

502 vLLM, LM Studio, etc. expose) and returns the model ids in their reported

503 order. Best-effort and stdlib-only: any failure (server down, bad JSON)

504 returns ``[]`` so callers can fall back gracefully.

505

506 The endpoint is validated here at the seam (issue #309) so EVERY caller —

507 including the un-gated ``jury init --local-endpoint`` discovery path — gets

508 the same SSRF gate that ``config._endpoint_issues`` enforces for config-file

509 endpoints: a non-``http(s)`` scheme or a non-loopback host (without the

510 ``JURY_ALLOW_REMOTE_ENDPOINT`` opt-in) yields ``[]`` without any network call.

511 """

512 import json as _json

513

514 from .config import _endpoint_issues

515

516 base = (endpoint or _DEFAULT_LOCAL_ENDPOINT).rstrip("/")

517 try:

518 # SSRF gate INSIDE the try (review of #309): `_endpoint_issues` calls

519 # urlsplit, which raises ValueError on a malformed URL (e.g. `http://[::1`);

520 # keep the best-effort "any failure -> []" contract rather than crashing.

521 if _endpoint_issues(base, "local-endpoint")[0]: # hard-error issues -> refuse

522 return []

523 url = base if base.endswith("/models") else f"{base}/models"

524 with _open(url, _VERSION_PROBE_TIMEOUT) as resp: # noqa: S310

525 data = _json.loads(resp.read(_MAX_RESPONSE_BYTES).decode("utf-8", errors="replace"))

526 except Exception: # noqa: BLE001 - discovery is best-effort

527 return []

528 models = data.get("data") if isinstance(data, dict) else None

529 if not isinstance(models, list):

530 return []

531 ids = [m.get("id") for m in models if isinstance(m, dict) and m.get("id")]

532 return [str(i) for i in ids]

533

534

535class LocalAdapter(Adapter):

536 """Open-weight / local-model reviewer over an OpenAI-compatible API (issue #43).

537

538 Targets the ``/v1/chat/completions`` endpoint exposed by common local servers

539 (Ollama, llama.cpp ``llama-server``, vLLM, LM Studio). It talks plain HTTP via

540 the stdlib (``urllib``) — no new dependencies and no subprocess — so one panel

541 seat can run free and fully offline, adding model diversity (the load-bearing

542 advantage) at zero marginal cost.

543

544 Configure as a normal ``[[agent]]`` with ``vendor = "local"``, an

545 ``endpoint`` (base URL, default ``http://localhost:11434/v1``), and a

546 ``model``. ``extra_args`` is unused. An unreachable server fails with the

547 typed ``connection_error`` code (issue #29) rather than a crash.

548 """

549

550 SUPPORTS_HEADLESS = True

551 SUPPORTS_MODEL_SELECTION = True

552

553 @property

554 def endpoint(self) -> str:

555 return (self.spec.endpoint or _DEFAULT_LOCAL_ENDPOINT).rstrip("/")

556

557 def completions_url(self) -> str:

558 """Resolve the chat-completions URL from the configured base endpoint.

559

560 Accepts either a base URL (``…/v1``) or a full completions URL; pure so it

561 can be unit-tested without network.

562 """

563 base = self.endpoint

564 if base.endswith("/chat/completions"):

565 return base

566 return f"{base}/chat/completions"

567

568 def build_payload(self, prompt: str) -> dict:

569 """Build the OpenAI-compatible chat-completions request body (pure)."""

570 return {

571 "model": self.spec.model or "",

572 "messages": [{"role": "user", "content": prompt}],

573 "stream": False,

574 "temperature": 0,

575 }

576

577 @staticmethod

578 def parse_content(data: dict) -> str:

579 """Extract the assistant message text from a chat-completions response."""

580 choices = data.get("choices") or []

581 if not choices:

582 return ""

583 message = choices[0].get("message") or {}

584 return (message.get("content") or "").strip()

585

586 @staticmethod

587 def classify_http_status(status: int) -> str:

588 """Map an HTTP error status to a typed error code (issue #29)."""

589 if status in (401, 403):

590 return ERR_AUTH_REQUIRED

591 if status == 429:

592 return ERR_RATE_LIMITED

593 return ERR_NONZERO_EXIT

594

595 def available(self) -> bool:

596 """A local agent is 'available' when its server answers a quick probe.

597

598 Probes the OpenAI-compatible ``/v1/models`` (or the endpoint root) with a

599 short timeout. Network-only; never raises.

600 """

601 import urllib.error

602 import urllib.request

603

604 url = f"{self.endpoint}/models"

605 try:

606 with _open(url, _VERSION_PROBE_TIMEOUT) as resp: # noqa: S310

607 return 200 <= resp.status < 500

608 except urllib.error.HTTPError as exc:

609 # A 4xx (e.g. 404 on /models) still means the server is up.

610 return exc.code < 500

611 except Exception: # noqa: BLE001 - unreachable server -> not available

612 return False

613

614 def detect_capabilities(self) -> dict:

615 reachable = self.available()

616 return {

617 "version": None,

618 "supports_headless": self.SUPPORTS_HEADLESS,

619 "supports_model_selection": self.SUPPORTS_MODEL_SELECTION,

620 "raw_version_output": f"local endpoint {self.endpoint}",

621 "status": CAP_OK if reachable else CAP_UNAVAILABLE,

622 "warnings": ([] if reachable else [f"local server unreachable at {self.endpoint}"]),

623 }

624

625 def run(self, prompt: str, phase: str = "review", timeout: int | None = None) -> AgentResult:

626 import json as _json

627 import urllib.error

628 import urllib.request

629

630 del phase

631 effective_timeout = self.spec.timeout

632 if timeout is not None:

633 effective_timeout = max(1, min(self.spec.timeout, int(timeout)))

634 body = _json.dumps(self.build_payload(prompt)).encode("utf-8")

635 req = urllib.request.Request(

636 self.completions_url(),

637 data=body,

638 headers={"Content-Type": "application/json"},

639 method="POST",

640 )

641 start = time.monotonic()

642 try:

643 with _open(req, effective_timeout) as resp: # noqa: S310

644 raw = resp.read(_MAX_RESPONSE_BYTES).decode("utf-8", errors="replace")

645 data = _json.loads(raw)

646 except urllib.error.HTTPError as exc:

647 detail = ""

648 try:

649 detail = exc.read(_MAX_RESPONSE_BYTES).decode("utf-8", errors="replace")[:300]

650 except Exception: # noqa: BLE001

651 detail = exc.reason or ""

652 # The body is from a possibly-untrusted endpoint and is surfaced in

653 # the report; redact recognized secrets before embedding (#293/F-8).

654 detail = redaction.redact(detail)[0]

655 return AgentResult(

656 self.name,

657 self.spec.vendor,

658 False,

659 "",

660 time.monotonic() - start,

661 f"HTTP {exc.code}: {detail}",

662 error_code=self.classify_http_status(exc.code),

663 )

664 except TimeoutError:

665 return AgentResult(

666 self.name,

667 self.spec.vendor,

668 False,

669 "",

670 time.monotonic() - start,

671 f"timed out after {effective_timeout}s",

672 error_code=ERR_TIMEOUT,

673 )

674 except urllib.error.URLError as exc:

675 return AgentResult(

676 self.name,

677 self.spec.vendor,

678 False,

679 "",

680 time.monotonic() - start,

681 f"could not reach local server at {self.endpoint}: {redaction.redact(str(exc.reason))[0]}",

682 error_code=ERR_CONNECTION,

683 )

684 except Exception as exc: # noqa: BLE001 - surface any other failure

685 return AgentResult(

686 self.name,

687 self.spec.vendor,

688 False,

689 "",

690 time.monotonic() - start,

691 f"local request failed: {redaction.redact(str(exc))[0]}",

692 error_code=ERR_UNKNOWN,

693 )

694 dur = time.monotonic() - start

695 content = self.parse_content(data)

696 if not content:

697 return AgentResult(

698 self.name,

699 self.spec.vendor,

700 False,

701 "",

702 dur,

703 "local model returned empty content",

704 error_code=ERR_EMPTY_OUTPUT,

705 )

706 return AgentResult(self.name, self.spec.vendor, True, content, dur)

707

708

709# Hosted vendor API endpoints (issue #430). Fixed, not configurable: unlike

710# `local`'s user-supplied `endpoint` (which needs the SSRF validation in

711# config._endpoint_issues), a hosted vendor's URL is a known constant, not an

712# attacker- or operator-influenceable value, so there is nothing to validate.

713_ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"

714_ANTHROPIC_API_VERSION = "2023-06-01"

715_OPENAI_API_URL = "https://api.openai.com/v1/chat/completions"

716# The Anthropic Messages API requires max_tokens on every request; there is no

717# server-side default. Generous enough for a review response, small enough to

718# bound cost/latency if a run is ever misconfigured to loop.

719_HOSTED_API_MAX_TOKENS = 4096

720

721

722def _hosted_api_status_code(status: int) -> str:

723 """Map a hosted-API HTTP status to a typed error code (issue #430).

724

725 Shared by every hosted-API adapter — identical mapping to

726 ``LocalAdapter.classify_http_status`` (401/403 → auth, 429 → rate limit),

727 kept as a free function since it has no per-adapter state.

728 """

729 if status in (401, 403):

730 return ERR_AUTH_REQUIRED

731 if status == 429:

732 return ERR_RATE_LIMITED

733 return ERR_NONZERO_EXIT

734

735

736def _post_json(

737 url: str, payload: dict, headers: dict[str, str], timeout: int

738) -> tuple[dict | None, str | None, str | None]:

739 """POST a JSON body and parse a JSON response (issue #430).

740

741 Shared HTTP mechanics for the hosted-API adapters: build the request,

742 route it through the SSRF-safe opener (``_open``, no file/ftp handlers, no

743 redirect following — the same seam ``LocalAdapter`` uses), cap the response

744 read at ``_MAX_RESPONSE_BYTES``, and classify any failure into a typed

745 error code. Returns ``(response_dict, None, None)`` on success or

746 ``(None, error_message, error_code)`` on failure — exactly one shape.

747 Response bodies are redacted before being returned in an error message

748 since they originate from the network and are surfaced in the report.

749 """

750 import json as _json

751 import urllib.error

752 import urllib.request

753

754 body = _json.dumps(payload).encode("utf-8")

755 req = urllib.request.Request(url, data=body, headers=headers, method="POST")

756 try:

757 with _open(req, timeout) as resp: # noqa: S310

758 raw = resp.read(_MAX_RESPONSE_BYTES).decode("utf-8", errors="replace")

759 return _json.loads(raw), None, None

760 except urllib.error.HTTPError as exc:

761 detail = ""

762 try:

763 detail = exc.read(_MAX_RESPONSE_BYTES).decode("utf-8", errors="replace")[:300]

764 except Exception: # noqa: BLE001 - reading the error body is best-effort

765 detail = exc.reason or ""

766 detail = redaction.redact(detail)[0]

767 return None, f"HTTP {exc.code}: {detail}", _hosted_api_status_code(exc.code)

768 except TimeoutError:

769 return None, f"timed out after {timeout}s", ERR_TIMEOUT

770 except urllib.error.URLError as exc:

771 return (

772 None,

773 f"could not reach {url}: {redaction.redact(str(exc.reason))[0]}",

774 ERR_CONNECTION,

775 )

776 except Exception as exc: # noqa: BLE001 - surface any other failure

777 return None, f"request failed: {redaction.redact(str(exc))[0]}", ERR_UNKNOWN

778

779

780class _HostedApiAdapter(Adapter):

781 """Shared base for hosted-vendor-API reviewers keyed by an env-var API key.

782

783 No CLI install, no interactive login, no subprocess: just an HTTP call

784 over stdlib ``urllib`` to the vendor's real hosted API (issue #430), the

785 same no-subprocess/no-new-dependency design as ``LocalAdapter`` but

786 pointed at a hosted endpoint instead of a local server. The API key is

787 read from the environment ONLY, never from ``jury.toml``, so it cannot

788 leak into a checked-in config; the endpoint is a fixed per-vendor

789 constant, not a config value, so there is no SSRF surface to guard the

790 way `local`'s `endpoint` needs.

791 """

792

793 SUPPORTS_HEADLESS = True

794 SUPPORTS_MODEL_SELECTION = True

795

796 # Subclasses override.

797 _API_KEY_ENV: str = ""

798

799 def _api_key(self) -> str:

800 return os.environ.get(self._API_KEY_ENV, "")

801

802 def _api_url(self) -> str: # pragma: no cover - overridden

803 raise NotImplementedError

804

805 def _invalid_key_reason(self) -> str | None:

806 """None if the key is safe to use as an HTTP header value; else why not.

807

808 A key containing a control character (most plausibly a stray

809 trailing ``\\n`` from a file/k8s-secret/`.env` mount) trips CPython's

810 ``http.client`` header-injection guard. That guard reports the

811 rejected value via ``repr()`` (e.g. an embedded newline becomes the

812 two literal characters ``\\`` ``n``), which does **not** byte-for-byte

813 match the raw key — so a literal substring scrub of the exception

814 text (see :meth:`_scrub_secret`) cannot reliably catch it; the

815 transformed text is no longer equal to the original secret. Validate

816 and reject *before* the key ever reaches a header instead of trying

817 to scrub it back out afterward.

818 """

819 if any(ord(ch) < 0x20 or ord(ch) == 0x7F for ch in self._api_key()):

820 return (

821 f"{self._API_KEY_ENV} contains a control character (e.g. a stray "

822 f"trailing newline from how the secret was loaded) and cannot be "

823 f"used as an HTTP header value"

824 )

825 return None

826

827 def _scrub_secret(self, text: str) -> str:

828 """Strip the literal API key value from an error message (issue #430).

829

830 Defense-in-depth alongside ``redaction.redact()`` (which only

831 recognizes known vendor-token *shapes* via regex) for any leak path

832 NOT already ruled out by :meth:`_invalid_key_reason` — e.g. a

833 well-formed key that still ends up quoted in some other library's

834 error text. Not a substitute for that check: once a value contains

835 control characters, downstream formatting (``repr()``, percent-

836 encoding, ...) can transform it before it reaches an error message,

837 and a literal match against the *original* key would then silently

838 miss it — which is exactly why control characters are rejected

839 upfront in :meth:`run` instead of relying on this alone.

840 """

841 key = self._api_key()

842 if key and key in text:

843 return text.replace(key, "[REDACTED]")

844 return text

845

846 def available(self) -> bool:

847 """Available when a *usable* API key is set — a fast, network-free check.

848

849 Unlike ``LocalAdapter.available()`` (which probes the server, since a

850 local endpoint's reachability is genuinely uncertain), a hosted

851 vendor's API is assumed reachable; the two real unknowns locally are

852 whether the operator configured a key at all, and whether it's

853 actually usable as a header value (see :meth:`_invalid_key_reason`) —

854 a key that will be rejected by :meth:`run` should not report as

855 available here either, or a capability check (``jury --doctor``)

856 would give a falsely reassuring answer.

857 """

858 return bool(self._api_key()) and self._invalid_key_reason() is None

859

860 def detect_capabilities(self) -> dict:

861 key_set = bool(self._api_key())

862 invalid_reason = self._invalid_key_reason() if key_set else None

863 has_key = key_set and invalid_reason is None

864 if not key_set:

865 warnings = [f"{self._API_KEY_ENV} is not set in the environment"]

866 elif invalid_reason:

867 warnings = [invalid_reason]

868 else:

869 warnings = []

870 return {

871 "version": None,

872 "supports_headless": self.SUPPORTS_HEADLESS,

873 "supports_model_selection": self.SUPPORTS_MODEL_SELECTION,

874 "raw_version_output": f"hosted API {self._api_url()}",

875 "status": CAP_OK if has_key else CAP_UNAVAILABLE,

876 "warnings": warnings,

877 }

878

879 def build_payload(self, prompt: str) -> dict: # pragma: no cover - overridden

880 raise NotImplementedError

881

882 def _headers(self) -> dict[str, str]: # pragma: no cover - overridden

883 raise NotImplementedError

884

885 @staticmethod

886 def parse_content(data: dict) -> str: # pragma: no cover - overridden

887 raise NotImplementedError

888

889 def run(self, prompt: str, phase: str = "review", timeout: int | None = None) -> AgentResult:

890 del phase

891 # Checked independently of available() (not just "not available()"):

892 # available() now also returns False for a key that IS set but

893 # invalid, and that case needs its own distinct error_code/message

894 # below rather than the misleading "is not set" one.

895 if not self._api_key():

896 return AgentResult(

897 self.name,

898 self.spec.vendor,

899 False,

900 "",

901 0.0,

902 f"{self._API_KEY_ENV} is not set in the environment",

903 error_code=ERR_MISSING_API_KEY,

904 )

905 invalid_reason = self._invalid_key_reason()

906 if invalid_reason is not None:

907 # Reject before the key ever reaches a header — see

908 # _invalid_key_reason for why post-hoc scrubbing can't be trusted

909 # here. This message never echoes the key itself.

910 return AgentResult(

911 self.name, self.spec.vendor, False, "", 0.0,

912 invalid_reason, error_code=ERR_INVALID_API_KEY,

913 )

914 effective_timeout = self.spec.timeout

915 if timeout is not None:

916 effective_timeout = max(1, min(self.spec.timeout, int(timeout)))

917 start = time.monotonic()

918 data, err_msg, err_code = _post_json(

919 self._api_url(), self.build_payload(prompt), self._headers(), effective_timeout

920 )

921 dur = time.monotonic() - start

922 if err_msg is not None:

923 return AgentResult(

924 self.name, self.spec.vendor, False, "", dur,

925 self._scrub_secret(err_msg), error_code=err_code,

926 )

927 content = self.parse_content(data or {})

928 if not content:

929 return AgentResult(

930 self.name,

931 self.spec.vendor,

932 False,

933 "",

934 dur,

935 "hosted API returned empty content",

936 error_code=ERR_EMPTY_OUTPUT,

937 )

938 return AgentResult(self.name, self.spec.vendor, True, content, dur)

939

940

941class AnthropicApiAdapter(_HostedApiAdapter):

942 """Hosted Anthropic Messages API reviewer, keyed by ``ANTHROPIC_API_KEY`` (issue #430).

943

944 Configure as a normal ``[[agent]]`` with ``vendor = "anthropic-api"`` and a

945 ``model`` (e.g. a current Claude model id) — no ``command``, no ``claude``

946 CLI install or interactive login needed.

947 """

948

949 _API_KEY_ENV = "ANTHROPIC_API_KEY"

950

951 def _api_url(self) -> str:

952 return _ANTHROPIC_API_URL

953

954 def build_payload(self, prompt: str) -> dict:

955 """Build the Anthropic Messages API request body (pure)."""

956 return {

957 "model": self.spec.model or "",

958 "max_tokens": _HOSTED_API_MAX_TOKENS,

959 "messages": [{"role": "user", "content": prompt}],

960 }

961

962 def _headers(self) -> dict[str, str]:

963 return {

964 "Content-Type": "application/json",

965 "x-api-key": self._api_key(),

966 "anthropic-version": _ANTHROPIC_API_VERSION,

967 }

968

969 @staticmethod

970 def parse_content(data: dict) -> str:

971 """Extract the assistant text from a Messages API response."""

972 if not isinstance(data, dict):

973 return ""

974 blocks = data.get("content") or []

975 texts = [

976 block.get("text", "")

977 for block in blocks

978 if isinstance(block, dict) and block.get("type") == "text"

979 ]

980 return "".join(texts).strip()

981

982

983class OpenAiApiAdapter(_HostedApiAdapter):

984 """Hosted OpenAI Chat Completions API reviewer, keyed by ``OPENAI_API_KEY`` (issue #430).

985

986 Configure as a normal ``[[agent]]`` with ``vendor = "openai-api"`` and a

987 ``model`` (e.g. a current GPT model id) — no ``command``, no ``codex`` CLI

988 install or interactive login needed. Same request/response shape as

989 ``LocalAdapter`` (both are OpenAI-compatible chat completions), just

990 against the real hosted API with an ``Authorization`` header.

991 """

992

993 _API_KEY_ENV = "OPENAI_API_KEY"

994

995 def _api_url(self) -> str:

996 return _OPENAI_API_URL

997

998 def build_payload(self, prompt: str) -> dict:

999 """Build the OpenAI chat-completions request body (pure)."""

1000 return {

1001 "model": self.spec.model or "",

1002 "messages": [{"role": "user", "content": prompt}],

1003 }

1004

1005 def _headers(self) -> dict[str, str]:

1006 return {

1007 "Content-Type": "application/json",

1008 "Authorization": f"Bearer {self._api_key()}",

1009 }

1010

1011 @staticmethod

1012 def parse_content(data: dict) -> str:

1013 """Extract the assistant message text from a chat-completions response."""

1014 if not isinstance(data, dict):

1015 return ""

1016 choices = data.get("choices") or []

1017 if not choices:

1018 return ""

1019 message = choices[0].get("message") or {}

1020 return (message.get("content") or "").strip()

1021

1022

1023_GEMINI_API_BASE = "https://generativelanguage.googleapis.com/v1beta/models"

1024

1025

1026class GoogleApiAdapter(_HostedApiAdapter):

1027 """Hosted Google Gemini API reviewer, keyed by ``GEMINI_API_KEY`` (issue #432).

1028

1029 Configure as a normal ``[[agent]]`` with ``vendor = "google-api"`` and a

1030 ``model`` (e.g. a current Gemini model id) — no ``command``, no ``agy``

1031 CLI install or interactive login needed.

1032

1033 Two differences from the other two hosted adapters:

1034

1035 - The Gemini API embeds the model id in the URL **path**

1036 (``.../models/{model}:generateContent``), not the request body, so

1037 ``_api_url()`` is built from ``self.spec.model`` on every call rather

1038 than returning a fixed constant like the other two adapters.

1039 - The key is sent via the ``x-goog-api-key`` header. Gemini also accepts

1040 the key as a ``?key=...`` query parameter, but a query-string key is a

1041 much easier accidental-leak vector (proxy/access logs, anything that

1042 prints the request URL) than a header — deliberately not supported.

1043

1044 A prompt blocked by Gemini's safety filters comes back with an empty

1045 ``candidates`` list (and a ``promptFeedback.blockReason``); this is not

1046 distinguished from a genuinely empty response and both currently surface

1047 as the same generic ``ERR_EMPTY_OUTPUT`` — a possible future refinement,

1048 not required for parity with the other two adapters.

1049 """

1050

1051 _API_KEY_ENV = "GEMINI_API_KEY"

1052

1053 def _api_url(self) -> str:

1054 # Escape the model id as a single path segment (issue #432 review): an

1055 # operator-configured model containing reserved URL characters

1056 # (`/`, `?`, `#`, ...) would otherwise change the request's path/query

1057 # semantics instead of staying a single `{model}` segment.

1058 import urllib.parse

1059

1060 model = urllib.parse.quote(self.spec.model or "", safe="")

1061 return f"{_GEMINI_API_BASE}/{model}:generateContent"

1062

1063 def build_payload(self, prompt: str) -> dict:

1064 """Build the Gemini ``generateContent`` request body (pure)."""

1065 return {"contents": [{"parts": [{"text": prompt}]}]}

1066

1067 def _headers(self) -> dict[str, str]:

1068 return {

1069 "Content-Type": "application/json",

1070 "x-goog-api-key": self._api_key(),

1071 }

1072

1073 @staticmethod

1074 def parse_content(data: dict) -> str:

1075 """Extract the assistant text from a ``generateContent`` response."""

1076 if not isinstance(data, dict):

1077 return ""

1078 candidates = data.get("candidates") or []

1079 if not candidates or not isinstance(candidates[0], dict):

1080 return ""

1081 content = candidates[0].get("content")

1082 if not isinstance(content, dict):

1083 return ""

1084 parts = content.get("parts") or []

1085 texts = [

1086 part.get("text", "")

1087 for part in parts

1088 if isinstance(part, dict) and isinstance(part.get("text", ""), str)

1089 ]

1090 return "".join(texts).strip()

1091

1092

1093class MockAdapter(Adapter):

1094 """Offline adapter for tests and ``--mock`` runs.

1095

1096 Produces deterministic, phase-aware text so the full orchestration pipeline

1097 can run end-to-end without live CLIs, auth, or token spend.

1098 """

1099

1100 # Synthetic capabilities: the mock is offline and runs no real CLI.

1101 SUPPORTS_HEADLESS = True

1102 SUPPORTS_MODEL_SELECTION = False

1103

1104 def available(self) -> bool:

1105 return True

1106

1107 def detect_capabilities(self) -> dict:

1108 """Deterministic fake capabilities so doctor/tests stay stable offline."""

1109 return {

1110 "version": "mock-1.0",

1111 "supports_headless": self.SUPPORTS_HEADLESS,

1112 "supports_model_selection": self.SUPPORTS_MODEL_SELECTION,

1113 "raw_version_output": "mock-1.0",

1114 "status": CAP_OK,

1115 "warnings": [],

1116 }

1117

1118 def run(self, prompt: str, phase: str = "review", timeout: int | None = None) -> AgentResult:

1119 del prompt, timeout

1120 n = self.name

1121 if phase == "review":

1122 body = (

1123 f"- **[major]** `src/example.py:42` — {n}: unchecked return value "

1124 f"may swallow an error.\n"

1125 f"- **[minor]** `src/example.py:7` — {n}: missing docstring.\n\n"

1126 "```json\n"

1127 "[\n"

1128 ' {"severity": "major", "file": "src/example.py", "line": 42, '

1129 f'"claim": "{n}: unchecked return value may swallow an error", '

1130 '"evidence": "the added code ignores the return value of int(x)", '

1131 '"suggested_fix": "check the result and raise on failure", '

1132 f'"confidence": "high", "reviewer": "{n}"}},\n'

1133 ' {"severity": "minor", "file": "src/example.py", "line": 7, '

1134 f'"claim": "{n}: missing docstring", '

1135 '"evidence": "the new function parse() has no docstring", '

1136 '"suggested_fix": "add a one-line docstring", '

1137 f'"confidence": "medium", "reviewer": "{n}"}}\n'

1138 "]\n"

1139 "```"

1140 )

1141 elif phase == "debate":

1142 body = (

1143 f"## AGREE\n- {n}: confirm the unchecked-return finding at "

1144 f"`src/example.py:42`.\n"

1145 f"## DISPUTE\n- {n}: the missing-docstring finding is a nit, not blocking.\n"

1146 f"## MISSED\n- {n}: no test covers the error branch."

1147 )

1148 elif phase == "verify":

1149 body = (

1150 "Verification: confirming the unchecked-return finding at "

1151 "`src/example.py:42`; the missing-docstring claim at `:7` is a nit "

1152 "not supported as blocking.\n\n"

1153 "```json\n"

1154 "[\n"

1155 ' {"file": "src/example.py", "line": 42, '

1156 '"claim": "unchecked return value may swallow an error", '

1157 '"status": "verified", '

1158 '"reasoning": "the added code ignores the return value of int(x)"},\n'

1159 ' {"file": "src/example.py", "line": 7, '

1160 '"claim": "missing docstring", '

1161 '"status": "unsupported", '

1162 '"reasoning": "a missing docstring is not a defect the diff introduces"}\n'

1163 "]\n"

1164 "```"

1165 )

1166 else: # synthesis

1167 body = (

1168 "## Verdict\nREQUEST CHANGES — one confirmed major issue.\n\n"

1169 "## Consensus findings\n- **[major]** `src/example.py:42` — unchecked "

1170 "return value (raised by all reviewers).\n\n"

1171 "## Disputed findings\n- Missing docstring: ruled non-blocking.\n\n"

1172 "## Notable single-reviewer findings\n- Missing test for the error branch."

1173 )

1174 return AgentResult(n, self.spec.vendor, True, body, 0.0)

1175

1176

1177_VENDOR_ADAPTERS: dict[str, type[Adapter]] = {

1178 "anthropic": ClaudeAdapter,

1179 "openai": CodexAdapter,

1180 "google": AgyAdapter,

1181 "local": LocalAdapter,

1182 "anthropic-api": AnthropicApiAdapter,

1183 "openai-api": OpenAiApiAdapter,

1184 "google-api": GoogleApiAdapter,

1185}

1186

1187

1188def make_adapter(spec: AgentSpec, mock: bool = False) -> Adapter:

1189 if mock:

1190 return MockAdapter(spec)

1191 cls = _VENDOR_ADAPTERS.get(spec.vendor)

1192 if cls is None:

1193 # Unknown vendor: treat command as a print-style CLI (prompt as last arg).

1194 return AgyAdapter(spec)

1195 return cls(spec)