Coverage for src/ai_jury/adapters.py: 100%
253 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-05 20:29 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-05 20:29 +0000
1"""Agent adapters — each wraps one native coding-agent CLI in headless mode.
3Every adapter turns a prompt into a subprocess invocation and captures stdout as
4the agent's response. Adapters are intentionally thin: the orchestrator owns the
5prompt content and the round structure; an adapter only knows how to *invoke its
6CLI*.
8Headless invocations (verified against installed CLIs, early 2026):
9 - Claude Code : ``claude -p "<prompt>" --output-format text``
10 - Codex CLI : ``codex exec <args> < <prompt>`` (prompt piped via stdin)
11 - Antigravity : ``agy --print "<prompt>"``
12"""
13from __future__ import annotations
15import re
16import shutil
17import subprocess
18import time
19from dataclasses import dataclass, field
21from .config import AgentSpec
23# Short timeout for capability/version probes. Detection is best-effort and must
24# never slow down or block a normal run, so probes are deliberately snappy.
25_VERSION_PROBE_TIMEOUT = 10
27# Matches a version-looking token, e.g. "1.2", "1.2.3", "v0.45.1".
28_VERSION_RE = re.compile(r"\d+\.\d+(?:\.\d+)?")
30# Capability/version probe statuses.
31CAP_OK = "ok"
32CAP_UNKNOWN_VERSION = "unknown_version"
33CAP_UNAVAILABLE = "unavailable"
35# Stable, typed error taxonomy for failed agent executions. These codes let
36# reports and CI/policy distinguish retryable from non-retryable failures
37# instead of pattern-matching free-text error strings.
38ERR_MISSING_CLI = "missing_cli"
39ERR_AUTH_REQUIRED = "auth_required"
40ERR_PERMISSION_PROMPT = "permission_prompt"
41ERR_TIMEOUT = "timeout"
42ERR_NONZERO_EXIT = "nonzero_exit"
43ERR_EMPTY_OUTPUT = "empty_output"
44ERR_SPAWN_FAILED = "spawn_failed"
45ERR_RATE_LIMITED = "rate_limited"
46# Local/HTTP adapter could not reach its server (issue #43): connection refused,
47# DNS failure, or the local model server is not running.
48ERR_CONNECTION = "connection_error"
49ERR_UNKNOWN = "unknown"
51ERROR_CODES = frozenset({
52 ERR_MISSING_CLI,
53 ERR_AUTH_REQUIRED,
54 ERR_PERMISSION_PROMPT,
55 ERR_TIMEOUT,
56 ERR_NONZERO_EXIT,
57 ERR_EMPTY_OUTPUT,
58 ERR_SPAWN_FAILED,
59 ERR_RATE_LIMITED,
60 ERR_CONNECTION,
61 ERR_UNKNOWN,
62})
64# Failures that are worth retrying because they are typically transient (issue
65# #30): a timeout, a rate-limit, a process that failed to spawn, or a local
66# server that was briefly unreachable (#43). Auth, missing-CLI,
67# permission-prompt, empty-output, and generic nonzero-exit are treated as
68# deterministic — retrying them just burns time and tokens.
69RETRYABLE_ERROR_CODES = frozenset({
70 ERR_TIMEOUT,
71 ERR_RATE_LIMITED,
72 ERR_SPAWN_FAILED,
73 ERR_CONNECTION,
74})
77# Ordered keyword groups for classify_stderr. Each keyword is matched on word
78# boundaries (\b...\b) so incidental substrings do NOT trigger a false
79# classification: bare "auth" matches "auth error" but not "author identity",
80# and "login" matches "login required" but not "login_attempts" ("_" is a word
81# char, so there is no boundary inside "login_attempts"). Multi-word phrases
82# tolerate a space OR "_" between tokens (e.g. "rate limit"/"rate_limit").
83def _keyword_pattern(*keywords: str) -> re.Pattern[str]:
84 parts = [
85 r"[ _]+".join(re.escape(tok) for tok in kw.split())
86 for kw in keywords
87 ]
88 return re.compile(r"\b(?:" + "|".join(parts) + r")\b")
91# Order matters: auth and rate-limit signals are checked before the generic
92# permission and nonzero-exit fallbacks.
93_AUTH_RE = _keyword_pattern(
94 "not authenticated", "unauthenticated", "authentication", "unauthorized",
95 "api key", "auth", "log in", "login", "credential", "credentials",
96)
97_RATE_LIMIT_RE = _keyword_pattern("rate limit", "429", "quota", "too many requests")
98_PERMISSION_RE = _keyword_pattern(
99 "permission", "permissions", "approve", "approval", "confirm", "confirmation",
100)
103def classify_stderr(returncode: int, stderr: str) -> str:
104 """Classify a nonzero-exit failure into a typed error code from its stderr.
106 Token-aware matching against the lowercased stderr: each keyword group is a
107 word-boundary regex, so incidental substrings (e.g. "author" containing
108 "auth") never cause a misclassification. Ordering matters (auth and
109 rate-limit signals are checked before the generic permission and
110 nonzero-exit fallbacks). Returns one of the ``ERR_*`` codes.
111 """
112 text = (stderr or "").lower()
113 if _AUTH_RE.search(text):
114 return ERR_AUTH_REQUIRED
115 if _RATE_LIMIT_RE.search(text):
116 return ERR_RATE_LIMITED
117 if _PERMISSION_RE.search(text):
118 return ERR_PERMISSION_PROMPT
119 del returncode
120 return ERR_NONZERO_EXIT
123@dataclass
124class AgentResult:
125 agent: str
126 vendor: str
127 ok: bool
128 output: str
129 duration_s: float
130 error: str | None = None
131 findings: list = field(default_factory=list)
132 warnings: list = field(default_factory=list)
133 error_code: str | None = None
134 # Number of attempts made for this result (issue #30): 1 means no retry.
135 # >1 records that a transient failure was retried before this outcome.
136 attempts: int = 1
139class Adapter:
140 """Base adapter. Subclasses build the argv for their CLI."""
142 # Declarative capability metadata. Real coding-agent CLIs support a headless
143 # (non-interactive) invocation and model selection; subclasses override where
144 # this differs. ``MockAdapter`` reports synthetic capabilities.
145 SUPPORTS_HEADLESS = True
146 SUPPORTS_MODEL_SELECTION = True
148 # Args passed to the CLI to print its version. Subclasses override if the CLI
149 # uses a different verb/flag (e.g. ``codex --version``).
150 _VERSION_ARGS = ("--version",)
152 def __init__(self, spec: AgentSpec):
153 self.spec = spec
155 @property
156 def name(self) -> str:
157 return self.spec.name
159 def available(self) -> bool:
160 return shutil.which(self.spec.command) is not None
162 def build_argv(self, prompt: str) -> list[str]: # pragma: no cover - overridden
163 raise NotImplementedError
165 def _stdin_for(self, prompt: str) -> str | None:
166 """Prompt to feed on stdin, or None to pass it in argv (the default)."""
167 del prompt
168 return None
170 def _version_argv(self) -> list[str]:
171 """Argv used to probe the CLI's version."""
172 return [self.spec.command, *self._VERSION_ARGS]
174 def detect_capabilities(self) -> dict:
175 """Best-effort probe of this agent's version and capabilities.
177 Returns a dict shaped like::
179 {
180 "version": "<str|None>",
181 "supports_headless": bool,
182 "supports_model_selection": bool,
183 "raw_version_output": "<short str>",
184 "status": "ok|unknown_version|unavailable",
185 "warnings": [...],
186 }
188 This is intentionally fast and forgiving: it runs ``<command> --version``
189 with a SHORT timeout and swallows ALL errors (missing CLI, timeout,
190 nonzero exit, garbage output). It NEVER raises, so it is safe to call
191 from diagnostics without blocking or crashing a run.
192 """
193 caps = {
194 "version": None,
195 "supports_headless": self.SUPPORTS_HEADLESS,
196 "supports_model_selection": self.SUPPORTS_MODEL_SELECTION,
197 "raw_version_output": "",
198 "status": CAP_UNAVAILABLE,
199 "warnings": [],
200 }
202 # Not on PATH: report unavailable without spawning a subprocess.
203 if not self.available():
204 return caps
206 try:
207 proc = subprocess.run(
208 self._version_argv(),
209 capture_output=True,
210 text=True,
211 timeout=_VERSION_PROBE_TIMEOUT,
212 )
213 except subprocess.TimeoutExpired:
214 caps["status"] = CAP_UNKNOWN_VERSION
215 caps["warnings"].append(
216 f"version probe for '{self.spec.command}' timed out after "
217 f"{_VERSION_PROBE_TIMEOUT}s"
218 )
219 return caps
220 except Exception as exc: # noqa: BLE001 - swallow any spawn failure
221 caps["status"] = CAP_UNKNOWN_VERSION
222 caps["warnings"].append(
223 f"version probe for '{self.spec.command}' failed: {exc}"
224 )
225 return caps
227 raw = ((proc.stdout or "") + (proc.stderr or "")).strip()
228 caps["raw_version_output"] = raw[:200]
229 match = _VERSION_RE.search(raw)
230 if proc.returncode == 0 and match:
231 caps["version"] = match.group(0)
232 caps["status"] = CAP_OK
233 else:
234 caps["status"] = CAP_UNKNOWN_VERSION
235 caps["warnings"].append(
236 f"could not determine version of '{self.spec.command}' "
237 f"(exit {proc.returncode}); capabilities assumed from vendor defaults"
238 )
239 return caps
241 def run(self, prompt: str, phase: str = "review", timeout: int | None = None) -> AgentResult:
242 del phase
243 if not self.available():
244 return AgentResult(
245 self.name, self.spec.vendor, False, "",
246 0.0, f"command not found on PATH: {self.spec.command}",
247 error_code=ERR_MISSING_CLI,
248 )
249 # The effective timeout is the caller's override (the run budget, issue
250 # #30) when smaller than the agent's own bound, else the agent timeout.
251 effective_timeout = self.spec.timeout
252 if timeout is not None:
253 effective_timeout = max(1, min(self.spec.timeout, int(timeout)))
254 argv = self.build_argv(prompt)
255 stdin = self._stdin_for(prompt)
256 start = time.monotonic()
257 try:
258 proc = subprocess.run(
259 argv,
260 input=stdin,
261 capture_output=True,
262 text=True,
263 timeout=effective_timeout,
264 )
265 except subprocess.TimeoutExpired:
266 return AgentResult(
267 self.name, self.spec.vendor, False, "",
268 time.monotonic() - start, f"timed out after {effective_timeout}s",
269 error_code=ERR_TIMEOUT,
270 )
271 except Exception as exc: # noqa: BLE001 - surface any spawn failure
272 return AgentResult(
273 self.name, self.spec.vendor, False, "",
274 time.monotonic() - start, f"spawn failed: {exc}",
275 error_code=ERR_SPAWN_FAILED,
276 )
277 dur = time.monotonic() - start
278 out = (proc.stdout or "").strip()
279 # A nonzero exit is ALWAYS a failure, even with stdout (issue #101): a
280 # crashing CLI can still print partial or error output, and counting that
281 # as a clean review would silently feed it into consensus, synthesis, and
282 # the CI gate. We classify from stderr (falling back to any stdout) and
283 # keep a short snippet in the error for debugging — but ok=False, so the
284 # orchestrator excludes it.
285 if proc.returncode != 0:
286 stderr = (proc.stderr or "").strip()
287 detail = stderr or out
288 return AgentResult(
289 self.name, self.spec.vendor, False, "",
290 dur, f"exit {proc.returncode}: {detail[:500]}",
291 error_code=classify_stderr(proc.returncode, stderr or out),
292 )
293 if not out:
294 # Exit 0 but nothing on stdout: the agent produced no usable review.
295 return AgentResult(
296 self.name, self.spec.vendor, False, "",
297 dur, f"exit {proc.returncode}: empty output",
298 error_code=ERR_EMPTY_OUTPUT,
299 )
300 return AgentResult(self.name, self.spec.vendor, True, out, dur)
303class ClaudeAdapter(Adapter):
304 def build_argv(self, prompt: str) -> list[str]:
305 argv = [self.spec.command, "-p", prompt]
306 if self.spec.model:
307 argv += ["--model", self.spec.model]
308 return argv + self.spec.extra_args
311class CodexAdapter(Adapter):
312 # Pipe the prompt on stdin (not positionally) so ``codex exec`` never blocks
313 # waiting for input in non-interactive runs. Sandbox flags live in extra_args;
314 # the shipped default is ``-s read-only`` (secure by default, #100) — the
315 # reviewer only reads its prompt, since the jury fetches the diff via ``gh``.
316 def build_argv(self, prompt: str) -> list[str]:
317 del prompt
318 argv = [self.spec.command, "exec"]
319 if self.spec.model:
320 argv += ["-m", self.spec.model]
321 return argv + self.spec.extra_args
323 def _stdin_for(self, prompt: str) -> str | None:
324 return prompt
327class AgyAdapter(Adapter):
328 def build_argv(self, prompt: str) -> list[str]:
329 argv = [self.spec.command, "--print", prompt]
330 if self.spec.model:
331 argv += ["--model", self.spec.model]
332 return argv + self.spec.extra_args
335_DEFAULT_LOCAL_ENDPOINT = "http://localhost:11434/v1"
338def list_local_models(endpoint: str = _DEFAULT_LOCAL_ENDPOINT) -> list[str]:
339 """List model ids from a local OpenAI-compatible server (issue #109).
341 GETs ``{endpoint}/models`` (the OpenAI-compatible listing that Ollama,
342 vLLM, LM Studio, etc. expose) and returns the model ids in their reported
343 order. Best-effort and stdlib-only: any failure (server down, bad JSON)
344 returns ``[]`` so callers can fall back gracefully.
345 """
346 import json as _json
347 import urllib.request
349 base = (endpoint or _DEFAULT_LOCAL_ENDPOINT).rstrip("/")
350 url = base if base.endswith("/models") else f"{base}/models"
351 try:
352 with urllib.request.urlopen(url, timeout=_VERSION_PROBE_TIMEOUT) as resp: # noqa: S310
353 data = _json.loads(resp.read().decode("utf-8"))
354 except Exception: # noqa: BLE001 - discovery is best-effort
355 return []
356 models = data.get("data") if isinstance(data, dict) else None
357 if not isinstance(models, list):
358 return []
359 ids = [m.get("id") for m in models if isinstance(m, dict) and m.get("id")]
360 return [str(i) for i in ids]
363class LocalAdapter(Adapter):
364 """Open-weight / local-model reviewer over an OpenAI-compatible API (issue #43).
366 Targets the ``/v1/chat/completions`` endpoint exposed by common local servers
367 (Ollama, llama.cpp ``llama-server``, vLLM, LM Studio). It talks plain HTTP via
368 the stdlib (``urllib``) — no new dependencies and no subprocess — so one panel
369 seat can run free and fully offline, adding model diversity (the load-bearing
370 advantage) at zero marginal cost.
372 Configure as a normal ``[[agent]]`` with ``vendor = "local"``, an
373 ``endpoint`` (base URL, default ``http://localhost:11434/v1``), and a
374 ``model``. ``extra_args`` is unused. An unreachable server fails with the
375 typed ``connection_error`` code (issue #29) rather than a crash.
376 """
378 SUPPORTS_HEADLESS = True
379 SUPPORTS_MODEL_SELECTION = True
381 @property
382 def endpoint(self) -> str:
383 return (self.spec.endpoint or _DEFAULT_LOCAL_ENDPOINT).rstrip("/")
385 def completions_url(self) -> str:
386 """Resolve the chat-completions URL from the configured base endpoint.
388 Accepts either a base URL (``…/v1``) or a full completions URL; pure so it
389 can be unit-tested without network.
390 """
391 base = self.endpoint
392 if base.endswith("/chat/completions"):
393 return base
394 return f"{base}/chat/completions"
396 def build_payload(self, prompt: str) -> dict:
397 """Build the OpenAI-compatible chat-completions request body (pure)."""
398 return {
399 "model": self.spec.model or "",
400 "messages": [{"role": "user", "content": prompt}],
401 "stream": False,
402 "temperature": 0,
403 }
405 @staticmethod
406 def parse_content(data: dict) -> str:
407 """Extract the assistant message text from a chat-completions response."""
408 choices = data.get("choices") or []
409 if not choices:
410 return ""
411 message = choices[0].get("message") or {}
412 return (message.get("content") or "").strip()
414 @staticmethod
415 def classify_http_status(status: int) -> str:
416 """Map an HTTP error status to a typed error code (issue #29)."""
417 if status in (401, 403):
418 return ERR_AUTH_REQUIRED
419 if status == 429:
420 return ERR_RATE_LIMITED
421 return ERR_NONZERO_EXIT
423 def available(self) -> bool:
424 """A local agent is 'available' when its server answers a quick probe.
426 Probes the OpenAI-compatible ``/v1/models`` (or the endpoint root) with a
427 short timeout. Network-only; never raises.
428 """
429 import urllib.error
430 import urllib.request
432 url = f"{self.endpoint}/models"
433 try:
434 with urllib.request.urlopen(url, timeout=_VERSION_PROBE_TIMEOUT) as resp: # noqa: S310
435 return 200 <= resp.status < 500
436 except urllib.error.HTTPError as exc:
437 # A 4xx (e.g. 404 on /models) still means the server is up.
438 return exc.code < 500
439 except Exception: # noqa: BLE001 - unreachable server -> not available
440 return False
442 def detect_capabilities(self) -> dict:
443 reachable = self.available()
444 return {
445 "version": None,
446 "supports_headless": self.SUPPORTS_HEADLESS,
447 "supports_model_selection": self.SUPPORTS_MODEL_SELECTION,
448 "raw_version_output": f"local endpoint {self.endpoint}",
449 "status": CAP_OK if reachable else CAP_UNAVAILABLE,
450 "warnings": (
451 [] if reachable else [f"local server unreachable at {self.endpoint}"]
452 ),
453 }
455 def run(self, prompt: str, phase: str = "review", timeout: int | None = None) -> AgentResult:
456 import json as _json
457 import urllib.error
458 import urllib.request
460 del phase
461 effective_timeout = self.spec.timeout
462 if timeout is not None:
463 effective_timeout = max(1, min(self.spec.timeout, int(timeout)))
464 body = _json.dumps(self.build_payload(prompt)).encode("utf-8")
465 req = urllib.request.Request(
466 self.completions_url(),
467 data=body,
468 headers={"Content-Type": "application/json"},
469 method="POST",
470 )
471 start = time.monotonic()
472 try:
473 with urllib.request.urlopen(req, timeout=effective_timeout) as resp: # noqa: S310
474 raw = resp.read().decode("utf-8")
475 data = _json.loads(raw)
476 except urllib.error.HTTPError as exc:
477 detail = ""
478 try:
479 detail = exc.read().decode("utf-8")[:300]
480 except Exception: # noqa: BLE001
481 detail = exc.reason or ""
482 return AgentResult(
483 self.name, self.spec.vendor, False, "",
484 time.monotonic() - start, f"HTTP {exc.code}: {detail}",
485 error_code=self.classify_http_status(exc.code),
486 )
487 except TimeoutError:
488 return AgentResult(
489 self.name, self.spec.vendor, False, "",
490 time.monotonic() - start, f"timed out after {effective_timeout}s",
491 error_code=ERR_TIMEOUT,
492 )
493 except urllib.error.URLError as exc:
494 return AgentResult(
495 self.name, self.spec.vendor, False, "",
496 time.monotonic() - start,
497 f"could not reach local server at {self.endpoint}: {exc.reason}",
498 error_code=ERR_CONNECTION,
499 )
500 except Exception as exc: # noqa: BLE001 - surface any other failure
501 return AgentResult(
502 self.name, self.spec.vendor, False, "",
503 time.monotonic() - start, f"local request failed: {exc}",
504 error_code=ERR_UNKNOWN,
505 )
506 dur = time.monotonic() - start
507 content = self.parse_content(data)
508 if not content:
509 return AgentResult(
510 self.name, self.spec.vendor, False, "",
511 dur, "local model returned empty content",
512 error_code=ERR_EMPTY_OUTPUT,
513 )
514 return AgentResult(self.name, self.spec.vendor, True, content, dur)
517class MockAdapter(Adapter):
518 """Offline adapter for tests and ``--mock`` runs.
520 Produces deterministic, phase-aware text so the full orchestration pipeline
521 can run end-to-end without live CLIs, auth, or token spend.
522 """
524 # Synthetic capabilities: the mock is offline and runs no real CLI.
525 SUPPORTS_HEADLESS = True
526 SUPPORTS_MODEL_SELECTION = False
528 def available(self) -> bool:
529 return True
531 def detect_capabilities(self) -> dict:
532 """Deterministic fake capabilities so doctor/tests stay stable offline."""
533 return {
534 "version": "mock-1.0",
535 "supports_headless": self.SUPPORTS_HEADLESS,
536 "supports_model_selection": self.SUPPORTS_MODEL_SELECTION,
537 "raw_version_output": "mock-1.0",
538 "status": CAP_OK,
539 "warnings": [],
540 }
542 def run(self, prompt: str, phase: str = "review", timeout: int | None = None) -> AgentResult:
543 del prompt, timeout
544 n = self.name
545 if phase == "review":
546 body = (
547 f"- **[major]** `src/example.py:42` — {n}: unchecked return value "
548 f"may swallow an error.\n"
549 f"- **[minor]** `src/example.py:7` — {n}: missing docstring.\n\n"
550 "```json\n"
551 "[\n"
552 ' {"severity": "major", "file": "src/example.py", "line": 42, '
553 f'"claim": "{n}: unchecked return value may swallow an error", '
554 '"evidence": "the added code ignores the return value of int(x)", '
555 '"suggested_fix": "check the result and raise on failure", '
556 f'"confidence": "high", "reviewer": "{n}"}},\n'
557 ' {"severity": "minor", "file": "src/example.py", "line": 7, '
558 f'"claim": "{n}: missing docstring", '
559 '"evidence": "the new function parse() has no docstring", '
560 '"suggested_fix": "add a one-line docstring", '
561 f'"confidence": "medium", "reviewer": "{n}"}}\n'
562 "]\n"
563 "```"
564 )
565 elif phase == "debate":
566 body = (
567 f"## AGREE\n- {n}: confirm the unchecked-return finding at "
568 f"`src/example.py:42`.\n"
569 f"## DISPUTE\n- {n}: the missing-docstring finding is a nit, not blocking.\n"
570 f"## MISSED\n- {n}: no test covers the error branch."
571 )
572 elif phase == "verify":
573 body = (
574 "Verification: confirming the unchecked-return finding at "
575 "`src/example.py:42`; the missing-docstring claim at `:7` is a nit "
576 "not supported as blocking.\n\n"
577 "```json\n"
578 "[\n"
579 ' {"file": "src/example.py", "line": 42, '
580 '"claim": "unchecked return value may swallow an error", '
581 '"status": "verified", '
582 '"reasoning": "the added code ignores the return value of int(x)"},\n'
583 ' {"file": "src/example.py", "line": 7, '
584 '"claim": "missing docstring", '
585 '"status": "unsupported", '
586 '"reasoning": "a missing docstring is not a defect the diff introduces"}\n'
587 "]\n"
588 "```"
589 )
590 else: # synthesis
591 body = (
592 "## Verdict\nREQUEST CHANGES — one confirmed major issue.\n\n"
593 "## Consensus findings\n- **[major]** `src/example.py:42` — unchecked "
594 "return value (raised by all reviewers).\n\n"
595 "## Disputed findings\n- Missing docstring: ruled non-blocking.\n\n"
596 "## Notable single-reviewer findings\n- Missing test for the error branch."
597 )
598 return AgentResult(n, self.spec.vendor, True, body, 0.0)
601_VENDOR_ADAPTERS: dict[str, type[Adapter]] = {
602 "anthropic": ClaudeAdapter,
603 "openai": CodexAdapter,
604 "google": AgyAdapter,
605 "local": LocalAdapter,
606}
609def make_adapter(spec: AgentSpec, mock: bool = False) -> Adapter:
610 if mock:
611 return MockAdapter(spec)
612 cls = _VENDOR_ADAPTERS.get(spec.vendor)
613 if cls is None:
614 # Unknown vendor: treat command as a print-style CLI (prompt as last arg).
615 return AgyAdapter(spec)
616 return cls(spec)