Coverage for src/ai_jury/benchmark.py: 100%
160 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-05 20:29 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-05 20:29 +0000
1"""Offline benchmark for jury review quality (issue #12).
3Measures, in a small and *directional* way, whether a jury's structured
4findings line up with hand-authored expectations for a set of fixture diffs.
6Design (honesty matters here)
7-----------------------------
8The :class:`~ai_jury.adapters.MockAdapter` emits a *fixed* canned
9finding regardless of the diff it is given, so ``--mock`` output does **not**
10reflect a fixture's content. Running ``--mock`` per fixture and scoring it would
11be fake signal. We therefore separate two concerns:
13* A **scorer** (:func:`score_fixture`) that compares an arbitrary list of
14 finding dicts against a fixture's ``expected`` spec. This is pure and
15 deterministic.
16* A **finding source**:
17 - *Offline / CI default ("recorded" mode):* every fixture ships a
18 hand-authored ``<id>.expected.json`` AND a recorded ``<id>.findings.json``
19 (a realistic sample of what a jury produced for that diff). The offline
20 benchmark scores recorded -> expected. It validates the SCORER + FIXTURES
21 and provides recorded baselines. It runs with no live CLIs and no network.
22 - *Optional live mode:* gated behind ``JURY_BENCH_LIVE=1``. It runs the
23 real jury (``run_jury(..., mock=False)``) per fixture diff and scores
24 the live findings. OFF by default; never in CI.
26What this benchmark does and does NOT claim
27-------------------------------------------
28It is small and directional. The offline run does not measure live review
29quality; it validates the scorer and the recorded baselines. True quality
30measurement requires live mode against real agent CLIs. See ``benchmark/README.md``.
32This module is stdlib-only and has no import-time dependency on the live
33adapters; ``run_live`` imports the orchestrator lazily.
34"""
35from __future__ import annotations
37import json
38import os
39from dataclasses import dataclass, field
40from pathlib import Path
42# Severity ranking, kept local so this module does not require the rest of the
43# package at import time (the orchestrator is only imported lazily for live mode).
44SEVERITIES: tuple[str, ...] = ("critical", "major", "minor", "nit", "info")
45_SEVERITY_RANK: dict[str, int] = {sev: i for i, sev in enumerate(SEVERITIES)}
47# Severities that count as "blocking" for the must_not_flag / max_blocking
48# checks. Mirrors the default CI ``fail_on`` set (critical, major).
49BLOCKING_SEVERITIES: frozenset[str] = frozenset({"critical", "major"})
51# Default line tolerance for a positional match. A finding at line L matches an
52# expected line E when ``abs(L - E) <= LINE_TOLERANCE``. Diffs shift line
53# numbers slightly between reviewers, so an exact match is too strict.
54LINE_TOLERANCE = 3
56#: Directory holding the shipped fixtures (``benchmark/`` at the repo root).
57BENCHMARK_DIR = Path(__file__).resolve().parents[2] / "benchmark"
60# ---------------------------------------------------------------------------
61# Data model
62# ---------------------------------------------------------------------------
63@dataclass
64class Fixture:
65 """A parsed benchmark fixture: a diff plus its expected/recorded data."""
67 id: str
68 description: str
69 diff: str
70 expected: dict
71 recorded: list[dict] = field(default_factory=list)
74@dataclass
75class FixtureScore:
76 """The scored result of one fixture."""
78 id: str
79 passed: bool
80 matched: int
81 missed: int
82 false_positives: int
83 expected_count: int
84 precision: float
85 recall: float
86 reasons: list[str] = field(default_factory=list)
89# ---------------------------------------------------------------------------
90# Match rule
91# ---------------------------------------------------------------------------
92def _severity_rank(sev: object) -> int:
93 if isinstance(sev, str):
94 return _SEVERITY_RANK.get(sev.strip().lower(), len(SEVERITIES) - 1)
95 return len(SEVERITIES) - 1
98def _is_blocking(finding: dict) -> bool:
99 sev = finding.get("severity")
100 return isinstance(sev, str) and sev.strip().lower() in BLOCKING_SEVERITIES
103def _line_matches(finding_line: object, expected_line: object, tol: int) -> bool:
104 """A finding line matches the expected line when within +/- ``tol``.
106 When the expected entry has no line, any finding line matches (the entry is
107 file/keyword scoped). When the finding has no line but the expected entry
108 does, it cannot positionally match.
109 """
110 if expected_line is None:
111 return True
112 if finding_line is None:
113 return False
114 try:
115 return abs(int(finding_line) - int(expected_line)) <= tol
116 except (TypeError, ValueError):
117 return False
120def _keywords_match(finding: dict, keywords: list) -> bool:
121 """At least one keyword (case-insensitive) appears in claim or evidence.
123 An empty/absent keyword list is treated as "no keyword constraint" -> match.
124 """
125 if not keywords:
126 return True
127 haystack = (
128 str(finding.get("claim", "")) + " " + str(finding.get("evidence", ""))
129 ).lower()
130 return any(str(kw).lower() in haystack for kw in keywords)
133def finding_matches_expected(finding: dict, entry: dict, tol: int = LINE_TOLERANCE) -> bool:
134 """Return True when ``finding`` satisfies a ``must_match`` ``entry``.
136 Match rule (all conditions must hold):
138 * **file**: same file path (exact string match) when the entry specifies a
139 ``file``; if the entry omits ``file`` the file is not constrained.
140 * **line**: the finding's line is within ``+/- tol`` of the entry's ``line``
141 (default :data:`LINE_TOLERANCE`); an entry without a ``line`` is not
142 positionally constrained.
143 * **severity**: the finding is at least as severe as the entry's
144 ``severity`` (e.g. an expected ``major`` is satisfied by ``major`` or
145 ``critical``); an entry without a ``severity`` is not constrained.
146 * **keywords**: at least one of the entry's ``keywords`` appears
147 (case-insensitive) in the finding's ``claim`` or ``evidence``; an empty
148 list imposes no keyword constraint.
149 """
150 exp_file = entry.get("file")
151 if exp_file is not None and str(finding.get("file", "")) != str(exp_file):
152 return False
153 if not _line_matches(finding.get("line"), entry.get("line"), tol):
154 return False
155 exp_sev = entry.get("severity")
156 if exp_sev is not None and _severity_rank(finding.get("severity")) > _severity_rank(exp_sev):
157 return False
158 return _keywords_match(finding, entry.get("keywords", []))
161# ---------------------------------------------------------------------------
162# Scoring
163# ---------------------------------------------------------------------------
164def score_fixture(
165 findings: list[dict], expected: dict, tol: int = LINE_TOLERANCE
166) -> FixtureScore:
167 """Score a list of finding dicts against a fixture's ``expected`` spec.
169 The ``expected`` spec is the ``"expect"`` object documented in the fixture
170 schema and supports these (all optional) keys:
172 * ``must_match`` (list of entries): each entry SHOULD be matched by at least
173 one finding (see :func:`finding_matches_expected`). An unmatched entry is a
174 *missed* finding.
175 * ``must_not_flag`` (list of entries): findings matching any of these entries
176 are *false positives* (the diff is correct here; flagging it is wrong).
177 * ``min_findings`` (int): the run must produce at least this many findings.
178 * ``max_blocking`` (int): at most this many *blocking* (critical/major)
179 findings are allowed. Used by false-positive-trap and docs-only fixtures
180 to encode "no blocking finding" via ``{"max_blocking": 0}``.
182 Pass/fail: a fixture passes when every ``must_match`` entry is matched, no
183 ``must_not_flag`` entry is matched, ``min_findings`` is met, and the blocking
184 count does not exceed ``max_blocking``.
186 Precision/recall are computed over the ``must_match`` entries:
188 * recall = matched / number_of_must_match_entries (1.0 when there are none)
189 * precision = matched / (matched + false_positives) (1.0 when both are 0)
191 These are directional indicators, not statistically rigorous metrics.
192 """
193 must_match = expected.get("must_match", []) or []
194 must_not_flag = expected.get("must_not_flag", []) or []
195 reasons: list[str] = []
197 matched = 0
198 for entry in must_match:
199 if any(finding_matches_expected(f, entry, tol) for f in findings):
200 matched += 1
201 else:
202 reasons.append(f"missed expected finding: {entry}")
203 missed = len(must_match) - matched
205 false_positives = 0
206 for entry in must_not_flag:
207 hits = [f for f in findings if finding_matches_expected(f, entry, tol)]
208 if hits:
209 false_positives += len(hits)
210 reasons.append(f"flagged must_not_flag entry: {entry}")
212 passed = matched == len(must_match) and false_positives == 0
214 min_findings = expected.get("min_findings")
215 if isinstance(min_findings, int) and len(findings) < min_findings:
216 passed = False
217 reasons.append(f"min_findings not met: got {len(findings)}, want >= {min_findings}")
219 max_blocking = expected.get("max_blocking")
220 if isinstance(max_blocking, int):
221 blocking = sum(1 for f in findings if _is_blocking(f))
222 if blocking > max_blocking:
223 passed = False
224 false_positives += blocking - max_blocking
225 reasons.append(
226 f"too many blocking findings: got {blocking}, want <= {max_blocking}"
227 )
229 recall = matched / len(must_match) if must_match else 1.0
230 denom = matched + false_positives
231 precision = matched / denom if denom else 1.0
233 return FixtureScore(
234 id=expected.get("id", ""),
235 passed=passed,
236 matched=matched,
237 missed=missed,
238 false_positives=false_positives,
239 expected_count=len(must_match),
240 precision=precision,
241 recall=recall,
242 reasons=reasons,
243 )
246def aggregate(scores: list[FixtureScore]) -> dict:
247 """Aggregate per-fixture scores into a summary dict."""
248 total = len(scores)
249 passed = sum(1 for s in scores if s.passed)
250 matched = sum(s.matched for s in scores)
251 missed = sum(s.missed for s in scores)
252 false_positives = sum(s.false_positives for s in scores)
253 expected_count = sum(s.expected_count for s in scores)
254 recall = matched / expected_count if expected_count else 1.0
255 denom = matched + false_positives
256 precision = matched / denom if denom else 1.0
257 return {
258 "fixtures": total,
259 "passed": passed,
260 "failed": total - passed,
261 "matched": matched,
262 "missed": missed,
263 "false_positives": false_positives,
264 "expected_count": expected_count,
265 "precision": precision,
266 "recall": recall,
267 }
270# ---------------------------------------------------------------------------
271# Fixture loading
272# ---------------------------------------------------------------------------
273def load_fixture(fixture_id: str, base_dir: Path | None = None) -> Fixture:
274 """Load a single fixture (diff + expected.json + findings.json) by id."""
275 base = base_dir or BENCHMARK_DIR
276 diff_path = base / f"{fixture_id}.diff"
277 expected_path = base / f"{fixture_id}.expected.json"
278 findings_path = base / f"{fixture_id}.findings.json"
280 diff = diff_path.read_text(encoding="utf-8")
281 expected = json.loads(expected_path.read_text(encoding="utf-8"))
282 expected.setdefault("id", fixture_id)
283 recorded: list[dict] = []
284 if findings_path.exists():
285 recorded = json.loads(findings_path.read_text(encoding="utf-8"))
286 description = str(expected.get("description", ""))
287 return Fixture(
288 id=fixture_id,
289 description=description,
290 diff=diff,
291 expected=expected.get("expect", {}) | {"id": expected.get("id", fixture_id)},
292 recorded=recorded,
293 )
296def discover_fixture_ids(base_dir: Path | None = None) -> list[str]:
297 """Return the sorted ids of all fixtures (those with an expected.json)."""
298 base = base_dir or BENCHMARK_DIR
299 if not base.exists():
300 return []
301 return sorted(
302 p.name[: -len(".expected.json")]
303 for p in base.glob("*.expected.json")
304 )
307def load_fixtures(base_dir: Path | None = None) -> list[Fixture]:
308 """Load every shipped fixture, sorted by id (stable, deterministic)."""
309 return [load_fixture(fid, base_dir) for fid in discover_fixture_ids(base_dir)]
312# ---------------------------------------------------------------------------
313# Runners
314# ---------------------------------------------------------------------------
315def run_offline(base_dir: Path | None = None) -> tuple[list[FixtureScore], dict]:
316 """Score each fixture's *recorded* findings against its expected spec.
318 Deterministic, offline, no live CLIs. Returns ``(scores, aggregate)``.
319 """
320 fixtures = load_fixtures(base_dir)
321 scores = [score_fixture(fx.recorded, fx.expected) for fx in fixtures]
322 return scores, aggregate(scores)
325def live_enabled() -> bool:
326 """True when the optional live benchmark mode is explicitly enabled."""
327 return os.environ.get("JURY_BENCH_LIVE") == "1"
330def run_live(base_dir: Path | None = None) -> tuple[list[FixtureScore], dict]:
331 """Run the real jury per fixture diff and score the live findings.
333 Only meaningful when ``JURY_BENCH_LIVE=1``. Imports the orchestrator
334 lazily so the offline path never touches the live machinery. This invokes
335 real agent CLIs and is never run in CI.
336 """
337 if not live_enabled():
338 raise RuntimeError(
339 "live benchmark is disabled; set JURY_BENCH_LIVE=1 to enable it"
340 )
341 # Lazy imports: keep the offline/import path free of live dependencies.
342 from .config import DEFAULT_CONFIG, _from_dict
343 from .orchestrator import run_jury
345 config = _from_dict(DEFAULT_CONFIG)
346 fixtures = load_fixtures(base_dir)
347 scores: list[FixtureScore] = []
348 for fx in fixtures:
349 outcome = run_jury(config, fx.diff, mock=False)
350 findings = [_finding_to_dict(f) for f in outcome.findings]
351 scores.append(score_fixture(findings, fx.expected))
352 return scores, aggregate(scores)
355def _finding_to_dict(finding) -> dict:
356 """Convert a Finding dataclass into the plain dict the scorer consumes."""
357 return {
358 "severity": getattr(finding, "severity", "info"),
359 "file": getattr(finding, "file", ""),
360 "line": getattr(finding, "line", None),
361 "claim": getattr(finding, "claim", ""),
362 "evidence": getattr(finding, "evidence", ""),
363 }
366# ---------------------------------------------------------------------------
367# Reporting
368# ---------------------------------------------------------------------------
369def format_table(scores: list[FixtureScore], summary: dict) -> str:
370 """Render a per-fixture + aggregate score table as plain text."""
371 header = f"{'fixture':<24} {'pass':<5} {'match':<6} {'miss':<5} {'fp':<4} {'prec':<5} {'recall':<6}"
372 lines = [header, "-" * len(header)]
373 for s in scores:
374 lines.append(
375 f"{s.id:<24} {('yes' if s.passed else 'NO'):<5} "
376 f"{s.matched:<6} {s.missed:<5} {s.false_positives:<4} "
377 f"{s.precision:<5.2f} {s.recall:<6.2f}"
378 )
379 lines.append("-" * len(header))
380 pass_ratio = f"{summary['passed']}/{summary['fixtures']}"
381 lines.append(
382 f"{'TOTAL':<24} {pass_ratio:<5} "
383 f"{summary['matched']:<6} {summary['missed']:<5} {summary['false_positives']:<4} "
384 f"{summary['precision']:<5.2f} {summary['recall']:<6.2f}"
385 )
386 return "\n".join(lines)
389def main() -> int:
390 """Module entry point: print the score table (offline by default)."""
391 live = live_enabled()
392 mode = "live (JURY_BENCH_LIVE=1)" if live else "offline/recorded"
393 print(f"ai-jury benchmark — mode: {mode}")
394 print(
395 "NOTE: small and directional, not a universal quality claim. "
396 "Offline mode validates the scorer + recorded baselines, not live quality.\n"
397 )
398 scores, summary = run_live() if live else run_offline()
399 print(format_table(scores, summary))
400 # Exit non-zero when any fixture failed, so the table is usable as a check.
401 return 0 if summary["failed"] == 0 else 1
404if __name__ == "__main__":
405 raise SystemExit(main())