Coverage for src/ai_jury/benchmark.py: 100%

160 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-05 20:29 +0000

1"""Offline benchmark for jury review quality (issue #12). 

2 

3Measures, in a small and *directional* way, whether a jury's structured 

4findings line up with hand-authored expectations for a set of fixture diffs. 

5 

6Design (honesty matters here) 

7----------------------------- 

8The :class:`~ai_jury.adapters.MockAdapter` emits a *fixed* canned 

9finding regardless of the diff it is given, so ``--mock`` output does **not** 

10reflect a fixture's content. Running ``--mock`` per fixture and scoring it would 

11be fake signal. We therefore separate two concerns: 

12 

13* A **scorer** (:func:`score_fixture`) that compares an arbitrary list of 

14 finding dicts against a fixture's ``expected`` spec. This is pure and 

15 deterministic. 

16* A **finding source**: 

17 - *Offline / CI default ("recorded" mode):* every fixture ships a 

18 hand-authored ``<id>.expected.json`` AND a recorded ``<id>.findings.json`` 

19 (a realistic sample of what a jury produced for that diff). The offline 

20 benchmark scores recorded -> expected. It validates the SCORER + FIXTURES 

21 and provides recorded baselines. It runs with no live CLIs and no network. 

22 - *Optional live mode:* gated behind ``JURY_BENCH_LIVE=1``. It runs the 

23 real jury (``run_jury(..., mock=False)``) per fixture diff and scores 

24 the live findings. OFF by default; never in CI. 

25 

26What this benchmark does and does NOT claim 

27------------------------------------------- 

28It is small and directional. The offline run does not measure live review 

29quality; it validates the scorer and the recorded baselines. True quality 

30measurement requires live mode against real agent CLIs. See ``benchmark/README.md``. 

31 

32This module is stdlib-only and has no import-time dependency on the live 

33adapters; ``run_live`` imports the orchestrator lazily. 

34""" 

35from __future__ import annotations 

36 

37import json 

38import os 

39from dataclasses import dataclass, field 

40from pathlib import Path 

41 

42# Severity ranking, kept local so this module does not require the rest of the 

43# package at import time (the orchestrator is only imported lazily for live mode). 

44SEVERITIES: tuple[str, ...] = ("critical", "major", "minor", "nit", "info") 

45_SEVERITY_RANK: dict[str, int] = {sev: i for i, sev in enumerate(SEVERITIES)} 

46 

47# Severities that count as "blocking" for the must_not_flag / max_blocking 

48# checks. Mirrors the default CI ``fail_on`` set (critical, major). 

49BLOCKING_SEVERITIES: frozenset[str] = frozenset({"critical", "major"}) 

50 

51# Default line tolerance for a positional match. A finding at line L matches an 

52# expected line E when ``abs(L - E) <= LINE_TOLERANCE``. Diffs shift line 

53# numbers slightly between reviewers, so an exact match is too strict. 

54LINE_TOLERANCE = 3 

55 

56#: Directory holding the shipped fixtures (``benchmark/`` at the repo root). 

57BENCHMARK_DIR = Path(__file__).resolve().parents[2] / "benchmark" 

58 

59 

60# --------------------------------------------------------------------------- 

61# Data model 

62# --------------------------------------------------------------------------- 

63@dataclass 

64class Fixture: 

65 """A parsed benchmark fixture: a diff plus its expected/recorded data.""" 

66 

67 id: str 

68 description: str 

69 diff: str 

70 expected: dict 

71 recorded: list[dict] = field(default_factory=list) 

72 

73 

74@dataclass 

75class FixtureScore: 

76 """The scored result of one fixture.""" 

77 

78 id: str 

79 passed: bool 

80 matched: int 

81 missed: int 

82 false_positives: int 

83 expected_count: int 

84 precision: float 

85 recall: float 

86 reasons: list[str] = field(default_factory=list) 

87 

88 

89# --------------------------------------------------------------------------- 

90# Match rule 

91# --------------------------------------------------------------------------- 

92def _severity_rank(sev: object) -> int: 

93 if isinstance(sev, str): 

94 return _SEVERITY_RANK.get(sev.strip().lower(), len(SEVERITIES) - 1) 

95 return len(SEVERITIES) - 1 

96 

97 

98def _is_blocking(finding: dict) -> bool: 

99 sev = finding.get("severity") 

100 return isinstance(sev, str) and sev.strip().lower() in BLOCKING_SEVERITIES 

101 

102 

103def _line_matches(finding_line: object, expected_line: object, tol: int) -> bool: 

104 """A finding line matches the expected line when within +/- ``tol``. 

105 

106 When the expected entry has no line, any finding line matches (the entry is 

107 file/keyword scoped). When the finding has no line but the expected entry 

108 does, it cannot positionally match. 

109 """ 

110 if expected_line is None: 

111 return True 

112 if finding_line is None: 

113 return False 

114 try: 

115 return abs(int(finding_line) - int(expected_line)) <= tol 

116 except (TypeError, ValueError): 

117 return False 

118 

119 

120def _keywords_match(finding: dict, keywords: list) -> bool: 

121 """At least one keyword (case-insensitive) appears in claim or evidence. 

122 

123 An empty/absent keyword list is treated as "no keyword constraint" -> match. 

124 """ 

125 if not keywords: 

126 return True 

127 haystack = ( 

128 str(finding.get("claim", "")) + " " + str(finding.get("evidence", "")) 

129 ).lower() 

130 return any(str(kw).lower() in haystack for kw in keywords) 

131 

132 

133def finding_matches_expected(finding: dict, entry: dict, tol: int = LINE_TOLERANCE) -> bool: 

134 """Return True when ``finding`` satisfies a ``must_match`` ``entry``. 

135 

136 Match rule (all conditions must hold): 

137 

138 * **file**: same file path (exact string match) when the entry specifies a 

139 ``file``; if the entry omits ``file`` the file is not constrained. 

140 * **line**: the finding's line is within ``+/- tol`` of the entry's ``line`` 

141 (default :data:`LINE_TOLERANCE`); an entry without a ``line`` is not 

142 positionally constrained. 

143 * **severity**: the finding is at least as severe as the entry's 

144 ``severity`` (e.g. an expected ``major`` is satisfied by ``major`` or 

145 ``critical``); an entry without a ``severity`` is not constrained. 

146 * **keywords**: at least one of the entry's ``keywords`` appears 

147 (case-insensitive) in the finding's ``claim`` or ``evidence``; an empty 

148 list imposes no keyword constraint. 

149 """ 

150 exp_file = entry.get("file") 

151 if exp_file is not None and str(finding.get("file", "")) != str(exp_file): 

152 return False 

153 if not _line_matches(finding.get("line"), entry.get("line"), tol): 

154 return False 

155 exp_sev = entry.get("severity") 

156 if exp_sev is not None and _severity_rank(finding.get("severity")) > _severity_rank(exp_sev): 

157 return False 

158 return _keywords_match(finding, entry.get("keywords", [])) 

159 

160 

161# --------------------------------------------------------------------------- 

162# Scoring 

163# --------------------------------------------------------------------------- 

164def score_fixture( 

165 findings: list[dict], expected: dict, tol: int = LINE_TOLERANCE 

166) -> FixtureScore: 

167 """Score a list of finding dicts against a fixture's ``expected`` spec. 

168 

169 The ``expected`` spec is the ``"expect"`` object documented in the fixture 

170 schema and supports these (all optional) keys: 

171 

172 * ``must_match`` (list of entries): each entry SHOULD be matched by at least 

173 one finding (see :func:`finding_matches_expected`). An unmatched entry is a 

174 *missed* finding. 

175 * ``must_not_flag`` (list of entries): findings matching any of these entries 

176 are *false positives* (the diff is correct here; flagging it is wrong). 

177 * ``min_findings`` (int): the run must produce at least this many findings. 

178 * ``max_blocking`` (int): at most this many *blocking* (critical/major) 

179 findings are allowed. Used by false-positive-trap and docs-only fixtures 

180 to encode "no blocking finding" via ``{"max_blocking": 0}``. 

181 

182 Pass/fail: a fixture passes when every ``must_match`` entry is matched, no 

183 ``must_not_flag`` entry is matched, ``min_findings`` is met, and the blocking 

184 count does not exceed ``max_blocking``. 

185 

186 Precision/recall are computed over the ``must_match`` entries: 

187 

188 * recall = matched / number_of_must_match_entries (1.0 when there are none) 

189 * precision = matched / (matched + false_positives) (1.0 when both are 0) 

190 

191 These are directional indicators, not statistically rigorous metrics. 

192 """ 

193 must_match = expected.get("must_match", []) or [] 

194 must_not_flag = expected.get("must_not_flag", []) or [] 

195 reasons: list[str] = [] 

196 

197 matched = 0 

198 for entry in must_match: 

199 if any(finding_matches_expected(f, entry, tol) for f in findings): 

200 matched += 1 

201 else: 

202 reasons.append(f"missed expected finding: {entry}") 

203 missed = len(must_match) - matched 

204 

205 false_positives = 0 

206 for entry in must_not_flag: 

207 hits = [f for f in findings if finding_matches_expected(f, entry, tol)] 

208 if hits: 

209 false_positives += len(hits) 

210 reasons.append(f"flagged must_not_flag entry: {entry}") 

211 

212 passed = matched == len(must_match) and false_positives == 0 

213 

214 min_findings = expected.get("min_findings") 

215 if isinstance(min_findings, int) and len(findings) < min_findings: 

216 passed = False 

217 reasons.append(f"min_findings not met: got {len(findings)}, want >= {min_findings}") 

218 

219 max_blocking = expected.get("max_blocking") 

220 if isinstance(max_blocking, int): 

221 blocking = sum(1 for f in findings if _is_blocking(f)) 

222 if blocking > max_blocking: 

223 passed = False 

224 false_positives += blocking - max_blocking 

225 reasons.append( 

226 f"too many blocking findings: got {blocking}, want <= {max_blocking}" 

227 ) 

228 

229 recall = matched / len(must_match) if must_match else 1.0 

230 denom = matched + false_positives 

231 precision = matched / denom if denom else 1.0 

232 

233 return FixtureScore( 

234 id=expected.get("id", ""), 

235 passed=passed, 

236 matched=matched, 

237 missed=missed, 

238 false_positives=false_positives, 

239 expected_count=len(must_match), 

240 precision=precision, 

241 recall=recall, 

242 reasons=reasons, 

243 ) 

244 

245 

246def aggregate(scores: list[FixtureScore]) -> dict: 

247 """Aggregate per-fixture scores into a summary dict.""" 

248 total = len(scores) 

249 passed = sum(1 for s in scores if s.passed) 

250 matched = sum(s.matched for s in scores) 

251 missed = sum(s.missed for s in scores) 

252 false_positives = sum(s.false_positives for s in scores) 

253 expected_count = sum(s.expected_count for s in scores) 

254 recall = matched / expected_count if expected_count else 1.0 

255 denom = matched + false_positives 

256 precision = matched / denom if denom else 1.0 

257 return { 

258 "fixtures": total, 

259 "passed": passed, 

260 "failed": total - passed, 

261 "matched": matched, 

262 "missed": missed, 

263 "false_positives": false_positives, 

264 "expected_count": expected_count, 

265 "precision": precision, 

266 "recall": recall, 

267 } 

268 

269 

270# --------------------------------------------------------------------------- 

271# Fixture loading 

272# --------------------------------------------------------------------------- 

273def load_fixture(fixture_id: str, base_dir: Path | None = None) -> Fixture: 

274 """Load a single fixture (diff + expected.json + findings.json) by id.""" 

275 base = base_dir or BENCHMARK_DIR 

276 diff_path = base / f"{fixture_id}.diff" 

277 expected_path = base / f"{fixture_id}.expected.json" 

278 findings_path = base / f"{fixture_id}.findings.json" 

279 

280 diff = diff_path.read_text(encoding="utf-8") 

281 expected = json.loads(expected_path.read_text(encoding="utf-8")) 

282 expected.setdefault("id", fixture_id) 

283 recorded: list[dict] = [] 

284 if findings_path.exists(): 

285 recorded = json.loads(findings_path.read_text(encoding="utf-8")) 

286 description = str(expected.get("description", "")) 

287 return Fixture( 

288 id=fixture_id, 

289 description=description, 

290 diff=diff, 

291 expected=expected.get("expect", {}) | {"id": expected.get("id", fixture_id)}, 

292 recorded=recorded, 

293 ) 

294 

295 

296def discover_fixture_ids(base_dir: Path | None = None) -> list[str]: 

297 """Return the sorted ids of all fixtures (those with an expected.json).""" 

298 base = base_dir or BENCHMARK_DIR 

299 if not base.exists(): 

300 return [] 

301 return sorted( 

302 p.name[: -len(".expected.json")] 

303 for p in base.glob("*.expected.json") 

304 ) 

305 

306 

307def load_fixtures(base_dir: Path | None = None) -> list[Fixture]: 

308 """Load every shipped fixture, sorted by id (stable, deterministic).""" 

309 return [load_fixture(fid, base_dir) for fid in discover_fixture_ids(base_dir)] 

310 

311 

312# --------------------------------------------------------------------------- 

313# Runners 

314# --------------------------------------------------------------------------- 

315def run_offline(base_dir: Path | None = None) -> tuple[list[FixtureScore], dict]: 

316 """Score each fixture's *recorded* findings against its expected spec. 

317 

318 Deterministic, offline, no live CLIs. Returns ``(scores, aggregate)``. 

319 """ 

320 fixtures = load_fixtures(base_dir) 

321 scores = [score_fixture(fx.recorded, fx.expected) for fx in fixtures] 

322 return scores, aggregate(scores) 

323 

324 

325def live_enabled() -> bool: 

326 """True when the optional live benchmark mode is explicitly enabled.""" 

327 return os.environ.get("JURY_BENCH_LIVE") == "1" 

328 

329 

330def run_live(base_dir: Path | None = None) -> tuple[list[FixtureScore], dict]: 

331 """Run the real jury per fixture diff and score the live findings. 

332 

333 Only meaningful when ``JURY_BENCH_LIVE=1``. Imports the orchestrator 

334 lazily so the offline path never touches the live machinery. This invokes 

335 real agent CLIs and is never run in CI. 

336 """ 

337 if not live_enabled(): 

338 raise RuntimeError( 

339 "live benchmark is disabled; set JURY_BENCH_LIVE=1 to enable it" 

340 ) 

341 # Lazy imports: keep the offline/import path free of live dependencies. 

342 from .config import DEFAULT_CONFIG, _from_dict 

343 from .orchestrator import run_jury 

344 

345 config = _from_dict(DEFAULT_CONFIG) 

346 fixtures = load_fixtures(base_dir) 

347 scores: list[FixtureScore] = [] 

348 for fx in fixtures: 

349 outcome = run_jury(config, fx.diff, mock=False) 

350 findings = [_finding_to_dict(f) for f in outcome.findings] 

351 scores.append(score_fixture(findings, fx.expected)) 

352 return scores, aggregate(scores) 

353 

354 

355def _finding_to_dict(finding) -> dict: 

356 """Convert a Finding dataclass into the plain dict the scorer consumes.""" 

357 return { 

358 "severity": getattr(finding, "severity", "info"), 

359 "file": getattr(finding, "file", ""), 

360 "line": getattr(finding, "line", None), 

361 "claim": getattr(finding, "claim", ""), 

362 "evidence": getattr(finding, "evidence", ""), 

363 } 

364 

365 

366# --------------------------------------------------------------------------- 

367# Reporting 

368# --------------------------------------------------------------------------- 

369def format_table(scores: list[FixtureScore], summary: dict) -> str: 

370 """Render a per-fixture + aggregate score table as plain text.""" 

371 header = f"{'fixture':<24} {'pass':<5} {'match':<6} {'miss':<5} {'fp':<4} {'prec':<5} {'recall':<6}" 

372 lines = [header, "-" * len(header)] 

373 for s in scores: 

374 lines.append( 

375 f"{s.id:<24} {('yes' if s.passed else 'NO'):<5} " 

376 f"{s.matched:<6} {s.missed:<5} {s.false_positives:<4} " 

377 f"{s.precision:<5.2f} {s.recall:<6.2f}" 

378 ) 

379 lines.append("-" * len(header)) 

380 pass_ratio = f"{summary['passed']}/{summary['fixtures']}" 

381 lines.append( 

382 f"{'TOTAL':<24} {pass_ratio:<5} " 

383 f"{summary['matched']:<6} {summary['missed']:<5} {summary['false_positives']:<4} " 

384 f"{summary['precision']:<5.2f} {summary['recall']:<6.2f}" 

385 ) 

386 return "\n".join(lines) 

387 

388 

389def main() -> int: 

390 """Module entry point: print the score table (offline by default).""" 

391 live = live_enabled() 

392 mode = "live (JURY_BENCH_LIVE=1)" if live else "offline/recorded" 

393 print(f"ai-jury benchmark — mode: {mode}") 

394 print( 

395 "NOTE: small and directional, not a universal quality claim. " 

396 "Offline mode validates the scorer + recorded baselines, not live quality.\n" 

397 ) 

398 scores, summary = run_live() if live else run_offline() 

399 print(format_table(scores, summary)) 

400 # Exit non-zero when any fixture failed, so the table is usable as a check. 

401 return 0 if summary["failed"] == 0 else 1 

402 

403 

404if __name__ == "__main__": 

405 raise SystemExit(main())