Coverage for src/ai_jury/benchmark.py: 99%

1"""Offline benchmark for jury review quality (issue #12).

3Measures, in a small and *directional* way, whether a jury's structured

4findings line up with hand-authored expectations for a set of fixture diffs.

6Design (honesty matters here)

7-----------------------------

8The :class:`~ai_jury.adapters.MockAdapter` emits a *fixed* canned

9finding regardless of the diff it is given, so ``--mock`` output does **not**

10reflect a fixture's content. Running ``--mock`` per fixture and scoring it would

11be fake signal. We therefore separate two concerns:

13* A **scorer** (:func:`score_fixture`) that compares an arbitrary list of

14 finding dicts against a fixture's ``expected`` spec. This is pure and

15 deterministic.

16* A **finding source**:

17 - *Offline / CI default ("recorded" mode):* every fixture ships a

18 hand-authored ``<id>.expected.json`` AND a recorded ``<id>.findings.json``

19 (a realistic sample of what a jury produced for that diff). The offline

20 benchmark scores recorded -> expected. It validates the SCORER + FIXTURES

21 and provides recorded baselines. It runs with no live CLIs and no network.

22 - *Optional live mode:* gated behind ``JURY_BENCH_LIVE=1``. It runs the

23 real jury (``run_jury(..., mock=False)``) per fixture diff and scores

24 the live findings. OFF by default; never in CI.

26What this benchmark does and does NOT claim

27-------------------------------------------

28It is small and directional. The offline run does not measure live review

29quality; it validates the scorer and the recorded baselines. True quality

30measurement requires live mode against real agent CLIs. See ``benchmark/README.md``.

32This module is stdlib-only and has no import-time dependency on the live

33adapters; ``run_live`` imports the orchestrator lazily.

34"""

36from __future__ import annotations

38import json

39import os

40from dataclasses import dataclass, field

41from pathlib import Path

43# Severity ranking, kept local so this module does not require the rest of the

44# package at import time (the orchestrator is only imported lazily for live mode).

45SEVERITIES: tuple[str, ...] = ("critical", "major", "minor", "nit", "info")

46_SEVERITY_RANK: dict[str, int] = {sev: i for i, sev in enumerate(SEVERITIES)}

48# Severities that count as "blocking" for the must_not_flag / max_blocking

49# checks. Mirrors the default CI ``fail_on`` set (critical, major).

50BLOCKING_SEVERITIES: frozenset[str] = frozenset({"critical", "major"})

52# Default line tolerance for a positional match. A finding at line L matches an

53# expected line E when ``abs(L - E) <= LINE_TOLERANCE``. Diffs shift line

54# numbers slightly between reviewers, so an exact match is too strict.

55LINE_TOLERANCE = 3

57#: Directory holding the shipped fixtures (``benchmark/`` at the repo root).

58BENCHMARK_DIR = Path(__file__).resolve().parents[2] / "benchmark"

61# ---------------------------------------------------------------------------

62# Data model

63# ---------------------------------------------------------------------------

64@dataclass

65class Fixture:

66 """A parsed benchmark fixture: a diff plus its expected/recorded data."""

68 id: str

69 description: str

70 diff: str

71 expected: dict

72 recorded: list[dict] = field(default_factory=list)

75@dataclass

76class FixtureScore:

77 """The scored result of one fixture."""

79 id: str

80 passed: bool

81 matched: int

82 missed: int

83 false_positives: int

84 expected_count: int

85 precision: float

86 recall: float

87 reasons: list[str] = field(default_factory=list)

90# ---------------------------------------------------------------------------

91# Match rule

92# ---------------------------------------------------------------------------

93def _severity_rank(sev: object) -> int:

94 if isinstance(sev, str):

95 return _SEVERITY_RANK.get(sev.strip().lower(), len(SEVERITIES) - 1)

96 return len(SEVERITIES) - 1

99def _is_blocking(finding: dict) -> bool:

100 sev = finding.get("severity")

101 return isinstance(sev, str) and sev.strip().lower() in BLOCKING_SEVERITIES

102

103

104def _line_matches(finding_line: object, expected_line: object, tol: int) -> bool:

105 """A finding line matches the expected line when within +/- ``tol``.

106

107 When the expected entry has no line, any finding line matches (the entry is

108 file/keyword scoped). When the finding has no line but the expected entry

109 does, it cannot positionally match.

110 """

111 if expected_line is None:

112 return True

113 if finding_line is None:

114 return False

115 try:

116 return abs(int(finding_line) - int(expected_line)) <= tol

117 except (TypeError, ValueError):

118 return False

119

120

121def _keywords_match(finding: dict, keywords: list) -> bool:

122 """At least one keyword (case-insensitive) appears in claim or evidence.

123

124 An empty/absent keyword list is treated as "no keyword constraint" -> match.

125 """

126 if not keywords:

127 return True

128 haystack = (str(finding.get("claim", "")) + " " + str(finding.get("evidence", ""))).lower()

129 return any(str(kw).lower() in haystack for kw in keywords)

130

131

132def finding_matches_expected(finding: dict, entry: dict, tol: int = LINE_TOLERANCE) -> bool:

133 """Return True when ``finding`` satisfies a ``must_match`` ``entry``.

134

135 Match rule (all conditions must hold):

136

137 * **file**: same file path (exact string match) when the entry specifies a

138 ``file``; if the entry omits ``file`` the file is not constrained.

139 * **line**: the finding's line is within ``+/- tol`` of the entry's ``line``

140 (default :data:`LINE_TOLERANCE`); an entry without a ``line`` is not

141 positionally constrained.

142 * **severity**: the finding is at least as severe as the entry's

143 ``severity`` (e.g. an expected ``major`` is satisfied by ``major`` or

144 ``critical``); an entry without a ``severity`` is not constrained.

145 * **keywords**: at least one of the entry's ``keywords`` appears

146 (case-insensitive) in the finding's ``claim`` or ``evidence``; an empty

147 list imposes no keyword constraint.

148 """

149 exp_file = entry.get("file")

150 if exp_file is not None and str(finding.get("file", "")) != str(exp_file):

151 return False

152 if not _line_matches(finding.get("line"), entry.get("line"), tol):

153 return False

154 exp_sev = entry.get("severity")

155 if exp_sev is not None and _severity_rank(finding.get("severity")) > _severity_rank(exp_sev):

156 return False

157 return _keywords_match(finding, entry.get("keywords", []))

158

159

160# ---------------------------------------------------------------------------

161# Scoring

162# ---------------------------------------------------------------------------

163def score_fixture(findings: list[dict], expected: dict, tol: int = LINE_TOLERANCE) -> FixtureScore:

164 """Score a list of finding dicts against a fixture's ``expected`` spec.

165

166 The ``expected`` spec is the ``"expect"`` object documented in the fixture

167 schema and supports these (all optional) keys:

168

169 * ``must_match`` (list of entries): each entry SHOULD be matched by at least

170 one finding (see :func:`finding_matches_expected`). An unmatched entry is a

171 *missed* finding.

172 * ``must_not_flag`` (list of entries): findings matching any of these entries

173 are *false positives* (the diff is correct here; flagging it is wrong).

174 * ``min_findings`` (int): the run must produce at least this many findings.

175 * ``max_blocking`` (int): at most this many *blocking* (critical/major)

176 findings are allowed. Used by false-positive-trap and docs-only fixtures

177 to encode "no blocking finding" via ``{"max_blocking": 0}``.

178

179 Pass/fail: a fixture passes when every ``must_match`` entry is matched, no

180 ``must_not_flag`` entry is matched, ``min_findings`` is met, and the blocking

181 count does not exceed ``max_blocking``.

182

183 Precision/recall are computed over the ``must_match`` entries:

184

185 * recall = matched / number_of_must_match_entries (1.0 when there are none)

186 * precision = matched / (matched + false_positives) (1.0 when both are 0)

187

188 These are directional indicators, not statistically rigorous metrics.

189 """

190 must_match = expected.get("must_match", []) or []

191 must_not_flag = expected.get("must_not_flag", []) or []

192 reasons: list[str] = []

193

194 matched = 0

195 for entry in must_match:

196 if any(finding_matches_expected(f, entry, tol) for f in findings):

197 matched += 1

198 else:

199 reasons.append(f"missed expected finding: {entry}")

200 missed = len(must_match) - matched

201

202 false_positives = 0

203 for entry in must_not_flag:

204 hits = [f for f in findings if finding_matches_expected(f, entry, tol)]

205 if hits:

206 false_positives += len(hits)

207 reasons.append(f"flagged must_not_flag entry: {entry}")

208

209 passed = matched == len(must_match) and false_positives == 0

210

211 min_findings = expected.get("min_findings")

212 if isinstance(min_findings, int) and len(findings) < min_findings:

213 passed = False

214 reasons.append(f"min_findings not met: got {len(findings)}, want >= {min_findings}")

215

216 max_blocking = expected.get("max_blocking")

217 if isinstance(max_blocking, int):

218 blocking = sum(1 for f in findings if _is_blocking(f))

219 if blocking > max_blocking:

220 passed = False

221 false_positives += blocking - max_blocking

222 reasons.append(f"too many blocking findings: got {blocking}, want <= {max_blocking}")

223

224 recall = matched / len(must_match) if must_match else 1.0

225 denom = matched + false_positives

226 precision = matched / denom if denom else 1.0

227

228 return FixtureScore(

229 id=expected.get("id", ""),

230 passed=passed,

231 matched=matched,

232 missed=missed,

233 false_positives=false_positives,

234 expected_count=len(must_match),

235 precision=precision,

236 recall=recall,

237 reasons=reasons,

238 )

239

240

241def aggregate(scores: list[FixtureScore]) -> dict:

242 """Aggregate per-fixture scores into a summary dict."""

243 total = len(scores)

244 passed = 0

245 matched = 0

246 missed = 0

247 false_positives = 0

248 expected_count = 0

249 for s in scores:

250 if s.passed: 250 ↛ 252line 250 didn't jump to line 252 because the condition on line 250 was always true

251 passed += 1

252 matched += s.matched

253 missed += s.missed

254 false_positives += s.false_positives

255 expected_count += s.expected_count

256 recall = matched / expected_count if expected_count else 1.0

257 denom = matched + false_positives

258 precision = matched / denom if denom else 1.0

259 return {

260 "fixtures": total,

261 "passed": passed,

262 "failed": total - passed,

263 "matched": matched,

264 "missed": missed,

265 "false_positives": false_positives,

266 "expected_count": expected_count,

267 "precision": precision,

268 "recall": recall,

269 }

270

271

272# ---------------------------------------------------------------------------

273# Fixture loading

274# ---------------------------------------------------------------------------

275def load_fixture(fixture_id: str, base_dir: Path | None = None) -> Fixture:

276 """Load a single fixture (diff + expected.json + findings.json) by id."""

277 base = base_dir or BENCHMARK_DIR

278 diff_path = base / f"{fixture_id}.diff"

279 expected_path = base / f"{fixture_id}.expected.json"

280 findings_path = base / f"{fixture_id}.findings.json"

281

282 diff = diff_path.read_text(encoding="utf-8")

283 expected = json.loads(expected_path.read_text(encoding="utf-8"))

284 expected.setdefault("id", fixture_id)

285 recorded: list[dict] = []

286 if findings_path.exists():

287 recorded = json.loads(findings_path.read_text(encoding="utf-8"))

288 description = str(expected.get("description", ""))

289 return Fixture(

290 id=fixture_id,

291 description=description,

292 diff=diff,

293 expected=expected.get("expect", {}) | {"id": expected.get("id", fixture_id)},

294 recorded=recorded,

295 )

296

297

298def discover_fixture_ids(base_dir: Path | None = None) -> list[str]:

299 """Return the sorted ids of all fixtures (those with an expected.json)."""

300 base = base_dir or BENCHMARK_DIR

301 if not base.exists():

302 return []

303 return sorted(p.name[: -len(".expected.json")] for p in base.glob("*.expected.json"))

304

305

306def load_fixtures(base_dir: Path | None = None) -> list[Fixture]:

307 """Load every shipped fixture, sorted by id (stable, deterministic)."""

308 return [load_fixture(fid, base_dir) for fid in discover_fixture_ids(base_dir)]

309

310

311# ---------------------------------------------------------------------------

312# Runners

313# ---------------------------------------------------------------------------

314def run_offline(base_dir: Path | None = None) -> tuple[list[FixtureScore], dict]:

315 """Score each fixture's *recorded* findings against its expected spec.

316

317 Deterministic, offline, no live CLIs. Returns ``(scores, aggregate)``.

318 """

319 fixtures = load_fixtures(base_dir)

320 scores = [score_fixture(fx.recorded, fx.expected) for fx in fixtures]

321 return scores, aggregate(scores)

322

323

324def live_enabled() -> bool:

325 """True when the optional live benchmark mode is explicitly enabled."""

326 return os.environ.get("JURY_BENCH_LIVE") == "1"

327

328

329def run_live(base_dir: Path | None = None) -> tuple[list[FixtureScore], dict]:

330 """Run the real jury per fixture diff and score the live findings.

331

332 Only meaningful when ``JURY_BENCH_LIVE=1``. Imports the orchestrator

333 lazily so the offline path never touches the live machinery. This invokes

334 real agent CLIs and is never run in CI.

335 """

336 if not live_enabled():

337 raise RuntimeError("live benchmark is disabled; set JURY_BENCH_LIVE=1 to enable it")

338 # Lazy imports: keep the offline/import path free of live dependencies.

339 from .config import DEFAULT_CONFIG, _from_dict

340 from .orchestrator import run_jury

341

342 config = _from_dict(DEFAULT_CONFIG)

343 fixtures = load_fixtures(base_dir)

344 scores: list[FixtureScore] = []

345 for fx in fixtures:

346 outcome = run_jury(config, fx.diff, mock=False)

347 findings = [_finding_to_dict(f) for f in outcome.findings]

348 scores.append(score_fixture(findings, fx.expected))

349 return scores, aggregate(scores)

350

351

352def _finding_to_dict(finding) -> dict:

353 """Convert a Finding dataclass into the plain dict the scorer consumes."""

354 return {

355 "severity": getattr(finding, "severity", "info"),

356 "file": getattr(finding, "file", ""),

357 "line": getattr(finding, "line", None),

358 "claim": getattr(finding, "claim", ""),

359 "evidence": getattr(finding, "evidence", ""),

360 }

361

362

363# ---------------------------------------------------------------------------

364# Reporting

365# ---------------------------------------------------------------------------

366def format_table(scores: list[FixtureScore], summary: dict) -> str:

367 """Render a per-fixture + aggregate score table as plain text."""

368 header = (

369 f"{'fixture':<24} {'pass':<5} {'match':<6} {'miss':<5} {'fp':<4} {'prec':<5} {'recall':<6}"

370 )

371 lines = [header, "-" * len(header)]

372 for s in scores:

373 lines.append(

374 f"{s.id:<24} {('yes' if s.passed else 'NO'):<5} "

375 f"{s.matched:<6} {s.missed:<5} {s.false_positives:<4} "

376 f"{s.precision:<5.2f} {s.recall:<6.2f}"

377 )

378 lines.append("-" * len(header))

379 pass_ratio = f"{summary['passed']}/{summary['fixtures']}"

380 lines.append(

381 f"{'TOTAL':<24} {pass_ratio:<5} "

382 f"{summary['matched']:<6} {summary['missed']:<5} {summary['false_positives']:<4} "

383 f"{summary['precision']:<5.2f} {summary['recall']:<6.2f}"

384 )

385 return "\n".join(lines)

386

387

388def main() -> int:

389 """Module entry point: print the score table (offline by default)."""

390 live = live_enabled()

391 mode = "live (JURY_BENCH_LIVE=1)" if live else "offline/recorded"

392 print(f"ai-jury benchmark — mode: {mode}")

393 print(

394 "NOTE: small and directional, not a universal quality claim. "

395 "Offline mode validates the scorer + recorded baselines, not live quality.\n"

396 )

397 scores, summary = run_live() if live else run_offline()

398 print(format_table(scores, summary))

399 # Exit non-zero when any fixture failed, so the table is usable as a check.

400 return 0 if summary["failed"] == 0 else 1

401

402

403if __name__ == "__main__":

404 raise SystemExit(main())