Coverage for src/ai_jury/orchestrator.py: 99%

1"""Jury orchestration: review -> debate -> synthesis.

3The orchestrator owns the round structure and prompt assembly; adapters only run

4their CLI. Rounds run agents concurrently (thread pool) because each call is an

5independent, IO-bound subprocess.

6"""

8from __future__ import annotations

10import random

11import string

12import time

13from concurrent.futures import ThreadPoolExecutor

14from dataclasses import dataclass, field, replace

16from . import convergence, injection, largediff, prompts

17from .adapters import RETRYABLE_ERROR_CODES, Adapter, AgentResult, make_adapter

18from .config import JuryConfig

19from .consensus import FindingGroup, demote_local_only_groups, group_findings

20from .findings import Finding, Verdict, parse_findings, parse_verdicts

21from .policy import ReviewPolicy, render_policy_section

22from .privilege import audit_privilege

23from .redaction import redact

26class RunBudget:

27 """Wall-clock budget for one jury run (issue #30).

29 Tracks the elapsed time since construction and derives the timeout to pass a

30 single agent call from the optional total-run and per-phase budgets. ``None``

31 for either budget means uncapped; when both are unset ``call_timeout``

32 returns ``None`` so adapters fall back to their own per-agent timeout and

33 behaviour is identical to having no budget at all.

34 """

36 def __init__(self, total_timeout: int | None, phase_timeout: int | None):

37 self.total = total_timeout

38 self.phase = phase_timeout

39 self._start = time.monotonic()

41 def elapsed(self) -> float:

42 return time.monotonic() - self._start

44 def remaining(self) -> float | None:

45 if self.total is None:

46 return None

47 return max(0.0, self.total - self.elapsed())

49 def expired(self) -> bool:

50 return self.total is not None and self.elapsed() >= self.total

52 def call_timeout(self) -> int | None:

53 """Per-call timeout: the min of the phase budget and remaining total.

55 The agent's own per-agent timeout is applied by the adapter (it takes the

56 min with this value), so it is not needed here. Returns ``None`` when

57 neither budget caps the call, leaving the adapter to use its configured

58 per-agent timeout.

59 """

60 caps: list[float] = []

61 if self.phase is not None:

62 caps.append(float(self.phase))

63 remaining = self.remaining()

64 if remaining is not None:

65 caps.append(remaining)

66 if not caps:

67 return None

68 return max(1, int(min(caps)))

71def _run_with_retry(

72 adapter: Adapter,

73 prompt: str,

74 phase: str,

75 budget: RunBudget,

76 retries: int,

77 log,

78) -> AgentResult:

79 """Run one agent for one phase, retrying transient failures (issue #30).

81 Retries only failures whose typed error code is in

82 ``RETRYABLE_ERROR_CODES`` (timeout/rate-limit/spawn), up to ``retries`` extra

83 attempts. A deterministic failure (auth, missing CLI, empty output, generic

84 nonzero exit) is returned immediately. The returned result's ``attempts``

85 records how many tries were made. Retrying stops early when the run budget is

86 exhausted so a retry never overruns the total timeout.

87 """

88 max_attempts = max(1, retries + 1)

89 result = adapter.run(prompt, phase=phase, timeout=budget.call_timeout())

90 attempts = 1

91 while (

92 not result.ok

93 and result.error_code in RETRYABLE_ERROR_CODES

94 and attempts < max_attempts

95 and not budget.expired()

96 ):

97 log(f"{adapter.name}: {phase} attempt {attempts} failed ({result.error_code}); retrying")

98 result = adapter.run(prompt, phase=phase, timeout=budget.call_timeout())

99 attempts += 1

100 result.attempts = attempts

101 return result

102

103

104def _order_by_agents(results: list[AgentResult], order: list[str]) -> list[AgentResult]:

105 """Reorder phase results into the configured/enabled agent order.

106

107 Round phases run agents concurrently (ThreadPoolExecutor.map), so the order

108 in which results arrive is not guaranteed across runs. The report and all

109 downstream consumers must NOT depend on thread-completion order, so we sort

110 every phase's results by each agent's index in ``order`` (the stable

111 enabled-agent list). Agents not present in ``order`` (should not happen)

112 sort to the end, preserving their relative arrival order as a stable

113 tiebreak so the sort is total and deterministic.

114 """

115 index = {name: i for i, name in enumerate(order)}

116 fallback = len(order)

117 return sorted(results, key=lambda r: index.get(r.agent, fallback))

118

119

120@dataclass

121class JuryOutcome:

122 reviews: list[AgentResult]

123 debate: list[AgentResult]

124 synthesis: AgentResult | None

125 chair: str

126 findings: list[Finding] = field(default_factory=list)

127 warnings: list[str] = field(default_factory=list)

128 groups: list[FindingGroup] = field(default_factory=list)

129 verify: AgentResult | None = None

130 verdicts: list[Verdict] = field(default_factory=list)

131 context_mode: str = "diff-only"

132 redact_secrets: bool = True

133 redaction_count: int = 0

134 injection_hits: list = field(default_factory=list)

135 # Execution/partial-result signals (issue #30): agents skipped because their

136 # CLI was unavailable (name, reason), and whether the run budget was

137 # exhausted before all phases completed.

138 skipped: list = field(default_factory=list)

139 budget_exhausted: bool = False

140 # Adaptive-rounds signals (issue #40): rounds actually executed and a short

141 # human-readable reason for why the debate ran / stopped.

142 rounds_executed: int = 1

143 stop_reason: str = ""

144 # Set when this outcome was served from the local result cache (issue #33),

145 # so the report/metadata can mark it as cached rather than freshly computed.

146 from_cache: bool = False

147

148

149def _run_phase(

150 adapters: list[Adapter],

151 prompt_for: dict[str, str],

152 phase: str,

153 parallel: bool,

154 *,

155 budget: RunBudget,

156 retries: int,

157 log,

158) -> list[AgentResult]:

159 def task(a: Adapter) -> AgentResult:

160 return _run_with_retry(a, prompt_for[a.name], phase, budget, retries, log)

161

162 if parallel and len(adapters) > 1:

163 with ThreadPoolExecutor(max_workers=len(adapters)) as pool:

164 return list(pool.map(task, adapters))

165 return [task(a) for a in adapters]

166

167

168def _others(reviews: list[AgentResult], me: str) -> str:

169 """Identity-labeled peer reviews (legacy path; ``anonymize_debate = false``).

170

171 Renders each *other* reviewer's round-1 output with its real agent/vendor

172 identity in the stable enabled-agent order. This is the pre-#37 behaviour and

173 leaks both identity and position; the anonymizing path below is the default.

174 """

175 chunks = [

176 f"### {r.agent} ({r.vendor})\n{r.output}"

177 for r in reviews

178 if r.agent != me and r.ok and r.output

179 ]

180 return "\n\n".join(chunks) if chunks else "_(no other reviews available)_"

181

182

183def _anon_label(i: int) -> str:

184 """Stable anonymous reviewer label: 0->'A', 1->'B', ... 26->'AA'."""

185 letters = string.ascii_uppercase

186 label = ""

187 i += 1

188 while i > 0:

189 i, rem = divmod(i - 1, 26)

190 label = letters[rem] + label

191 return label

192

193

194def _anonymize_peers(

195 reviews: list[AgentResult], me: str, rng: random.Random

196) -> tuple[str, dict[str, str]]:

197 """Chatham House peer view for a debater (#37).

198

199 Returns ``(prompt_text, label_to_agent)`` where the prompt text renders each

200 *other* successful reviewer's round-1 output under an anonymous

201 ``### Reviewer A`` / ``### Reviewer B`` heading — NO vendor or agent name.

202 The debater's OWN review is excluded (it is passed separately as

203 ``own_review``). Presentation order is shuffled DETERMINISTICALLY using the

204 shared run RNG so neither identity nor position is a stable signal; the same

205 seed yields the same order, different seeds may differ.

206

207 ``label_to_agent`` keeps the anonymous-label -> real-agent mapping internal so

208 callers can still recover authorship (the report attributes by real name).

209 """

210 peers = [r for r in reviews if r.agent != me and r.ok and r.output]

211 if not peers:

212 return "_(no other reviews available)_", {}

213 # Deterministic per-debater shuffle from the shared run RNG. We shuffle a

214 # copy so the caller's review list (used elsewhere) is untouched.

215 order = list(peers)

216 rng.shuffle(order)

217 chunks: list[str] = []

218 label_to_agent: dict[str, str] = {}

219 for i, r in enumerate(order):

220 label = f"Reviewer {_anon_label(i)}"

221 label_to_agent[label] = r.agent

222 chunks.append(f"### {label}\n{r.output}")

223 return "\n\n".join(chunks), label_to_agent

224

225

226def _debate_round(

227 debaters: list[Adapter],

228 reviews: list[AgentResult],

229 diff: str,

230 config: JuryConfig,

231 run_rng: random.Random,

232 agent_order: list[str],

233 prior: list[AgentResult],

234 budget: RunBudget,

235 retries: int,

236 log,

237 round_no: int,

238 template: str = prompts.DEBATE,

239) -> list[AgentResult]:

240 """Run one debate round and return its results in stable agent order.

241

242 ``prior`` holds the previous round's debate outputs (empty for the first

243 debate round); when present they are appended to each debater's prompt as a

244 "prior debate" addendum so later rounds in an adaptive run (issue #40) build

245 on, rather than repeat, earlier cross-examination. The peer-review anonymizing

246 path (#37) is preserved unchanged.

247 """

248 log(f"round {round_no}: {len(debaters)} agents cross-examining")

249 own = {r.agent: r.output for r in reviews if r.ok}

250 # Prior-round debate output quotes attacker-controlled diff text, so it is

251 # untrusted: neutralize sentinels (issue #316/L-1) before it is fenced and

252 # appended below, matching every other peer-output slot.

253 prior_txt = prompts.neutralize_sentinels(

254 "\n\n".join(f"### {r.agent}\n{r.output}" for r in prior if r.ok and r.output)

255 )

256 debate_prompt: dict[str, str] = {}

257 for a in debaters:

258 if config.anonymize_debate:

259 # Per-debater deterministic shuffle: derive a child RNG from the

260 # shared run RNG so each debater gets an independent but reproducible

261 # peer ordering (same seed -> same order).

262 peer_rng = random.Random(run_rng.random())

263 other_reviews, _label_map = _anonymize_peers(reviews, a.name, peer_rng)

264 else:

265 other_reviews = _others(reviews, a.name)

266 text = template.format(

267 name=a.name,

268 diff=prompts.neutralize_sentinels(diff),

269 own_review=prompts.neutralize_sentinels(

270 own.get(a.name, "_(your review was unavailable)_")

271 ),

272 other_reviews=prompts.neutralize_sentinels(other_reviews),

273 notice=prompts._UNTRUSTED_NOTICE,

274 )

275 if prior_txt:

276 text += (

277 "\n\n=== PRIOR DEBATE (earlier round) ===\n"

278 "Build on this; do not just repeat it. Only keep a DISPUTE or "

279 "MISSED item if it is still unresolved.\n\n"

280 "<<<UNTRUSTED_REVIEW\n" + prior_txt + "\nUNTRUSTED_REVIEW>>>\n"

281 )

282 debate_prompt[a.name] = text

283 results = _run_phase(

284 debaters,

285 debate_prompt,

286 "debate",

287 config.parallel,

288 budget=budget,

289 retries=retries,

290 log=log,

291 )

292 # Same stable-ordering guarantee as round 1: independent of thread-pool

293 # completion order.

294 return _order_by_agents(results, agent_order)

295

296

297def run_jury(

298 config: JuryConfig,

299 diff: str,

300 *,

301 context: str = "",

302 mock: bool = False,

303 strict: bool = False,

304 seed: int | None = None,

305 policy: ReviewPolicy | None = None,

306 log=lambda _msg: None,

307 budget: RunBudget | None = None,

308 on_event=None,

309 mode: str = "code",

310) -> JuryOutcome:

311 # Jury mode (issue #221): "code" (default) reviews a diff with the code-review

312 # rubric; "issue" reviews a GitHub issue's prose for completeness/clarity.

313 # Only the prompt TEMPLATES differ — the round structure, consensus, voting,

314 # verification, ordering, and determinism are identical. ``tmpl`` selects the

315 # four phase templates; each is threaded into the phase that uses it so the

316 # call sites are otherwise unchanged.

317 tmpl = prompts.for_mode(mode)

318 # Live play-by-play hook (issue #210): an optional callback fired after each

319 # phase result is produced — ``on_event(kind, result, round_no=None)`` with

320 # kind in {"review", "debate", "verify", "synthesis"}. It lets a caller stream

321 # the deliberation as it happens (CLI ``--live``) without the orchestrator

322 # doing any I/O itself. Fired in stable per-phase order (not thread-completion

323 # order) so the event sequence is deterministic. Defaults to a no-op.

324 emit = on_event or (lambda *_a, **_k: None)

325 # Repository review policy (optional, #8): maintainer-authored, TRUSTED

326 # content rendered into each REVIEW prompt in a clearly separated section.

327 # When ``policy`` is None a sentinel placeholder is used, so the prompt is

328 # unchanged except for that section. The policy is distinct from the

329 # agent-runtime ``config`` and never enters the untrusted diff/context fences.

330 policy_section = render_policy_section(policy)

331 # Run reproducibility: a single shared RNG seeds every randomized

332 # orchestration decision (future: anonymized-rebuttal order, rotating

333 # chair, tie-breaks). The seed comes from the explicit ``seed`` argument if

334 # given, else from ``config.seed``. We construct a dedicated

335 # ``random.Random`` instance rather than touching the global ``random``

336 # module so seeding a jury run never perturbs unrelated global state.

337 # When the seed is None the RNG is unseeded (still deterministic

338 # orchestration; randomness, if any, is just not reproducible run-to-run).

339 # LLM output itself is never made deterministic by this — only the

340 # orchestration around it. ``run_rng`` is the shared run RNG: pass it to

341 # any feature that needs reproducible randomness instead of using ``random``.

342 run_seed = seed if seed is not None else config.seed

343 run_rng = random.Random(run_seed) # shared run RNG (see docstring)

344

345 # Run budget (issue #30): a single wall-clock budget threaded through every

346 # phase. Defaults (both None) leave behaviour identical to no budget, with

347 # each agent bounded only by its own per-agent timeout. ``retries`` is the

348 # number of extra attempts for transient (retryable) failures. A caller may

349 # pass a SHARED budget so ``total_timeout`` spans a whole chunked review

350 # rather than resetting per chunk (issue #31 / review finding).

351 if budget is None:

352 budget = RunBudget(config.total_timeout, config.phase_timeout)

353 retries = config.retries

354

355 # Context policy: diff-only sends only the diff; expanded includes context.

356 ctx_cfg = getattr(config, "context", None)

357 context_mode = getattr(ctx_cfg, "mode", "diff-only") if ctx_cfg else "diff-only"

358 redact_on = getattr(ctx_cfg, "redact_secrets", True) if ctx_cfg else True

359 if context_mode == "diff-only":

360 context = ""

361 redaction_count = 0

362 if redact_on:

363 diff, _n1 = redact(diff)

364 context, _n2 = redact(context)

365 redaction_count = _n1 + _n2

366 if redaction_count:

367 log(f"redacted {redaction_count} secret(s) before sending to agents")

368

369 # Prompt-injection heuristic (OWASP LLM01): scan untrusted diff/context for

370 # patterns that try to override instructions, then SURFACE them as a synthetic

371 # finding/warning. We never act on them; the CI gate is derived from

372 # structured consensus (see ci.evaluate_ci), so an injected "APPROVE"

373 # cannot flip the verdict.

374 injection_hits = injection.scan_inputs(diff, context)

375 injection_findings: list[Finding] = []

376 if injection_hits:

377 log(f"prompt-injection heuristic: {len(injection_hits)} suspicious pattern(s) flagged")

378 syn = injection.hits_to_finding(injection_hits)

379 if syn is not None: 379 ↛ 384line 379 didn't jump to line 384 because the condition on line 379 was always true

380 injection_findings.append(syn)

381

382 # Least-privilege audit: warn when a configured agent could perform

383 # write/tool actions while reviewing attacker-controlled content.

384 privilege_warnings = audit_privilege(config.enabled_agents)

385 for w in privilege_warnings:

386 log(f"least-privilege warning: {w}")

387 if strict and privilege_warnings:

388 raise RuntimeError(

389 "least-privilege check failed (--strict): " + "; ".join(privilege_warnings)

390 )

391

392 specs = config.enabled_agents

393 adapters = [make_adapter(s, mock=mock) for s in specs]

394

395 # Filter to available agents (unless strict, where a missing CLI is fatal).

396 # Skipped agents are recorded (name, reason) so the report can state exactly

397 # which agents never ran — part of the partial-result policy (issue #30).

398 usable: list[Adapter] = []

399 skipped: list[tuple[str, str]] = []

400 for a in adapters:

401 if a.available():

402 usable.append(a)

403 elif strict:

404 raise RuntimeError(f"agent '{a.name}' CLI not available: {a.spec.command}")

405 else:

406 reason = f"CLI not found ({a.spec.command})"

407 log(f"skipping '{a.name}': {reason}")

408 skipped.append((a.name, reason))

409 if not usable:

410 raise RuntimeError("no usable agents — install at least one agent CLI or use --mock")

411

412 usable_names = [a.name for a in usable]

413

414 # Round 1: independent reviews.

415 log(f"round 1: {len(usable)} agents reviewing")

416 review_prompt = {

417 a.name: tmpl["review"].format(

418 name=a.name,

419 context=prompts.neutralize_sentinels(context or "_(none)_"),

420 diff=prompts.neutralize_sentinels(diff),

421 policy=policy_section,

422 notice=prompts._UNTRUSTED_NOTICE,

423 )

424 for a in usable

425 }

426 reviews = _run_phase(

427 usable,

428 review_prompt,

429 "review",

430 config.parallel,

431 budget=budget,

432 retries=retries,

433 log=log,

434 )

435 # Stable ordering: the thread pool can return results in any completion

436 # order. Reorder to the enabled-agent order so the report (and every

437 # downstream consumer) is independent of which thread finished first.

438 agent_order = [a.name for a in usable]

439 reviews = _order_by_agents(reviews, agent_order)

440

441 # Parse structured findings from each successful review and aggregate them.

442 # Seed with the synthetic injection finding/warnings so they surface in the

443 # report and outcome.warnings without ever influencing agent behaviour.

444 all_findings: list[Finding] = list(injection_findings)

445 all_warnings: list[str] = injection.hits_to_warnings(injection_hits)

446 all_warnings.extend(privilege_warnings)

447 for r in reviews:

448 if not r.ok:

449 continue

450 found, warns = parse_findings(r.output, r.agent)

451 r.findings = found

452 r.warnings = warns

453 all_findings.extend(found)

454 all_warnings.extend(warns)

455

456 # Stream round-1 reviews as they're now finalized (stable order).

457 for r in reviews:

458 emit("review", r)

459

460 # Deterministic consensus grouping across reviewers.

461 groups = group_findings(all_findings, len(reviews))

462

463 # Names of agents whose round-1 review succeeded — the chair resolver uses

464 # this to (optionally) prefer a non-reviewer chair (#38).

465 reviewer_names = [r.agent for r in reviews if r.ok]

466

467 # Resolve the chair ONCE for the whole run so verify and synthesis use the

468 # SAME chair. ``chair = "rotate"`` and prefer-non-reviewer both consume the

469 # shared run RNG / reviewer info, so resolving once (rather than recomputing

470 # per phase) is what keeps a rotating chair stable within a run (#38).

471 chair_name = resolve_chair(config, usable_names, reviewer_names, run_rng)

472

473 # Round 2+: debate. Only agents whose round-1 review succeeded participate.

474 # Two modes (issue #40):

475 # - fixed (early_stop = false): honour ``rounds`` exactly — run one debate

476 # round iff rounds >= 2. Reproducible fixed-N behaviour for benchmarking.

477 # - adaptive (early_stop = true): skip the debate when round-1 reviewers

478 # already agree, otherwise run debate up to ``max_rounds`` rounds and stop

479 # as soon as a round resolves all disputes.

480 debate: list[AgentResult] = []

481 rounds_executed = 1

482 stop_reason = ""

483 budget_exhausted = False

484 debaters = [a for a in usable if any(r.agent == a.name and r.ok for r in reviews)]

485 can_debate = len(debaters) >= 2

486

487 if config.early_stop:

488 max_rounds = config.effective_max_rounds

489 if not can_debate:

490 stop_reason = "stopped after round 1: need >=2 successful reviews to debate"

491 log(stop_reason)

492 elif max_rounds < 2:

493 stop_reason = "stopped after round 1: max_rounds < 2"

494 log(stop_reason)

495 else:

496 converged, why = convergence.review_convergence(groups, len(reviews))

497 if converged:

498 stop_reason = f"early stop after round 1: {why}"

499 log(stop_reason)

500 else:

501 log(f"early stop active: {why}; running debate up to {max_rounds} round(s)")

502 prior: list[AgentResult] = []

503 round_no = 1

504 while round_no < max_rounds:

505 if budget.expired():

506 budget_exhausted = True

507 stop_reason = f"stopped at round {rounds_executed}: run budget exhausted"

508 log(stop_reason)

509 break

510 round_no += 1

511 debate = _debate_round(

512 debaters,

513 reviews,

514 diff,

515 config,

516 run_rng,

517 agent_order,

518 prior,

519 budget,

520 retries,

521 log,

522 round_no,

523 template=tmpl["debate"],

524 )

525 rounds_executed = round_no

526 for r in debate:

527 emit("debate", r, round_no)

528 dconv, dwhy = convergence.debate_convergence(debate)

529 if dconv:

530 stop_reason = f"converged after round {round_no}: {dwhy}"

531 log(stop_reason)

532 break

533 prior = debate

534 stop_reason = f"ran {round_no} rounds: {dwhy}"

535 else:

536 stop_reason = stop_reason or (

537 f"reached max_rounds ({max_rounds}) with disagreement remaining"

538 )

539 else:

540 # Fixed-N: exactly the historical behaviour.

541 if config.rounds >= 2 and can_debate:

542 if budget.expired():

543 budget_exhausted = True

544 stop_reason = "round 2 skipped: run budget exhausted"

545 log(stop_reason)

546 else:

547 debate = _debate_round(

548 debaters,

549 reviews,

550 diff,

551 config,

552 run_rng,

553 agent_order,

554 [],

555 budget,

556 retries,

557 log,

558 2,

559 template=tmpl["debate"],

560 )

561 rounds_executed = 2

562 for r in debate:

563 emit("debate", r, 2)

564 elif config.rounds >= 2:

565 stop_reason = "round 2 skipped: need >=2 successful reviews to debate"

566 log(stop_reason)

567 else:

568 stop_reason = "single round (rounds = 1)"

569

570 # Verification: the chair judges candidate findings to reduce false

571 # positives. Skipped when the run budget is exhausted (issue #30) so a

572 # partial run still returns what completed instead of overrunning.

573 verify_result: AgentResult | None = None

574 verdicts: list[Verdict] = []

575 if config.verify:

576 if budget.expired():

577 budget_exhausted = True

578 msg = "verification skipped: run budget exhausted"

579 log(msg)

580 all_warnings.append(msg)

581 else:

582 verify_result, verdicts, verify_warnings = _verify(

583 chair_name,

584 usable,

585 all_findings,

586 diff,

587 context,

588 budget,

589 retries,

590 log,

591 template=tmpl["verify"],

592 )

593 all_warnings.extend(verify_warnings)

594 _apply_verdicts(groups, verdicts)

595 if verify_result is not None:

596 emit("verify", verify_result)

597

598 # Local-only demotion (issue #442) runs AFTER verification, never before:

599 # _reject_targets' member-tier guard (orchestrator._reject_targets) assumes

600 # group.severity == max(member severities) to decide whether a rejecting

601 # verdict may suppress the whole group. Demoting group.severity earlier

602 # would desync it from that invariant and could let a verdict aimed at a

603 # minor local-only duplicate collateral-reject a genuinely critical,

604 # never-verified co-located finding merged into the same group.

605 if config.demote_local_only:

606 vendor_by_reviewer = {a.name: a.vendor for a in config.agents}

607 demote_local_only_groups(groups, vendor_by_reviewer)

608

609 # Synthesis: the chair consolidates. When the resolved chair is ALSO a

610 # round-1 reviewer, feed it an anonymized view of the reviews (#38 guardrail)

611 # so it cannot preferentially weight its own findings; the report still

612 # attributes by real name because it renders the real outcome data, not this

613 # synthesis prompt.

614 synthesis: AgentResult | None = None

615 if budget.expired():

616 budget_exhausted = True

617 msg = "synthesis skipped: run budget exhausted"

618 log(msg)

619 if msg not in all_warnings: 619 ↛ 641line 619 didn't jump to line 641 because the condition on line 619 was always true

620 all_warnings.append(msg)

621 else:

622 chair_is_reviewer = chair_name in reviewer_names

623 anonymize_synthesis = config.anonymize_debate and chair_is_reviewer

624 synthesis = _synthesize(

625 chair_name,

626 usable,

627 reviews,

628 debate,

629 diff,

630 budget,

631 retries,

632 log,

633 verdicts=verdicts,

634 anonymize_reviews=anonymize_synthesis,

635 rng=run_rng,

636 template=tmpl["synthesis"],

637 )

638 if synthesis is not None:

639 emit("synthesis", synthesis)

640

641 return JuryOutcome(

642 reviews=reviews,

643 debate=debate,

644 synthesis=synthesis,

645 chair=chair_name,

646 findings=all_findings,

647 warnings=all_warnings,

648 groups=groups,

649 verify=verify_result,

650 verdicts=verdicts,

651 context_mode=context_mode,

652 redact_secrets=redact_on,

653 redaction_count=redaction_count,

654 injection_hits=injection_hits,

655 skipped=skipped,

656 budget_exhausted=budget_exhausted,

657 rounds_executed=rounds_executed,

658 stop_reason=stop_reason,

659 )

660

661

662def resolve_chair(

663 config: JuryConfig,

664 usable: list[str],

665 reviewers: list[str],

666 rng: random.Random,

667) -> str:

668 """Resolve the chair for a run as a PURE function of its inputs (#38).

669

670 Precedence:

671 1. ``chair = "rotate"`` — pick deterministically from the usable agents

672 using the shared run ``rng``. Same seed -> same chair; different seeds

673 may differ. Falls back to the first usable agent when none are usable.

674 2. An explicit ``config.chair`` that names a usable agent — honoured as-is

675 (an operator-chosen chair always wins).

676 3. ``prefer_non_reviewer_chair`` — when set and a usable agent that was NOT

677 a successful round-1 reviewer exists, prefer the first such agent

678 (neutral chair). This only applies when the configured chair is not

679 itself a usable agent.

680 4. Fallback to the first usable agent (legacy behaviour).

681

682 Keeping this pure (no Adapter objects, no I/O) makes it directly

683 unit-testable and guarantees ``_verify`` and ``_synthesize`` agree because

684 the caller resolves it ONCE and threads the result through both.

685 """

686 if not usable:

687 return config.chair

688 names = set(usable)

689

690 if config.chair == "rotate":

691 # Deterministic rotation: sort for a stable candidate order independent

692 # of dict/thread ordering, then index with the shared run RNG. Sorting

693 # the candidate list (not iterating the set) makes the pick a pure

694 # function of (seed, usable-name set): same seed + same agents -> same

695 # chair, regardless of RNG-consumption order elsewhere.

696 candidates = sorted(names)

697 return candidates[rng.randrange(len(candidates))]

698

699 if config.chair in names:

700 return config.chair

701

702 if config.prefer_non_reviewer_chair:

703 reviewer_set = set(reviewers)

704 non_reviewers = [n for n in usable if n not in reviewer_set]

705 if non_reviewers:

706 return non_reviewers[0]

707

708 return usable[0]

709

710

711def _format_findings_for_verify(findings: list[Finding]) -> str:

712 """Render candidate findings for the chair's verification prompt.

713

714 Reviewer identity is omitted (#250) so the chair can't favour its own

715 findings while judging them — parity with the #37/#38 anonymization.

716 Verdicts match back by file/line/claim, so dropping it is safe.

717 """

718 if not findings:

719 return "_(no candidate findings)_"

720 lines = []

721 for f in findings:

722 loc = f.file or "?"

723 if f.line is not None:

724 loc = f"{loc}:{f.line}"

725 lines.append(f"- [{f.severity}] {loc} — {f.claim}")

726 return "\n".join(lines)

727

728

729def _format_verdicts(verdicts: list[Verdict]) -> str:

730 if not verdicts:

731 return "_(no verification verdicts)_"

732 lines = []

733 for v in verdicts:

734 loc = v.file or "?"

735 if v.line is not None:

736 loc = f"{loc}:{v.line}"

737 lines.append(f"- [{v.status}] {loc} — {v.claim}: {v.reasoning}")

738 return "\n".join(lines)

739

740

741def _verify(

742 chair_name,

743 usable,

744 findings,

745 diff,

746 context,

747 budget,

748 retries,

749 log,

750 template=prompts.VERIFY,

751) -> tuple[AgentResult | None, list[Verdict], list[str]]:

752 chair = next((a for a in usable if a.name == chair_name), None)

753 if chair is None:

754 return None, [], []

755 log(f"verification: chair '{chair_name}' judging {len(findings)} candidate findings")

756 prompt = template.format(

757 diff=prompts.neutralize_sentinels(diff),

758 findings=prompts.neutralize_sentinels(_format_findings_for_verify(findings)),

759 context=prompts.neutralize_sentinels(context or "_(none)_"),

760 notice=prompts._UNTRUSTED_NOTICE,

761 )

762 result = _run_with_retry(chair, prompt, "verify", budget, retries, log)

763 if not result.ok:

764 return result, [], [f"verification failed: {result.error}"]

765 verdicts, warnings = parse_verdicts(result.output, chair_name)

766 return result, verdicts, warnings

767

768

769def _verdict_matches_group(verdict: Verdict, group: FindingGroup) -> bool:

770 from .consensus import _normalize_claim, _normalize_path

771

772 rep = group.representative

773 # Case-EXACT path match (fold_case=False): on a case-sensitive filesystem

774 # ``Config.py`` != ``config.py``, so a verdict must not reject a finding it

775 # only case-collapses onto (audit 2026-06-13 r6/M).

776 if _normalize_path(verdict.file, fold_case=False) != _normalize_path(rep.file, fold_case=False):

777 return False

778 if verdict.line is not None and rep.line is not None and abs(verdict.line - rep.line) > 3:

779 return False

780 v_claim = _normalize_claim(verdict.claim)

781 r_claim = _normalize_claim(rep.claim)

782 if not v_claim:

783 # An empty verdict claim is allowed to match the finding *at this

784 # location* (the verifier may omit the claim and refer to it by

785 # position). But a verdict with NEITHER a claim NOR a line has no

786 # location precision at all: it would otherwise match — and, when

787 # ``unsupported``, REJECT — every finding group in the file, including

788 # unrelated criticals, flipping the CI gate from FAIL to PASS. Such a

789 # claim-less, line-less verdict is a file-wide wildcard and must not

790 # match (security audit 2026-06-13 r6/M). Require a concrete line that

791 # actually pins the finding before honoring an empty-claim match.

792 return verdict.line is not None and rep.line is not None

793 if v_claim == r_claim:

794 return True

795 v_tokens, r_tokens = set(v_claim.split()), set(r_claim.split())

796 if not v_tokens or not r_tokens:

797 return False

798 inter = len(v_tokens & r_tokens)

799 union = len(v_tokens) + len(r_tokens) - inter

800 return (inter / union if union else 0.0) >= 0.5

801

802

803def _claim_sim(a_claim: str, b_claim: str) -> float:

804 """Token-set similarity between two claims: 1.0 exact, else Jaccard, 0.0 if

805 either side is empty."""

806 from .consensus import _normalize_claim

807

808 a = _normalize_claim(a_claim)

809 b = _normalize_claim(b_claim)

810 if not a or not b:

811 return 0.0

812 if a == b:

813 return 1.0

814 at, bt = set(a.split()), set(b.split())

815 inter = len(at & bt)

816 union = len(at) + len(bt) - inter

817 return (inter / union) if union else 0.0

818

819

820# Verdict statuses that move a finding into a non-blocking bucket (suppress it).

821_REJECTING_STATUSES = frozenset({"unsupported", "needs_human_decision"})

822# Apply most-blocking statuses first so a contradictory verdict pair on one

823# finding is fail-closed: a `verified` (blocking) judgement is recorded before

824# any `unsupported`/`needs_human_decision` and cannot then be flipped to

825# non-blocking by verdict array ordering (audit 2026-06-13 r8/M).

826_STATUS_PRIORITY = {"verified": 0, "needs_human_decision": 1, "unsupported": 2}

827# Minimum claim similarity for a verdict to be considered "about" a finding at

828# all. The PRIMARY defence against a verdict dismissing a co-located *distinct*

829# finding is that a rejection attaches to AT MOST the single best-matching group

830# (`_best_reject_target`), so a verdict whose claim copies a benign neighbour

831# routes to that neighbour, not to the co-located critical (audit 2026-06-13

832# r8/M). The threshold stays moderate so the verifier's legitimate paraphrased

833# rejections (it drops the reviewer-name prefix etc.) still apply.

834_REJECT_CLAIM_THRESHOLD = 0.5

835

836

837def _reject_targets(verdict: Verdict, groups: list[FindingGroup]) -> list[FindingGroup]:

838 """Un-statused groups a rejecting verdict may suppress (fail-closed, r7/r8).

839

840 Defences (each closes a distinct collateral-rejection vector found across

841 audit rounds 6-9):

842

843 0. **Line required.** A rejecting verdict must pin a concrete line. A

844 line-less verdict is too imprecise to safely suppress a finding and would

845 act as a file-wide-by-claim wildcard (audit r9/M, the claim-ful

846 counterpart of the round-6 line-less-wildcard fix).

847 1. **Member-tier guard.** A group may merge findings of different severities

848 (consensus keeps the max). A verdict is "about" the member whose claim it

849 best matches; if that member is *less severe* than the group's max, the

850 verdict is dismissing a lesser co-located finding and must NOT suppress

851 the (e.g. critical) group.

852 2. **Best-tier only.** Across candidate groups, suppress only those at the

853 highest match similarity — a verdict copying a benign neighbour rejects

854 that neighbour (and its duplicate phrasings, which tie) but not a

855 separate, less-similar critical group.

856 3. **Least-severe within a tie.** If the best-similarity tier still spans

857 severities (an exact `_claim_sim` tie between a critical and a benign

858 decoy), suppress only the *least*-severe groups — a tie must never drag a

859 critical down alongside a decoy (audit r9/M).

860 """

861 from .findings import SEVERITY_ORDER

862

863 if verdict.line is None:

864 return []

865 scored: list[tuple[float, FindingGroup]] = []

866 for group in groups:

867 if group.status:

868 continue

869 if not _verdict_matches_group(verdict, group):

870 continue

871 members = getattr(group, "members", None) or [group.representative]

872 best_sim, best_member = max(

873 ((_claim_sim(verdict.claim, m.claim), m) for m in members),

874 key=lambda t: t[0],

875 )

876 if best_sim < _REJECT_CLAIM_THRESHOLD:

877 continue

878 # Member-tier guard: refuse if the verdict best-names a member less

879 # severe than the group's max severity (lower rank = more severe).

880 if SEVERITY_ORDER.get(best_member.severity, 99) > SEVERITY_ORDER.get(group.severity, 99):

881 continue

882 scored.append((best_sim, group))

883 if not scored:

884 return []

885 best = max(sim for sim, _ in scored)

886 tier = [(sim, group) for sim, group in scored if sim >= best]

887 # Within the top-similarity tier, keep only the least-severe groups.

888 least_rank = max(SEVERITY_ORDER.get(g.severity, 99) for _, g in tier)

889 return [g for _, g in tier if SEVERITY_ORDER.get(g.severity, 99) == least_rank]

890

891

892def _apply_verdicts(groups: list[FindingGroup], verdicts: list[Verdict]) -> None:

893 """Attach verification statuses to consensus groups.

894

895 unsupported -> bucket 'rejected'; needs_human_decision -> bucket 'disputed';

896 verified -> status recorded, bucket unchanged.

897 """

898 # Stable-sort by blocking priority so contradictions resolve fail-closed.

899 for verdict in sorted(verdicts, key=lambda v: _STATUS_PRIORITY.get(v.status, 3)):

900 if verdict.status in _REJECTING_STATUSES:

901 # Suppress only the best-similarity tier this verdict names — never

902 # collaterally a co-located, less-similar distinct finding.

903 bucket = "rejected" if verdict.status == "unsupported" else "disputed"

904 for target in _reject_targets(verdict, groups):

905 target.status = verdict.status

906 target.status_reasoning = verdict.reasoning

907 target.bucket = bucket

908 continue

909 # A verifying (non-suppressing) verdict may attach to every matching

910 # group: when reviewers phrase the same issue differently it can land in

911 # more than one group, and all should carry the judgement.

912 for group in groups:

913 if group.status:

914 continue

915 if _verdict_matches_group(verdict, group):

916 group.status = verdict.status

917 group.status_reasoning = verdict.reasoning

918

919

920def _synthesize(

921 chair_name,

922 usable,

923 reviews,

924 debate,

925 diff,

926 budget,

927 retries,

928 log,

929 verdicts=None,

930 anonymize_reviews=False,

931 rng=None,

932 template=prompts.SYNTHESIS,

933) -> AgentResult | None:

934 chair = next((a for a in usable if a.name == chair_name), None)

935 if chair is None:

936 return None

937 log(f"synthesis: chair '{chair_name}' consolidating verdict")

938 if anonymize_reviews:

939 # Chair self-preference guardrail (#38): present round-1 reviews to the

940 # chair under anonymous labels (no agent/vendor identity, no stable

941 # order) so it cannot tell which review is "its own". Uses the shared run

942 # RNG for deterministic-but-unstable ordering. ``me=None`` keeps ALL

943 # reviews (we are not excluding a debater here, only stripping identity).

944 peer_rng = random.Random(rng.random()) if rng is not None else random.Random()

945 reviews_txt, _label_map = _anonymize_peers(reviews, None, peer_rng)

946 else:

947 reviews_txt = (

948 "\n\n".join(

949 f"### {r.agent} ({r.vendor})\n{r.output}" for r in reviews if r.ok and r.output

950 )

951 or "_(no reviews)_"

952 )

953 debate_txt = (

954 "\n\n".join(f"### {r.agent}\n{r.output}" for r in debate if r.ok and r.output)

955 or "_(no debate round)_"

956 )

957 prompt = template.format(

958 diff=prompts.neutralize_sentinels(diff),

959 reviews=prompts.neutralize_sentinels(reviews_txt),

960 debate=prompts.neutralize_sentinels(debate_txt),

961 notice=prompts._UNTRUSTED_NOTICE,

962 )

963 if verdicts:

964 # The verdicts quote candidate findings, which transitively quote

965 # untrusted diff text (issue v1.5.0/M-1: this addendum was the one slot

966 # the #316/L-1 fix missed). Fence + neutralize it like every other

967 # peer-output slot so an embedded closing token can't break out.

968 prompt += (

969 "\n\n=== VERIFICATION VERDICTS (may quote UNTRUSTED text) ===\n"

970 "<<<UNTRUSTED_FINDINGS\n"

971 + prompts.neutralize_sentinels(_format_verdicts(verdicts))

972 + "\nUNTRUSTED_FINDINGS>>>\n"

973 )

974 return _run_with_retry(chair, prompt, "synthesis", budget, retries, log)

975

976

977def _merge_results_by_agent(phase_lists: list[list[AgentResult]]) -> list[AgentResult]:

978 """Merge per-chunk results for the same agent into one result (issue #31).

979

980 Outputs are concatenated under per-chunk headers, durations summed, ``ok`` is

981 true if the agent succeeded on any chunk, and ``attempts`` keeps the max so a

982 retried chunk is still visible. Agent order follows first appearance.

983 """

984 order: list[str] = []

985 by_agent: dict[str, list[AgentResult]] = {}

986 for lst in phase_lists:

987 for r in lst:

988 if r.agent not in by_agent:

989 by_agent[r.agent] = []

990 order.append(r.agent)

991 by_agent[r.agent].append(r)

992

993 merged: list[AgentResult] = []

994 for name in order:

995 parts = by_agent[name]

996

997 # bolt: Consolidate multiple metrics (ok, body, total_duration, max_attempts)

998 # into a single-pass O(N) explicit loop to bypass multiple generator instantiations

999 ok = False

1000 body_parts = []

1001 first_err = None

1002 total_duration = 0.0

1003 max_attempts = 0

1004

1005 for i, p in enumerate(parts, 1):

1006 if p.ok:

1007 ok = True

1008 if p.output:

1009 body_parts.append(f"#### chunk {i}\n{p.output}")

1010 elif first_err is None:

1011 first_err = p

1012

1013 total_duration += p.duration_s

1014 if p.attempts > max_attempts:

1015 max_attempts = p.attempts

1016

1017 body = "\n\n".join(body_parts)

1018

1019 merged.append(

1020 AgentResult(

1021 name,

1022 parts[0].vendor,

1023 ok,

1024 body,

1025 round(total_duration, 3),

1026 error=None if ok else (first_err.error if first_err else None),

1027 error_code=None if ok else (first_err.error_code if first_err else None),

1028 attempts=max_attempts,

1029 )

1030 )

1031 return merged

1032

1033

1034def _combine_chair_results(results: list[AgentResult], chair: str) -> AgentResult | None:

1035 """Combine per-chunk chair results (verify/synthesis) into one labelled result."""

1036 ok_parts = [r for r in results if r.ok and r.output]

1037 if not ok_parts:

1038 return results[0] if results else None

1039 vendor = ok_parts[0].vendor

1040

1041 # bolt: Consolidate body text concatenation and duration sum into a single-pass O(N) loop

1042 body_parts = []

1043 total_duration = 0.0

1044 for i, r in enumerate(ok_parts, 1):

1045 body_parts.append(f"### chunk {i}\n{r.output}")

1046 total_duration += r.duration_s

1047

1048 body = "\n\n".join(body_parts)

1049 return AgentResult(chair, vendor, True, body, round(total_duration, 3))

1050

1051

1052def _merge_chunk_outcomes(outcomes: list[JuryOutcome], config: JuryConfig) -> JuryOutcome:

1053 """Fold per-chunk outcomes (issue #31) into one renderable JuryOutcome.

1054

1055 Findings are unioned and re-grouped across all chunks so the consensus view

1056 is global; verdicts are re-applied to the merged groups. Review/debate/chair

1057 outputs are merged per agent with chunk labels so the report stays coherent.

1058 """

1059 if len(outcomes) == 1:

1060 return outcomes[0]

1061 base = outcomes[0]

1062

1063 reviews = _merge_results_by_agent([o.reviews for o in outcomes])

1064 debate = (

1065 _merge_results_by_agent([o.debate for o in outcomes])

1066 if any(o.debate for o in outcomes)

1067 else []

1068 )

1069 findings = [f for o in outcomes for f in o.findings]

1070 groups = group_findings(findings, len(reviews))

1071 verdicts = [v for o in outcomes for v in o.verdicts]

1072 # Scope each chunk's verdicts to that chunk's own findings. A verdict is

1073 # produced while verifying ONE chunk (whose prompt held only that chunk's

1074 # findings, but whose attacker-controlled diff text could steer it); after

1075 # the global merge an unscoped verdict could reject a *different* chunk's

1076 # structured critical and flip the CI gate (audit 2026-06-13 r7/M). Chunks

1077 # are file-disjoint, so apply each chunk's verdicts only to groups whose

1078 # location is one of that chunk's files.

1079 from .consensus import _normalize_path

1080

1081 for o in outcomes:

1082 chunk_files = {_normalize_path(f.file, fold_case=False) for f in o.findings if f.file}

1083 chunk_groups = [

1084 g

1085 for g in groups

1086 if _normalize_path(g.representative.file, fold_case=False) in chunk_files

1087 ]

1088 _apply_verdicts(chunk_groups, o.verdicts)

1089

1090 # Runs AFTER verdicts are applied — see the matching comment in run_jury for

1091 # why (the _reject_targets member-tier guard assumes group.severity is the

1092 # true max member severity; demoting earlier would desync that invariant).

1093 if config.demote_local_only:

1094 vendor_by_reviewer = {a.name: a.vendor for a in config.agents}

1095 demote_local_only_groups(groups, vendor_by_reviewer)

1096

1097 warnings = [w for o in outcomes for w in o.warnings]

1098

1099 synthesis = _combine_chair_results([o.synthesis for o in outcomes if o.synthesis], base.chair)

1100 verify = _combine_chair_results([o.verify for o in outcomes if o.verify], base.chair)

1101

1102 # bolt: Consolidate collection aggregations (sum, extend, max, any) into a single-pass O(N) explicit loop

1103 redaction_count = 0

1104 injection_hits = []

1105 budget_exhausted = False

1106 rounds_executed = 0

1107

1108 for o in outcomes:

1109 redaction_count += o.redaction_count

1110 injection_hits.extend(o.injection_hits)

1111 if o.budget_exhausted:

1112 budget_exhausted = True

1113 if o.rounds_executed > rounds_executed:

1114 rounds_executed = o.rounds_executed

1115

1116 return JuryOutcome(

1117 reviews=reviews,

1118 debate=debate,

1119 synthesis=synthesis,

1120 chair=base.chair,

1121 findings=findings,

1122 warnings=warnings,

1123 groups=groups,

1124 verify=verify,

1125 verdicts=verdicts,

1126 context_mode=base.context_mode,

1127 redact_secrets=base.redact_secrets,

1128 redaction_count=redaction_count,

1129 injection_hits=injection_hits,

1130 skipped=base.skipped,

1131 budget_exhausted=budget_exhausted,

1132 rounds_executed=rounds_executed,

1133 stop_reason=f"chunked review across {len(outcomes)} part(s)",

1134 )

1135

1136

1137def review_diff(

1138 config: JuryConfig,

1139 diff: str,

1140 *,

1141 context: str = "",

1142 mock: bool = False,

1143 strict: bool = False,

1144 seed: int | None = None,

1145 policy: ReviewPolicy | None = None,

1146 log=lambda _msg: None,

1147 on_event=None,

1148) -> tuple[JuryOutcome, largediff.DiffPlan]:

1149 """Plan a diff (filter + size + mode) then run the jury (issue #31).

1150

1151 The single entry point the CLI uses: it measures and filters the diff,

1152 reports the size and the selected handling mode, and dispatches:

1153

1154 - ``full`` — review the filtered diff in one ``run_jury`` pass;

1155 - ``chunked`` — review each chunk and merge the outcomes;

1156 - ``too_large`` — raise ``RuntimeError`` with an actionable message.

1157

1158 Returns ``(outcome, plan)`` so the caller can surface the plan. Existing

1159 callers of :func:`run_jury` are unaffected.

1160 """

1161 dc = config.diff

1162 plan = largediff.plan_diff(

1163 diff,

1164 max_bytes=dc.max_bytes,

1165 chunk=dc.chunk,

1166 chunk_max_bytes=dc.chunk_max_bytes,

1167 exclude_generated=dc.exclude_generated,

1168 exclude=dc.exclude,

1169 include=dc.include,

1170 )

1171 log(

1172 f"diff size: {plan.total_bytes} B total, {plan.kept_bytes} B after filters "

1173 f"({len(plan.kept)} file(s) kept, {len(plan.excluded)} excluded); "

1174 f"mode: {plan.mode}"

1175 )

1176 if plan.excluded:

1177 log("excluded: " + ", ".join(f"{p} [{why}]" for p, why in plan.excluded))

1178 log(plan.reason)

1179

1180 if plan.mode == largediff.MODE_TOO_LARGE:

1181 raise RuntimeError(f"diff too large to review: {plan.reason}")

1182 if not plan.chunks:

1183 raise RuntimeError(

1184 "nothing to review after filters — all files were excluded "

1185 "(check [jury.diff] include/exclude patterns)"

1186 )

1187

1188 # One shared budget across all chunks so ``total_timeout`` bounds the WHOLE

1189 # review, not each chunk independently (review finding). ``phase_timeout`` and

1190 # per-agent timeouts still apply per call via the same budget.

1191 shared_budget = RunBudget(config.total_timeout, config.phase_timeout)

1192

1193 # Redact the shared context ONCE here, before fan-out (#249). The same context

1194 # is reviewed against every chunk; letting each per-chunk ``run_jury`` redact

1195 # it would count its secrets once per chunk and ``_merge_chunk_outcomes`` would

1196 # sum them, inflating ``redaction_count`` (e.g. a 1-secret context over 8

1197 # chunks reported 8). Pre-redacting makes each chunk's re-redaction a no-op —

1198 # the ``[REDACTED:…]`` placeholders no longer match — so we add the one-time

1199 # context count back at the end. Diff/chunk redactions are still counted

1200 # per chunk and summed, which is correct (each chunk's diff is distinct).

1201 # `config.context` is the right path: `_from_dict` flattens the `[jury]`

1202 # table onto JuryConfig, so the `[jury.context]` sub-table is `config.context`

1203 # (a ContextConfig), NOT `config.jury.context` — there is no `config.jury`.

1204 # This mirrors how run_jury() reads it.

1205 ctx_cfg = getattr(config, "context", None)

1206 ctx_mode = getattr(ctx_cfg, "mode", "diff-only") if ctx_cfg else "diff-only"

1207 redact_on = getattr(ctx_cfg, "redact_secrets", True) if ctx_cfg else True

1208 context_redactions = 0

1209 if redact_on and ctx_mode != "diff-only" and context:

1210 context, context_redactions = redact(context)

1211

1212 def _run(chunk: str) -> JuryOutcome:

1213 return run_jury(

1214 config,

1215 chunk,

1216 context=context,

1217 mock=mock,

1218 strict=strict,

1219 seed=seed,

1220 policy=policy,

1221 log=log,

1222 budget=shared_budget,

1223 on_event=on_event,

1224 )

1225

1226 def _finalize(outcome: JuryOutcome) -> JuryOutcome:

1227 # Add the one-time context redaction count (per-chunk runs saw an already-

1228 # redacted context and counted 0 for it).

1229 if not context_redactions:

1230 return outcome

1231 return replace(outcome, redaction_count=outcome.redaction_count + context_redactions)

1232

1233 if plan.mode == largediff.MODE_FULL:

1234 return _finalize(_run(plan.chunks[0])), plan

1235

1236 outcomes = []

1237 for i, chunk in enumerate(plan.chunks, 1):

1238 log(f"reviewing chunk {i}/{len(plan.chunks)}")

1239 outcomes.append(_run(chunk))

1240 return _finalize(_merge_chunk_outcomes(outcomes, config)), plan