Coverage for src/ai_jury/report.py: 100%
319 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-05 20:29 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-05 20:29 +0000
1"""Render the jury run into a single markdown report."""
2from __future__ import annotations
4from . import classification as _classification
5from .adapters import AgentResult
6from .findings import SEVERITY_ORDER, Finding
9def _block(title: str, body: str) -> str:
10 return f"### {title}\n\n{body.strip() or '_(no output)_'}\n"
13def _fail_status(r: AgentResult) -> str:
14 """Failed-agent status line with a concise typed-error-code prefix."""
15 prefix = f"[{r.error_code}] " if getattr(r, "error_code", None) else ""
16 return f"⚠️ {prefix}{r.error}"
19def _finding_line(f: Finding) -> str:
20 loc = f.file or "?"
21 if f.line is not None:
22 loc = f"{loc}:{f.line}"
23 return f"- [{f.severity}] {loc} — {f.claim} ({f.confidence}, by {f.reviewer})"
26_BUCKET_LABELS = {
27 "consensus": "Consensus (all reviewers)",
28 "majority": "Majority",
29 "single_reviewer": "Single reviewer",
30 "disputed": "Disputed (needs human decision)",
31 "rejected": "Rejected (unsupported by verifier)",
32}
33_BUCKET_ORDER = ["consensus", "majority", "single_reviewer", "disputed", "rejected"]
35_STATUS_LABELS = {
36 "verified": "verified",
37 "unsupported": "unsupported",
38 "needs_human_decision": "needs human decision",
39}
42def _group_line(g) -> str:
43 f = g.representative
44 loc = f.file or "?"
45 if f.line is not None:
46 loc = f"{loc}:{f.line}"
47 reviewers = ", ".join(g.reviewers) if g.reviewers else "(unknown)"
49 parts = [f"- [{g.severity}] {loc} — {f.claim} (reviewers: {reviewers})"]
51 # Surface the reviewer's supporting evidence — the "why" behind the claim —
52 # so the verdict is auditable, not just asserted (issue: evidence surfacing).
53 if getattr(f, "evidence", ""):
54 parts.append(f"\n - _evidence:_ {f.evidence}")
56 status = getattr(g, "status", "")
57 if status:
58 reasoning = getattr(g, "status_reasoning", "")
59 if reasoning:
60 parts.append(f"\n - _verification:_ {_STATUS_LABELS.get(status, status)} — {reasoning}")
61 else:
62 parts.append(f"\n - _verification:_ {_STATUS_LABELS.get(status, status)}")
64 if f.suggested_fix:
65 parts.append(f"\n - _fix:_ {f.suggested_fix}")
67 return "".join(parts) if len(parts) > 1 else parts[0]
70def _metadata_block(metadata: dict) -> list[str]:
71 """Render the deterministic run-metadata section.
73 Intentionally omits non-deterministic fields (e.g. ``generated_at``) so the
74 Markdown report stays stable for snapshot tests. Per-agent durations are
75 deterministic under mock (0s) and scrubbed by the golden test's duration
76 normalizer otherwise. Wall-clock is labelled a cost proxy, not a dollar cost.
77 """
78 lines = ["## Run metadata\n"]
79 lines.append(f"- rounds executed: {metadata['rounds_executed']}")
80 # Adaptive-rounds explanation (issue #40): only shown when the orchestrator
81 # recorded a reason, so a plain fixed-N run stays unchanged.
82 if metadata.get("from_cache"):
83 lines.append("- ♻️ served from local cache (not re-computed)")
84 stop_reason = metadata.get("stop_reason")
85 if stop_reason:
86 lines.append(f"- rounds decision: {stop_reason}")
87 lines.append(f"- verify: {'on' if metadata['verify_enabled'] else 'off'}")
88 lines.append(f"- context mode: {metadata['context_mode']}")
89 # Partial-result signals (issue #30): only rendered when relevant so a
90 # complete, unbudgeted run is unaffected.
91 if metadata.get("budget_exhausted"):
92 lines.append("- ⚠️ run budget exhausted: some phases were skipped")
93 skipped = metadata.get("skipped") or []
94 if skipped:
95 names = ", ".join(f"{s['name']} ({s['reason']})" for s in skipped)
96 lines.append(f"- skipped agents (never ran): {names}")
97 retried = metadata.get("retried") or []
98 if retried:
99 lines.append(f"- retried agents: {', '.join(retried)}")
100 total = metadata["total_wall_clock_s"]
101 lines.append(f"- total wall-clock (cost proxy, not $): {total:.0f}s")
102 lines.append("")
103 lines.append("| agent | vendor | status | duration |")
104 lines.append("| --- | --- | --- | --- |")
105 for a in metadata["agents"]:
106 code = a.get("error_code")
107 status = a["status"] if not code else f"{a['status']} ({code})"
108 # Note a retried agent inline; attempts == 1 leaves the row unchanged.
109 attempts = a.get("attempts", 1)
110 if attempts and attempts > 1:
111 status += f", {attempts} attempts"
112 lines.append(
113 f"| {a['name']} | {a['vendor']} | {status} | {a['duration_s']:.0f}s |"
114 )
115 lines.append("")
116 lines.append(
117 "_Wall-clock seconds are an approximate cost proxy (no token counts are "
118 "available from the CLIs), not a dollar cost._\n"
119 )
120 return lines
123def _classification_block(classification: dict) -> list[str]:
124 """Render the compact PR-level classification summary.
126 Deterministic: ``classification`` is produced by the pure
127 :mod:`ai_jury.classification` module, so the rendered section is
128 stable for a deterministic run (and golden-tested under mock).
129 """
130 return [
131 "## Classification\n",
132 _classification.summary_line(classification),
133 "",
134 ]
137def _consensus_block(groups) -> list[str]:
138 lines = ["## Consensus\n"]
139 by_bucket: dict[str, list] = {b: [] for b in _BUCKET_ORDER}
140 for g in groups:
141 by_bucket.setdefault(g.bucket, []).append(g)
142 for bucket in _BUCKET_ORDER:
143 bg = by_bucket.get(bucket) or []
144 if not bg:
145 continue
146 lines.append(f"### {_BUCKET_LABELS.get(bucket, bucket)}\n")
147 for g in bg:
148 lines.append(_group_line(g))
149 lines.append("")
150 return lines
153def _vote_block(vote) -> list[str]:
154 """Render the panel-vote verdict + tally + per-reviewer ballots (issue #220).
156 Vocabulary-agnostic: the tally renders whatever stances the vote carries
157 (code: REQUEST CHANGES/COMMENT/APPROVE; issue: NEEDS-INFO/UNCLEAR/READY).
158 """
159 lines = ["## Verdict — panel vote\n"]
160 tally = " · ".join(f"{n} {label.lower()}" for label, n in vote.tally.items())
161 lines.append(f"**{vote.verdict}** — {tally}\n")
162 for b in vote.ballots:
163 lines.append(f"- `{b.reviewer}`: **{b.vote}** ({b.reason})")
164 lines.append("")
165 return lines
168def _verdict_headline(synthesis, vote) -> str | None:
169 """One-line verdict for the report's TL;DR callout (pure, deterministic).
171 Prefers the panel vote's verdict when voting; otherwise lifts the opening
172 ``## Verdict`` line out of the chair's synthesis prose — both the code and
173 issue synthesis prompts mandate a ``## Verdict\\n<LABEL> — <one sentence>``
174 first section, so the lift is reliable. The verdict sentence may wrap across
175 lines; they are joined into one. Returns ``None`` when neither source is
176 available (failed/absent synthesis, deviating output) so the caller simply
177 omits the callout — it is purely additive, never replacing a section.
178 """
179 if vote is not None and getattr(vote, "verdict", None):
180 return vote.verdict
181 if synthesis is None or not getattr(synthesis, "ok", False):
182 return None
183 rows = (synthesis.output or "").splitlines()
184 for i, row in enumerate(rows):
185 if row.strip().lower().lstrip("#").strip() == "verdict":
186 collected: list[str] = []
187 for nxt in rows[i + 1:]:
188 if nxt.strip().startswith("#"):
189 break
190 if not nxt.strip():
191 if collected:
192 break
193 continue
194 collected.append(nxt.strip())
195 return " ".join(collected) or None
196 return None
199def render(
200 reviews: list[AgentResult],
201 debate: list[AgentResult],
202 synthesis: AgentResult | None,
203 *,
204 chair: str,
205 findings: list[Finding] | None = None,
206 warnings: list[str] | None = None,
207 groups: list | None = None,
208 verify: AgentResult | None = None,
209 context_mode: str | None = None,
210 redact_secrets: bool | None = None,
211 redaction_count: int = 0,
212 metadata: dict | None = None,
213 classification: dict | None = None,
214 review_scope: str | None = None,
215 vote=None,
216) -> str:
217 findings = findings or []
218 warnings = warnings or []
219 groups = groups or []
220 lines: list[str] = []
221 lines.append("# 🏛️ AI Jury\n")
223 # TL;DR callout (issue: scannable headline): hoist the verdict to the very
224 # top so the outcome is the first thing a reader sees, before the panel and
225 # the full report. Purely additive — omitted when no verdict is available.
226 headline = _verdict_headline(synthesis, vote)
227 if headline:
228 lines.append(f"> ⚡ **TL;DR · {headline}**\n")
230 panel = ", ".join(f"`{r.agent}` ({r.vendor})" for r in reviews)
231 lines.append(f"**Panel:** {panel}\n")
233 # Review-scope note (issue #9): only rendered when the caller supplies it
234 # (incremental mode), so the default report is unchanged.
235 if review_scope:
236 lines.append(f"{review_scope}\n")
238 # Compact, deterministic PR-level classification (issue #7). Derived from the
239 # structured findings/groups when not supplied explicitly so the section
240 # always renders for a normal run.
241 if classification is None:
242 classification = _classification.classify(findings=findings, groups=groups)
243 lines.extend(_classification_block(classification))
245 if context_mode is not None or redact_secrets is not None:
246 lines.append("## Context policy\n")
247 if context_mode is not None:
248 lines.append(f"- context mode: {context_mode}")
249 if redact_secrets is not None:
250 state = "on" if redact_secrets else "off"
251 extra = f" ({redaction_count} redacted)" if redact_secrets else ""
252 lines.append(f"- secret redaction: {state}{extra}")
253 lines.append("")
255 if groups:
256 lines.extend(_consensus_block(groups))
257 lines.append("---\n")
259 # Panel-vote verdict (issue #220): when voting, the tally is the headline
260 # verdict and the chair's synthesis becomes supporting reasoning.
261 if vote is not None:
262 lines.extend(_vote_block(vote))
263 lines.append("---\n")
265 if verify is not None:
266 lines.append("## Verification\n")
267 lines.append(f"> Verified by `{chair}`\n")
268 if verify.ok:
269 lines.append(verify.output.strip() + "\n")
270 else:
271 lines.append(f"_Verification failed: {verify.error}_\n")
272 lines.append("---\n")
274 chair_heading = "Chair's reasoning" if vote is not None else "Chair verdict"
275 if synthesis and synthesis.ok:
276 lines.append(f"## {chair_heading}\n")
277 lines.append(f"> Synthesized by `{chair}`\n")
278 lines.append(synthesis.output.strip() + "\n")
279 elif synthesis and not synthesis.ok:
280 lines.append(f"## {chair_heading}\n")
281 lines.append(f"_Synthesis failed: {synthesis.error}_\n")
283 lines.append("---\n")
284 lines.append("## Structured findings\n")
285 if findings:
286 # ``f.file``/``f.line`` may be None (a finding need not be located), so
287 # coerce in the sort key — comparing None against str/int raises TypeError.
288 ranked = sorted(
289 findings,
290 key=lambda f: (SEVERITY_ORDER.get(f.severity, 99), f.file or "", f.line or 0),
291 )
292 for f in ranked:
293 lines.append(_finding_line(f))
294 lines.append("")
295 else:
296 lines.append("_(no structured findings parsed)_\n")
298 if warnings:
299 lines.append("> ⚠️ agent output warnings\n")
300 for w in warnings:
301 lines.append(f"- {w}")
302 lines.append("")
304 lines.append("## Round 1 — independent reviews\n")
305 for r in reviews:
306 status = f"{r.duration_s:.0f}s" if r.ok else _fail_status(r)
307 lines.append(_block(f"`{r.agent}` ({r.vendor}) — {status}", r.output if r.ok else ""))
309 if debate:
310 lines.append("## Round 2 — cross-examination\n")
311 for r in debate:
312 status = f"{r.duration_s:.0f}s" if r.ok else _fail_status(r)
313 lines.append(_block(f"`{r.agent}` — {status}", r.output if r.ok else ""))
315 if metadata is not None:
316 lines.append("---\n")
317 lines.extend(_metadata_block(metadata))
319 lines.append("---")
320 lines.append(
321 "\n<sub>Generated by "
322 "[ai-jury](https://github.com/berkayturanci/ai-jury)"
323 " — a cross-vendor multi-agent PR review jury.</sub>"
324 )
325 return "\n".join(lines)
328_LIVE_LABELS = {
329 "review": "Round 1 review",
330 "debate": "Cross-examination",
331 "verify": "Verification",
332 "synthesis": "Decision — verdict & reasoning",
333}
336def render_live_step(kind: str, result: AgentResult, round_no: int | None = None) -> tuple[str, str]:
337 """Format one streamed step as ``(title, body)`` for live output (issue #210).
339 Pure — no I/O. The CLI ``--live`` handler prints this to stdout and (with
340 ``--pr``) posts it as its own comment, as each step completes. ``kind`` is one
341 of review / debate / verify / synthesis."""
342 label = _LIVE_LABELS.get(kind, kind)
343 if kind == "debate" and round_no:
344 label = f"Cross-examination · round {round_no}"
345 if kind in ("verify", "synthesis"):
346 who = f"chair `{result.agent}`"
347 else:
348 who = f"`{result.agent}` ({result.vendor})"
349 status = f"{result.duration_s:.0f}s" if result.ok else _fail_status(result)
350 title = f"🏛️ AI Jury — {label}: {who} — {status}"
351 body = result.output.strip() if result.ok else ""
352 return title, (body or "_(no output)_")
355def _conversation_blocks(
356 reviews: list[AgentResult],
357 debate: list[AgentResult],
358 synthesis: AgentResult | None,
359 verify: AgentResult | None,
360 *,
361 chair: str,
362) -> list[str]:
363 """The chronological deliberation, foregrounded: each reviewer's raw output,
364 then the debate exchanges in order, then verification, then the chair's
365 decision *and its reasoning* — so a reader can follow who said what and why
366 the chair ruled as it did (issue: full transcript)."""
367 lines: list[str] = ["## Round 1 — independent reviews\n"]
368 for r in reviews:
369 status = f"{r.duration_s:.0f}s" if r.ok else _fail_status(r)
370 lines.append(_block(f"`{r.agent}` ({r.vendor}) — {status}", r.output if r.ok else ""))
371 if debate:
372 lines.append("## Round 2 — cross-examination (debate)\n")
373 for r in debate:
374 status = f"{r.duration_s:.0f}s" if r.ok else _fail_status(r)
375 lines.append(_block(f"`{r.agent}` — {status}", r.output if r.ok else ""))
376 if verify is not None:
377 lines.append("## Verification\n")
378 lines.append(f"> Verified by `{chair}`\n")
379 lines.append(verify.output.strip() + "\n" if verify.ok else f"_Verification failed: {verify.error}_\n")
380 lines.append("## Decision — verdict & reasoning\n")
381 if synthesis and synthesis.ok:
382 lines.append(f"> Decided by `{chair}`\n")
383 lines.append(synthesis.output.strip() + "\n")
384 elif synthesis and not synthesis.ok:
385 lines.append(f"_Synthesis failed: {synthesis.error}_\n")
386 else:
387 lines.append("_(no synthesis produced)_\n")
388 return lines
391def _summary_blocks(
392 findings: list[Finding],
393 warnings: list[str],
394 groups: list,
395 classification: dict,
396 vote=None,
397) -> list[str]:
398 """Consensus + structured-findings recap (the auditable at-a-glance summary)."""
399 lines = list(_classification_block(classification))
400 if vote is not None:
401 lines.extend(_vote_block(vote))
402 if groups:
403 lines.extend(_consensus_block(groups))
404 lines.append("## Structured findings\n")
405 if findings:
406 ranked = sorted(
407 findings,
408 key=lambda f: (SEVERITY_ORDER.get(f.severity, 99), f.file or "", f.line or 0),
409 )
410 lines.extend(_finding_line(f) for f in ranked)
411 lines.append("")
412 else:
413 lines.append("_(no structured findings parsed)_\n")
414 if warnings:
415 lines.append("> ⚠️ agent output warnings\n")
416 lines.extend(f"- {w}" for w in warnings)
417 lines.append("")
418 return lines
421def render_transcript(
422 reviews: list[AgentResult],
423 debate: list[AgentResult],
424 synthesis: AgentResult | None,
425 *,
426 chair: str,
427 findings: list[Finding] | None = None,
428 warnings: list[str] | None = None,
429 groups: list | None = None,
430 verify: AgentResult | None = None,
431 context_mode: str | None = None,
432 redact_secrets: bool | None = None,
433 redaction_count: int = 0,
434 metadata: dict | None = None,
435 classification: dict | None = None,
436 review_scope: str | None = None,
437 lead_with_summary: bool = False,
438 vote=None,
439) -> str:
440 """Render the full play-by-play transcript (issue: full transcript / --verbose).
442 Two layouts from one function:
444 * ``lead_with_summary=False`` (``--transcript``) — a dedicated, conversation-first
445 document: Round 1 → debate → verification → the chair's decision & reasoning,
446 then a compact consensus/findings recap for auditability.
447 * ``lead_with_summary=True`` (``--verbose``) — the consensus/verdict summary first,
448 then the same full transcript below it, in one document.
450 The default :func:`render` (consensus-first summary with a raw appendix) is
451 unchanged, so existing reports/goldens are unaffected.
452 """
453 findings = findings or []
454 warnings = warnings or []
455 groups = groups or []
456 if classification is None:
457 classification = _classification.classify(findings=findings, groups=groups)
459 lines: list[str] = []
460 lines.append(
461 "# 🏛️ AI Jury — verbose report\n" if lead_with_summary
462 else "# 🏛️ AI Jury — full transcript\n"
463 )
464 # TL;DR callout (parity with render()): the verdict headline leads the
465 # verbose/transcript report too, so every renderer surfaces the outcome first.
466 headline = _verdict_headline(synthesis, vote)
467 if headline:
468 lines.append(f"> ⚡ **TL;DR · {headline}**\n")
469 panel = ", ".join(f"`{r.agent}` ({r.vendor})" for r in reviews)
470 lines.append(f"**Panel:** {panel}\n")
471 if review_scope:
472 lines.append(f"{review_scope}\n")
474 # Disclose the context/redaction policy (parity with render()): whoever reads
475 # the shared transcript should see whether secrets were redacted before the
476 # diff reached the agents.
477 if context_mode is not None or redact_secrets is not None:
478 lines.append("## Context policy\n")
479 if context_mode is not None:
480 lines.append(f"- context mode: {context_mode}")
481 if redact_secrets is not None:
482 state = "on" if redact_secrets else "off"
483 extra = f" ({redaction_count} redacted)" if redact_secrets else ""
484 lines.append(f"- secret redaction: {state}{extra}")
485 lines.append("")
487 if lead_with_summary:
488 lines.extend(_summary_blocks(findings, warnings, groups, classification, vote=vote))
489 lines.append("---\n")
490 lines.append("# Full transcript\n")
491 lines.extend(_conversation_blocks(reviews, debate, synthesis, verify, chair=chair))
492 else:
493 lines.extend(_conversation_blocks(reviews, debate, synthesis, verify, chair=chair))
494 lines.append("---\n")
495 lines.extend(_summary_blocks(findings, warnings, groups, classification, vote=vote))
497 if metadata is not None:
498 lines.append("---\n")
499 lines.extend(_metadata_block(metadata))
501 lines.append("---")
502 lines.append(
503 "\n<sub>Generated by "
504 "[ai-jury](https://github.com/berkayturanci/ai-jury)"
505 " — a cross-vendor multi-agent PR review jury.</sub>"
506 )
507 return "\n".join(lines)
510def render_sections(
511 reviews: list[AgentResult],
512 debate: list[AgentResult],
513 synthesis: AgentResult | None,
514 *,
515 chair: str,
516 findings: list[Finding] | None = None,
517 warnings: list[str] | None = None,
518 groups: list | None = None,
519 verify: AgentResult | None = None,
520 classification: dict | None = None,
521 vote=None,
522) -> list[tuple[str, str]]:
523 """Split the report into ordered ``(title, body)`` sections for phased posting.
525 Returns up to three sections — **Round 1** (independent reviews), **Round 2**
526 (debate, omitted when there was none), and **Decision** (verification + chair
527 verdict + consensus + structured findings) — so a PR can show the flow as
528 separate, readable comments (issue #127). ``render()`` (the single-blob
529 report) is unchanged. Empty sections are skipped.
530 """
531 findings = findings or []
532 warnings = warnings or []
533 groups = groups or []
534 sections: list[tuple[str, str]] = []
536 # Round 1 — independent reviews.
537 r1 = [f"**Panel:** {', '.join(f'`{r.agent}` ({r.vendor})' for r in reviews)}\n"]
538 for r in reviews:
539 status = f"{r.duration_s:.0f}s" if r.ok else _fail_status(r)
540 r1.append(_block(f"`{r.agent}` ({r.vendor}) — {status}", r.output if r.ok else ""))
541 sections.append(("🏛️ AI Jury — Round 1: independent reviews", "\n".join(r1).strip()))
543 # Round 2 — cross-examination (only if a debate ran).
544 if debate:
545 r2 = []
546 for r in debate:
547 status = f"{r.duration_s:.0f}s" if r.ok else _fail_status(r)
548 r2.append(_block(f"`{r.agent}` — {status}", r.output if r.ok else ""))
549 sections.append(("🏛️ AI Jury — Round 2: cross-examination (debate)", "\n".join(r2).strip()))
551 # Decision — verification + chair verdict + consensus + findings.
552 dec: list[str] = []
553 if classification is None:
554 classification = _classification.classify(findings=findings, groups=groups)
555 dec.extend(_classification_block(classification))
556 if vote is not None:
557 dec.extend(_vote_block(vote))
558 if groups:
559 dec.extend(_consensus_block(groups))
560 if verify is not None:
561 dec.append("## Verification\n")
562 dec.append(f"> Verified by `{chair}`\n")
563 dec.append(verify.output.strip() + "\n" if verify.ok else f"_Verification failed: {verify.error}_\n")
564 chair_heading = "Chair's reasoning" if vote is not None else "Chair verdict"
565 if synthesis and synthesis.ok:
566 dec.append(f"## {chair_heading}\n")
567 dec.append(f"> Synthesized by `{chair}`\n")
568 dec.append(synthesis.output.strip() + "\n")
569 elif synthesis and not synthesis.ok:
570 dec.append(f"## {chair_heading}\n\n_Synthesis failed: {synthesis.error}_\n")
571 if findings:
572 dec.append("## Structured findings\n")
573 ranked = sorted(
574 findings,
575 key=lambda f: (SEVERITY_ORDER.get(f.severity, 99), f.file or "", f.line or 0),
576 )
577 dec.extend(_finding_line(f) for f in ranked)
578 if warnings:
579 dec.append("\n> ⚠️ agent output warnings\n")
580 dec.extend(f"- {w}" for w in warnings)
581 sections.append(("🏛️ AI Jury — Decision: verdict & consensus", "\n".join(dec).strip()))
583 return sections