Coverage for src/ai_jury/metadata.py: 100%
31 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-05 20:29 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-05 20:29 +0000
1"""Run metadata and cost-awareness (wall-clock proxy) reporting.
3Builds a machine-readable metadata dict describing a jury run: which
4agents participated, their per-agent status and wall-clock duration, how many
5rounds ran, whether verification was enabled, and timestamps.
7IMPORTANT: This metadata deliberately contains NO diff text, NO prompt text,
8NO agent output, and NO secrets -- only structural/operational signals.
10There are no token counts available from the underlying CLIs, so wall-clock
11seconds are used as an approximate cost *proxy*, not a dollar cost.
12"""
14from __future__ import annotations
16from datetime import UTC, datetime
17from typing import TYPE_CHECKING
19if TYPE_CHECKING: # pragma: no cover - typing only
20 from .config import JuryConfig
21 from .orchestrator import JuryOutcome
23# v2 (issue #30/#40) added: stop_reason, skipped, retried, budget_exhausted,
24# execution{...}, and per-agent ``attempts``.
25SCHEMA_VERSION = 3
28def _agent_entry(result) -> dict:
29 """Build a single agent metadata entry.
31 Only operational fields are copied -- never ``output`` or ``error`` text,
32 which could contain raw prompt/diff content or secrets.
33 """
34 return {
35 "name": result.agent,
36 "vendor": result.vendor,
37 "status": "ok" if result.ok else "failed",
38 "duration_s": round(float(result.duration_s), 3),
39 "error_code": result.error_code,
40 # Number of attempts made (issue #30): >1 means a transient failure was
41 # retried before this outcome.
42 "attempts": int(getattr(result, "attempts", 1) or 1),
43 }
46def _rounds_executed(outcome: JuryOutcome) -> int:
47 # Prefer the orchestrator's authoritative count (adaptive rounds, issue #40);
48 # fall back to inferring it from the phases that produced output.
49 recorded = getattr(outcome, "rounds_executed", None)
50 if isinstance(recorded, int) and recorded >= 1:
51 return recorded
52 rounds = 1 if outcome.reviews else 0
53 if outcome.debate:
54 rounds += 1
55 return rounds
58def build_run_metadata(outcome: JuryOutcome, config: JuryConfig, *, decision=None, vote=None) -> dict:
59 """Return a machine-readable metadata dict for a jury run.
61 The dict is safe to serialize as JSON and contains no diff text, prompt
62 text, agent output, or secrets.
64 Per-agent entries reflect the review panel (round 1). Total wall-clock is
65 summed across every phase (review, debate, verify, synthesis) so it captures
66 the full run cost proxy even though debate/verify/synthesis are re-runs of
67 panel agents rather than distinct participants.
68 """
69 # The panel is the set of round-1 participants; this is the canonical
70 # per-agent view and avoids duplicating the chair across later phases.
71 agents = [_agent_entry(r) for r in outcome.reviews]
73 all_results = list(outcome.reviews) + list(outcome.debate)
74 if outcome.synthesis is not None:
75 all_results.append(outcome.synthesis)
76 if outcome.verify is not None:
77 all_results.append(outcome.verify)
78 total_wall_clock_s = round(sum(float(r.duration_s) for r in all_results), 3)
80 # Reproducibility signals (issue #41): the run seed and a stable hash of the
81 # effective config let a run be reproduced/explained. The seed is whatever
82 # the run was configured with (may be None when unseeded). The config hash
83 # is a pure function of config, so it is stable across runs and over time.
84 from .classification import classify
85 from .config import config_hash
87 # Execution / partial-result signals (issue #30) and adaptive-round signals
88 # (issue #40). ``skipped`` lists agents whose CLI was unavailable so they
89 # never ran; ``budget_exhausted`` flags a run that stopped early on the total
90 # timeout; ``stop_reason`` explains why debate ran or stopped.
91 skipped = [
92 {"name": name, "reason": reason}
93 for name, reason in getattr(outcome, "skipped", []) or []
94 ]
95 retried = [a["name"] for a in agents if a["attempts"] > 1]
97 # Final-verdict mode (issue #220). ``decision`` is the effective mode (CLI
98 # override else config); ``vote`` is the tally dict when voting, else None.
99 decision = decision or config.decision
100 vote_meta = None
101 if vote is not None:
102 vote_meta = {
103 "verdict": vote.verdict,
104 "tally": vote.tally,
105 "ballots": [
106 {"reviewer": b.reviewer, "vote": b.vote, "reason": b.reason}
107 for b in vote.ballots
108 ],
109 }
111 return {
112 "schema_version": SCHEMA_VERSION,
113 "decision": decision,
114 "vote": vote_meta,
115 "agents": agents,
116 "rounds_executed": _rounds_executed(outcome),
117 "from_cache": bool(getattr(outcome, "from_cache", False)),
118 "stop_reason": getattr(outcome, "stop_reason", "") or "",
119 "skipped": skipped,
120 "retried": retried,
121 "budget_exhausted": bool(getattr(outcome, "budget_exhausted", False)),
122 "execution": {
123 "total_timeout": config.total_timeout,
124 "phase_timeout": config.phase_timeout,
125 "retries": config.retries,
126 "early_stop": config.early_stop,
127 "max_rounds": config.effective_max_rounds,
128 },
129 "verify_enabled": bool(config.verify),
130 "context_mode": outcome.context_mode,
131 "redact_secrets": bool(outcome.redact_secrets),
132 "redaction_count": outcome.redaction_count,
133 "seed": config.seed,
134 "config_hash": config_hash(config),
135 # PR-level classification (issue #7): deterministic summary derived from
136 # the structured findings + consensus groups. No diff text is included.
137 "classification": classify(outcome),
138 # Wall-clock is an approximate COST PROXY, not a dollar cost. No token
139 # counts are available from the underlying CLIs.
140 "total_wall_clock_s": total_wall_clock_s,
141 "cost_signal": "wall-clock-proxy",
142 "generated_at": datetime.now(UTC).isoformat(),
143 }
146# end