Coverage for src/ai_jury/metadata.py: 100%

1"""Run metadata and cost-awareness (wall-clock proxy) reporting.

3Builds a machine-readable metadata dict describing a jury run: which

4agents participated, their per-agent status and wall-clock duration, how many

5rounds ran, whether verification was enabled, and timestamps.

7IMPORTANT: This metadata deliberately contains NO diff text, NO prompt text,

8NO agent output, and NO secrets -- only structural/operational signals.

10There are no token counts available from the underlying CLIs, so wall-clock

11seconds are used as an approximate cost *proxy*, not a dollar cost.

12"""

14from __future__ import annotations

16from datetime import UTC, datetime

17from typing import TYPE_CHECKING

19if TYPE_CHECKING: # pragma: no cover - typing only

20 from .config import JuryConfig

21 from .orchestrator import JuryOutcome

23# v2 (issue #30/#40) added: stop_reason, skipped, retried, budget_exhausted,

24# execution{...}, and per-agent ``attempts``.

25SCHEMA_VERSION = 3

28def _agent_entry(result) -> dict:

29 """Build a single agent metadata entry.

31 Only operational fields are copied -- never ``output`` or ``error`` text,

32 which could contain raw prompt/diff content or secrets.

33 """

34 return {

35 "name": result.agent,

36 "vendor": result.vendor,

37 "status": "ok" if result.ok else "failed",

38 "duration_s": round(float(result.duration_s), 3),

39 "error_code": result.error_code,

40 # Number of attempts made (issue #30): >1 means a transient failure was

41 # retried before this outcome.

42 "attempts": int(getattr(result, "attempts", 1) or 1),

43 }

46def _rounds_executed(outcome: JuryOutcome) -> int:

47 # Prefer the orchestrator's authoritative count (adaptive rounds, issue #40);

48 # fall back to inferring it from the phases that produced output.

49 recorded = getattr(outcome, "rounds_executed", None)

50 if isinstance(recorded, int) and recorded >= 1:

51 return recorded

52 rounds = 1 if outcome.reviews else 0

53 if outcome.debate:

54 rounds += 1

55 return rounds

58def build_run_metadata(

59 outcome: JuryOutcome, config: JuryConfig, *, decision=None, vote=None

60) -> dict:

61 """Return a machine-readable metadata dict for a jury run.

63 The dict is safe to serialize as JSON and contains no diff text, prompt

64 text, agent output, or secrets.

66 Per-agent entries reflect the review panel (round 1). Total wall-clock is

67 summed across every phase (review, debate, verify, synthesis) so it captures

68 the full run cost proxy even though debate/verify/synthesis are re-runs of

69 panel agents rather than distinct participants.

70 """

71 # The panel is the set of round-1 participants; this is the canonical

72 # per-agent view and avoids duplicating the chair across later phases.

73 agents = [_agent_entry(r) for r in outcome.reviews]

75 all_results = list(outcome.reviews) + list(outcome.debate)

76 if outcome.synthesis is not None:

77 all_results.append(outcome.synthesis)

78 if outcome.verify is not None:

79 all_results.append(outcome.verify)

80 total_wall_clock_s = round(sum(float(r.duration_s) for r in all_results), 3)

82 # Reproducibility signals (issue #41): the run seed and a stable hash of the

83 # effective config let a run be reproduced/explained. The seed is whatever

84 # the run was configured with (may be None when unseeded). The config hash

85 # is a pure function of config, so it is stable across runs and over time.

86 from .classification import classify

87 from .config import config_hash

89 # Execution / partial-result signals (issue #30) and adaptive-round signals

90 # (issue #40). ``skipped`` lists agents whose CLI was unavailable so they

91 # never ran; ``budget_exhausted`` flags a run that stopped early on the total

92 # timeout; ``stop_reason`` explains why debate ran or stopped.

93 skipped = [

94 {"name": name, "reason": reason} for name, reason in getattr(outcome, "skipped", []) or []

95 ]

96 retried = [a["name"] for a in agents if a["attempts"] > 1]

98 # Final-verdict mode (issue #220). ``decision`` is the effective mode (CLI

99 # override else config); ``vote`` is the tally dict when voting, else None.

100 decision = decision or config.decision

101 vote_meta = None

102 if vote is not None:

103 vote_meta = {

104 "verdict": vote.verdict,

105 "tally": vote.tally,

106 "ballots": [

107 {"reviewer": b.reviewer, "vote": b.vote, "reason": b.reason} for b in vote.ballots

108 ],

109 }

110

111 return {

112 "schema_version": SCHEMA_VERSION,

113 "decision": decision,

114 "vote": vote_meta,

115 "agents": agents,

116 "rounds_executed": _rounds_executed(outcome),

117 "from_cache": bool(getattr(outcome, "from_cache", False)),

118 "stop_reason": getattr(outcome, "stop_reason", "") or "",

119 "skipped": skipped,

120 "retried": retried,

121 "budget_exhausted": bool(getattr(outcome, "budget_exhausted", False)),

122 "execution": {

123 "total_timeout": config.total_timeout,

124 "phase_timeout": config.phase_timeout,

125 "retries": config.retries,

126 "early_stop": config.early_stop,

127 "max_rounds": config.effective_max_rounds,

128 },

129 "verify_enabled": bool(config.verify),

130 "context_mode": outcome.context_mode,

131 "redact_secrets": bool(outcome.redact_secrets),

132 "redaction_count": outcome.redaction_count,

133 "seed": config.seed,

134 "config_hash": config_hash(config),

135 # PR-level classification (issue #7): deterministic summary derived from

136 # the structured findings + consensus groups. No diff text is included.

137 "classification": classify(outcome),

138 # Wall-clock is an approximate COST PROXY, not a dollar cost. No token

139 # counts are available from the underlying CLIs.

140 "total_wall_clock_s": total_wall_clock_s,

141 "cost_signal": "wall-clock-proxy",

142 "generated_at": datetime.now(UTC).isoformat(),

143 }

144

145

146# end