Coverage for src/ai_jury/voting.py: 100%

1"""Panel voting: derive a verdict by tallying the reviewers instead of letting a

2single chair decide (issue #220).

4Pure and deterministic — a function of the consensus groups and the reviewer

5names only (no I/O, no wall-clock, no randomness). Each reviewer's vote is

6derived from the worst-severity finding they raised that the verifier did not

7reject ("unsupported"); the panel verdict is the majority, ties broken toward the

8more conservative stance. This is a rendering/aggregation layer: it never changes

9how agents run, and the severity-based CI gate (:func:`ai_jury.ci.evaluate_ci`)

10remains the independent hard safety check.

11"""

13from __future__ import annotations

15from dataclasses import dataclass, field

17from .findings import SEVERITIES, SEVERITY_ORDER

19APPROVE = "APPROVE"

20COMMENT = "COMMENT"

21REQUEST_CHANGES = "REQUEST CHANGES"

22NO_QUORUM = "NO QUORUM"

23# Issue-review verdict vocabulary (issue #230): the panel votes over an issue's

24# completeness instead of a diff's correctness.

25READY = "READY"

26NEEDS_INFO = "NEEDS-INFO"

27UNCLEAR = "UNCLEAR"

29# Per-mode vocabulary: the worst gap/finding a reviewer raised maps to a stance

30# (blocking severity → strict, middling → soft, none → clear), and ties resolve

31# to the strictest stance via the ``order`` (higher = stricter).

32_MODES = {

33 "code": {

34 "blocking": REQUEST_CHANGES,

35 "middling": COMMENT,

36 "clear": APPROVE,

37 # Keys are listed strictest-first so the rendered tally reads

38 # "request changes · comment · approve"; the integer values (not key

39 # order) drive the tie-break, so display order is free to be intuitive.

40 "order": {REQUEST_CHANGES: 2, COMMENT: 1, APPROVE: 0},

41 },

42 "issue": {

43 "blocking": NEEDS_INFO,

44 "middling": UNCLEAR,

45 "clear": READY,

46 "order": {NEEDS_INFO: 2, UNCLEAR: 1, READY: 0},

47 },

48}

50# A reviewer that did not actually review (an empty reply, or a short refusal /

51# safety-decline like "I can't assist with that request") must NOT be counted as

52# a "clear" vote — a non-answer is not an approval (issue #251). Such reviewers

53# abstain and are dropped from the tally entirely. Kept conservative: a genuine

54# clean review has substantive prose and won't match. Markers are written in the

55# *normalized* (expanded-contraction) form below.

56_ABSTENTION_MARKERS = (

57 "i cannot assist",

58 "i cannot help",

59 "cannot help with that",

60 "cannot comply",

61 "cannot do that",

62 "i am unable to",

63 "i will not be able to",

64 "unable to assist",

65 "unable to help",

66)

69def _normalize_refusal(text: str) -> str:

70 """Fold contraction/apostrophe variants so "can't X" and "cannot X" match the

71 same marker (issue #251 follow-up: the jury caught "can't comply" slipping

72 past a "cannot comply"-only list)."""

73 t = text.replace("’", "'") # smart apostrophe → ascii

74 return t.replace("can't", "cannot").replace("won't", "will not").replace("i'm ", "i am ")

77def is_abstention(output) -> bool:

78 """True when a review is empty or a recognizable non-review (refusal).

80 Empty/whitespace output always abstains. A short reply (no real review body)

81 that contains a refusal marker also abstains; longer substantive reviews do

82 not, even if they happen to quote one of these phrases. Contraction variants

83 (``can't``/``cannot``, ``won't``/``will not``, ``i'm``/``i am``) are folded so

84 a marker need only be listed once.

85 """

86 text = (output or "").strip().lower()

87 if not text:

88 return True

89 return len(text) < 400 and any(m in _normalize_refusal(text) for m in _ABSTENTION_MARKERS)

92# Worst-severity thresholds. critical/major are blocking; minor/nit are middling.

93_MAJOR_RANK = SEVERITY_ORDER["major"]

94_NIT_RANK = SEVERITY_ORDER["nit"]

97def _severity_to_vote(rank: int, vocab: dict) -> str:

98 if rank <= _MAJOR_RANK:

99 return vocab["blocking"]

100 if rank <= _NIT_RANK:

101 return vocab["middling"]

102 return vocab["clear"]

103

104

105@dataclass

106class Ballot:

107 reviewer: str

108 vote: str

109 reason: str

110

111

112@dataclass

113class VoteResult:

114 verdict: str

115 tally: dict = field(default_factory=dict)

116 ballots: list[Ballot] = field(default_factory=list)

117

118

119def tally_votes(groups, reviewers, *, mode: str = "code") -> VoteResult:

120 """Tally a panel verdict from the consensus ``groups`` and ``reviewers``.

121

122 Each reviewer votes from the worst-severity group they contributed to that was

123 not marked ``unsupported`` by the verifier. The verdict is whichever stance has

124 the most ballots; a tie resolves toward the strictest stance. The vocabulary is

125 mode-aware (issue #230): ``code`` → REQUEST CHANGES > COMMENT > APPROVE;

126 ``issue`` → NEEDS-INFO > UNCLEAR > READY (the panel judges completeness, not

127 correctness). With no reviewers the verdict is ``NO QUORUM``.

128 """

129 vocab = _MODES.get(mode, _MODES["code"])

130 order = vocab["order"]

131 ballots: list[Ballot] = []

132 for rv in reviewers:

133 worst_rank: int | None = None

134 for g in groups:

135 if rv not in getattr(g, "reviewers", []):

136 continue

137 if (getattr(g, "status", "") or "") == "unsupported":

138 continue # verifier rejected it — doesn't count toward this vote

139 rank = SEVERITY_ORDER.get(getattr(g, "severity", ""), len(SEVERITIES) - 1)

140 if worst_rank is None or rank < worst_rank:

141 worst_rank = rank

142 if worst_rank is None:

143 ballots.append(Ballot(rv, vocab["clear"], "no supported findings raised"))

144 else:

145 sev = SEVERITIES[worst_rank]

146 ballots.append(

147 Ballot(rv, _severity_to_vote(worst_rank, vocab), f"worst finding: {sev}")

148 )

149

150 tally = dict.fromkeys(order, 0)

151 for b in ballots:

152 tally[b.vote] += 1

153

154 if not ballots:

155 return VoteResult(NO_QUORUM, tally, ballots)

156

157 # Most votes wins; tie-break toward the strictest stance.

158 verdict = max(tally, key=lambda v: (tally[v], order[v]))

159 return VoteResult(verdict, tally, ballots)