Coverage for src/ai_jury/voting.py: 100%

63 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-05 20:29 +0000

1"""Panel voting: derive a verdict by tallying the reviewers instead of letting a 

2single chair decide (issue #220). 

3 

4Pure and deterministic — a function of the consensus groups and the reviewer 

5names only (no I/O, no wall-clock, no randomness). Each reviewer's vote is 

6derived from the worst-severity finding they raised that the verifier did not 

7reject ("unsupported"); the panel verdict is the majority, ties broken toward the 

8more conservative stance. This is a rendering/aggregation layer: it never changes 

9how agents run, and the severity-based CI gate (:func:`ai_jury.ci.evaluate_ci`) 

10remains the independent hard safety check. 

11""" 

12from __future__ import annotations 

13 

14from dataclasses import dataclass, field 

15 

16from .findings import SEVERITIES, SEVERITY_ORDER 

17 

18APPROVE = "APPROVE" 

19COMMENT = "COMMENT" 

20REQUEST_CHANGES = "REQUEST CHANGES" 

21NO_QUORUM = "NO QUORUM" 

22# Issue-review verdict vocabulary (issue #230): the panel votes over an issue's 

23# completeness instead of a diff's correctness. 

24READY = "READY" 

25NEEDS_INFO = "NEEDS-INFO" 

26UNCLEAR = "UNCLEAR" 

27 

28# Per-mode vocabulary: the worst gap/finding a reviewer raised maps to a stance 

29# (blocking severity → strict, middling → soft, none → clear), and ties resolve 

30# to the strictest stance via the ``order`` (higher = stricter). 

31_MODES = { 

32 "code": { 

33 "blocking": REQUEST_CHANGES, "middling": COMMENT, "clear": APPROVE, 

34 # Keys are listed strictest-first so the rendered tally reads 

35 # "request changes · comment · approve"; the integer values (not key 

36 # order) drive the tie-break, so display order is free to be intuitive. 

37 "order": {REQUEST_CHANGES: 2, COMMENT: 1, APPROVE: 0}, 

38 }, 

39 "issue": { 

40 "blocking": NEEDS_INFO, "middling": UNCLEAR, "clear": READY, 

41 "order": {NEEDS_INFO: 2, UNCLEAR: 1, READY: 0}, 

42 }, 

43} 

44 

45# A reviewer that did not actually review (an empty reply, or a short refusal / 

46# safety-decline like "I can't assist with that request") must NOT be counted as 

47# a "clear" vote — a non-answer is not an approval (issue #251). Such reviewers 

48# abstain and are dropped from the tally entirely. Kept conservative: a genuine 

49# clean review has substantive prose and won't match. Markers are written in the 

50# *normalized* (expanded-contraction) form below. 

51_ABSTENTION_MARKERS = ( 

52 "i cannot assist", "i cannot help", "cannot help with that", "cannot comply", 

53 "cannot do that", "i am unable to", "i will not be able to", 

54 "unable to assist", "unable to help", 

55) 

56 

57 

58def _normalize_refusal(text: str) -> str: 

59 """Fold contraction/apostrophe variants so "can't X" and "cannot X" match the 

60 same marker (issue #251 follow-up: the jury caught "can't comply" slipping 

61 past a "cannot comply"-only list).""" 

62 t = text.replace("’", "'") # smart apostrophe → ascii 

63 return ( 

64 t.replace("can't", "cannot") 

65 .replace("won't", "will not") 

66 .replace("i'm ", "i am ") 

67 ) 

68 

69 

70def is_abstention(output) -> bool: 

71 """True when a review is empty or a recognizable non-review (refusal). 

72 

73 Empty/whitespace output always abstains. A short reply (no real review body) 

74 that contains a refusal marker also abstains; longer substantive reviews do 

75 not, even if they happen to quote one of these phrases. Contraction variants 

76 (``can't``/``cannot``, ``won't``/``will not``, ``i'm``/``i am``) are folded so 

77 a marker need only be listed once. 

78 """ 

79 text = (output or "").strip().lower() 

80 if not text: 

81 return True 

82 return len(text) < 400 and any(m in _normalize_refusal(text) for m in _ABSTENTION_MARKERS) 

83 

84 

85# Worst-severity thresholds. critical/major are blocking; minor/nit are middling. 

86_MAJOR_RANK = SEVERITY_ORDER["major"] 

87_NIT_RANK = SEVERITY_ORDER["nit"] 

88 

89 

90def _severity_to_vote(rank: int, vocab: dict) -> str: 

91 if rank <= _MAJOR_RANK: 

92 return vocab["blocking"] 

93 if rank <= _NIT_RANK: 

94 return vocab["middling"] 

95 return vocab["clear"] 

96 

97 

98@dataclass 

99class Ballot: 

100 reviewer: str 

101 vote: str 

102 reason: str 

103 

104 

105@dataclass 

106class VoteResult: 

107 verdict: str 

108 tally: dict = field(default_factory=dict) 

109 ballots: list[Ballot] = field(default_factory=list) 

110 

111 

112def tally_votes(groups, reviewers, *, mode: str = "code") -> VoteResult: 

113 """Tally a panel verdict from the consensus ``groups`` and ``reviewers``. 

114 

115 Each reviewer votes from the worst-severity group they contributed to that was 

116 not marked ``unsupported`` by the verifier. The verdict is whichever stance has 

117 the most ballots; a tie resolves toward the strictest stance. The vocabulary is 

118 mode-aware (issue #230): ``code`` → REQUEST CHANGES > COMMENT > APPROVE; 

119 ``issue`` → NEEDS-INFO > UNCLEAR > READY (the panel judges completeness, not 

120 correctness). With no reviewers the verdict is ``NO QUORUM``. 

121 """ 

122 vocab = _MODES.get(mode, _MODES["code"]) 

123 order = vocab["order"] 

124 ballots: list[Ballot] = [] 

125 for rv in reviewers: 

126 worst_rank: int | None = None 

127 for g in groups: 

128 if rv not in getattr(g, "reviewers", []): 

129 continue 

130 if (getattr(g, "status", "") or "") == "unsupported": 

131 continue # verifier rejected it — doesn't count toward this vote 

132 rank = SEVERITY_ORDER.get(getattr(g, "severity", ""), len(SEVERITIES) - 1) 

133 if worst_rank is None or rank < worst_rank: 

134 worst_rank = rank 

135 if worst_rank is None: 

136 ballots.append(Ballot(rv, vocab["clear"], "no supported findings raised")) 

137 else: 

138 sev = SEVERITIES[worst_rank] 

139 ballots.append(Ballot(rv, _severity_to_vote(worst_rank, vocab), f"worst finding: {sev}")) 

140 

141 tally = dict.fromkeys(order, 0) 

142 for b in ballots: 

143 tally[b.vote] += 1 

144 

145 if not ballots: 

146 return VoteResult(NO_QUORUM, tally, ballots) 

147 

148 # Most votes wins; tie-break toward the strictest stance. 

149 verdict = max(tally, key=lambda v: (tally[v], order[v])) 

150 return VoteResult(verdict, tally, ballots)