Coverage for src/ai_jury/voting.py: 100%
63 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-05 20:29 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-05 20:29 +0000
1"""Panel voting: derive a verdict by tallying the reviewers instead of letting a
2single chair decide (issue #220).
4Pure and deterministic — a function of the consensus groups and the reviewer
5names only (no I/O, no wall-clock, no randomness). Each reviewer's vote is
6derived from the worst-severity finding they raised that the verifier did not
7reject ("unsupported"); the panel verdict is the majority, ties broken toward the
8more conservative stance. This is a rendering/aggregation layer: it never changes
9how agents run, and the severity-based CI gate (:func:`ai_jury.ci.evaluate_ci`)
10remains the independent hard safety check.
11"""
12from __future__ import annotations
14from dataclasses import dataclass, field
16from .findings import SEVERITIES, SEVERITY_ORDER
18APPROVE = "APPROVE"
19COMMENT = "COMMENT"
20REQUEST_CHANGES = "REQUEST CHANGES"
21NO_QUORUM = "NO QUORUM"
22# Issue-review verdict vocabulary (issue #230): the panel votes over an issue's
23# completeness instead of a diff's correctness.
24READY = "READY"
25NEEDS_INFO = "NEEDS-INFO"
26UNCLEAR = "UNCLEAR"
28# Per-mode vocabulary: the worst gap/finding a reviewer raised maps to a stance
29# (blocking severity → strict, middling → soft, none → clear), and ties resolve
30# to the strictest stance via the ``order`` (higher = stricter).
31_MODES = {
32 "code": {
33 "blocking": REQUEST_CHANGES, "middling": COMMENT, "clear": APPROVE,
34 # Keys are listed strictest-first so the rendered tally reads
35 # "request changes · comment · approve"; the integer values (not key
36 # order) drive the tie-break, so display order is free to be intuitive.
37 "order": {REQUEST_CHANGES: 2, COMMENT: 1, APPROVE: 0},
38 },
39 "issue": {
40 "blocking": NEEDS_INFO, "middling": UNCLEAR, "clear": READY,
41 "order": {NEEDS_INFO: 2, UNCLEAR: 1, READY: 0},
42 },
43}
45# A reviewer that did not actually review (an empty reply, or a short refusal /
46# safety-decline like "I can't assist with that request") must NOT be counted as
47# a "clear" vote — a non-answer is not an approval (issue #251). Such reviewers
48# abstain and are dropped from the tally entirely. Kept conservative: a genuine
49# clean review has substantive prose and won't match. Markers are written in the
50# *normalized* (expanded-contraction) form below.
51_ABSTENTION_MARKERS = (
52 "i cannot assist", "i cannot help", "cannot help with that", "cannot comply",
53 "cannot do that", "i am unable to", "i will not be able to",
54 "unable to assist", "unable to help",
55)
58def _normalize_refusal(text: str) -> str:
59 """Fold contraction/apostrophe variants so "can't X" and "cannot X" match the
60 same marker (issue #251 follow-up: the jury caught "can't comply" slipping
61 past a "cannot comply"-only list)."""
62 t = text.replace("’", "'") # smart apostrophe → ascii
63 return (
64 t.replace("can't", "cannot")
65 .replace("won't", "will not")
66 .replace("i'm ", "i am ")
67 )
70def is_abstention(output) -> bool:
71 """True when a review is empty or a recognizable non-review (refusal).
73 Empty/whitespace output always abstains. A short reply (no real review body)
74 that contains a refusal marker also abstains; longer substantive reviews do
75 not, even if they happen to quote one of these phrases. Contraction variants
76 (``can't``/``cannot``, ``won't``/``will not``, ``i'm``/``i am``) are folded so
77 a marker need only be listed once.
78 """
79 text = (output or "").strip().lower()
80 if not text:
81 return True
82 return len(text) < 400 and any(m in _normalize_refusal(text) for m in _ABSTENTION_MARKERS)
85# Worst-severity thresholds. critical/major are blocking; minor/nit are middling.
86_MAJOR_RANK = SEVERITY_ORDER["major"]
87_NIT_RANK = SEVERITY_ORDER["nit"]
90def _severity_to_vote(rank: int, vocab: dict) -> str:
91 if rank <= _MAJOR_RANK:
92 return vocab["blocking"]
93 if rank <= _NIT_RANK:
94 return vocab["middling"]
95 return vocab["clear"]
98@dataclass
99class Ballot:
100 reviewer: str
101 vote: str
102 reason: str
105@dataclass
106class VoteResult:
107 verdict: str
108 tally: dict = field(default_factory=dict)
109 ballots: list[Ballot] = field(default_factory=list)
112def tally_votes(groups, reviewers, *, mode: str = "code") -> VoteResult:
113 """Tally a panel verdict from the consensus ``groups`` and ``reviewers``.
115 Each reviewer votes from the worst-severity group they contributed to that was
116 not marked ``unsupported`` by the verifier. The verdict is whichever stance has
117 the most ballots; a tie resolves toward the strictest stance. The vocabulary is
118 mode-aware (issue #230): ``code`` → REQUEST CHANGES > COMMENT > APPROVE;
119 ``issue`` → NEEDS-INFO > UNCLEAR > READY (the panel judges completeness, not
120 correctness). With no reviewers the verdict is ``NO QUORUM``.
121 """
122 vocab = _MODES.get(mode, _MODES["code"])
123 order = vocab["order"]
124 ballots: list[Ballot] = []
125 for rv in reviewers:
126 worst_rank: int | None = None
127 for g in groups:
128 if rv not in getattr(g, "reviewers", []):
129 continue
130 if (getattr(g, "status", "") or "") == "unsupported":
131 continue # verifier rejected it — doesn't count toward this vote
132 rank = SEVERITY_ORDER.get(getattr(g, "severity", ""), len(SEVERITIES) - 1)
133 if worst_rank is None or rank < worst_rank:
134 worst_rank = rank
135 if worst_rank is None:
136 ballots.append(Ballot(rv, vocab["clear"], "no supported findings raised"))
137 else:
138 sev = SEVERITIES[worst_rank]
139 ballots.append(Ballot(rv, _severity_to_vote(worst_rank, vocab), f"worst finding: {sev}"))
141 tally = dict.fromkeys(order, 0)
142 for b in ballots:
143 tally[b.vote] += 1
145 if not ballots:
146 return VoteResult(NO_QUORUM, tally, ballots)
148 # Most votes wins; tie-break toward the strictest stance.
149 verdict = max(tally, key=lambda v: (tally[v], order[v]))
150 return VoteResult(verdict, tally, ballots)