Coverage for src/ai_jury/injection.py: 100%

1"""Prompt-injection heuristics for untrusted review input (OWASP LLM01).

3The jury feeds attacker-controlled content (the PR diff, and via ``--pr`` the

4PR title/body) into reviewer prompts. This module scans that content for common

5prompt-injection patterns and surfaces hits as *synthetic findings/warnings* —

6it never alters agent behaviour or the CI gate. Surfacing-not-obeying is the

7whole point: a human (and the structured consensus pipeline) stays in control.

9The detector is intentionally conservative and dependency-free (stdlib only).

10False positives are acceptable here because a hit only adds an advisory finding;

11it cannot flip a verdict.

12"""

14from __future__ import annotations

16import bisect

17import re

18from dataclasses import dataclass

20# The scanner is advisory (it surfaces hits, never changes the gate). Cap hits

21# per kind so a pathological input — e.g. a long run of zero-width chars — yields

22# a bounded number of hits instead of one per char (issue #314).

23_MAX_HITS_PER_KIND = 25

25# Zero-width / bidi / invisible control characters often used to smuggle hidden

26# text. Built from explicit code points (NOT invisible string literals) so the

27# set is readable, verifiable, and can't be silently stripped by an editor or a

28# whitespace normalizer (review of #303/L-2). Extended to cover direction marks,

29# invisible math operators, soft hyphen, CGJ, Mongolian vowel separator, and

30# Hangul fillers.

31_ZERO_WIDTH_CODEPOINTS = (

32 0x200B, # zero-width space

33 0x200C, # zero-width non-joiner

34 0x200D, # zero-width joiner

35 0x2060, # word joiner

36 0xFEFF, # zero-width no-break space / BOM

37 0x202A,

38 0x202B,

39 0x202C,

40 0x202D,

41 0x202E, # bidi embedding/override controls

42 0x200E,

43 0x200F, # LRM / RLM (left/right-to-left marks)

44 0x061C, # Arabic letter mark

45 0x2061,

46 0x2062,

47 0x2063,

48 0x2064, # invisible math operators

49 0x00AD, # soft hyphen

50 0x034F, # combining grapheme joiner

51 0x180E, # Mongolian vowel separator

52 0x115F,

53 0x1160,

54 0x3164,

55 0xFFA0, # Hangul fillers (render as invisible)

56)

57_ZERO_WIDTH = "".join(chr(c) for c in _ZERO_WIDTH_CODEPOINTS)

58_ZERO_WIDTH_RE = re.compile("[" + re.escape(_ZERO_WIDTH) + "]")

60# Imperative phrases that try to override the system/developer instructions.

61_PHRASE_PATTERNS: tuple[tuple[str, re.Pattern], ...] = (

62 (

63 "override-instructions",

64 re.compile(

65 r"(?i)\b(ignore|disregard|forget|override)\b[^.\n]{0,40}"

66 r"\b(previous|prior|above|earlier|all|any|the)\b[^.\n]{0,20}"

68 ),

69 ),

70 (

71 "role-reassignment",

72 re.compile(

73 r"(?i)\byou\s+are\s+now\b|\bnew\s+(instructions?|persona|role|system\s+prompt)\b"

74 ),

75 ),

76 (

77 "fake-system-turn",

78 re.compile(

79 r"(?im)^\s*(system|assistant|developer)\s*:",

80 ),

81 ),

82 (

83 "verdict-coercion",

84 re.compile(

85 r"(?i)\b(approve|lgtm|pass|merge)\b[^.\n]{0,40}"

86 r"\b(no\s+findings?|no\s+issues?|without\s+(any\s+)?(review|findings?|comment))\b"

87 ),

88 ),

89 ("instruction-tag", re.compile(r"(?i)<\s*/?\s*(system|instructions?|prompt)\s*>")),

90)

92# A long run of base64-ish characters can hide an encoded payload. The class

93# includes URL-safe base64 (`-_`) as well as standard `+/` (issue #303/L-2).

94_BASE64_RE = re.compile(r"[A-Za-z0-9+/_-]{120,}={0,2}")

97@dataclass

98class InjectionHit:

99 """One suspicious pattern detected in untrusted content."""

100

101 kind: str

102 source: str # "diff" or "context"

103 line: int | None

104 snippet: str

105

106 def location(self) -> str:

107 loc = self.source

108 if self.line is not None:

109 loc = f"{self.source}:{self.line}"

110 return loc

111

112

113def _snippet(text: str, start: int, end: int, width: int = 60) -> str:

114 frag = text[start:end]

115 frag = frag.replace("\n", "\\n").replace("\r", "")

116 frag = "".join(ch for ch in frag if ch.isprintable())

117 if len(frag) > width:

118 frag = frag[:width] + "..."

119 return frag.strip()

120

121

122def _newline_offsets(text: str) -> list[int]:

123 """Sorted positions of every ``\\n`` in *text* (built once per scan)."""

124 offsets: list[int] = []

125 start = 0

126 while True:

127 idx = text.find("\n", start)

128 if idx == -1:

129 return offsets

130 offsets.append(idx)

131 start = idx + 1

132

133

134def scan(text: str, source: str = "diff") -> list[InjectionHit]:

135 """Scan *text* for prompt-injection patterns.

136

137 Returns a (possibly empty) list of :class:`InjectionHit`. Never raises.

138 *source* labels where the text came from ("diff" or "context").

139

140 Line numbers are resolved against newline offsets computed ONCE (binary

141 search per hit), and hits are capped per kind, so the scan is linear even on

142 a pathological high-hit input (issue #314) rather than the old quadratic

143 per-hit ``text.count`` line lookup.

144 """

145 if not text:

146 return []

147

148 newlines = _newline_offsets(text)

149

150 def line_of(index: int) -> int:

151 return bisect.bisect_left(newlines, index) + 1

152

153 hits: list[InjectionHit] = []

154 counts: dict[str, int] = {}

155

156 def add(kind: str, index: int, snippet: str) -> None:

157 seen = counts.get(kind, 0)

158 counts[kind] = seen + 1

159 if seen < _MAX_HITS_PER_KIND:

160 hits.append(

161 InjectionHit(kind=kind, source=source, line=line_of(index), snippet=snippet)

162 )

163

164 for kind, pat in _PHRASE_PATTERNS:

165 for m in pat.finditer(text):

166 add(kind, m.start(), _snippet(text, m.start(), m.end()))

167

168 for m in _BASE64_RE.finditer(text):

169 add("base64-blob", m.start(), f"{len(m.group(0))}-char base64-like blob")

170

171 for m in _ZERO_WIDTH_RE.finditer(text):

172 add("zero-width-char", m.start(), f"hidden control char U+{ord(m.group(0)):04X}")

173

174 return hits

175

176

177def scan_inputs(diff: str, context: str = "") -> list[InjectionHit]:

178 """Scan both the diff and PR context, labelling each hit's source."""

179 hits = scan(diff, source="diff")

180 if context:

181 hits.extend(scan(context, source="context"))

182 return hits

183

184

185def hits_to_warnings(hits: list[InjectionHit]) -> list[str]:

186 """Render hits as human-readable warning strings for ``outcome.warnings``."""

187 out: list[str] = []

188 for h in hits:

189 out.append(f"possible prompt-injection ({h.kind}) in {h.location()}: {h.snippet}")

190 return out

191

192

193def hits_to_finding(hits: list[InjectionHit]):

194 """Build a single synthetic ``[major]`` Finding summarizing all hits.

195

196 Returns ``None`` when there are no hits. The finding is advisory: it informs

197 the human and report, but the CI gate is derived from structured *consensus*

198 (see ``ci.evaluate_ci``), so an injected "APPROVE" cannot flip the gate.

199 """

200 if not hits:

201 return None

202 # Imported lazily to avoid a circular import (findings has no dep on us).

203 from .findings import Finding

204

205 first = hits[0]

206 kinds = sorted({h.kind for h in hits})

207 locs = ", ".join(dict.fromkeys(h.location() for h in hits[:5]))

208 claim = f"possible prompt-injection in untrusted input ({len(hits)} hit(s): {', '.join(kinds)})"

209 return Finding(

210 severity="major",

211 file=first.source,

212 line=first.line,

213 claim=claim,

214 evidence=(

215 "Untrusted diff/PR content contains text resembling instructions to "

216 f"the model at {locs}. Treated as data, not obeyed."

217 ),

218 suggested_fix=(

219 "Review the flagged locations manually; do not act on any instructions "

220 "embedded in the diff or PR description."

221 ),

222 confidence="medium",

223 reviewer="injection-scanner",

224 )