Coverage for src/ai_jury/privilege.py: 99%

1"""Least-privilege auditing for review agents (OWASP LLM01 defense-in-depth).

3Reviewers process attacker-controlled content (the PR diff and, via ``--pr``, the

4PR title/body). If an agent CLI is invoked with write/tool/network powers, a

5successful prompt injection could escalate from "bad review text" to real

6side effects. The jury mitigates this by running agents read-only.

8This module inspects each configured agent's ``extra_args`` and WARNS when an

9agent could perform write or tool actions during review. It is advisory by

10default (a warning, surfaced via ``run_jury``); ``--strict`` promotes the

11warnings to a hard failure.

13Required read-only invocation per adapter (documented here and in docs/security.md):

15- ``claude`` : pass ``--disallowed-tools Edit,Write,NotebookEdit,Bash`` so the

16 reviewer cannot edit files or run shell commands.

17- ``codex`` : ``-s read-only`` (the shipped default, issue #100). A wider

18 sandbox (``workspace-write``/``danger-full-access``) is flagged

19 here so operators opt in knowingly.

20- ``agy``/gemini : run under ``--sandbox`` (the shipped default). A bare

21 ``--dangerously-skip-permissions`` / ``--yolo`` without a sandbox

22 is flagged.

23"""

25from __future__ import annotations

27# Flags that grant broad write/tool/network powers — dangerous for a reviewer.

28_DANGEROUS_FLAGS: tuple[str, ...] = (

29 "--dangerously-skip-permissions",

30 "--yolo",

31 "danger-full-access",

32 "--full-auto",

33)

35# Tool names that allow filesystem writes or shell execution.

36_WRITE_TOOLS: tuple[str, ...] = ("Edit", "Write", "NotebookEdit", "Bash")

39def _args_str(extra_args: list[str]) -> str:

40 return " ".join(extra_args)

43# Codex sandbox VALUES that actually restrict the agent. A value sandbox like

44# ``workspace-write`` / ``danger-full-access`` does NOT (issue #292): the audit

45# must not treat the mere presence of a ``-s``/``--sandbox`` token as proof of a

46# read-only run when its value grants write/tool powers.

47_RESTRICTING_SANDBOX_VALUES: tuple[str, ...] = ("read-only",)

50def _is_sandboxed(extra_args: list[str], vendor: str = "", name: str = "") -> bool:

51 """True when a non-claude agent runs under a *restricting* sandbox.

53 Vendor-aware (issue #292) so a bare ``--sandbox`` token cannot give false

54 assurance: only the agy/gemini terminal sandbox is a genuine boolean

55 ``--sandbox``; for codex the sandbox takes a VALUE and only ``read-only``

56 restricts (``-s read-only`` / ``--sandbox read-only``). A bare ``--sandbox``

57 from any other vendor — e.g. ``["--sandbox", "--dangerously-skip-permissions",

58 "--yolo"]`` — is no longer accepted as a sandbox. When a restricting sandbox

59 is active, an otherwise-broad flag no longer grants real powers (issue #100).

60 """

61 vendor = (vendor or "").lower()

62 name = (name or "").lower()

63 is_agy = vendor == "google" or "agy" in name or "gemini" in name

64 args = list(extra_args)

65 for i, a in enumerate(args):

66 # Equals form (issue #316/L-6): `-s=read-only` / `--sandbox=read-only`,

67 # which `enforce_read_only._ensure_value_sandbox` already recognizes — so

68 # the audit must too, or it false-positives a genuinely-safe config under

69 # `--strict`.

70 if a.startswith(("-s=", "--sandbox=")):

71 value = a.split("=", 1)[1]

72 if value in _RESTRICTING_SANDBOX_VALUES:

73 return True

74 if a.startswith("--sandbox=") and is_agy and value == "": 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 return True

76 continue

77 if a in ("-s", "--sandbox"):

78 nxt = args[i + 1] if i + 1 < len(args) else ""

79 # Codex (and any vendor): an explicit read-only sandbox value.

80 if nxt in _RESTRICTING_SANDBOX_VALUES:

81 return True

82 # agy/gemini: bare boolean --sandbox (no value, or another flag next).

83 if a == "--sandbox" and is_agy and (nxt == "" or nxt.startswith("-")):

84 return True

85 return False

88def _ensure_claude_disallowed(extra_args: list[str]) -> list[str]:

89 """Guarantee ``--disallowed-tools`` covers every write tool (issue #288).

91 Merges the mandatory write tools into any existing ``--disallowed-tools``

92 value (config may ADD denials, never REMOVE the mandatory ones), or injects

93 the flag when absent. Idempotent: the shipped default already lists all four,

94 so it is returned unchanged.

95 """

97 def _merged(value: str) -> str:

98 existing = [t.strip() for t in value.split(",") if t.strip()]

99 for tool in _WRITE_TOOLS:

100 if tool not in existing:

101 existing.append(tool)

102 return ",".join(existing)

103

104 args = list(extra_args)

105 out: list[str] = []

106 i = 0

107 found = False

108 while i < len(args):

109 a = args[i]

110 # Space-separated form: --disallowed-tools Edit,Write

111 if a == "--disallowed-tools" and i + 1 < len(args):

112 found = True

113 out.extend([a, _merged(args[i + 1])])

114 i += 2

115 continue

116 # Equals form: --disallowed-tools=Edit,Write (review of #288 — the

117 # exact-match check missed this, so a narrower =-value could sit after

118 # the injected safe set and, if the CLI is last-wins, narrow the deny set).

119 if a.startswith("--disallowed-tools="):

120 found = True

121 out.append("--disallowed-tools=" + _merged(a.split("=", 1)[1]))

122 i += 1

123 continue

124 out.append(a)

125 i += 1

126 if not found:

127 out = ["--disallowed-tools", ",".join(_WRITE_TOOLS), *out]

128 return out

129

130

131def _ensure_value_sandbox(extra_args: list[str], default: list[str]) -> list[str]:

132 """Ensure SOME sandbox flag is present; inject ``default`` only when none is.

133

134 If the operator already specified ``-s``/``--sandbox`` (even a wider value

135 like codex ``workspace-write``), respect it — that is a documented, audited

136 opt-in. We only inject the secure default when no sandbox flag exists at all,

137 which is the actual hole (empty/misconfigured ``extra_args``, issue #288).

138 """

139 args = list(extra_args)

140 # Recognize both the space form (-s read-only) and the equals form

141 # (--sandbox=read-only) so an existing sandbox is never double-specified.

142 if any(a in ("-s", "--sandbox") or a.startswith(("-s=", "--sandbox=")) for a in args):

143 return args

144 return [*default, *args]

145

146

147def enforce_read_only(vendor: str, name: str, extra_args: list[str]) -> list[str]:

148 """Return ``extra_args`` with the mandatory read-only restriction guaranteed.

149

150 The sandbox is enforced here (issue #288) rather than left to config, so an

151 empty or misconfigured ``extra_args`` cannot produce a write-capable reviewer

152 of an attacker-controlled diff. Config may still WIDEN a codex sandbox

153 (``-s workspace-write``) — an explicit opt-in the audit warns about — but it

154 can never REMOVE the restriction. A ``local`` (network) agent runs no

155 subprocess and is returned unchanged; neither does a hosted-API agent

156 (issue #430) — it makes one HTTP call with no tool/file/shell access at

157 all, so there is no ``extra_args``/sandbox concept to enforce. An

158 **unknown vendor** routes to the generic ``AgyAdapter``, so it is treated

159 like agy and gets ``--sandbox`` injected (issue #310, completes #300) —

160 fail-closed, never fail-open.

161 """

162 vendor = (vendor or "").lower()

163 name = (name or "").lower()

164 extra_args = list(extra_args or [])

165 # `local`/hosted-API vendors are checked FIRST (review of #310): a network

166 # agent runs no subprocess, and the name-substring checks below would

167 # otherwise mis-handle e.g. a local agent named "local-claude" / "my-codex".

168 if vendor in ("local", "anthropic-api", "openai-api", "google-api"):

169 return extra_args

170 if "claude" in name or vendor == "anthropic":

171 return _ensure_claude_disallowed(extra_args)

172 if vendor == "openai" or "codex" in name:

173 return _ensure_value_sandbox(extra_args, ["-s", "read-only"])

174 # google / agy / gemini AND any unknown vendor (issue #310, completes #300):

175 # an unknown vendor routes to the generic AgyAdapter (--print/--sandbox), so

176 # inject --sandbox like agy. An agy-compatible CLI then runs sandboxed; an

177 # incompatible one fails on the unknown flag rather than running UNSANDBOXED

178 # — fail-closed either way, never fail-open.

179 return _ensure_value_sandbox(extra_args, ["--sandbox"])

180

181

182def _claude_is_locked_down(extra_args: list[str]) -> bool:

183 """True when claude is given --disallowed-tools covering all write tools."""

184 disallowed: set[str] = set()

185 args = list(extra_args)

186 for i, a in enumerate(args):

187 if a == "--disallowed-tools" and i + 1 < len(args):

188 disallowed |= {t.strip() for t in args[i + 1].split(",") if t.strip()}

189 return all(t in disallowed for t in _WRITE_TOOLS)

190

191

192def audit_agent(spec) -> list[str]:

193 """Return least-privilege warnings for a single agent spec."""

194 warnings: list[str] = []

195 name = (getattr(spec, "name", "") or "").lower()

196 vendor = (getattr(spec, "vendor", "") or "").lower()

197 extra_args = list(getattr(spec, "extra_args", []) or [])

198 args_text = _args_str(extra_args)

199 label = getattr(spec, "name", "agent")

200

201 # Local/HTTP agents (issue #43) and hosted-API agents (issue #430) run no

202 # subprocess to sandbox — there is no write/tool/network surface to flag

203 # (a hosted-API call has strictly less access than even a sandboxed CLI:

204 # no filesystem, no shell, nothing to disallow), so they are out of scope

205 # for this audit.

206 if vendor in ("local", "anthropic-api", "openai-api", "google-api"):

207 return warnings

208

209 is_claude = "claude" in name or vendor == "anthropic"

210

211 if is_claude:

212 if not _claude_is_locked_down(extra_args):

213 warnings.append(

214 f"agent '{label}' (claude) is not restricted to read-only: add "

215 f"`--disallowed-tools {','.join(_WRITE_TOOLS)}` so a prompt "

216 f"injection in the diff cannot edit files or run commands."

217 )

218 # claude's own default config additionally uses

219 # --dangerously-skip-permissions; that is safe only *because* write

220 # tools are disallowed, so we don't warn separately when locked down.

221 return warnings

222

223 # Non-claude agents must run under a restricting sandbox (issue #100).

224 if _is_sandboxed(extra_args, vendor=vendor, name=name):

225 return warnings

226 # Not sandboxed. A broad-powers flag gets a specific message…

227 for flag in _DANGEROUS_FLAGS:

228 if flag in extra_args or flag in args_text:

229 warnings.append(

230 f"agent '{label}' is configured with `{flag}`, granting "

231 f"write/tool/network powers while reviewing untrusted content; "

232 f"prefer a read-only sandbox (e.g. codex `-s read-only` or agy "

233 f"`--sandbox`)."

234 )

235 return warnings

236 # …otherwise warn that it simply isn't sandboxed. This closes the audit

237 # blind spot (issue #300): an unknown-vendor or no-flag agent previously

238 # produced ZERO warnings and ran via the generic adapter without the

239 # read-only guarantee — and so `--strict` could not fail it on this basis.

240 warnings.append(

241 f"agent '{label}' is not running under a recognized read-only sandbox "

242 f"(no `-s read-only` / `--sandbox`); a prompt injection in the diff could "

243 f"reach write/tool/network. Add a sandbox, or run with `--strict` to fail."

244 )

245 return warnings

246

247

248def audit_privilege(specs) -> list[str]:

249 """Return all least-privilege warnings across the configured agents."""

250 warnings: list[str] = []

251 for spec in specs:

252 warnings.extend(audit_agent(spec))

253 return warnings