Coverage for src/ai_jury/largediff.py: 93%

1"""Large-diff handling: filtering and chunking (issue #31).

3The jury sends the diff to every agent, so a large or generated diff inflates

4cost, runtime, and prompt size. This module measures a diff, drops files that

5should not be reviewed (binary blobs, generated/vendored files, and anything the

6configured path filters exclude), and decides a handling mode:

8- ``full`` — kept diff fits the budget; review it in one pass.

9- ``chunked`` — kept diff is over budget and chunking is enabled; split it

10 into per-file chunks each within the chunk budget.

11- ``too_large`` — over budget and chunking is disabled; the caller should fail

12 with a clear message.

14Everything here is PURE and deterministic: parsing, classification, and chunk

15boundaries are a function of the diff text and config only, so the plan is

16reproducible and unit-testable.

17"""

19from __future__ import annotations

21import fnmatch

22import re

23from dataclasses import dataclass, field

25# Default "generated / not worth reviewing" path globs. Conservative and

26# language-agnostic; users extend or replace via ``[jury.diff] exclude``.

27DEFAULT_GENERATED_GLOBS: tuple[str, ...] = (

28 # Dependency lockfiles.

29 "*.lock",

30 "package-lock.json",

31 "yarn.lock",

32 "pnpm-lock.yaml",

33 "poetry.lock",

34 "Cargo.lock",

35 "composer.lock",

36 "Gemfile.lock",

37 "go.sum",

38 # Minified / map artifacts.

39 "*.min.js",

40 "*.min.css",

41 "*.map",

42 # Snapshots and common generated code.

43 "*.snap",

44 "*.pb.go",

45 "*_pb2.py",

46 "*_pb2_grpc.py",

47 # Vendored / build output directories.

48 "vendor/**",

49 "node_modules/**",

50 "dist/**",

51 "build/**",

52)

54EXCLUDE_BINARY = "binary"

55EXCLUDE_GENERATED = "generated"

56EXCLUDE_FILTER = "excluded-by-filter"

57EXCLUDE_NOT_INCLUDED = "not-in-include-filter"

59MODE_FULL = "full"

60MODE_CHUNKED = "chunked"

61MODE_TOO_LARGE = "too_large"

64@dataclass

65class DiffFile:

66 """One file's segment of a unified diff."""

68 path: str

69 text: str

71 @property

72 def size_bytes(self) -> int:

73 return len(self.text.encode("utf-8"))

76@dataclass

77class DiffPlan:

78 mode: str

79 chunks: list[str] = field(default_factory=list)

80 kept: list[DiffFile] = field(default_factory=list)

81 excluded: list[tuple[str, str]] = field(default_factory=list)

82 total_bytes: int = 0

83 kept_bytes: int = 0

84 reason: str = ""

86 @property

87 def kept_paths(self) -> list[str]:

88 return [f.path for f in self.kept]

91def _strip_ab(path: str) -> str:

92 for prefix in ("a/", "b/"):

93 if path.startswith(prefix):

94 return path[len(prefix) :]

95 return path

98def _unquote_git_path(path: str) -> str:

99 """Undo git's C-style quoting of paths with special chars (best-effort).

100

101 git wraps a path in double quotes and octal-escapes special/non-ASCII bytes

102 when ``core.quotepath`` is on. We decode it back so the full path is

103 recovered for glob filtering and classification.

104 """

105 if len(path) >= 2 and path.startswith('"') and path.endswith('"'):

106 inner = path[1:-1]

107 try:

108 return (

109 inner.encode("latin-1", "backslashreplace")

110 .decode("unicode_escape")

111 .encode("latin-1")

112 .decode("utf-8", "replace")

113 )

114 except (UnicodeDecodeError, UnicodeEncodeError):

115 return inner.replace('\\"', '"').replace("\\\\", "\\")

116 return path

117

118

119def _path_from_marker(line: str) -> str | None:

120 """Path from a ``+++ b/`` or ``--- a/`` line, or None for /dev/null.

121

122 These marker lines carry a single, unambiguous path even when it contains

123 spaces or quoted special chars — unlike the ``diff --git a/ b/``

124 header, which a ``str.split()`` truncates at the first space, hiding or

125 mislabeling the file (security audit 2026-06-13/L-4,N-3).

126 """

127 rest = line[4:].rstrip("\r\n")

128 # Some diff formats append a tab + timestamp; the path ends at the tab.

129 if "\t" in rest: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 rest = rest.split("\t", 1)[0]

131 if rest == "/dev/null": 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 return None

133 return _strip_ab(_unquote_git_path(rest))

134

135

136def _path_from_git_header(line: str) -> str:

137 """Best-effort new-side path from a ``diff --git a/ b/`` header.

138

139 ``str.split()[3]`` truncates a space-containing name; split on the last

140 `` b/`` separator instead so the full b-side path is recovered, then unquote

141 git's C-quoting (audit 2026-06-13/L-4, r3 marker-less case).

142 """

143 rest = line[len("diff --git ") :].rstrip("\r\n")

144 # Non-rename headers are symmetric: ``a/ b/`` with the SAME on both

145 # sides. Recover by halving, which is robust even when itself

146 # contains `` b/`` (a mode-change-only segment has no +++/--- or rename

147 # marker to fall back on — audit 2026-06-13 r4/L). len(body) = 2*len(p)+3.

148 if rest.startswith("a/"):

149 body = rest[2:]

150 half = (len(body) - 3) // 2

151 if (

152 len(body) >= 3

153 and body[half : half + 3] == " b/"

154 and body[:half] == body[half + 3 :]

155 ):

156 return _unquote_git_path(body[:half])

157 idx = rest.rfind(" b/")

158 if idx != -1:

159 return _strip_ab(_unquote_git_path(rest[idx + 1 :]))

160 # Quoted b-side: git C-quotes special/spaced paths as `"a/" "b/"`, so

161 # the separator is `` "b/`` not `` b/`` (audit 2026-06-13 r5/L).

162 qidx = rest.rfind(' "b/')

163 if qidx != -1: 163 ↛ 165line 163 didn't jump to line 165 because the condition on line 163 was always true

164 return _strip_ab(_unquote_git_path(rest[qidx + 1 :]))

165 parts = line.split()

166 return _strip_ab(parts[3]) if len(parts) >= 4 else _strip_ab(parts[-1])

167

168

169def split_diff(diff: str) -> list[DiffFile]:

170 """Split a unified diff into per-file segments.

171

172 Segments start at ``diff --git a/ b/`` headers (the git format the

173 adapters emit). Any preamble before the first header is attached to the first

174 file so no bytes are silently dropped. A diff with no ``diff --git`` header is

175 returned as a single unnamed segment (it cannot be chunked by file).

176 """

177 if not diff:

178 return []

179

180 files: list[DiffFile] = []

181

182 parts = []

183 # bolt: avoid allocating a huge list of strings from splitlines(keepends=True)

184 # by splitting chunks directly and only iterating their header lines.

185 idx = diff.find("diff --git ")

186 if idx == -1:

187 parts = [diff]

188 else:

189 if idx > 0: 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true

190 parts.append(diff[:idx])

191

192 while idx != -1: 192 ↛ 201line 192 didn't jump to line 201 because the condition on line 192 was always true

193 next_idx = diff.find("\ndiff --git ", idx)

194 if next_idx == -1:

195 parts.append(diff[idx:])

196 break

197 else:

198 parts.append(diff[idx:next_idx+1])

199 idx = next_idx + 1

200

201 for part in parts:

202 cur_path = None

203

204 p_idx = 0

205 while p_idx < len(part):

206 next_nl = part.find("\n", p_idx)

207 if next_nl == -1: 207 ↛ 208line 207 didn't jump to line 208 because the condition on line 207 was never true

208 line = part[p_idx:]

209 else:

210 line = part[p_idx:next_nl+1]

211

212 if line.startswith("diff --git "):

213 cur_path = _path_from_git_header(line)

214 elif line.startswith("+++ ") or (line.startswith("--- ") and not cur_path):

215 p = _path_from_marker(line)

216 if p is not None: 216 ↛ 223line 216 didn't jump to line 223 because the condition on line 216 was always true

217 cur_path = p

218 elif line.startswith(("rename to ", "copy to ")):

219 p = _strip_ab(_unquote_git_path(line.split(" to ", 1)[1].rstrip("\r\n")))

220 if p: 220 ↛ 223line 220 didn't jump to line 223 because the condition on line 220 was always true

221 cur_path = p

222

223 if cur_path is None:

224 cur_path = ""

225

226 if line.startswith("@@ "):

227 break

228

229 if next_nl == -1: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true

230 break

231 p_idx = next_nl + 1

232

233 files.append(DiffFile(path=cur_path or "", text=part))

234

235 return files

236

237

238_BINARY_RE = re.compile(r"(?m)^\s*(?:GIT binary patch|Binary files .* differ)\s*$")

239

240

241def _is_binary(text: str) -> bool:

242 """True when a file segment is a git *binary* diff.

243

244 Matches the binary marker on its own header line — ``Binary files … differ``

245 or a standalone ``GIT binary patch`` — rather than the substring anywhere in

246 the text. A diff's content lines are prefixed with ``+``/``-``/`` ``, so this

247 never misfires on source code that merely *mentions* those strings (e.g. this

248 module's own detector).

249 """

250 # bolt: avoid allocating a huge list of strings from splitlines()

251 # and generator overhead by using C-optimized regex finding.

252 return bool(_BINARY_RE.search(text))

253

254

255def _matches_any(path: str, patterns) -> bool:

256 """True when ``path`` matches any glob in ``patterns``.

257

258 Supports a trailing ``/**`` to mean "anything under this directory" and the

259 basename for simple ``*.ext`` patterns, in addition to a full-path match.

260 """

261 name = path.rsplit("/", 1)[-1]

262 for pat in patterns:

263 if pat.endswith("/**"):

264 prefix = pat[:-2] # keep trailing slash

265 if path.startswith(prefix):

266 return True

267 if fnmatch.fnmatch(path, pat) or fnmatch.fnmatch(name, pat):

268 return True

269 return False

270

271

272def _chunk_files(kept: list[DiffFile], chunk_max_bytes: int) -> list[str]:

273 """Greedily pack kept files into chunks no larger than the budget.

274

275 Files keep their order. A file that alone exceeds the budget becomes its own

276 chunk (it cannot be split without breaking the diff), so chunking degrades

277 gracefully rather than failing.

278 """

279 chunks: list[str] = []

280 current: list[str] = []

281 current_bytes = 0

282 for f in kept:

283 fb = f.size_bytes

284 if current and current_bytes + fb > chunk_max_bytes:

285 chunks.append("".join(current))

286 current, current_bytes = [], 0

287 current.append(f.text)

288 current_bytes += fb

289 if current:

290 chunks.append("".join(current))

291 return chunks

292

293

294def plan_diff(

295 diff: str,

296 *,

297 max_bytes: int,

298 chunk: bool,

299 chunk_max_bytes: int | None = None,

300 exclude_generated: bool = True,

301 exclude: tuple[str, ...] | list[str] = (),

302 include: tuple[str, ...] | list[str] = (),

303) -> DiffPlan:

304 """Measure, filter, and decide a handling mode for ``diff`` (issue #31)."""

305 files = split_diff(diff)

306 total_bytes = len(diff.encode("utf-8"))

307 chunk_max_bytes = chunk_max_bytes or max_bytes

308

309 kept: list[DiffFile] = []

310 excluded: list[tuple[str, str]] = []

311 generated_globs = tuple(DEFAULT_GENERATED_GLOBS) + tuple(exclude)

312

313 for f in files:

314 # An include allow-list, when present, drops anything not matching.

315 if include and not _matches_any(f.path, include):

316 excluded.append((f.path, EXCLUDE_NOT_INCLUDED))

317 continue

318 if _is_binary(f.text):

319 excluded.append((f.path, EXCLUDE_BINARY))

320 continue

321 if exclude_generated and _matches_any(f.path, generated_globs):

322 excluded.append((f.path, EXCLUDE_GENERATED))

323 continue

324 if exclude and _matches_any(f.path, exclude):

325 excluded.append((f.path, EXCLUDE_FILTER))

326 continue

327 kept.append(f)

328

329 filtered_diff = "".join(f.text for f in kept)

330 kept_bytes = len(filtered_diff.encode("utf-8"))

331

332 if kept_bytes <= max_bytes:

333 mode = MODE_FULL

334 chunks = [filtered_diff] if kept_bytes else []

335 reason = (

336 f"{kept_bytes} B within budget ({max_bytes} B); reviewing in one pass"

337 if kept_bytes

338 else "nothing left to review after filters"

339 )

340 elif chunk:

341 mode = MODE_CHUNKED

342 chunks = _chunk_files(kept, chunk_max_bytes)

343 reason = (

344 f"{kept_bytes} B over budget ({max_bytes} B); chunked into "

345 f"{len(chunks)} part(s) of <= {chunk_max_bytes} B"

346 )

347 else:

348 mode = MODE_TOO_LARGE

349 chunks = []

350 reason = (

351 f"{kept_bytes} B over budget ({max_bytes} B) and chunking is disabled; "

352 f"enable [jury.diff] chunk = true or narrow the diff with "

353 f"include/exclude filters"

354 )

355

356 return DiffPlan(

357 mode=mode,

358 chunks=chunks,

359 kept=kept,

360 excluded=excluded,

361 total_bytes=total_bytes,

362 kept_bytes=kept_bytes,

363 reason=reason,

364 )