Coverage for src/ai_jury/largediff.py: 100%

123 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-05 20:29 +0000

1"""Large-diff handling: filtering and chunking (issue #31). 

2 

3The jury sends the diff to every agent, so a large or generated diff inflates 

4cost, runtime, and prompt size. This module measures a diff, drops files that 

5should not be reviewed (binary blobs, generated/vendored files, and anything the 

6configured path filters exclude), and decides a handling mode: 

7 

8- ``full`` — kept diff fits the budget; review it in one pass. 

9- ``chunked`` — kept diff is over budget and chunking is enabled; split it 

10 into per-file chunks each within the chunk budget. 

11- ``too_large`` — over budget and chunking is disabled; the caller should fail 

12 with a clear message. 

13 

14Everything here is PURE and deterministic: parsing, classification, and chunk 

15boundaries are a function of the diff text and config only, so the plan is 

16reproducible and unit-testable. 

17""" 

18from __future__ import annotations 

19 

20import fnmatch 

21from dataclasses import dataclass, field 

22 

23# Default "generated / not worth reviewing" path globs. Conservative and 

24# language-agnostic; users extend or replace via ``[jury.diff] exclude``. 

25DEFAULT_GENERATED_GLOBS: tuple[str, ...] = ( 

26 # Dependency lockfiles. 

27 "*.lock", 

28 "package-lock.json", 

29 "yarn.lock", 

30 "pnpm-lock.yaml", 

31 "poetry.lock", 

32 "Cargo.lock", 

33 "composer.lock", 

34 "Gemfile.lock", 

35 "go.sum", 

36 # Minified / map artifacts. 

37 "*.min.js", 

38 "*.min.css", 

39 "*.map", 

40 # Snapshots and common generated code. 

41 "*.snap", 

42 "*.pb.go", 

43 "*_pb2.py", 

44 "*_pb2_grpc.py", 

45 # Vendored / build output directories. 

46 "vendor/**", 

47 "node_modules/**", 

48 "dist/**", 

49 "build/**", 

50) 

51 

52EXCLUDE_BINARY = "binary" 

53EXCLUDE_GENERATED = "generated" 

54EXCLUDE_FILTER = "excluded-by-filter" 

55EXCLUDE_NOT_INCLUDED = "not-in-include-filter" 

56 

57MODE_FULL = "full" 

58MODE_CHUNKED = "chunked" 

59MODE_TOO_LARGE = "too_large" 

60 

61 

62@dataclass 

63class DiffFile: 

64 """One file's segment of a unified diff.""" 

65 

66 path: str 

67 text: str 

68 

69 @property 

70 def size_bytes(self) -> int: 

71 return len(self.text.encode("utf-8")) 

72 

73 

74@dataclass 

75class DiffPlan: 

76 mode: str 

77 chunks: list[str] = field(default_factory=list) 

78 kept: list[DiffFile] = field(default_factory=list) 

79 excluded: list[tuple[str, str]] = field(default_factory=list) 

80 total_bytes: int = 0 

81 kept_bytes: int = 0 

82 reason: str = "" 

83 

84 @property 

85 def kept_paths(self) -> list[str]: 

86 return [f.path for f in self.kept] 

87 

88 

89def _strip_ab(path: str) -> str: 

90 for prefix in ("a/", "b/"): 

91 if path.startswith(prefix): 

92 return path[len(prefix):] 

93 return path 

94 

95 

96def split_diff(diff: str) -> list[DiffFile]: 

97 """Split a unified diff into per-file segments. 

98 

99 Segments start at ``diff --git a/<p> b/<p>`` headers (the git format the 

100 adapters emit). Any preamble before the first header is attached to the first 

101 file so no bytes are silently dropped. A diff with no ``diff --git`` header is 

102 returned as a single unnamed segment (it cannot be chunked by file). 

103 """ 

104 if not diff: 

105 return [] 

106 lines = diff.splitlines(keepends=True) 

107 files: list[DiffFile] = [] 

108 cur_path: str | None = None 

109 cur: list[str] = [] 

110 

111 def flush() -> None: 

112 if cur: 

113 files.append(DiffFile(path=cur_path or "", text="".join(cur))) 

114 

115 for line in lines: 

116 if line.startswith("diff --git "): 

117 flush() 

118 cur = [line] 

119 parts = line.split() 

120 # "diff --git a/x b/x" -> prefer the new-side (b/) path. 

121 cur_path = _strip_ab(parts[3]) if len(parts) >= 4 else _strip_ab(parts[-1]) 

122 else: 

123 cur.append(line) 

124 if cur_path is None: 

125 cur_path = "" 

126 flush() 

127 return files 

128 

129 

130def _is_binary(text: str) -> bool: 

131 """True when a file segment is a git *binary* diff. 

132 

133 Matches the binary marker on its own header line — ``Binary files … differ`` 

134 or a standalone ``GIT binary patch`` — rather than the substring anywhere in 

135 the text. A diff's content lines are prefixed with ``+``/``-``/`` ``, so this 

136 never misfires on source code that merely *mentions* those strings (e.g. this 

137 module's own detector). 

138 """ 

139 for line in text.splitlines(): 

140 stripped = line.strip() 

141 if stripped == "GIT binary patch": 

142 return True 

143 if stripped.startswith("Binary files ") and stripped.endswith(" differ"): 

144 return True 

145 return False 

146 

147 

148def _matches_any(path: str, patterns) -> bool: 

149 """True when ``path`` matches any glob in ``patterns``. 

150 

151 Supports a trailing ``/**`` to mean "anything under this directory" and the 

152 basename for simple ``*.ext`` patterns, in addition to a full-path match. 

153 """ 

154 name = path.rsplit("/", 1)[-1] 

155 for pat in patterns: 

156 if pat.endswith("/**"): 

157 prefix = pat[:-2] # keep trailing slash 

158 if path.startswith(prefix): 

159 return True 

160 if fnmatch.fnmatch(path, pat) or fnmatch.fnmatch(name, pat): 

161 return True 

162 return False 

163 

164 

165def _chunk_files(kept: list[DiffFile], chunk_max_bytes: int) -> list[str]: 

166 """Greedily pack kept files into chunks no larger than the budget. 

167 

168 Files keep their order. A file that alone exceeds the budget becomes its own 

169 chunk (it cannot be split without breaking the diff), so chunking degrades 

170 gracefully rather than failing. 

171 """ 

172 chunks: list[str] = [] 

173 current: list[str] = [] 

174 current_bytes = 0 

175 for f in kept: 

176 fb = f.size_bytes 

177 if current and current_bytes + fb > chunk_max_bytes: 

178 chunks.append("".join(current)) 

179 current, current_bytes = [], 0 

180 current.append(f.text) 

181 current_bytes += fb 

182 if current: 

183 chunks.append("".join(current)) 

184 return chunks 

185 

186 

187def plan_diff( 

188 diff: str, 

189 *, 

190 max_bytes: int, 

191 chunk: bool, 

192 chunk_max_bytes: int | None = None, 

193 exclude_generated: bool = True, 

194 exclude: tuple[str, ...] | list[str] = (), 

195 include: tuple[str, ...] | list[str] = (), 

196) -> DiffPlan: 

197 """Measure, filter, and decide a handling mode for ``diff`` (issue #31).""" 

198 files = split_diff(diff) 

199 total_bytes = len(diff.encode("utf-8")) 

200 chunk_max_bytes = chunk_max_bytes or max_bytes 

201 

202 kept: list[DiffFile] = [] 

203 excluded: list[tuple[str, str]] = [] 

204 generated_globs = tuple(DEFAULT_GENERATED_GLOBS) + tuple(exclude) 

205 

206 for f in files: 

207 # An include allow-list, when present, drops anything not matching. 

208 if include and not _matches_any(f.path, include): 

209 excluded.append((f.path, EXCLUDE_NOT_INCLUDED)) 

210 continue 

211 if _is_binary(f.text): 

212 excluded.append((f.path, EXCLUDE_BINARY)) 

213 continue 

214 if exclude_generated and _matches_any(f.path, generated_globs): 

215 excluded.append((f.path, EXCLUDE_GENERATED)) 

216 continue 

217 if exclude and _matches_any(f.path, exclude): 

218 excluded.append((f.path, EXCLUDE_FILTER)) 

219 continue 

220 kept.append(f) 

221 

222 filtered_diff = "".join(f.text for f in kept) 

223 kept_bytes = len(filtered_diff.encode("utf-8")) 

224 

225 if kept_bytes <= max_bytes: 

226 mode = MODE_FULL 

227 chunks = [filtered_diff] if kept_bytes else [] 

228 reason = ( 

229 f"{kept_bytes} B within budget ({max_bytes} B); reviewing in one pass" 

230 if kept_bytes 

231 else "nothing left to review after filters" 

232 ) 

233 elif chunk: 

234 mode = MODE_CHUNKED 

235 chunks = _chunk_files(kept, chunk_max_bytes) 

236 reason = ( 

237 f"{kept_bytes} B over budget ({max_bytes} B); chunked into " 

238 f"{len(chunks)} part(s) of <= {chunk_max_bytes} B" 

239 ) 

240 else: 

241 mode = MODE_TOO_LARGE 

242 chunks = [] 

243 reason = ( 

244 f"{kept_bytes} B over budget ({max_bytes} B) and chunking is disabled; " 

245 f"enable [jury.diff] chunk = true or narrow the diff with " 

246 f"include/exclude filters" 

247 ) 

248 

249 return DiffPlan( 

250 mode=mode, 

251 chunks=chunks, 

252 kept=kept, 

253 excluded=excluded, 

254 total_bytes=total_bytes, 

255 kept_bytes=kept_bytes, 

256 reason=reason, 

257 )