Coverage for src/ai_jury/largediff.py: 100%
123 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-05 20:29 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-05 20:29 +0000
1"""Large-diff handling: filtering and chunking (issue #31).
3The jury sends the diff to every agent, so a large or generated diff inflates
4cost, runtime, and prompt size. This module measures a diff, drops files that
5should not be reviewed (binary blobs, generated/vendored files, and anything the
6configured path filters exclude), and decides a handling mode:
8- ``full`` — kept diff fits the budget; review it in one pass.
9- ``chunked`` — kept diff is over budget and chunking is enabled; split it
10 into per-file chunks each within the chunk budget.
11- ``too_large`` — over budget and chunking is disabled; the caller should fail
12 with a clear message.
14Everything here is PURE and deterministic: parsing, classification, and chunk
15boundaries are a function of the diff text and config only, so the plan is
16reproducible and unit-testable.
17"""
18from __future__ import annotations
20import fnmatch
21from dataclasses import dataclass, field
23# Default "generated / not worth reviewing" path globs. Conservative and
24# language-agnostic; users extend or replace via ``[jury.diff] exclude``.
25DEFAULT_GENERATED_GLOBS: tuple[str, ...] = (
26 # Dependency lockfiles.
27 "*.lock",
28 "package-lock.json",
29 "yarn.lock",
30 "pnpm-lock.yaml",
31 "poetry.lock",
32 "Cargo.lock",
33 "composer.lock",
34 "Gemfile.lock",
35 "go.sum",
36 # Minified / map artifacts.
37 "*.min.js",
38 "*.min.css",
39 "*.map",
40 # Snapshots and common generated code.
41 "*.snap",
42 "*.pb.go",
43 "*_pb2.py",
44 "*_pb2_grpc.py",
45 # Vendored / build output directories.
46 "vendor/**",
47 "node_modules/**",
48 "dist/**",
49 "build/**",
50)
52EXCLUDE_BINARY = "binary"
53EXCLUDE_GENERATED = "generated"
54EXCLUDE_FILTER = "excluded-by-filter"
55EXCLUDE_NOT_INCLUDED = "not-in-include-filter"
57MODE_FULL = "full"
58MODE_CHUNKED = "chunked"
59MODE_TOO_LARGE = "too_large"
62@dataclass
63class DiffFile:
64 """One file's segment of a unified diff."""
66 path: str
67 text: str
69 @property
70 def size_bytes(self) -> int:
71 return len(self.text.encode("utf-8"))
74@dataclass
75class DiffPlan:
76 mode: str
77 chunks: list[str] = field(default_factory=list)
78 kept: list[DiffFile] = field(default_factory=list)
79 excluded: list[tuple[str, str]] = field(default_factory=list)
80 total_bytes: int = 0
81 kept_bytes: int = 0
82 reason: str = ""
84 @property
85 def kept_paths(self) -> list[str]:
86 return [f.path for f in self.kept]
89def _strip_ab(path: str) -> str:
90 for prefix in ("a/", "b/"):
91 if path.startswith(prefix):
92 return path[len(prefix):]
93 return path
96def split_diff(diff: str) -> list[DiffFile]:
97 """Split a unified diff into per-file segments.
99 Segments start at ``diff --git a/<p> b/<p>`` headers (the git format the
100 adapters emit). Any preamble before the first header is attached to the first
101 file so no bytes are silently dropped. A diff with no ``diff --git`` header is
102 returned as a single unnamed segment (it cannot be chunked by file).
103 """
104 if not diff:
105 return []
106 lines = diff.splitlines(keepends=True)
107 files: list[DiffFile] = []
108 cur_path: str | None = None
109 cur: list[str] = []
111 def flush() -> None:
112 if cur:
113 files.append(DiffFile(path=cur_path or "", text="".join(cur)))
115 for line in lines:
116 if line.startswith("diff --git "):
117 flush()
118 cur = [line]
119 parts = line.split()
120 # "diff --git a/x b/x" -> prefer the new-side (b/) path.
121 cur_path = _strip_ab(parts[3]) if len(parts) >= 4 else _strip_ab(parts[-1])
122 else:
123 cur.append(line)
124 if cur_path is None:
125 cur_path = ""
126 flush()
127 return files
130def _is_binary(text: str) -> bool:
131 """True when a file segment is a git *binary* diff.
133 Matches the binary marker on its own header line — ``Binary files … differ``
134 or a standalone ``GIT binary patch`` — rather than the substring anywhere in
135 the text. A diff's content lines are prefixed with ``+``/``-``/`` ``, so this
136 never misfires on source code that merely *mentions* those strings (e.g. this
137 module's own detector).
138 """
139 for line in text.splitlines():
140 stripped = line.strip()
141 if stripped == "GIT binary patch":
142 return True
143 if stripped.startswith("Binary files ") and stripped.endswith(" differ"):
144 return True
145 return False
148def _matches_any(path: str, patterns) -> bool:
149 """True when ``path`` matches any glob in ``patterns``.
151 Supports a trailing ``/**`` to mean "anything under this directory" and the
152 basename for simple ``*.ext`` patterns, in addition to a full-path match.
153 """
154 name = path.rsplit("/", 1)[-1]
155 for pat in patterns:
156 if pat.endswith("/**"):
157 prefix = pat[:-2] # keep trailing slash
158 if path.startswith(prefix):
159 return True
160 if fnmatch.fnmatch(path, pat) or fnmatch.fnmatch(name, pat):
161 return True
162 return False
165def _chunk_files(kept: list[DiffFile], chunk_max_bytes: int) -> list[str]:
166 """Greedily pack kept files into chunks no larger than the budget.
168 Files keep their order. A file that alone exceeds the budget becomes its own
169 chunk (it cannot be split without breaking the diff), so chunking degrades
170 gracefully rather than failing.
171 """
172 chunks: list[str] = []
173 current: list[str] = []
174 current_bytes = 0
175 for f in kept:
176 fb = f.size_bytes
177 if current and current_bytes + fb > chunk_max_bytes:
178 chunks.append("".join(current))
179 current, current_bytes = [], 0
180 current.append(f.text)
181 current_bytes += fb
182 if current:
183 chunks.append("".join(current))
184 return chunks
187def plan_diff(
188 diff: str,
189 *,
190 max_bytes: int,
191 chunk: bool,
192 chunk_max_bytes: int | None = None,
193 exclude_generated: bool = True,
194 exclude: tuple[str, ...] | list[str] = (),
195 include: tuple[str, ...] | list[str] = (),
196) -> DiffPlan:
197 """Measure, filter, and decide a handling mode for ``diff`` (issue #31)."""
198 files = split_diff(diff)
199 total_bytes = len(diff.encode("utf-8"))
200 chunk_max_bytes = chunk_max_bytes or max_bytes
202 kept: list[DiffFile] = []
203 excluded: list[tuple[str, str]] = []
204 generated_globs = tuple(DEFAULT_GENERATED_GLOBS) + tuple(exclude)
206 for f in files:
207 # An include allow-list, when present, drops anything not matching.
208 if include and not _matches_any(f.path, include):
209 excluded.append((f.path, EXCLUDE_NOT_INCLUDED))
210 continue
211 if _is_binary(f.text):
212 excluded.append((f.path, EXCLUDE_BINARY))
213 continue
214 if exclude_generated and _matches_any(f.path, generated_globs):
215 excluded.append((f.path, EXCLUDE_GENERATED))
216 continue
217 if exclude and _matches_any(f.path, exclude):
218 excluded.append((f.path, EXCLUDE_FILTER))
219 continue
220 kept.append(f)
222 filtered_diff = "".join(f.text for f in kept)
223 kept_bytes = len(filtered_diff.encode("utf-8"))
225 if kept_bytes <= max_bytes:
226 mode = MODE_FULL
227 chunks = [filtered_diff] if kept_bytes else []
228 reason = (
229 f"{kept_bytes} B within budget ({max_bytes} B); reviewing in one pass"
230 if kept_bytes
231 else "nothing left to review after filters"
232 )
233 elif chunk:
234 mode = MODE_CHUNKED
235 chunks = _chunk_files(kept, chunk_max_bytes)
236 reason = (
237 f"{kept_bytes} B over budget ({max_bytes} B); chunked into "
238 f"{len(chunks)} part(s) of <= {chunk_max_bytes} B"
239 )
240 else:
241 mode = MODE_TOO_LARGE
242 chunks = []
243 reason = (
244 f"{kept_bytes} B over budget ({max_bytes} B) and chunking is disabled; "
245 f"enable [jury.diff] chunk = true or narrow the diff with "
246 f"include/exclude filters"
247 )
249 return DiffPlan(
250 mode=mode,
251 chunks=chunks,
252 kept=kept,
253 excluded=excluded,
254 total_bytes=total_bytes,
255 kept_bytes=kept_bytes,
256 reason=reason,
257 )