ast-chunking.test.ts 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. /**
  2. * Integration tests for AST-aware chunking.
  3. *
  4. * Migrated from the standalone test-ast-chunking.mjs script into the
  5. * vitest suite. Covers the integration between AST break point extraction
  6. * and the chunking pipeline — areas not tested by the unit-level ast.test.ts.
  7. */
  8. import { describe, test, expect } from "vitest";
  9. import { getASTBreakPoints } from "../src/ast.js";
  10. import {
  11. chunkDocument,
  12. chunkDocumentAsync,
  13. chunkDocumentWithBreakPoints,
  14. mergeBreakPoints,
  15. scanBreakPoints,
  16. findCodeFences,
  17. } from "../src/store.js";
  18. // ==========================================================================
  19. // mergeBreakPoints
  20. // ==========================================================================
  21. describe("mergeBreakPoints", () => {
  22. test("merges regex and AST break points, higher score wins at same position", () => {
  23. const regexPoints = [
  24. { pos: 10, score: 20, type: "blank" },
  25. { pos: 50, score: 1, type: "newline" },
  26. { pos: 100, score: 20, type: "blank" },
  27. ];
  28. const astPoints = [
  29. { pos: 10, score: 90, type: "ast:func" },
  30. { pos: 75, score: 100, type: "ast:class" },
  31. { pos: 100, score: 60, type: "ast:import" },
  32. ];
  33. const merged = mergeBreakPoints(regexPoints, astPoints);
  34. expect(merged).toHaveLength(4);
  35. expect(merged.find(p => p.pos === 10)?.score).toBe(90); // AST wins (90 > 20)
  36. expect(merged.find(p => p.pos === 50)?.score).toBe(1); // regex only
  37. expect(merged.find(p => p.pos === 75)?.score).toBe(100); // AST only
  38. expect(merged.find(p => p.pos === 100)?.score).toBe(60); // AST wins (60 > 20)
  39. });
  40. test("result is sorted by position", () => {
  41. const merged = mergeBreakPoints(
  42. [{ pos: 100, score: 10, type: "a" }],
  43. [{ pos: 5, score: 50, type: "b" }],
  44. );
  45. expect(merged[0]!.pos).toBeLessThan(merged[1]!.pos);
  46. });
  47. });
  48. // ==========================================================================
  49. // AST vs Regex chunking comparison
  50. // ==========================================================================
  51. describe("AST vs Regex chunking", () => {
  52. // Generate a large TS file with 30 functions
  53. const parts: string[] = [];
  54. for (let i = 0; i < 30; i++) {
  55. parts.push(`
  56. export function handler${i}(req: Request, res: Response): void {
  57. const startTime = Date.now();
  58. const userId = req.params.userId;
  59. const sessionToken = req.headers.authorization;
  60. if (!userId || !sessionToken) {
  61. res.status(400).json({ error: "Missing required parameters" });
  62. return;
  63. }
  64. console.log(\`Processing request ${i} for user \${userId}\`);
  65. const result = processBusinessLogic${i}(userId, sessionToken);
  66. const elapsed = Date.now() - startTime;
  67. res.json({ data: result, processingTimeMs: elapsed });
  68. }
  69. `);
  70. }
  71. const largeTS = parts.join("\n");
  72. function countSplitFunctions(chunks: { text: string; pos: number }[]): number {
  73. let splits = 0;
  74. for (let i = 0; i < 30; i++) {
  75. const funcStart = largeTS.indexOf(`function handler${i}(`);
  76. const nextFunc = largeTS.indexOf(`function handler${i + 1}(`, funcStart + 1);
  77. const funcEnd = nextFunc > 0 ? nextFunc : largeTS.length;
  78. const chunkIndices = new Set<number>();
  79. for (let ci = 0; ci < chunks.length; ci++) {
  80. const chunkStart = chunks[ci]!.pos;
  81. const chunkEnd = chunkStart + chunks[ci]!.text.length;
  82. if (chunkStart < funcEnd && chunkEnd > funcStart) {
  83. chunkIndices.add(ci);
  84. }
  85. }
  86. if (chunkIndices.size > 1) splits++;
  87. }
  88. return splits;
  89. }
  90. test("AST splits fewer functions across chunk boundaries than regex", async () => {
  91. const regexChunks = chunkDocument(largeTS);
  92. const astChunks = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, "handlers.ts", "auto");
  93. const regexSplits = countSplitFunctions(regexChunks);
  94. const astSplits = countSplitFunctions(astChunks);
  95. expect(astSplits).toBeLessThanOrEqual(regexSplits);
  96. });
  97. test("markdown files produce identical chunks in auto vs regex mode", async () => {
  98. const sections: string[] = [];
  99. for (let i = 0; i < 15; i++) {
  100. sections.push(`# Section ${i}\n\n${"Lorem ipsum dolor sit amet. ".repeat(40)}\n`);
  101. }
  102. const largeMD = sections.join("\n");
  103. const mdRegex = chunkDocument(largeMD);
  104. const mdAst = await chunkDocumentAsync(largeMD, undefined, undefined, undefined, "readme.md", "auto");
  105. expect(mdAst).toHaveLength(mdRegex.length);
  106. for (let i = 0; i < mdRegex.length; i++) {
  107. expect(mdAst[i]?.text).toBe(mdRegex[i]?.text);
  108. expect(mdAst[i]?.pos).toBe(mdRegex[i]?.pos);
  109. }
  110. });
  111. test("regex strategy bypasses AST entirely", async () => {
  112. const regexOnly = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, "handlers.ts", "regex");
  113. const syncRegex = chunkDocument(largeTS);
  114. expect(regexOnly).toHaveLength(syncRegex.length);
  115. for (let i = 0; i < syncRegex.length; i++) {
  116. expect(regexOnly[i]?.text).toBe(syncRegex[i]?.text);
  117. }
  118. });
  119. test("no filepath falls back to regex", async () => {
  120. const noPathChunks = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, undefined, "auto");
  121. const syncRegex = chunkDocument(largeTS);
  122. expect(noPathChunks).toHaveLength(syncRegex.length);
  123. });
  124. test("small file produces single chunk", async () => {
  125. const smallChunks = await chunkDocumentAsync("export const x = 1;", undefined, undefined, undefined, "s.ts", "auto");
  126. expect(smallChunks).toHaveLength(1);
  127. });
  128. });
  129. // ==========================================================================
  130. // chunkDocumentWithBreakPoints equivalence
  131. // ==========================================================================
  132. describe("chunkDocumentWithBreakPoints equivalence", () => {
  133. test("produces identical output to chunkDocument for the same content", () => {
  134. const content = "a".repeat(5000) + "\n\n" + "b".repeat(5000);
  135. const old = chunkDocument(content);
  136. const withBP = chunkDocumentWithBreakPoints(content, scanBreakPoints(content), findCodeFences(content));
  137. expect(withBP).toHaveLength(old.length);
  138. for (let i = 0; i < old.length; i++) {
  139. expect(withBP[i]?.text).toBe(old[i]?.text);
  140. expect(withBP[i]?.pos).toBe(old[i]?.pos);
  141. }
  142. });
  143. });
  144. // ==========================================================================
  145. // Score assertions not covered by ast.test.ts unit tests
  146. // ==========================================================================
  147. describe("AST break point scores", () => {
  148. test("TypeScript export (class) scores 90", async () => {
  149. const code = `export class Foo {}\nexport function bar() {}`;
  150. const points = await getASTBreakPoints(code, "a.ts");
  151. const exportPoint = points.find(p => p.type === "ast:export");
  152. expect(exportPoint?.score).toBe(90);
  153. });
  154. test("Python class scores 100", async () => {
  155. const code = `class Foo:\n pass\n\ndef bar():\n pass`;
  156. const points = await getASTBreakPoints(code, "a.py");
  157. expect(points.find(p => p.type === "ast:class")?.score).toBe(100);
  158. });
  159. test("Go type scores 80", async () => {
  160. const code = `package main\n\ntype Server struct {\n port int\n}\n\nfunc main() {}`;
  161. const points = await getASTBreakPoints(code, "a.go");
  162. expect(points.find(p => p.type === "ast:type")?.score).toBe(80);
  163. });
  164. test("Rust enum scores 80", async () => {
  165. const code = `enum State {\n On,\n Off,\n}\n\nfn main() {}`;
  166. const points = await getASTBreakPoints(code, "a.rs");
  167. expect(points.find(p => p.type === "ast:enum")?.score).toBe(80);
  168. });
  169. });