/** * Integration tests for AST-aware chunking. * * Migrated from the standalone test-ast-chunking.mjs script into the * vitest suite. Covers the integration between AST break point extraction * and the chunking pipeline — areas not tested by the unit-level ast.test.ts. */ import { describe, test, expect } from "vitest"; import { getASTBreakPoints } from "../src/ast.js"; import { chunkDocument, chunkDocumentAsync, chunkDocumentWithBreakPoints, mergeBreakPoints, scanBreakPoints, findCodeFences, } from "../src/store.js"; // ========================================================================== // mergeBreakPoints // ========================================================================== describe("mergeBreakPoints", () => { test("merges regex and AST break points, higher score wins at same position", () => { const regexPoints = [ { pos: 10, score: 20, type: "blank" }, { pos: 50, score: 1, type: "newline" }, { pos: 100, score: 20, type: "blank" }, ]; const astPoints = [ { pos: 10, score: 90, type: "ast:func" }, { pos: 75, score: 100, type: "ast:class" }, { pos: 100, score: 60, type: "ast:import" }, ]; const merged = mergeBreakPoints(regexPoints, astPoints); expect(merged).toHaveLength(4); expect(merged.find(p => p.pos === 10)?.score).toBe(90); // AST wins (90 > 20) expect(merged.find(p => p.pos === 50)?.score).toBe(1); // regex only expect(merged.find(p => p.pos === 75)?.score).toBe(100); // AST only expect(merged.find(p => p.pos === 100)?.score).toBe(60); // AST wins (60 > 20) }); test("result is sorted by position", () => { const merged = mergeBreakPoints( [{ pos: 100, score: 10, type: "a" }], [{ pos: 5, score: 50, type: "b" }], ); expect(merged[0]!.pos).toBeLessThan(merged[1]!.pos); }); }); // ========================================================================== // AST vs Regex chunking comparison // ========================================================================== describe("AST vs Regex chunking", () => { // Generate a large TS file with 30 functions const parts: string[] = []; for (let i = 0; i < 30; i++) { parts.push(` export function handler${i}(req: Request, res: Response): void { const startTime = Date.now(); const userId = req.params.userId; const sessionToken = req.headers.authorization; if (!userId || !sessionToken) { res.status(400).json({ error: "Missing required parameters" }); return; } console.log(\`Processing request ${i} for user \${userId}\`); const result = processBusinessLogic${i}(userId, sessionToken); const elapsed = Date.now() - startTime; res.json({ data: result, processingTimeMs: elapsed }); } `); } const largeTS = parts.join("\n"); function countSplitFunctions(chunks: { text: string; pos: number }[]): number { let splits = 0; for (let i = 0; i < 30; i++) { const funcStart = largeTS.indexOf(`function handler${i}(`); const nextFunc = largeTS.indexOf(`function handler${i + 1}(`, funcStart + 1); const funcEnd = nextFunc > 0 ? nextFunc : largeTS.length; const chunkIndices = new Set(); for (let ci = 0; ci < chunks.length; ci++) { const chunkStart = chunks[ci]!.pos; const chunkEnd = chunkStart + chunks[ci]!.text.length; if (chunkStart < funcEnd && chunkEnd > funcStart) { chunkIndices.add(ci); } } if (chunkIndices.size > 1) splits++; } return splits; } test("AST splits fewer functions across chunk boundaries than regex", async () => { const regexChunks = chunkDocument(largeTS); const astChunks = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, "handlers.ts", "auto"); const regexSplits = countSplitFunctions(regexChunks); const astSplits = countSplitFunctions(astChunks); expect(astSplits).toBeLessThanOrEqual(regexSplits); }); test("markdown files produce identical chunks in auto vs regex mode", async () => { const sections: string[] = []; for (let i = 0; i < 15; i++) { sections.push(`# Section ${i}\n\n${"Lorem ipsum dolor sit amet. ".repeat(40)}\n`); } const largeMD = sections.join("\n"); const mdRegex = chunkDocument(largeMD); const mdAst = await chunkDocumentAsync(largeMD, undefined, undefined, undefined, "readme.md", "auto"); expect(mdAst).toHaveLength(mdRegex.length); for (let i = 0; i < mdRegex.length; i++) { expect(mdAst[i]?.text).toBe(mdRegex[i]?.text); expect(mdAst[i]?.pos).toBe(mdRegex[i]?.pos); } }); test("regex strategy bypasses AST entirely", async () => { const regexOnly = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, "handlers.ts", "regex"); const syncRegex = chunkDocument(largeTS); expect(regexOnly).toHaveLength(syncRegex.length); for (let i = 0; i < syncRegex.length; i++) { expect(regexOnly[i]?.text).toBe(syncRegex[i]?.text); } }); test("no filepath falls back to regex", async () => { const noPathChunks = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, undefined, "auto"); const syncRegex = chunkDocument(largeTS); expect(noPathChunks).toHaveLength(syncRegex.length); }); test("small file produces single chunk", async () => { const smallChunks = await chunkDocumentAsync("export const x = 1;", undefined, undefined, undefined, "s.ts", "auto"); expect(smallChunks).toHaveLength(1); }); }); // ========================================================================== // chunkDocumentWithBreakPoints equivalence // ========================================================================== describe("chunkDocumentWithBreakPoints equivalence", () => { test("produces identical output to chunkDocument for the same content", () => { const content = "a".repeat(5000) + "\n\n" + "b".repeat(5000); const old = chunkDocument(content); const withBP = chunkDocumentWithBreakPoints(content, scanBreakPoints(content), findCodeFences(content)); expect(withBP).toHaveLength(old.length); for (let i = 0; i < old.length; i++) { expect(withBP[i]?.text).toBe(old[i]?.text); expect(withBP[i]?.pos).toBe(old[i]?.pos); } }); }); // ========================================================================== // Score assertions not covered by ast.test.ts unit tests // ========================================================================== describe("AST break point scores", () => { test("TypeScript export (class) scores 90", async () => { const code = `export class Foo {}\nexport function bar() {}`; const points = await getASTBreakPoints(code, "a.ts"); const exportPoint = points.find(p => p.type === "ast:export"); expect(exportPoint?.score).toBe(90); }); test("Python class scores 100", async () => { const code = `class Foo:\n pass\n\ndef bar():\n pass`; const points = await getASTBreakPoints(code, "a.py"); expect(points.find(p => p.type === "ast:class")?.score).toBe(100); }); test("Go type scores 80", async () => { const code = `package main\n\ntype Server struct {\n port int\n}\n\nfunc main() {}`; const points = await getASTBreakPoints(code, "a.go"); expect(points.find(p => p.type === "ast:type")?.score).toBe(80); }); test("Rust enum scores 80", async () => { const code = `enum State {\n On,\n Off,\n}\n\nfn main() {}`; const points = await getASTBreakPoints(code, "a.rs"); expect(points.find(p => p.type === "ast:enum")?.score).toBe(80); }); }); // ========================================================================== // Function-level chunk strategy (Phase 2) // ========================================================================== describe("chunkDocumentAsync with chunkStrategy='function'", () => { const TS_CODE = `import { X } from "./x"; export function alpha(): number { const start = Date.now(); return start; } export function beta(): number { return 42; } export class Gamma { constructor() {} run(): void {} } `; test("produces one chunk per top-level code unit + import gap", async () => { const chunks = await chunkDocumentAsync(TS_CODE, undefined, undefined, undefined, "x.ts", "function"); // Expect at least: import-gap, alpha, beta, Gamma = 4 chunks. expect(chunks.length).toBeGreaterThanOrEqual(3); }); test("each function chunk contains exactly one function/class body", async () => { const chunks = await chunkDocumentAsync(TS_CODE, undefined, undefined, undefined, "x.ts", "function"); const alphaChunk = chunks.find(c => c.text.includes("function alpha")); const betaChunk = chunks.find(c => c.text.includes("function beta")); const classChunk = chunks.find(c => c.text.includes("class Gamma")); expect(alphaChunk).toBeDefined(); expect(betaChunk).toBeDefined(); expect(classChunk).toBeDefined(); expect(alphaChunk!.text.includes("function beta")).toBe(false); expect(betaChunk!.text.includes("class Gamma")).toBe(false); }); test("pos reflects absolute offset in original content", async () => { const chunks = await chunkDocumentAsync(TS_CODE, undefined, undefined, undefined, "x.ts", "function"); for (const c of chunks) { expect(c.pos).toBeGreaterThanOrEqual(0); const slice = TS_CODE.slice(c.pos, c.pos + Math.min(20, c.text.length)); const head = c.text.slice(0, Math.min(20, c.text.length)); expect(slice).toBe(head); } }); test("markdown falls back to auto behavior when chunkStrategy='function'", async () => { // Markdown → detectLanguage returns null → getASTFunctionRanges returns [] // → fall through to auto behavior → short markdown = 1 chunk. const md = "# Heading\n\nSome paragraph text."; const chunks = await chunkDocumentAsync(md, undefined, undefined, undefined, "readme.md", "function"); expect(chunks.length).toBe(1); expect(chunks[0]!.text).toBe(md); }); test("code file with only bare statements falls back to auto (no ranges)", async () => { const bare = "const x = 1;\nconst y = 2;\n"; const chunks = await chunkDocumentAsync(bare, undefined, undefined, undefined, "bare.ts", "function"); expect(chunks.length).toBe(1); }); });