test-ast-chunking.mjs 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823
  1. #!/usr/bin/env npx tsx
  2. /**
  3. * Thorough integration test + real-collection performance report for
  4. * AST-aware chunking.
  5. *
  6. * Usage:
  7. * npx tsx test-ast-chunking.mjs # synthetic tests only
  8. * npx tsx test-ast-chunking.mjs /path/to/code # + scan a real directory
  9. * npx tsx test-ast-chunking.mjs ~/dev/myproject # works with ~
  10. * npx tsx test-ast-chunking.mjs --help
  11. *
  12. * The real-collection scan walks the directory tree, finds supported code
  13. * files (.ts/.js/.py/.go/.rs) and markdown (.md), chunks each file with
  14. * both strategies, and prints a comparative performance report.
  15. */
  16. import { readFileSync, readdirSync, statSync } from "node:fs";
  17. import { join, relative, extname, resolve } from "node:path";
  18. import { homedir } from "node:os";
  19. import { detectLanguage, getASTBreakPoints } from "./src/ast.js";
  20. import {
  21. chunkDocument,
  22. chunkDocumentAsync,
  23. chunkDocumentWithBreakPoints,
  24. mergeBreakPoints,
  25. scanBreakPoints,
  26. findCodeFences,
  27. CHUNK_SIZE_CHARS,
  28. } from "./src/store.js";
  29. // ============================================================================
  30. // Helpers
  31. // ============================================================================
  32. let passed = 0;
  33. let failed = 0;
  34. function section(title) {
  35. console.log(`\n${"=".repeat(70)}`);
  36. console.log(` ${title}`);
  37. console.log("=".repeat(70));
  38. }
  39. function check(label, condition, detail) {
  40. if (condition) {
  41. console.log(` PASS ${label}`);
  42. passed++;
  43. } else {
  44. console.log(` FAIL ${label}`);
  45. if (detail) console.log(` ${detail}`);
  46. failed++;
  47. }
  48. }
  49. function formatBytes(bytes) {
  50. if (bytes < 1024) return `${bytes} B`;
  51. if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
  52. return `${(bytes / 1024 / 1024).toFixed(1)} MB`;
  53. }
  54. function pct(n, d) {
  55. if (d === 0) return "N/A";
  56. return `${((n / d) * 100).toFixed(1)}%`;
  57. }
  58. const SKIP_DIRS = new Set([
  59. "node_modules", ".git", ".cache", "vendor", "dist", "build",
  60. "__pycache__", ".tox", ".venv", "venv", ".mypy_cache", "target",
  61. ".next", ".nuxt", "coverage", ".turbo",
  62. ]);
  63. const CODE_EXTS = new Set([
  64. ".ts", ".tsx", ".js", ".jsx", ".mts", ".cts", ".mjs", ".cjs",
  65. ".py", ".go", ".rs",
  66. ]);
  67. const ALL_EXTS = new Set([...CODE_EXTS, ".md"]);
  68. function walkDir(dir, maxFiles = 5000) {
  69. const results = [];
  70. const queue = [dir];
  71. while (queue.length > 0 && results.length < maxFiles) {
  72. const current = queue.shift();
  73. let entries;
  74. try {
  75. entries = readdirSync(current, { withFileTypes: true });
  76. } catch {
  77. continue;
  78. }
  79. for (const entry of entries) {
  80. if (results.length >= maxFiles) break;
  81. if (entry.name.startsWith(".")) continue;
  82. const full = join(current, entry.name);
  83. if (entry.isDirectory()) {
  84. if (!SKIP_DIRS.has(entry.name)) queue.push(full);
  85. } else if (entry.isFile()) {
  86. const ext = extname(entry.name).toLowerCase();
  87. if (ALL_EXTS.has(ext)) results.push(full);
  88. }
  89. }
  90. }
  91. return results;
  92. }
  93. // ============================================================================
  94. // Parse CLI args
  95. // ============================================================================
  96. const args = process.argv.slice(2);
  97. let scanDir = null;
  98. let skipSynthetic = false;
  99. for (const arg of args) {
  100. if (arg === "--help" || arg === "-h") {
  101. console.log(`Usage: npx tsx test-ast-chunking.mjs [options] [directory]
  102. Options:
  103. --help, -h Show this help
  104. --scan-only Skip synthetic tests, only scan directory
  105. Arguments:
  106. directory Path to scan for a real-collection performance report.
  107. Walks the tree for .ts/.tsx/.js/.jsx/.py/.go/.rs/.md files.
  108. Examples:
  109. npx tsx test-ast-chunking.mjs # synthetic tests only
  110. npx tsx test-ast-chunking.mjs ~/dev/myproject # synthetic + real scan
  111. npx tsx test-ast-chunking.mjs --scan-only ~/dev # real scan only
  112. `);
  113. process.exit(0);
  114. }
  115. if (arg === "--scan-only") {
  116. skipSynthetic = true;
  117. } else if (!arg.startsWith("-")) {
  118. scanDir = arg.startsWith("~") ? arg.replace("~", homedir()) : resolve(arg);
  119. }
  120. }
  121. // ============================================================================
  122. // PART 1: Synthetic Tests
  123. // ============================================================================
  124. if (!skipSynthetic) {
  125. // --------------------------------------------------------------------------
  126. // 1. Language Detection
  127. // --------------------------------------------------------------------------
  128. section("1. Language Detection");
  129. const langTests = [
  130. ["src/auth.ts", "typescript"],
  131. ["src/App.tsx", "tsx"],
  132. ["src/util.js", "javascript"],
  133. ["src/App.jsx", "tsx"],
  134. ["src/auth.mts", "typescript"],
  135. ["src/auth.cjs", "javascript"],
  136. ["src/auth.py", "python"],
  137. ["src/auth.go", "go"],
  138. ["src/auth.rs", "rust"],
  139. ["docs/README.md", null],
  140. ["data/file.csv", null],
  141. ["Makefile", null],
  142. ["qmd://myproject/src/auth.ts", "typescript"],
  143. ["qmd://docs/notes.md", null],
  144. ];
  145. for (const [path, expected] of langTests) {
  146. const result = detectLanguage(path);
  147. check(`detectLanguage("${path}") = ${result}`, result === expected,
  148. `expected ${expected}, got ${result}`);
  149. }
  150. // --------------------------------------------------------------------------
  151. // 2. AST Break Points - TypeScript
  152. // --------------------------------------------------------------------------
  153. section("2. AST Break Points - TypeScript");
  154. const TS_SAMPLE = `import { Database } from './db';
  155. import type { User } from './types';
  156. interface AuthConfig {
  157. secret: string;
  158. ttl: number;
  159. }
  160. type UserId = string;
  161. export class AuthService {
  162. constructor(private db: Database) {}
  163. async authenticate(user: User, token: string): Promise<boolean> {
  164. const session = await this.db.findSession(token);
  165. return session?.userId === user.id;
  166. }
  167. validateToken(token: string): boolean {
  168. return token.length === 64;
  169. }
  170. }
  171. export function hashPassword(password: string): string {
  172. return crypto.createHash('sha256').update(password).digest('hex');
  173. }
  174. const helper = (x: number) => x * 2;
  175. `;
  176. const tsPoints = await getASTBreakPoints(TS_SAMPLE, "auth.ts");
  177. console.log(`\n TypeScript break points (${tsPoints.length} total):`);
  178. for (const p of tsPoints) {
  179. const snippet = TS_SAMPLE.slice(p.pos, p.pos + 40).replace(/\n/g, "\\n");
  180. console.log(` pos=${String(p.pos).padStart(4)} score=${String(p.score).padStart(3)} type=${p.type.padEnd(15)} text="${snippet}..."`);
  181. }
  182. check("Has import break points", tsPoints.some(p => p.type === "ast:import"));
  183. check("Has interface break point", tsPoints.some(p => p.type === "ast:iface"));
  184. check("Has type break point", tsPoints.some(p => p.type === "ast:type"));
  185. check("Has export break point (class)", tsPoints.some(p => p.type === "ast:export"));
  186. check("Has method break points", tsPoints.filter(p => p.type === "ast:method").length >= 2);
  187. check("Import scores 60", tsPoints.find(p => p.type === "ast:import")?.score === 60);
  188. check("Interface scores 100", tsPoints.find(p => p.type === "ast:iface")?.score === 100);
  189. check("Method scores 90", tsPoints.find(p => p.type === "ast:method")?.score === 90);
  190. check("Export scores 90", tsPoints.find(p => p.type === "ast:export")?.score === 90);
  191. check("Break points sorted by position", tsPoints.every((p, i) => i === 0 || p.pos >= tsPoints[i-1].pos));
  192. const firstImport = tsPoints.find(p => p.type === "ast:import");
  193. check("First import position is correct",
  194. TS_SAMPLE.slice(firstImport.pos, firstImport.pos + 6) === "import",
  195. `at pos ${firstImport.pos}: "${TS_SAMPLE.slice(firstImport.pos, firstImport.pos + 10)}"`);
  196. // --------------------------------------------------------------------------
  197. // 3. AST Break Points - Python
  198. // --------------------------------------------------------------------------
  199. section("3. AST Break Points - Python");
  200. const PY_SAMPLE = `import os
  201. from typing import Optional, List
  202. class UserService:
  203. def __init__(self, db):
  204. self.db = db
  205. async def find_user(self, user_id: str) -> Optional[dict]:
  206. return await self.db.find(user_id)
  207. def validate(self, user: dict) -> bool:
  208. return "id" in user and "name" in user
  209. def create_user(name: str, email: str) -> dict:
  210. return {"name": name, "email": email}
  211. @login_required
  212. def protected_endpoint():
  213. return "secret"
  214. `;
  215. const pyPoints = await getASTBreakPoints(PY_SAMPLE, "service.py");
  216. console.log(`\n Python break points (${pyPoints.length} total):`);
  217. for (const p of pyPoints) {
  218. const snippet = PY_SAMPLE.slice(p.pos, p.pos + 40).replace(/\n/g, "\\n");
  219. console.log(` pos=${String(p.pos).padStart(4)} score=${String(p.score).padStart(3)} type=${p.type.padEnd(15)} text="${snippet}..."`);
  220. }
  221. check("Has import break points", pyPoints.filter(p => p.type === "ast:import").length >= 2);
  222. check("Has class break point", pyPoints.some(p => p.type === "ast:class"));
  223. check("Has function break points (methods)", pyPoints.filter(p => p.type === "ast:func").length >= 3);
  224. check("Has decorated definition", pyPoints.some(p => p.type === "ast:decorated"));
  225. check("Class scores 100", pyPoints.find(p => p.type === "ast:class")?.score === 100);
  226. // --------------------------------------------------------------------------
  227. // 4. AST Break Points - Go
  228. // --------------------------------------------------------------------------
  229. section("4. AST Break Points - Go");
  230. const GO_SAMPLE = `package main
  231. import (
  232. "fmt"
  233. "net/http"
  234. )
  235. type Server struct {
  236. port int
  237. db *Database
  238. }
  239. type Config interface {
  240. GetPort() int
  241. }
  242. func NewServer(port int) *Server {
  243. return &Server{port: port}
  244. }
  245. func (s *Server) Start() error {
  246. return http.ListenAndServe(fmt.Sprintf(":%d", s.port), nil)
  247. }
  248. func (s *Server) Stop() {
  249. fmt.Println("stopping")
  250. }
  251. `;
  252. const goPoints = await getASTBreakPoints(GO_SAMPLE, "server.go");
  253. console.log(`\n Go break points (${goPoints.length} total):`);
  254. for (const p of goPoints) {
  255. const snippet = GO_SAMPLE.slice(p.pos, p.pos + 40).replace(/\n/g, "\\n");
  256. console.log(` pos=${String(p.pos).padStart(4)} score=${String(p.score).padStart(3)} type=${p.type.padEnd(15)} text="${snippet}..."`);
  257. }
  258. check("Has import break point", goPoints.some(p => p.type === "ast:import"));
  259. check("Has type break points", goPoints.filter(p => p.type === "ast:type").length >= 2);
  260. check("Has function break point", goPoints.some(p => p.type === "ast:func"));
  261. check("Has method break points", goPoints.filter(p => p.type === "ast:method").length >= 2);
  262. check("Type scores 80", goPoints.find(p => p.type === "ast:type")?.score === 80);
  263. // --------------------------------------------------------------------------
  264. // 5. AST Break Points - Rust
  265. // --------------------------------------------------------------------------
  266. section("5. AST Break Points - Rust");
  267. const RS_SAMPLE = `use std::collections::HashMap;
  268. use std::io;
  269. pub struct Config {
  270. port: u16,
  271. host: String,
  272. }
  273. impl Config {
  274. pub fn new(port: u16, host: String) -> Self {
  275. Config { port, host }
  276. }
  277. pub fn address(&self) -> String {
  278. format!("{}:{}", self.host, self.port)
  279. }
  280. }
  281. pub trait Configurable {
  282. fn configure(&mut self, config: &Config);
  283. }
  284. pub enum ServerState {
  285. Running,
  286. Stopped,
  287. Error(String),
  288. }
  289. pub fn start_server(config: Config) -> io::Result<()> {
  290. Ok(())
  291. }
  292. `;
  293. const rsPoints = await getASTBreakPoints(RS_SAMPLE, "config.rs");
  294. console.log(`\n Rust break points (${rsPoints.length} total):`);
  295. for (const p of rsPoints) {
  296. const snippet = RS_SAMPLE.slice(p.pos, p.pos + 40).replace(/\n/g, "\\n");
  297. console.log(` pos=${String(p.pos).padStart(4)} score=${String(p.score).padStart(3)} type=${p.type.padEnd(15)} text="${snippet}..."`);
  298. }
  299. check("Has use/import break points", rsPoints.filter(p => p.type === "ast:import").length >= 2);
  300. check("Has struct break point", rsPoints.some(p => p.type === "ast:struct"));
  301. check("Has impl break point", rsPoints.some(p => p.type === "ast:impl"));
  302. check("Has trait break point", rsPoints.some(p => p.type === "ast:trait"));
  303. check("Has enum break point", rsPoints.some(p => p.type === "ast:enum"));
  304. check("Has function break point", rsPoints.some(p => p.type === "ast:func"));
  305. check("Struct scores 100", rsPoints.find(p => p.type === "ast:struct")?.score === 100);
  306. check("Impl scores 100", rsPoints.find(p => p.type === "ast:impl")?.score === 100);
  307. check("Trait scores 100", rsPoints.find(p => p.type === "ast:trait")?.score === 100);
  308. check("Enum scores 80", rsPoints.find(p => p.type === "ast:enum")?.score === 80);
  309. // --------------------------------------------------------------------------
  310. // 6. Merge Break Points
  311. // --------------------------------------------------------------------------
  312. section("6. mergeBreakPoints");
  313. const regexPoints = [
  314. { pos: 10, score: 20, type: "blank" },
  315. { pos: 50, score: 1, type: "newline" },
  316. { pos: 100, score: 20, type: "blank" },
  317. ];
  318. const astPointsMerge = [
  319. { pos: 10, score: 90, type: "ast:func" },
  320. { pos: 75, score: 100, type: "ast:class" },
  321. { pos: 100, score: 60, type: "ast:import" },
  322. ];
  323. const merged = mergeBreakPoints(regexPoints, astPointsMerge);
  324. console.log(`\n Merged break points (${merged.length} total):`);
  325. for (const p of merged) {
  326. console.log(` pos=${String(p.pos).padStart(4)} score=${String(p.score).padStart(3)} type=${p.type}`);
  327. }
  328. check("Merge has 4 unique positions", merged.length === 4);
  329. check("pos 10: AST wins (90 > 20)", merged.find(p => p.pos === 10)?.score === 90);
  330. check("pos 50: regex only (1)", merged.find(p => p.pos === 50)?.score === 1);
  331. check("pos 75: AST only (100)", merged.find(p => p.pos === 75)?.score === 100);
  332. check("pos 100: AST wins (60 > 20)", merged.find(p => p.pos === 100)?.score === 60);
  333. check("Sorted by position", merged.every((p, i) => i === 0 || p.pos >= merged[i-1].pos));
  334. // --------------------------------------------------------------------------
  335. // 7. AST vs Regex Chunking Comparison (Large Synthetic File)
  336. // --------------------------------------------------------------------------
  337. section("7. AST vs Regex Chunking Comparison");
  338. const largeTSParts = [];
  339. for (let i = 0; i < 30; i++) {
  340. largeTSParts.push(`
  341. export function handler${i}(req: Request, res: Response): void {
  342. const startTime = Date.now();
  343. const userId = req.params.userId;
  344. const sessionToken = req.headers.authorization;
  345. // Validate the incoming request parameters
  346. if (!userId || !sessionToken) {
  347. res.status(400).json({ error: "Missing required parameters" });
  348. return;
  349. }
  350. // Process the request with detailed logging
  351. console.log(\`Processing request \${i} for user \${userId}\`);
  352. const result = processBusinessLogic${i}(userId, sessionToken);
  353. // Return the response with timing info
  354. const elapsed = Date.now() - startTime;
  355. res.json({ data: result, processingTimeMs: elapsed });
  356. }
  357. `);
  358. }
  359. const largeTS = largeTSParts.join("\n");
  360. console.log(`\n Large TS file: ${largeTS.length} chars, ${largeTSParts.length} functions`);
  361. const regexChunks = chunkDocument(largeTS);
  362. const astChunks = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, "handlers.ts", "auto");
  363. console.log(` Regex chunks: ${regexChunks.length}`);
  364. console.log(` AST chunks: ${astChunks.length}`);
  365. function countSplitFunctions(chunks, source) {
  366. let splits = 0;
  367. for (let i = 0; i < 30; i++) {
  368. const funcStart = source.indexOf(`function handler${i}(`);
  369. const nextFunc = source.indexOf(`function handler${i + 1}(`, funcStart + 1);
  370. const funcEnd = nextFunc > 0 ? nextFunc : source.length;
  371. const chunkIndices = new Set();
  372. for (let ci = 0; ci < chunks.length; ci++) {
  373. const chunkStart = chunks[ci].pos;
  374. const chunkEnd = chunkStart + chunks[ci].text.length;
  375. if (chunkStart < funcEnd && chunkEnd > funcStart) {
  376. chunkIndices.add(ci);
  377. }
  378. }
  379. if (chunkIndices.size > 1) splits++;
  380. }
  381. return splits;
  382. }
  383. const regexSplits = countSplitFunctions(regexChunks, largeTS);
  384. const astSplitsSynth = countSplitFunctions(astChunks, largeTS);
  385. console.log(`\n Functions split across chunks:`);
  386. console.log(` Regex: ${regexSplits} / 30`);
  387. console.log(` AST: ${astSplitsSynth} / 30`);
  388. check("AST splits fewer functions than regex", astSplitsSynth <= regexSplits,
  389. `AST split ${astSplitsSynth}, regex split ${regexSplits}`);
  390. // --------------------------------------------------------------------------
  391. // 8. Markdown Files Unchanged
  392. // --------------------------------------------------------------------------
  393. section("8. Markdown Files Unchanged in Auto Mode");
  394. const mdContent = [];
  395. for (let i = 0; i < 15; i++) {
  396. mdContent.push(`# Section ${i}\n\n${"Lorem ipsum dolor sit amet. ".repeat(40)}\n`);
  397. }
  398. const largeMD = mdContent.join("\n");
  399. const mdRegex = chunkDocument(largeMD);
  400. const mdAst = await chunkDocumentAsync(largeMD, undefined, undefined, undefined, "readme.md", "auto");
  401. check("Same number of chunks", mdRegex.length === mdAst.length,
  402. `regex=${mdRegex.length}, ast=${mdAst.length}`);
  403. let mdIdentical = true;
  404. for (let i = 0; i < mdRegex.length; i++) {
  405. if (mdRegex[i]?.text !== mdAst[i]?.text || mdRegex[i]?.pos !== mdAst[i]?.pos) {
  406. mdIdentical = false;
  407. break;
  408. }
  409. }
  410. check("Chunk content is identical", mdIdentical);
  411. // --------------------------------------------------------------------------
  412. // 9-11. Strategy bypass, no-filepath fallback, error handling
  413. // --------------------------------------------------------------------------
  414. section("9. Regex Strategy Bypass");
  415. const regexOnly = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, "handlers.ts", "regex");
  416. const syncRegex = chunkDocument(largeTS);
  417. check("Same chunks as sync regex", regexOnly.length === syncRegex.length &&
  418. regexOnly.every((c, i) => c.text === syncRegex[i]?.text));
  419. section("10. No Filepath Falls Back to Regex");
  420. const noPathChunks = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, undefined, "auto");
  421. check("Same chunks as regex", noPathChunks.length === syncRegex.length);
  422. section("11. Error Handling & Edge Cases");
  423. check("Empty file -> []", (await getASTBreakPoints("", "e.ts")).length === 0);
  424. check("Broken syntax doesn't crash", Array.isArray(await getASTBreakPoints("function { %%", "x.ts")));
  425. check("Unknown ext -> []", (await getASTBreakPoints("data", "f.csv")).length === 0);
  426. check("Markdown -> []", (await getASTBreakPoints("# H", "r.md")).length === 0);
  427. const smallChunks = await chunkDocumentAsync("export const x = 1;", undefined, undefined, undefined, "s.ts", "auto");
  428. check("Small file -> 1 chunk", smallChunks.length === 1);
  429. // --------------------------------------------------------------------------
  430. // 12. chunkDocumentWithBreakPoints Equivalence
  431. // --------------------------------------------------------------------------
  432. section("12. chunkDocumentWithBreakPoints Equivalence");
  433. const eqContent = "a".repeat(5000) + "\n\n" + "b".repeat(5000);
  434. const eqOld = chunkDocument(eqContent);
  435. const eqNew = chunkDocumentWithBreakPoints(eqContent, scanBreakPoints(eqContent), findCodeFences(eqContent));
  436. check("Identical output", eqOld.length === eqNew.length &&
  437. eqOld.every((c, i) => c.text === eqNew[i]?.text && c.pos === eqNew[i]?.pos));
  438. // --------------------------------------------------------------------------
  439. // 13. Synthetic performance
  440. // --------------------------------------------------------------------------
  441. section("13. Synthetic Performance");
  442. const t0 = performance.now();
  443. for (let i = 0; i < 10; i++) await getASTBreakPoints(largeTS, "p.ts");
  444. const astExtractMs = (performance.now() - t0) / 10;
  445. const t1 = performance.now();
  446. for (let i = 0; i < 10; i++) scanBreakPoints(largeTS);
  447. const regexExtractMs = (performance.now() - t1) / 10;
  448. const t2 = performance.now();
  449. for (let i = 0; i < 10; i++) await chunkDocumentAsync(largeTS, undefined, undefined, undefined, "p.ts", "auto");
  450. const astFullMs = (performance.now() - t2) / 10;
  451. const t3 = performance.now();
  452. for (let i = 0; i < 10; i++) chunkDocument(largeTS);
  453. const regexFullMs = (performance.now() - t3) / 10;
  454. console.log(`\n File size: ${formatBytes(largeTS.length)}`);
  455. console.log(` AST break point extraction: ${astExtractMs.toFixed(1)}ms`);
  456. console.log(` Regex break point extraction: ${regexExtractMs.toFixed(1)}ms`);
  457. console.log(` Full AST chunking: ${astFullMs.toFixed(1)}ms`);
  458. console.log(` Full regex chunking: ${regexFullMs.toFixed(1)}ms`);
  459. console.log(` Overhead per file: ${(astFullMs - regexFullMs).toFixed(1)}ms`);
  460. check("AST chunking < 50ms per file", astFullMs < 50, `was ${astFullMs.toFixed(1)}ms`);
  461. // End of synthetic tests
  462. section("Synthetic Test Results");
  463. console.log(`\n ${passed} passed, ${failed} failed`);
  464. } // end if (!skipSynthetic)
  465. // ============================================================================
  466. // PART 2: Real Collection Scan
  467. // ============================================================================
  468. if (scanDir) {
  469. section(`Real Collection Scan: ${scanDir}`);
  470. console.log(`\n Discovering files...`);
  471. const realFiles = walkDir(scanDir);
  472. console.log(` Found ${realFiles.length} files\n`);
  473. if (realFiles.length === 0) {
  474. console.log(" No supported files found. Supported: .ts .tsx .js .jsx .py .go .rs .md");
  475. } else {
  476. // Classify files
  477. const byLang = {};
  478. let totalBytes = 0;
  479. const fileEntries = [];
  480. for (const filepath of realFiles) {
  481. let content;
  482. try {
  483. const stat = statSync(filepath);
  484. if (stat.size > 500_000) continue; // skip files > 500KB
  485. content = readFileSync(filepath, "utf-8");
  486. } catch {
  487. continue;
  488. }
  489. if (!content.trim()) continue;
  490. const rel = relative(scanDir, filepath);
  491. const lang = detectLanguage(filepath);
  492. const langLabel = lang ?? "markdown";
  493. byLang[langLabel] = (byLang[langLabel] || 0) + 1;
  494. totalBytes += content.length;
  495. fileEntries.push({ filepath, rel, lang, langLabel, content });
  496. }
  497. // Print file distribution
  498. console.log(" File distribution:");
  499. for (const [lang, count] of Object.entries(byLang).sort((a, b) => b[1] - a[1])) {
  500. console.log(` ${lang.padEnd(14)} ${count} files`);
  501. }
  502. console.log(` ${"total".padEnd(14)} ${fileEntries.length} files (${formatBytes(totalBytes)})`);
  503. // ---- Per-file analysis ----
  504. // Accumulators
  505. const perLang = {};
  506. let totalRegexChunks = 0;
  507. let totalAstChunks = 0;
  508. let totalRegexMs = 0;
  509. let totalAstMs = 0;
  510. let filesWithDifference = 0;
  511. let multiChunkFiles = 0;
  512. const bigDiffs = []; // files where AST made the biggest difference
  513. console.log(`\n Analyzing ${fileEntries.length} files...\n`);
  514. for (const entry of fileEntries) {
  515. const { rel, lang, langLabel, content } = entry;
  516. const isCode = lang !== null;
  517. // Regex chunking
  518. const rt0 = performance.now();
  519. const rChunks = chunkDocument(content);
  520. const rMs = performance.now() - rt0;
  521. // AST chunking
  522. const at0 = performance.now();
  523. const aChunks = await chunkDocumentAsync(content, undefined, undefined, undefined, rel, "auto");
  524. const aMs = performance.now() - at0;
  525. totalRegexChunks += rChunks.length;
  526. totalAstChunks += aChunks.length;
  527. totalRegexMs += rMs;
  528. totalAstMs += aMs;
  529. if (rChunks.length > 1 || aChunks.length > 1) multiChunkFiles++;
  530. const chunkDiff = aChunks.length - rChunks.length;
  531. const contentDiffers = rChunks.length !== aChunks.length ||
  532. rChunks.some((c, i) => c.text !== aChunks[i]?.text);
  533. if (contentDiffers) filesWithDifference++;
  534. // Per-language stats
  535. if (!perLang[langLabel]) {
  536. perLang[langLabel] = {
  537. files: 0, bytes: 0, regexChunks: 0, astChunks: 0,
  538. regexMs: 0, astMs: 0, astBreakpoints: 0, diffs: 0,
  539. };
  540. }
  541. const s = perLang[langLabel];
  542. s.files++;
  543. s.bytes += content.length;
  544. s.regexChunks += rChunks.length;
  545. s.astChunks += aChunks.length;
  546. s.regexMs += rMs;
  547. s.astMs += aMs;
  548. if (contentDiffers) s.diffs++;
  549. // Count AST breakpoints for code files
  550. if (isCode) {
  551. const bp = await getASTBreakPoints(content, rel);
  552. s.astBreakpoints += bp.length;
  553. }
  554. // Track big differences for the detailed report
  555. if (contentDiffers && isCode && (rChunks.length > 1 || aChunks.length > 1)) {
  556. bigDiffs.push({
  557. rel, lang: langLabel, bytes: content.length,
  558. regexN: rChunks.length, astN: aChunks.length,
  559. diff: chunkDiff, overheadMs: aMs - rMs,
  560. });
  561. }
  562. }
  563. // ---- Aggregate report ----
  564. section("Per-Language Summary");
  565. const langOrder = Object.entries(perLang).sort((a, b) => b[1].files - a[1].files);
  566. const colW = { lang: 14, files: 7, bytes: 10, rChunks: 9, aChunks: 9, bps: 6, diffs: 6, rMs: 9, aMs: 9 };
  567. console.log(
  568. `\n ${"Language".padEnd(colW.lang)}${"Files".padStart(colW.files)}${"Size".padStart(colW.bytes)}` +
  569. `${"Rx Chnk".padStart(colW.rChunks)}${"AST Chnk".padStart(colW.aChunks)}` +
  570. `${"BPs".padStart(colW.bps)}${"Diffs".padStart(colW.diffs)}` +
  571. `${"Rx ms".padStart(colW.rMs)}${"AST ms".padStart(colW.aMs)}`
  572. );
  573. console.log(" " + "-".repeat(Object.values(colW).reduce((a, b) => a + b, 0)));
  574. for (const [lang, s] of langOrder) {
  575. console.log(
  576. ` ${lang.padEnd(colW.lang)}` +
  577. `${String(s.files).padStart(colW.files)}` +
  578. `${formatBytes(s.bytes).padStart(colW.bytes)}` +
  579. `${String(s.regexChunks).padStart(colW.rChunks)}` +
  580. `${String(s.astChunks).padStart(colW.aChunks)}` +
  581. `${String(s.astBreakpoints).padStart(colW.bps)}` +
  582. `${String(s.diffs).padStart(colW.diffs)}` +
  583. `${s.regexMs.toFixed(1).padStart(colW.rMs)}` +
  584. `${s.astMs.toFixed(1).padStart(colW.aMs)}`
  585. );
  586. }
  587. console.log(" " + "-".repeat(Object.values(colW).reduce((a, b) => a + b, 0)));
  588. console.log(
  589. ` ${"TOTAL".padEnd(colW.lang)}` +
  590. `${String(fileEntries.length).padStart(colW.files)}` +
  591. `${formatBytes(totalBytes).padStart(colW.bytes)}` +
  592. `${String(totalRegexChunks).padStart(colW.rChunks)}` +
  593. `${String(totalAstChunks).padStart(colW.aChunks)}` +
  594. `${"".padStart(colW.bps)}` +
  595. `${String(filesWithDifference).padStart(colW.diffs)}` +
  596. `${totalRegexMs.toFixed(1).padStart(colW.rMs)}` +
  597. `${totalAstMs.toFixed(1).padStart(colW.aMs)}`
  598. );
  599. // ---- Headline stats ----
  600. section("Headline Stats");
  601. const codeFiles = fileEntries.filter(e => e.lang !== null).length;
  602. const mdFiles = fileEntries.filter(e => e.lang === null).length;
  603. const avgOverheadMs = codeFiles > 0
  604. ? (langOrder.filter(([l]) => l !== "markdown").reduce((s, [, v]) => s + v.astMs - v.regexMs, 0)) / codeFiles
  605. : 0;
  606. console.log(`
  607. Files scanned: ${fileEntries.length} (${codeFiles} code, ${mdFiles} markdown)
  608. Multi-chunk files: ${multiChunkFiles} (files large enough to produce >1 chunk)
  609. Files where AST differed: ${filesWithDifference} / ${fileEntries.length} (${pct(filesWithDifference, fileEntries.length)})
  610. Total chunks (regex): ${totalRegexChunks}
  611. Total chunks (AST): ${totalAstChunks} (${totalAstChunks > totalRegexChunks ? "+" : ""}${totalAstChunks - totalRegexChunks})
  612. Total time (regex): ${totalRegexMs.toFixed(1)}ms
  613. Total time (AST): ${totalAstMs.toFixed(1)}ms (+${(totalAstMs - totalRegexMs).toFixed(1)}ms overhead)
  614. Avg overhead per code file: ${avgOverheadMs.toFixed(2)}ms
  615. `);
  616. // ---- Top differences ----
  617. if (bigDiffs.length > 0) {
  618. section("Top Files Where AST Changed Chunking");
  619. bigDiffs.sort((a, b) => Math.abs(b.diff) - Math.abs(a.diff));
  620. const topN = bigDiffs.slice(0, 20);
  621. console.log(
  622. `\n ${"File".padEnd(50)} ${"Lang".padEnd(12)} ${"Size".padStart(8)} ` +
  623. `${"Rx".padStart(4)} ${"AST".padStart(4)} ${"Diff".padStart(5)} ` +
  624. `${"OH ms".padStart(7)}`
  625. );
  626. console.log(" " + "-".repeat(94));
  627. for (const d of topN) {
  628. const sign = d.diff > 0 ? "+" : "";
  629. console.log(
  630. ` ${d.rel.slice(0, 49).padEnd(50)} ${d.lang.padEnd(12)} ${formatBytes(d.bytes).padStart(8)} ` +
  631. `${String(d.regexN).padStart(4)} ${String(d.astN).padStart(4)} ${(sign + d.diff).padStart(5)} ` +
  632. `${d.overheadMs.toFixed(1).padStart(7)}`
  633. );
  634. }
  635. if (bigDiffs.length > 20) {
  636. console.log(`\n ... and ${bigDiffs.length - 20} more files with differences`);
  637. }
  638. }
  639. // ---- Markdown regression check ----
  640. const mdEntries = fileEntries.filter(e => e.lang === null);
  641. if (mdEntries.length > 0) {
  642. section("Markdown Regression Check");
  643. let mdRegressions = 0;
  644. for (const entry of mdEntries) {
  645. const rChunks = chunkDocument(entry.content);
  646. const aChunks = await chunkDocumentAsync(entry.content, undefined, undefined, undefined, entry.rel, "auto");
  647. const same = rChunks.length === aChunks.length &&
  648. rChunks.every((c, i) => c.text === aChunks[i]?.text);
  649. if (!same) {
  650. mdRegressions++;
  651. console.log(` REGRESSION: ${entry.rel} (regex=${rChunks.length}, ast=${aChunks.length})`);
  652. }
  653. }
  654. if (mdRegressions === 0) {
  655. console.log(`\n All ${mdEntries.length} markdown files produce identical chunks. No regressions.`);
  656. } else {
  657. console.log(`\n ${mdRegressions} / ${mdEntries.length} markdown files differ (unexpected!)`);
  658. }
  659. }
  660. } // end if realFiles.length > 0
  661. } // end if scanDir
  662. // ============================================================================
  663. // Final Summary
  664. // ============================================================================
  665. console.log(`\n${"=".repeat(70)}`);
  666. if (!skipSynthetic) {
  667. console.log(` SYNTHETIC TESTS: ${passed} passed, ${failed} failed`);
  668. }
  669. if (scanDir) {
  670. console.log(` COLLECTION SCAN: complete (see report above)`);
  671. }
  672. if (!scanDir && !skipSynthetic) {
  673. console.log(`\n Tip: Run with a directory argument to scan real files:`);
  674. console.log(` npx tsx test-ast-chunking.mjs ~/dev/my-project`);
  675. }
  676. console.log("=".repeat(70));
  677. if (failed > 0) process.exit(1);