3 месяцев назад · 244ddf5ecb
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,18 @@
 
															 ## [Unreleased]
														
 
															+### Added
														
 
															+
														
 
															+- AST-aware chunking for code files via `web-tree-sitter`. Supported
														
 
															+  languages: TypeScript/JavaScript, Python, Go, and Rust. Code files
														
 
															+  are chunked at function, class, and import boundaries instead of
														
 
															+  arbitrary text positions. Markdown and unknown file types are unchanged.
														
 
															+- `--chunk-strategy <auto|regex>` flag for `qmd embed` and `qmd query`.
														
 
															+  Default is `regex` (existing behavior). Use `auto` to enable AST-aware
														
 
															+  chunking for code files.
														
 
															+- `qmd status` now shows AST grammar availability.
														
 
															+- SDK: `chunkStrategy` option on `embed()` and `search()` methods.
														
 
															+
														
 
															 ### Fixes
														
 
															 - Sync stale `bun.lock` (`better-sqlite3` 11.x → 12.x). CI and release
														
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -138,6 +138,7 @@ bun test --preload ./src/test-preload.ts test/
 
															 - node-llama-cpp for embeddings (embeddinggemma), reranking (qwen3-reranker), and query expansion (Qwen3)
														
 
															 - Reciprocal Rank Fusion (RRF) for combining results
														
 
															 - Smart chunking: 900 tokens/chunk with 15% overlap, prefers markdown headings as boundaries
														
 
															+- AST-aware chunking: use `--chunk-strategy auto` to chunk code files (.ts/.js/.py/.go/.rs) at function/class/import boundaries via tree-sitter. Default is `regex` (existing behavior). Markdown and unknown file types always use regex chunking.
														
 
															 ## Important: Do NOT run automatically
														
--- a/README.md
+++ b/README.md
@@ -318,6 +318,7 @@ const result = await store.update({
 
															 // Generate vector embeddings
														
 
															 const embedResult = await store.embed({
														
 
															   force: false,           // true to re-embed everything
														
 
															+  chunkStrategy: "auto",  // "regex" (default) or "auto" (AST for code files)
														
 
															   onProgress: ({ current, total, collection }) => {
														
 
															     console.log(`Embedding ${current}/${total}`)
														
 
															   },
														
@@ -564,8 +565,27 @@ qmd embed
 
															 # Force re-embed everything
														
 
															 qmd embed -f
														
 
															+
														
 
															+# Enable AST-aware chunking for code files (TS, JS, Python, Go, Rust)
														
 
															+qmd embed --chunk-strategy auto
														
 
															+
														
 
															+# Also works with query for consistent chunk selection
														
 
															+qmd query "auth flow" --chunk-strategy auto
														
 
															 ```
														
 
															+**AST-aware chunking** (`--chunk-strategy auto`) uses tree-sitter to chunk code
														
 
															+files at function, class, and import boundaries instead of arbitrary text
														
 
															+positions. This produces higher-quality chunks and better search results for
														
 
															+codebases. Markdown and other file types always use regex-based chunking
														
 
															+regardless of strategy.
														
 
															+
														
 
															+The default is `regex` (existing behavior). Use `--chunk-strategy auto` to
														
 
															+opt in. Run `qmd status` to verify which grammars are available.
														
 
															+
														
 
															+> **Note:** Tree-sitter grammars are optional dependencies. If they are not
														
 
															+> installed, `--chunk-strategy auto` falls back to regex-only chunking
														
 
															+> automatically. Tested on both Node.js and Bun.
														
 
															+
														
 
															 ### Context Management
														
 
															 Context adds descriptive metadata to collections and paths, helping search understand your content.
														
@@ -813,6 +833,19 @@ The squared distance decay means a heading 200 tokens back (score ~30) still bea
 
															 **Code Fence Protection:** Break points inside code blocks are ignored—code stays together. If a code block exceeds the chunk size, it's kept whole when possible.
														
 
															+**AST-Aware Chunking (Code Files):**
														
 
															+
														
 
															+For supported code files, QMD also parses the source with [tree-sitter](https://tree-sitter.github.io/) and adds AST-derived break points that are merged with the regex scores above:
														
 
															+
														
 
															+| AST Node | Score | Languages |
														
 
															+|----------|-------|-----------|
														
 
															+| Class / interface / struct / impl / trait | 100 | All |
														
 
															+| Function / method | 90 | All |
														
 
															+| Type alias / enum | 80 | All |
														
 
															+| Import / use declaration | 60 | All |
														
 
															+
														
 
															+Supported for `.ts`, `.tsx`, `.js`, `.jsx`, `.py`, `.go`, and `.rs` files. Enable with `--chunk-strategy auto`. Markdown and other file types always use regex chunking.
														
 
															+
														
 
															 ### Query Flow (Hybrid)
														
 
															 ```
														
--- a/package.json
+++ b/package.json
@@ -51,6 +51,7 @@
 
															     "node-llama-cpp": "^3.17.1",
														
 
															     "picomatch": "^4.0.0",
														
 
															     "sqlite-vec": "^0.1.7-alpha.2",
														
 
															+    "web-tree-sitter": "0.26.7",
														
 
															     "yaml": "^2.8.2",
														
 
															     "zod": "4.2.1"
														
 
															   },
														
@@ -59,7 +60,11 @@
 
															     "sqlite-vec-darwin-x64": "^0.1.7-alpha.2",
														
 
															     "sqlite-vec-linux-arm64": "^0.1.7-alpha.2",
														
 
															     "sqlite-vec-linux-x64": "^0.1.7-alpha.2",
														
 
															-    "sqlite-vec-windows-x64": "^0.1.7-alpha.2"
														
 
															+    "sqlite-vec-windows-x64": "^0.1.7-alpha.2",
														
 
															+    "tree-sitter-go": "0.23.4",
														
 
															+    "tree-sitter-python": "0.23.4",
														
 
															+    "tree-sitter-rust": "0.24.0",
														
 
															+    "tree-sitter-typescript": "0.23.2"
														
 
															   },
														
 
															   "devDependencies": {
														
 
															     "@types/better-sqlite3": "^7.6.0",
														
--- a/src/ast.ts
+++ b/src/ast.ts
@@ -0,0 +1,391 @@
 
															+/**
														
 
															+ * AST-aware chunking support via web-tree-sitter.
														
 
															+ *
														
 
															+ * Provides language detection, AST break point extraction for supported
														
 
															+ * code file types, and a stub for future symbol extraction.
														
 
															+ *
														
 
															+ * All functions degrade gracefully: parse failures or unsupported languages
														
 
															+ * return empty arrays, falling back to regex-only chunking.
														
 
															+ *
														
 
															+ * ## Dependency Note
														
 
															+ *
														
 
															+ * Grammar packages (tree-sitter-typescript, etc.) are listed as
														
 
															+ * optionalDependencies with pinned versions. They ship native prebuilds
														
 
															+ * and source files (~72 MB total) but QMD only uses the .wasm files
														
 
															+ * (~5 MB). If install size becomes a concern, the .wasm files can be
														
 
															+ * bundled directly in the repo (e.g. assets/grammars/) and resolved
														
 
															+ * via import.meta.url instead of require.resolve(), eliminating the
														
 
															+ * grammar packages entirely.
														
 
															+ */
														
 
															+
														
 
															+import { createRequire } from "node:module";
														
 
															+import { extname } from "node:path";
														
 
															+import type { BreakPoint } from "./store.js";
														
 
															+
														
 
															+// web-tree-sitter types — imported dynamically to avoid top-level WASM init
														
 
															+type ParserType = import("web-tree-sitter").Parser;
														
 
															+type LanguageType = import("web-tree-sitter").Language;
														
 
															+type QueryType = import("web-tree-sitter").Query;
														
 
															+
														
 
															+// =============================================================================
														
 
															+// Language Detection
														
 
															+// =============================================================================
														
 
															+
														
 
															+export type SupportedLanguage = "typescript" | "tsx" | "javascript" | "python" | "go" | "rust";
														
 
															+
														
 
															+const EXTENSION_MAP: Record<string, SupportedLanguage> = {
														
 
															+  ".ts": "typescript",
														
 
															+  ".tsx": "tsx",
														
 
															+  ".js": "javascript",
														
 
															+  ".jsx": "tsx",
														
 
															+  ".mts": "typescript",
														
 
															+  ".cts": "typescript",
														
 
															+  ".mjs": "javascript",
														
 
															+  ".cjs": "javascript",
														
 
															+  ".py": "python",
														
 
															+  ".go": "go",
														
 
															+  ".rs": "rust",
														
 
															+};
														
 
															+
														
 
															+/**
														
 
															+ * Detect language from file path extension.
														
 
															+ * Returns null for unsupported or unknown extensions (including .md).
														
 
															+ */
														
 
															+export function detectLanguage(filepath: string): SupportedLanguage | null {
														
 
															+  const ext = extname(filepath).toLowerCase();
														
 
															+  return EXTENSION_MAP[ext] ?? null;
														
 
															+}
														
 
															+
														
 
															+// =============================================================================
														
 
															+// Grammar Resolution
														
 
															+// =============================================================================
														
 
															+
														
 
															+/**
														
 
															+ * Maps language to the npm package and wasm filename for the grammar.
														
 
															+ */
														
 
															+const GRAMMAR_MAP: Record<SupportedLanguage, { pkg: string; wasm: string }> = {
														
 
															+  typescript: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-typescript.wasm" },
														
 
															+  tsx:        { pkg: "tree-sitter-typescript", wasm: "tree-sitter-tsx.wasm" },
														
 
															+  javascript: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-typescript.wasm" },
														
 
															+  python:     { pkg: "tree-sitter-python",     wasm: "tree-sitter-python.wasm" },
														
 
															+  go:         { pkg: "tree-sitter-go",         wasm: "tree-sitter-go.wasm" },
														
 
															+  rust:       { pkg: "tree-sitter-rust",        wasm: "tree-sitter-rust.wasm" },
														
 
															+};
														
 
															+
														
 
															+// =============================================================================
														
 
															+// Per-Language Query Definitions
														
 
															+// =============================================================================
														
 
															+
														
 
															+/**
														
 
															+ * Tree-sitter S-expression queries for each language.
														
 
															+ * Each capture name maps to a break point score via SCORE_MAP.
														
 
															+ *
														
 
															+ * For TypeScript/JavaScript, we match export_statement wrappers to get the
														
 
															+ * correct start position (before `export`), plus bare declarations for
														
 
															+ * non-exported code.
														
 
															+ */
														
 
															+const LANGUAGE_QUERIES: Record<SupportedLanguage, string> = {
														
 
															+  typescript: `
														
 
															+    (export_statement) @export
														
 
															+    (class_declaration) @class
														
 
															+    (function_declaration) @func
														
 
															+    (method_definition) @method
														
 
															+    (interface_declaration) @iface
														
 
															+    (type_alias_declaration) @type
														
 
															+    (enum_declaration) @enum
														
 
															+    (import_statement) @import
														
 
															+    (lexical_declaration (variable_declarator value: (arrow_function))) @func
														
 
															+    (lexical_declaration (variable_declarator value: (function_expression))) @func
														
 
															+  `,
														
 
															+  tsx: `
														
 
															+    (export_statement) @export
														
 
															+    (class_declaration) @class
														
 
															+    (function_declaration) @func
														
 
															+    (method_definition) @method
														
 
															+    (interface_declaration) @iface
														
 
															+    (type_alias_declaration) @type
														
 
															+    (enum_declaration) @enum
														
 
															+    (import_statement) @import
														
 
															+    (lexical_declaration (variable_declarator value: (arrow_function))) @func
														
 
															+    (lexical_declaration (variable_declarator value: (function_expression))) @func
														
 
															+  `,
														
 
															+  javascript: `
														
 
															+    (export_statement) @export
														
 
															+    (class_declaration) @class
														
 
															+    (function_declaration) @func
														
 
															+    (method_definition) @method
														
 
															+    (import_statement) @import
														
 
															+    (lexical_declaration (variable_declarator value: (arrow_function))) @func
														
 
															+    (lexical_declaration (variable_declarator value: (function_expression))) @func
														
 
															+  `,
														
 
															+  python: `
														
 
															+    (class_definition) @class
														
 
															+    (function_definition) @func
														
 
															+    (decorated_definition) @decorated
														
 
															+    (import_statement) @import
														
 
															+    (import_from_statement) @import
														
 
															+  `,
														
 
															+  go: `
														
 
															+    (type_declaration) @type
														
 
															+    (function_declaration) @func
														
 
															+    (method_declaration) @method
														
 
															+    (import_declaration) @import
														
 
															+  `,
														
 
															+  rust: `
														
 
															+    (struct_item) @struct
														
 
															+    (impl_item) @impl
														
 
															+    (function_item) @func
														
 
															+    (trait_item) @trait
														
 
															+    (enum_item) @enum
														
 
															+    (use_declaration) @import
														
 
															+    (type_item) @type
														
 
															+    (mod_item) @mod
														
 
															+  `,
														
 
															+};
														
 
															+
														
 
															+/**
														
 
															+ * Score mapping from capture names to break point scores.
														
 
															+ * Aligned with the markdown BREAK_PATTERNS scale (h1=100, h2=90, etc.)
														
 
															+ * so findBestCutoff() decay works unchanged.
														
 
															+ */
														
 
															+const SCORE_MAP: Record<string, number> = {
														
 
															+  class:     100,
														
 
															+  iface:     100,
														
 
															+  struct:    100,
														
 
															+  trait:     100,
														
 
															+  impl:      100,
														
 
															+  mod:       100,
														
 
															+  export:     90,
														
 
															+  func:       90,
														
 
															+  method:     90,
														
 
															+  decorated:  90,
														
 
															+  type:       80,
														
 
															+  enum:       80,
														
 
															+  import:     60,
														
 
															+};
														
 
															+
														
 
															+// =============================================================================
														
 
															+// Parser Caching & Initialization
														
 
															+// =============================================================================
														
 
															+
														
 
															+let ParserClass: typeof import("web-tree-sitter").Parser | null = null;
														
 
															+let LanguageClass: typeof import("web-tree-sitter").Language | null = null;
														
 
															+let QueryClass: typeof import("web-tree-sitter").Query | null = null;
														
 
															+let initPromise: Promise<void> | null = null;
														
 
															+
														
 
															+/** Languages that have already failed to load — warn only once per process. */
														
 
															+const failedLanguages = new Set<string>();
														
 
															+
														
 
															+/** Cached grammar load promises. */
														
 
															+const grammarCache = new Map<string, Promise<LanguageType>>();
														
 
															+
														
 
															+/** Cached compiled queries per language. */
														
 
															+const queryCache = new Map<string, QueryType>();
														
 
															+
														
 
															+/**
														
 
															+ * Initialize web-tree-sitter. Called once and cached.
														
 
															+ */
														
 
															+async function ensureInit(): Promise<void> {
														
 
															+  if (!initPromise) {
														
 
															+    initPromise = (async () => {
														
 
															+      const mod = await import("web-tree-sitter");
														
 
															+      ParserClass = mod.Parser;
														
 
															+      LanguageClass = mod.Language;
														
 
															+      QueryClass = mod.Query;
														
 
															+      await ParserClass.init();
														
 
															+    })();
														
 
															+  }
														
 
															+  return initPromise;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * Resolve the filesystem path to a grammar .wasm file.
														
 
															+ * Uses createRequire to resolve from installed dependency packages.
														
 
															+ */
														
 
															+function resolveGrammarPath(language: SupportedLanguage): string {
														
 
															+  const { pkg, wasm } = GRAMMAR_MAP[language];
														
 
															+  const require = createRequire(import.meta.url);
														
 
															+  return require.resolve(`${pkg}/${wasm}`);
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * Load and cache a grammar for the given language.
														
 
															+ * Returns null on failure (logs once per language).
														
 
															+ */
														
 
															+async function loadGrammar(language: SupportedLanguage): Promise<LanguageType | null> {
														
 
															+  if (failedLanguages.has(language)) return null;
														
 
															+
														
 
															+  const wasmKey = GRAMMAR_MAP[language].wasm;
														
 
															+  if (!grammarCache.has(wasmKey)) {
														
 
															+    grammarCache.set(wasmKey, (async () => {
														
 
															+      const path = resolveGrammarPath(language);
														
 
															+      return LanguageClass!.load(path);
														
 
															+    })());
														
 
															+  }
														
 
															+
														
 
															+  try {
														
 
															+    return await grammarCache.get(wasmKey)!;
														
 
															+  } catch (err) {
														
 
															+    failedLanguages.add(language);
														
 
															+    grammarCache.delete(wasmKey);
														
 
															+    console.warn(`[qmd] Failed to load tree-sitter grammar for ${language}: ${err}`);
														
 
															+    return null;
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * Get or create a compiled query for the given language.
														
 
															+ */
														
 
															+function getQuery(language: SupportedLanguage, grammar: LanguageType): QueryType {
														
 
															+  if (!queryCache.has(language)) {
														
 
															+    const source = LANGUAGE_QUERIES[language];
														
 
															+    const query = new QueryClass!(grammar, source);
														
 
															+    queryCache.set(language, query);
														
 
															+  }
														
 
															+  return queryCache.get(language)!;
														
 
															+}
														
 
															+
														
 
															+// =============================================================================
														
 
															+// AST Break Point Extraction
														
 
															+// =============================================================================
														
 
															+
														
 
															+/**
														
 
															+ * Parse a source file and return break points at AST node boundaries.
														
 
															+ *
														
 
															+ * Returns an empty array for unsupported languages, parse failures,
														
 
															+ * or grammar loading failures. Never throws.
														
 
															+ *
														
 
															+ * @param content - The file content to parse.
														
 
															+ * @param filepath - The file path (used for language detection).
														
 
															+ * @returns Array of BreakPoint objects suitable for merging with regex break points.
														
 
															+ */
														
 
															+export async function getASTBreakPoints(
														
 
															+  content: string,
														
 
															+  filepath: string,
														
 
															+): Promise<BreakPoint[]> {
														
 
															+  const language = detectLanguage(filepath);
														
 
															+  if (!language) return [];
														
 
															+
														
 
															+  try {
														
 
															+    await ensureInit();
														
 
															+
														
 
															+    const grammar = await loadGrammar(language);
														
 
															+    if (!grammar) return [];
														
 
															+
														
 
															+    const parser = new ParserClass!();
														
 
															+    parser.setLanguage(grammar);
														
 
															+
														
 
															+    const tree = parser.parse(content);
														
 
															+    if (!tree) {
														
 
															+      parser.delete();
														
 
															+      return [];
														
 
															+    }
														
 
															+
														
 
															+    const query = getQuery(language, grammar);
														
 
															+    const captures = query.captures(tree.rootNode);
														
 
															+
														
 
															+    // Deduplicate: at each byte position, keep the highest-scoring capture.
														
 
															+    // This handles cases like export_statement wrapping a class_declaration
														
 
															+    // at different offsets — we want the outermost (earliest) position.
														
 
															+    const seen = new Map<number, BreakPoint>();
														
 
															+
														
 
															+    for (const cap of captures) {
														
 
															+      const pos = cap.node.startIndex;
														
 
															+      const score = SCORE_MAP[cap.name] ?? 20;
														
 
															+      const type = `ast:${cap.name}`;
														
 
															+
														
 
															+      const existing = seen.get(pos);
														
 
															+      if (!existing || score > existing.score) {
														
 
															+        seen.set(pos, { pos, score, type });
														
 
															+      }
														
 
															+    }
														
 
															+
														
 
															+    tree.delete();
														
 
															+    parser.delete();
														
 
															+
														
 
															+    return Array.from(seen.values()).sort((a, b) => a.pos - b.pos);
														
 
															+  } catch (err) {
														
 
															+    console.warn(`[qmd] AST parse failed for ${filepath}, falling back to regex: ${err instanceof Error ? err.message : err}`);
														
 
															+    return [];
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+// =============================================================================
														
 
															+// Health / Status
														
 
															+// =============================================================================
														
 
															+
														
 
															+/**
														
 
															+ * Check which tree-sitter grammars are available.
														
 
															+ * Returns a status object for each supported language.
														
 
															+ */
														
 
															+export async function getASTStatus(): Promise<{
														
 
															+  available: boolean;
														
 
															+  languages: { language: SupportedLanguage; available: boolean; error?: string }[];
														
 
															+}> {
														
 
															+  const languages: { language: SupportedLanguage; available: boolean; error?: string }[] = [];
														
 
															+
														
 
															+  try {
														
 
															+    await ensureInit();
														
 
															+  } catch (err) {
														
 
															+    return {
														
 
															+      available: false,
														
 
															+      languages: (Object.keys(GRAMMAR_MAP) as SupportedLanguage[]).map(lang => ({
														
 
															+        language: lang,
														
 
															+        available: false,
														
 
															+        error: `web-tree-sitter init failed: ${err instanceof Error ? err.message : err}`,
														
 
															+      })),
														
 
															+    };
														
 
															+  }
														
 
															+
														
 
															+  for (const lang of Object.keys(GRAMMAR_MAP) as SupportedLanguage[]) {
														
 
															+    try {
														
 
															+      const grammar = await loadGrammar(lang);
														
 
															+      if (grammar) {
														
 
															+        // Also verify the query compiles
														
 
															+        getQuery(lang, grammar);
														
 
															+        languages.push({ language: lang, available: true });
														
 
															+      } else {
														
 
															+        languages.push({ language: lang, available: false, error: "grammar failed to load" });
														
 
															+      }
														
 
															+    } catch (err) {
														
 
															+      languages.push({
														
 
															+        language: lang,
														
 
															+        available: false,
														
 
															+        error: err instanceof Error ? err.message : String(err),
														
 
															+      });
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  return {
														
 
															+    available: languages.some(l => l.available),
														
 
															+    languages,
														
 
															+  };
														
 
															+}
														
 
															+
														
 
															+// =============================================================================
														
 
															+// Symbol Extraction (Phase 2 Stub)
														
 
															+// =============================================================================
														
 
															+
														
 
															+/**
														
 
															+ * Metadata about a code symbol within a chunk.
														
 
															+ * Stubbed for Phase 2 — always returns empty array in Phase 1.
														
 
															+ */
														
 
															+export interface SymbolInfo {
														
 
															+  name: string;
														
 
															+  kind: string;
														
 
															+  signature?: string;
														
 
															+  line: number;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * Extract symbol metadata for code within a byte range.
														
 
															+ * Stubbed for Phase 2 — returns empty array.
														
 
															+ */
														
 
															+export function extractSymbols(
														
 
															+  _content: string,
														
 
															+  _language: string,
														
 
															+  _startPos: number,
														
 
															+  _endPos: number,
														
 
															+): SymbolInfo[] {
														
 
															+  return [];
														
 
															+}
														
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -75,6 +75,7 @@ import {
 
															   generateEmbeddings,
														
 
															   syncConfigToDb,
														
 
															   type ReindexResult,
														
 
															+  type ChunkStrategy,
														
 
															 } from "../store.js";
														
 
															 import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
														
 
															 import {
														
@@ -372,6 +373,32 @@ async function showStatus(): Promise<void> {
 
															     });
														
 
															   }
														
 
															+  // AST chunking status
														
 
															+  try {
														
 
															+    const { getASTStatus } = await import("../ast.js");
														
 
															+    const ast = await getASTStatus();
														
 
															+    console.log(`\n${c.bold}AST Chunking${c.reset}`);
														
 
															+    if (ast.available) {
														
 
															+      const ok = ast.languages.filter(l => l.available).map(l => l.language);
														
 
															+      const fail = ast.languages.filter(l => !l.available);
														
 
															+      console.log(`  Status:   ${c.green}active${c.reset}`);
														
 
															+      console.log(`  Languages: ${ok.join(", ")}`);
														
 
															+      if (fail.length > 0) {
														
 
															+        for (const f of fail) {
														
 
															+          console.log(`  ${c.yellow}Unavailable: ${f.language} (${f.error})${c.reset}`);
														
 
															+        }
														
 
															+      }
														
 
															+    } else {
														
 
															+      console.log(`  Status:   ${c.yellow}unavailable${c.reset} (falling back to regex chunking)`);
														
 
															+      for (const l of ast.languages) {
														
 
															+        if (l.error) console.log(`  ${c.dim}${l.language}: ${l.error}${c.reset}`);
														
 
															+      }
														
 
															+    }
														
 
															+  } catch {
														
 
															+    console.log(`\n${c.bold}AST Chunking${c.reset}`);
														
 
															+    console.log(`  Status:   ${c.dim}not available${c.reset}`);
														
 
															+  }
														
 
															+
														
 
															   if (collections.length > 0) {
														
 
															     console.log(`\n${c.bold}Collections${c.reset}`);
														
 
															     for (const col of collections) {
														
@@ -1617,10 +1644,17 @@ function parseEmbedBatchOption(name: string, value: unknown): number | undefined
 
															   return parsed;
														
 
															 }
														
 
															+function parseChunkStrategy(value: unknown): ChunkStrategy | undefined {
														
 
															+  if (value === undefined) return undefined;
														
 
															+  const s = String(value);
														
 
															+  if (s === "auto" || s === "regex") return s;
														
 
															+  throw new Error(`--chunk-strategy must be "auto" or "regex" (got "${s}")`);
														
 
															+}
														
 
															+
														
 
															 async function vectorIndex(
														
 
															   model: string = DEFAULT_EMBED_MODEL,
														
 
															   force: boolean = false,
														
 
															-  batchOptions?: { maxDocsPerBatch?: number; maxBatchBytes?: number },
														
 
															+  batchOptions?: { maxDocsPerBatch?: number; maxBatchBytes?: number; chunkStrategy?: ChunkStrategy },
														
 
															 ): Promise<void> {
														
 
															   const storeInstance = getStore();
														
 
															   const db = storeInstance.db;
														
@@ -1653,6 +1687,7 @@ async function vectorIndex(
 
															     model,
														
 
															     maxDocsPerBatch: batchOptions?.maxDocsPerBatch,
														
 
															     maxBatchBytes: batchOptions?.maxBatchBytes,
														
 
															+    chunkStrategy: batchOptions?.chunkStrategy,
														
 
															     onProgress: (info) => {
														
 
															       if (info.totalBytes === 0) return;
														
 
															       const percent = (info.bytesProcessed / info.totalBytes) * 100;
														
@@ -1746,6 +1781,7 @@ type OutputOptions = {
 
															   candidateLimit?: number;  // Max candidates to rerank (default: 40)
														
 
															   intent?: string;       // Domain intent for disambiguation
														
 
															   skipRerank?: boolean;  // Skip LLM reranking, use RRF scores only
														
 
															+  chunkStrategy?: ChunkStrategy;  // "auto" (default) or "regex"
														
 
															 };
														
 
															 // Highlight query terms in text (skip short words < 3 chars)
														
@@ -2231,6 +2267,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
 
															         skipRerank: opts.skipRerank,
														
 
															         explain: !!opts.explain,
														
 
															         intent,
														
 
															+        chunkStrategy: opts.chunkStrategy,
														
 
															         hooks: {
														
 
															           onEmbedStart: (count) => {
														
 
															             process.stderr.write(`${c.dim}Embedding ${count} ${count === 1 ? 'query' : 'queries'}...${c.reset}`);
														
@@ -2258,6 +2295,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
 
															         skipRerank: opts.skipRerank,
														
 
															         explain: !!opts.explain,
														
 
															         intent,
														
 
															+        chunkStrategy: opts.chunkStrategy,
														
 
															         hooks: {
														
 
															           onStrongSignal: (score) => {
														
 
															             process.stderr.write(`${c.dim}Strong BM25 signal (${score.toFixed(2)}) — skipping expansion${c.reset}\n`);
														
@@ -2372,6 +2410,8 @@ function parseCLI() {
 
															       "candidate-limit": { type: "string", short: "C" },
														
 
															       "no-rerank": { type: "boolean", default: false },
														
 
															       intent: { type: "string" },
														
 
															+      // Chunking options
														
 
															+      "chunk-strategy": { type: "string" },  // "regex" (default) or "auto" (AST for code files)
														
 
															       // MCP HTTP transport options
														
 
															       http: { type: "boolean" },
														
 
															       daemon: { type: "boolean" },
														
@@ -2413,6 +2453,7 @@ function parseCLI() {
 
															     skipRerank: !!values["no-rerank"],
														
 
															     explain: !!values.explain,
														
 
															     intent: values.intent as string | undefined,
														
 
															+    chunkStrategy: parseChunkStrategy(values["chunk-strategy"]),
														
 
															   };
														
 
															   return {
														
@@ -2635,6 +2676,9 @@ function showHelp(): void {
 
															   console.log("  --files | --json | --csv | --md | --xml  - Output format");
														
 
															   console.log("  -c, --collection <name>    - Filter by one or more collections");
														
 
															   console.log("");
														
 
															+  console.log("Embed/query options:");
														
 
															+  console.log("  --chunk-strategy <auto|regex> - Chunking mode (default: regex; auto uses AST for code files)");
														
 
															+  console.log("");
														
 
															   console.log("Multi-get options:");
														
 
															   console.log("  -l <num>                   - Maximum lines per file");
														
 
															   console.log("  --max-bytes <num>          - Skip files larger than N bytes (default 10240)");
														
@@ -2957,9 +3001,11 @@ if (isMain) {
 
															       try {
														
 
															         const maxDocsPerBatch = parseEmbedBatchOption("maxDocsPerBatch", cli.values["max-docs-per-batch"]);
														
 
															         const maxBatchMb = parseEmbedBatchOption("maxBatchBytes", cli.values["max-batch-mb"]);
														
 
															+        const embedChunkStrategy = parseChunkStrategy(cli.values["chunk-strategy"]);
														
 
															         await vectorIndex(DEFAULT_EMBED_MODEL, !!cli.values.force, {
														
 
															           maxDocsPerBatch,
														
 
															           maxBatchBytes: maxBatchMb === undefined ? undefined : maxBatchMb * 1024 * 1024,
														
 
															+          chunkStrategy: embedChunkStrategy,
														
 
															         });
														
 
															       } catch (error) {
														
 
															         console.error(error instanceof Error ? error.message : String(error));
														
--- a/src/index.ts
+++ b/src/index.ts
@@ -62,6 +62,7 @@ import {
 
															   type ReindexResult,
														
 
															   type EmbedProgress,
														
 
															   type EmbedResult,
														
 
															+  type ChunkStrategy,
														
 
															 } from "./store.js";
														
 
															 import {
														
 
															   LlamaCpp,
														
@@ -108,8 +109,9 @@ export type {
 
															 // Re-export the internal Store type for advanced consumers
														
 
															 export type { InternalStore };
														
 
															-// Re-export utility functions used by frontends
														
 
															+// Re-export utility functions and types used by frontends
														
 
															 export { extractSnippet, addLineNumbers, DEFAULT_MULTI_GET_MAX_BYTES };
														
 
															+export type { ChunkStrategy } from "./store.js";
														
 
															 // Re-export getDefaultDbPath for CLI/MCP that need the default database location
														
 
															 export { getDefaultDbPath } from "./store.js";
														
@@ -161,6 +163,8 @@ export interface SearchOptions {
 
															   minScore?: number;
														
 
															   /** Include explain traces */
														
 
															   explain?: boolean;
														
 
															+  /** Chunk strategy: "auto" (default, uses AST for code files) or "regex" (legacy) */
														
 
															+  chunkStrategy?: ChunkStrategy;
														
 
															 }
														
 
															 /**
														
@@ -288,6 +292,7 @@ export interface QMDStore {
 
															     model?: string;
														
 
															     maxDocsPerBatch?: number;
														
 
															     maxBatchBytes?: number;
														
 
															+    chunkStrategy?: ChunkStrategy;
														
 
															     onProgress?: (info: EmbedProgress) => void;
														
 
															   }): Promise<EmbedResult>;
														
@@ -391,6 +396,7 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
 
															           explain: opts.explain,
														
 
															           intent: opts.intent,
														
 
															           skipRerank,
														
 
															+          chunkStrategy: opts.chunkStrategy,
														
 
															         });
														
 
															       }
														
@@ -402,6 +408,7 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
 
															         explain: opts.explain,
														
 
															         intent: opts.intent,
														
 
															         skipRerank,
														
 
															+        chunkStrategy: opts.chunkStrategy,
														
 
															       });
														
 
															     },
														
 
															     searchLex: async (q, opts) => internal.searchFTS(q, opts?.limit, opts?.collection),
														
@@ -506,6 +513,7 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
 
															         model: embedOpts?.model,
														
 
															         maxDocsPerBatch: embedOpts?.maxDocsPerBatch,
														
 
															         maxBatchBytes: embedOpts?.maxBatchBytes,
														
 
															+        chunkStrategy: embedOpts?.chunkStrategy,
														
 
															         onProgress: embedOpts?.onProgress,
														
 
															       });
														
 
															     },
														
--- a/src/store.ts
+++ b/src/store.ts
@@ -223,6 +223,89 @@ export function findBestCutoff(
 
															   return bestPos;
														
 
															 }
														
 
															+// =============================================================================
														
 
															+// Chunk Strategy
														
 
															+// =============================================================================
														
 
															+
														
 
															+export type ChunkStrategy = "auto" | "regex";
														
 
															+
														
 
															+/**
														
 
															+ * Merge two sets of break points (e.g. regex + AST), keeping the highest
														
 
															+ * score at each position. Result is sorted by position.
														
 
															+ */
														
 
															+export function mergeBreakPoints(a: BreakPoint[], b: BreakPoint[]): BreakPoint[] {
														
 
															+  const seen = new Map<number, BreakPoint>();
														
 
															+  for (const bp of a) {
														
 
															+    const existing = seen.get(bp.pos);
														
 
															+    if (!existing || bp.score > existing.score) {
														
 
															+      seen.set(bp.pos, bp);
														
 
															+    }
														
 
															+  }
														
 
															+  for (const bp of b) {
														
 
															+    const existing = seen.get(bp.pos);
														
 
															+    if (!existing || bp.score > existing.score) {
														
 
															+      seen.set(bp.pos, bp);
														
 
															+    }
														
 
															+  }
														
 
															+  return Array.from(seen.values()).sort((a, b) => a.pos - b.pos);
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * Core chunk algorithm that operates on precomputed break points and code fences.
														
 
															+ * This is the shared implementation used by both regex-only and AST-aware chunking.
														
 
															+ */
														
 
															+export function chunkDocumentWithBreakPoints(
														
 
															+  content: string,
														
 
															+  breakPoints: BreakPoint[],
														
 
															+  codeFences: CodeFenceRegion[],
														
 
															+  maxChars: number = CHUNK_SIZE_CHARS,
														
 
															+  overlapChars: number = CHUNK_OVERLAP_CHARS,
														
 
															+  windowChars: number = CHUNK_WINDOW_CHARS
														
 
															+): { text: string; pos: number }[] {
														
 
															+  if (content.length <= maxChars) {
														
 
															+    return [{ text: content, pos: 0 }];
														
 
															+  }
														
 
															+
														
 
															+  const chunks: { text: string; pos: number }[] = [];
														
 
															+  let charPos = 0;
														
 
															+
														
 
															+  while (charPos < content.length) {
														
 
															+    const targetEndPos = Math.min(charPos + maxChars, content.length);
														
 
															+    let endPos = targetEndPos;
														
 
															+
														
 
															+    if (endPos < content.length) {
														
 
															+      const bestCutoff = findBestCutoff(
														
 
															+        breakPoints,
														
 
															+        targetEndPos,
														
 
															+        windowChars,
														
 
															+        0.7,
														
 
															+        codeFences
														
 
															+      );
														
 
															+
														
 
															+      if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
														
 
															+        endPos = bestCutoff;
														
 
															+      }
														
 
															+    }
														
 
															+
														
 
															+    if (endPos <= charPos) {
														
 
															+      endPos = Math.min(charPos + maxChars, content.length);
														
 
															+    }
														
 
															+
														
 
															+    chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
														
 
															+
														
 
															+    if (endPos >= content.length) {
														
 
															+      break;
														
 
															+    }
														
 
															+    charPos = endPos - overlapChars;
														
 
															+    const lastChunkPos = chunks.at(-1)!.pos;
														
 
															+    if (charPos <= lastChunkPos) {
														
 
															+      charPos = endPos;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  return chunks;
														
 
															+}
														
 
															+
														
 
															 // Hybrid query: strong BM25 signal detection thresholds
														
 
															 // Skip expensive LLM expansion when top result is strong AND clearly separated from runner-up
														
 
															 export const STRONG_SIGNAL_MIN_SCORE = 0.85;
														
@@ -1197,6 +1280,7 @@ export type EmbedOptions = {
 
															   model?: string;
														
 
															   maxDocsPerBatch?: number;
														
 
															   maxBatchBytes?: number;
														
 
															+  chunkStrategy?: ChunkStrategy;
														
 
															   onProgress?: (info: EmbedProgress) => void;
														
 
															 };
														
@@ -1345,7 +1429,12 @@ export async function generateEmbeddings(
 
															         if (!doc.body.trim()) continue;
														
 
															         const title = extractTitle(doc.body, doc.path);
														
 
															-        const chunks = await chunkDocumentByTokens(doc.body);
														
 
															+        const chunks = await chunkDocumentByTokens(
														
 
															+          doc.body,
														
 
															+          undefined, undefined, undefined,
														
 
															+          doc.path,
														
 
															+          options?.chunkStrategy,
														
 
															+        );
														
 
															         for (let seq = 0; seq < chunks.length; seq++) {
														
 
															           batchChunks.push({
														
@@ -2021,78 +2110,66 @@ export function getActiveDocumentPaths(db: Database, collectionName: string): st
 
															 export { formatQueryForEmbedding, formatDocForEmbedding };
														
 
															+/**
														
 
															+ * Chunk a document using regex-only break point detection.
														
 
															+ * This is the sync, backward-compatible API used by tests and legacy callers.
														
 
															+ */
														
 
															 export function chunkDocument(
														
 
															   content: string,
														
 
															   maxChars: number = CHUNK_SIZE_CHARS,
														
 
															   overlapChars: number = CHUNK_OVERLAP_CHARS,
														
 
															   windowChars: number = CHUNK_WINDOW_CHARS
														
 
															 ): { text: string; pos: number }[] {
														
 
															-  if (content.length <= maxChars) {
														
 
															-    return [{ text: content, pos: 0 }];
														
 
															-  }
														
 
															-
														
 
															-  // Pre-scan all break points and code fences once
														
 
															   const breakPoints = scanBreakPoints(content);
														
 
															   const codeFences = findCodeFences(content);
														
 
															+  return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
														
 
															+}
														
 
															-  const chunks: { text: string; pos: number }[] = [];
														
 
															-  let charPos = 0;
														
 
															-
														
 
															-  while (charPos < content.length) {
														
 
															-    // Calculate target end position for this chunk
														
 
															-    const targetEndPos = Math.min(charPos + maxChars, content.length);
														
 
															-
														
 
															-    let endPos = targetEndPos;
														
 
															-
														
 
															-    // If not at the end, find the best break point
														
 
															-    if (endPos < content.length) {
														
 
															-      // Find best cutoff using scored algorithm
														
 
															-      const bestCutoff = findBestCutoff(
														
 
															-        breakPoints,
														
 
															-        targetEndPos,
														
 
															-        windowChars,
														
 
															-        0.7,
														
 
															-        codeFences
														
 
															-      );
														
 
															-
														
 
															-      // Only use the cutoff if it's within our current chunk
														
 
															-      if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
														
 
															-        endPos = bestCutoff;
														
 
															-      }
														
 
															-    }
														
 
															-
														
 
															-    // Ensure we make progress
														
 
															-    if (endPos <= charPos) {
														
 
															-      endPos = Math.min(charPos + maxChars, content.length);
														
 
															-    }
														
 
															-
														
 
															-    chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
														
 
															+/**
														
 
															+ * Async AST-aware chunking. Detects language from filepath, computes AST
														
 
															+ * break points for supported code files, merges with regex break points,
														
 
															+ * and delegates to the shared chunk algorithm.
														
 
															+ *
														
 
															+ * Falls back to regex-only when strategy is "regex", filepath is absent,
														
 
															+ * or language is unsupported.
														
 
															+ */
														
 
															+export async function chunkDocumentAsync(
														
 
															+  content: string,
														
 
															+  maxChars: number = CHUNK_SIZE_CHARS,
														
 
															+  overlapChars: number = CHUNK_OVERLAP_CHARS,
														
 
															+  windowChars: number = CHUNK_WINDOW_CHARS,
														
 
															+  filepath?: string,
														
 
															+  chunkStrategy: ChunkStrategy = "regex",
														
 
															+): Promise<{ text: string; pos: number }[]> {
														
 
															+  const regexPoints = scanBreakPoints(content);
														
 
															+  const codeFences = findCodeFences(content);
														
 
															-    // Move forward, but overlap with previous chunk
														
 
															-    // For last chunk, don't overlap (just go to the end)
														
 
															-    if (endPos >= content.length) {
														
 
															-      break;
														
 
															-    }
														
 
															-    charPos = endPos - overlapChars;
														
 
															-    const lastChunkPos = chunks.at(-1)!.pos;
														
 
															-    if (charPos <= lastChunkPos) {
														
 
															-      // Prevent infinite loop - move forward at least a bit
														
 
															-      charPos = endPos;
														
 
															+  let breakPoints = regexPoints;
														
 
															+  if (chunkStrategy === "auto" && filepath) {
														
 
															+    const { getASTBreakPoints } = await import("./ast.js");
														
 
															+    const astPoints = await getASTBreakPoints(content, filepath);
														
 
															+    if (astPoints.length > 0) {
														
 
															+      breakPoints = mergeBreakPoints(regexPoints, astPoints);
														
 
															     }
														
 
															   }
														
 
															-  return chunks;
														
 
															+  return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
														
 
															 }
														
 
															 /**
														
 
															  * Chunk a document by actual token count using the LLM tokenizer.
														
 
															  * More accurate than character-based chunking but requires async.
														
 
															+ *
														
 
															+ * When filepath and chunkStrategy are provided, uses AST-aware break points
														
 
															+ * for supported code files.
														
 
															  */
														
 
															 export async function chunkDocumentByTokens(
														
 
															   content: string,
														
 
															   maxTokens: number = CHUNK_SIZE_TOKENS,
														
 
															   overlapTokens: number = CHUNK_OVERLAP_TOKENS,
														
 
															-  windowTokens: number = CHUNK_WINDOW_TOKENS
														
 
															+  windowTokens: number = CHUNK_WINDOW_TOKENS,
														
 
															+  filepath?: string,
														
 
															+  chunkStrategy: ChunkStrategy = "regex",
														
 
															 ): Promise<{ text: string; pos: number; tokens: number }[]> {
														
 
															   const llm = getDefaultLlamaCpp();
														
@@ -2104,7 +2181,8 @@ export async function chunkDocumentByTokens(
 
															   const windowChars = windowTokens * avgCharsPerToken;
														
 
															   // Chunk in character space with conservative estimate
														
 
															-  let charChunks = chunkDocument(content, maxChars, overlapChars, windowChars);
														
 
															+  // Use AST-aware chunking for the first pass when filepath/strategy provided
														
 
															+  let charChunks = await chunkDocumentAsync(content, maxChars, overlapChars, windowChars, filepath, chunkStrategy);
														
 
															   // Tokenize and split any chunks that still exceed limit
														
 
															   const results: { text: string; pos: number; tokens: number }[] = [];
														
@@ -3674,6 +3752,7 @@ export interface HybridQueryOptions {
 
															   explain?: boolean;        // include backend/RRF/rerank score traces
														
 
															   intent?: string;          // domain intent hint for disambiguation
														
 
															   skipRerank?: boolean;     // skip LLM reranking, use only RRF scores
														
 
															+  chunkStrategy?: ChunkStrategy;
														
 
															   hooks?: SearchHooks;
														
 
															 }
														
@@ -3841,8 +3920,9 @@ export async function hybridQuery(
 
															   const intentTerms = intent ? extractIntentTerms(intent) : [];
														
 
															   const docChunkMap = new Map<string, { chunks: { text: string; pos: number }[]; bestIdx: number }>();
														
 
															+  const chunkStrategy = options?.chunkStrategy;
														
 
															   for (const cand of candidates) {
														
 
															-    const chunks = chunkDocument(cand.body);
														
 
															+    const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, chunkStrategy);
														
 
															     if (chunks.length === 0) continue;
														
 
															     // Pick chunk with most keyword overlap (fallback: first chunk)
														
@@ -4082,6 +4162,7 @@ export interface StructuredSearchOptions {
 
															   intent?: string;
														
 
															   /** Skip LLM reranking, use only RRF scores */
														
 
															   skipRerank?: boolean;
														
 
															+  chunkStrategy?: ChunkStrategy;
														
 
															   hooks?: SearchHooks;
														
 
															 }
														
@@ -4230,9 +4311,10 @@ export async function structuredSearch(
 
															   const queryTerms = primaryQuery.toLowerCase().split(/\s+/).filter(t => t.length > 2);
														
 
															   const intentTerms = intent ? extractIntentTerms(intent) : [];
														
 
															   const docChunkMap = new Map<string, { chunks: { text: string; pos: number }[]; bestIdx: number }>();
														
 
															+  const ssChunkStrategy = options?.chunkStrategy;
														
 
															   for (const cand of candidates) {
														
 
															-    const chunks = chunkDocument(cand.body);
														
 
															+    const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, ssChunkStrategy);
														
 
															     if (chunks.length === 0) continue;
														
 
															     // Pick chunk with most keyword overlap
														
--- a/test-ast-chunking.mjs
+++ b/test-ast-chunking.mjs
@@ -0,0 +1,823 @@
 
															+#!/usr/bin/env npx tsx
														
 
															+/**
														
 
															+ * Thorough integration test + real-collection performance report for
														
 
															+ * AST-aware chunking.
														
 
															+ *
														
 
															+ * Usage:
														
 
															+ *   npx tsx test-ast-chunking.mjs                  # synthetic tests only
														
 
															+ *   npx tsx test-ast-chunking.mjs /path/to/code    # + scan a real directory
														
 
															+ *   npx tsx test-ast-chunking.mjs ~/dev/myproject   # works with ~
														
 
															+ *   npx tsx test-ast-chunking.mjs --help
														
 
															+ *
														
 
															+ * The real-collection scan walks the directory tree, finds supported code
														
 
															+ * files (.ts/.js/.py/.go/.rs) and markdown (.md), chunks each file with
														
 
															+ * both strategies, and prints a comparative performance report.
														
 
															+ */
														
 
															+
														
 
															+import { readFileSync, readdirSync, statSync } from "node:fs";
														
 
															+import { join, relative, extname, resolve } from "node:path";
														
 
															+import { homedir } from "node:os";
														
 
															+import { detectLanguage, getASTBreakPoints } from "./src/ast.js";
														
 
															+import {
														
 
															+  chunkDocument,
														
 
															+  chunkDocumentAsync,
														
 
															+  chunkDocumentWithBreakPoints,
														
 
															+  mergeBreakPoints,
														
 
															+  scanBreakPoints,
														
 
															+  findCodeFences,
														
 
															+  CHUNK_SIZE_CHARS,
														
 
															+} from "./src/store.js";
														
 
															+
														
 
															+// ============================================================================
														
 
															+// Helpers
														
 
															+// ============================================================================
														
 
															+
														
 
															+let passed = 0;
														
 
															+let failed = 0;
														
 
															+
														
 
															+function section(title) {
														
 
															+  console.log(`\n${"=".repeat(70)}`);
														
 
															+  console.log(`  ${title}`);
														
 
															+  console.log("=".repeat(70));
														
 
															+}
														
 
															+
														
 
															+function check(label, condition, detail) {
														
 
															+  if (condition) {
														
 
															+    console.log(`  PASS  ${label}`);
														
 
															+    passed++;
														
 
															+  } else {
														
 
															+    console.log(`  FAIL  ${label}`);
														
 
															+    if (detail) console.log(`        ${detail}`);
														
 
															+    failed++;
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+function formatBytes(bytes) {
														
 
															+  if (bytes < 1024) return `${bytes} B`;
														
 
															+  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
														
 
															+  return `${(bytes / 1024 / 1024).toFixed(1)} MB`;
														
 
															+}
														
 
															+
														
 
															+function pct(n, d) {
														
 
															+  if (d === 0) return "N/A";
														
 
															+  return `${((n / d) * 100).toFixed(1)}%`;
														
 
															+}
														
 
															+
														
 
															+const SKIP_DIRS = new Set([
														
 
															+  "node_modules", ".git", ".cache", "vendor", "dist", "build",
														
 
															+  "__pycache__", ".tox", ".venv", "venv", ".mypy_cache", "target",
														
 
															+  ".next", ".nuxt", "coverage", ".turbo",
														
 
															+]);
														
 
															+
														
 
															+const CODE_EXTS = new Set([
														
 
															+  ".ts", ".tsx", ".js", ".jsx", ".mts", ".cts", ".mjs", ".cjs",
														
 
															+  ".py", ".go", ".rs",
														
 
															+]);
														
 
															+
														
 
															+const ALL_EXTS = new Set([...CODE_EXTS, ".md"]);
														
 
															+
														
 
															+function walkDir(dir, maxFiles = 5000) {
														
 
															+  const results = [];
														
 
															+  const queue = [dir];
														
 
															+  while (queue.length > 0 && results.length < maxFiles) {
														
 
															+    const current = queue.shift();
														
 
															+    let entries;
														
 
															+    try {
														
 
															+      entries = readdirSync(current, { withFileTypes: true });
														
 
															+    } catch {
														
 
															+      continue;
														
 
															+    }
														
 
															+    for (const entry of entries) {
														
 
															+      if (results.length >= maxFiles) break;
														
 
															+      if (entry.name.startsWith(".")) continue;
														
 
															+      const full = join(current, entry.name);
														
 
															+      if (entry.isDirectory()) {
														
 
															+        if (!SKIP_DIRS.has(entry.name)) queue.push(full);
														
 
															+      } else if (entry.isFile()) {
														
 
															+        const ext = extname(entry.name).toLowerCase();
														
 
															+        if (ALL_EXTS.has(ext)) results.push(full);
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+  return results;
														
 
															+}
														
 
															+
														
 
															+// ============================================================================
														
 
															+// Parse CLI args
														
 
															+// ============================================================================
														
 
															+
														
 
															+const args = process.argv.slice(2);
														
 
															+let scanDir = null;
														
 
															+let skipSynthetic = false;
														
 
															+
														
 
															+for (const arg of args) {
														
 
															+  if (arg === "--help" || arg === "-h") {
														
 
															+    console.log(`Usage: npx tsx test-ast-chunking.mjs [options] [directory]
														
 
															+
														
 
															+Options:
														
 
															+  --help, -h          Show this help
														
 
															+  --scan-only         Skip synthetic tests, only scan directory
														
 
															+
														
 
															+Arguments:
														
 
															+  directory           Path to scan for a real-collection performance report.
														
 
															+                      Walks the tree for .ts/.tsx/.js/.jsx/.py/.go/.rs/.md files.
														
 
															+
														
 
															+Examples:
														
 
															+  npx tsx test-ast-chunking.mjs                    # synthetic tests only
														
 
															+  npx tsx test-ast-chunking.mjs ~/dev/myproject     # synthetic + real scan
														
 
															+  npx tsx test-ast-chunking.mjs --scan-only ~/dev   # real scan only
														
 
															+`);
														
 
															+    process.exit(0);
														
 
															+  }
														
 
															+  if (arg === "--scan-only") {
														
 
															+    skipSynthetic = true;
														
 
															+  } else if (!arg.startsWith("-")) {
														
 
															+    scanDir = arg.startsWith("~") ? arg.replace("~", homedir()) : resolve(arg);
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+// ============================================================================
														
 
															+// PART 1: Synthetic Tests
														
 
															+// ============================================================================
														
 
															+
														
 
															+if (!skipSynthetic) {
														
 
															+
														
 
															+// --------------------------------------------------------------------------
														
 
															+// 1. Language Detection
														
 
															+// --------------------------------------------------------------------------
														
 
															+section("1. Language Detection");
														
 
															+
														
 
															+const langTests = [
														
 
															+  ["src/auth.ts", "typescript"],
														
 
															+  ["src/App.tsx", "tsx"],
														
 
															+  ["src/util.js", "javascript"],
														
 
															+  ["src/App.jsx", "tsx"],
														
 
															+  ["src/auth.mts", "typescript"],
														
 
															+  ["src/auth.cjs", "javascript"],
														
 
															+  ["src/auth.py", "python"],
														
 
															+  ["src/auth.go", "go"],
														
 
															+  ["src/auth.rs", "rust"],
														
 
															+  ["docs/README.md", null],
														
 
															+  ["data/file.csv", null],
														
 
															+  ["Makefile", null],
														
 
															+  ["qmd://myproject/src/auth.ts", "typescript"],
														
 
															+  ["qmd://docs/notes.md", null],
														
 
															+];
														
 
															+
														
 
															+for (const [path, expected] of langTests) {
														
 
															+  const result = detectLanguage(path);
														
 
															+  check(`detectLanguage("${path}") = ${result}`, result === expected,
														
 
															+    `expected ${expected}, got ${result}`);
														
 
															+}
														
 
															+
														
 
															+// --------------------------------------------------------------------------
														
 
															+// 2. AST Break Points - TypeScript
														
 
															+// --------------------------------------------------------------------------
														
 
															+section("2. AST Break Points - TypeScript");
														
 
															+
														
 
															+const TS_SAMPLE = `import { Database } from './db';
														
 
															+import type { User } from './types';
														
 
															+
														
 
															+interface AuthConfig {
														
 
															+  secret: string;
														
 
															+  ttl: number;
														
 
															+}
														
 
															+
														
 
															+type UserId = string;
														
 
															+
														
 
															+export class AuthService {
														
 
															+  constructor(private db: Database) {}
														
 
															+
														
 
															+  async authenticate(user: User, token: string): Promise<boolean> {
														
 
															+    const session = await this.db.findSession(token);
														
 
															+    return session?.userId === user.id;
														
 
															+  }
														
 
															+
														
 
															+  validateToken(token: string): boolean {
														
 
															+    return token.length === 64;
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+export function hashPassword(password: string): string {
														
 
															+  return crypto.createHash('sha256').update(password).digest('hex');
														
 
															+}
														
 
															+
														
 
															+const helper = (x: number) => x * 2;
														
 
															+`;
														
 
															+
														
 
															+const tsPoints = await getASTBreakPoints(TS_SAMPLE, "auth.ts");
														
 
															+console.log(`\n  TypeScript break points (${tsPoints.length} total):`);
														
 
															+for (const p of tsPoints) {
														
 
															+  const snippet = TS_SAMPLE.slice(p.pos, p.pos + 40).replace(/\n/g, "\\n");
														
 
															+  console.log(`    pos=${String(p.pos).padStart(4)} score=${String(p.score).padStart(3)} type=${p.type.padEnd(15)} text="${snippet}..."`);
														
 
															+}
														
 
															+
														
 
															+check("Has import break points", tsPoints.some(p => p.type === "ast:import"));
														
 
															+check("Has interface break point", tsPoints.some(p => p.type === "ast:iface"));
														
 
															+check("Has type break point", tsPoints.some(p => p.type === "ast:type"));
														
 
															+check("Has export break point (class)", tsPoints.some(p => p.type === "ast:export"));
														
 
															+check("Has method break points", tsPoints.filter(p => p.type === "ast:method").length >= 2);
														
 
															+check("Import scores 60", tsPoints.find(p => p.type === "ast:import")?.score === 60);
														
 
															+check("Interface scores 100", tsPoints.find(p => p.type === "ast:iface")?.score === 100);
														
 
															+check("Method scores 90", tsPoints.find(p => p.type === "ast:method")?.score === 90);
														
 
															+check("Export scores 90", tsPoints.find(p => p.type === "ast:export")?.score === 90);
														
 
															+check("Break points sorted by position", tsPoints.every((p, i) => i === 0 || p.pos >= tsPoints[i-1].pos));
														
 
															+
														
 
															+const firstImport = tsPoints.find(p => p.type === "ast:import");
														
 
															+check("First import position is correct",
														
 
															+  TS_SAMPLE.slice(firstImport.pos, firstImport.pos + 6) === "import",
														
 
															+  `at pos ${firstImport.pos}: "${TS_SAMPLE.slice(firstImport.pos, firstImport.pos + 10)}"`);
														
 
															+
														
 
															+// --------------------------------------------------------------------------
														
 
															+// 3. AST Break Points - Python
														
 
															+// --------------------------------------------------------------------------
														
 
															+section("3. AST Break Points - Python");
														
 
															+
														
 
															+const PY_SAMPLE = `import os
														
 
															+from typing import Optional, List
														
 
															+
														
 
															+class UserService:
														
 
															+    def __init__(self, db):
														
 
															+        self.db = db
														
 
															+
														
 
															+    async def find_user(self, user_id: str) -> Optional[dict]:
														
 
															+        return await self.db.find(user_id)
														
 
															+
														
 
															+    def validate(self, user: dict) -> bool:
														
 
															+        return "id" in user and "name" in user
														
 
															+
														
 
															+def create_user(name: str, email: str) -> dict:
														
 
															+    return {"name": name, "email": email}
														
 
															+
														
 
															+@login_required
														
 
															+def protected_endpoint():
														
 
															+    return "secret"
														
 
															+`;
														
 
															+
														
 
															+const pyPoints = await getASTBreakPoints(PY_SAMPLE, "service.py");
														
 
															+console.log(`\n  Python break points (${pyPoints.length} total):`);
														
 
															+for (const p of pyPoints) {
														
 
															+  const snippet = PY_SAMPLE.slice(p.pos, p.pos + 40).replace(/\n/g, "\\n");
														
 
															+  console.log(`    pos=${String(p.pos).padStart(4)} score=${String(p.score).padStart(3)} type=${p.type.padEnd(15)} text="${snippet}..."`);
														
 
															+}
														
 
															+
														
 
															+check("Has import break points", pyPoints.filter(p => p.type === "ast:import").length >= 2);
														
 
															+check("Has class break point", pyPoints.some(p => p.type === "ast:class"));
														
 
															+check("Has function break points (methods)", pyPoints.filter(p => p.type === "ast:func").length >= 3);
														
 
															+check("Has decorated definition", pyPoints.some(p => p.type === "ast:decorated"));
														
 
															+check("Class scores 100", pyPoints.find(p => p.type === "ast:class")?.score === 100);
														
 
															+
														
 
															+// --------------------------------------------------------------------------
														
 
															+// 4. AST Break Points - Go
														
 
															+// --------------------------------------------------------------------------
														
 
															+section("4. AST Break Points - Go");
														
 
															+
														
 
															+const GO_SAMPLE = `package main
														
 
															+
														
 
															+import (
														
 
															+    "fmt"
														
 
															+    "net/http"
														
 
															+)
														
 
															+
														
 
															+type Server struct {
														
 
															+    port int
														
 
															+    db   *Database
														
 
															+}
														
 
															+
														
 
															+type Config interface {
														
 
															+    GetPort() int
														
 
															+}
														
 
															+
														
 
															+func NewServer(port int) *Server {
														
 
															+    return &Server{port: port}
														
 
															+}
														
 
															+
														
 
															+func (s *Server) Start() error {
														
 
															+    return http.ListenAndServe(fmt.Sprintf(":%d", s.port), nil)
														
 
															+}
														
 
															+
														
 
															+func (s *Server) Stop() {
														
 
															+    fmt.Println("stopping")
														
 
															+}
														
 
															+`;
														
 
															+
														
 
															+const goPoints = await getASTBreakPoints(GO_SAMPLE, "server.go");
														
 
															+console.log(`\n  Go break points (${goPoints.length} total):`);
														
 
															+for (const p of goPoints) {
														
 
															+  const snippet = GO_SAMPLE.slice(p.pos, p.pos + 40).replace(/\n/g, "\\n");
														
 
															+  console.log(`    pos=${String(p.pos).padStart(4)} score=${String(p.score).padStart(3)} type=${p.type.padEnd(15)} text="${snippet}..."`);
														
 
															+}
														
 
															+
														
 
															+check("Has import break point", goPoints.some(p => p.type === "ast:import"));
														
 
															+check("Has type break points", goPoints.filter(p => p.type === "ast:type").length >= 2);
														
 
															+check("Has function break point", goPoints.some(p => p.type === "ast:func"));
														
 
															+check("Has method break points", goPoints.filter(p => p.type === "ast:method").length >= 2);
														
 
															+check("Type scores 80", goPoints.find(p => p.type === "ast:type")?.score === 80);
														
 
															+
														
 
															+// --------------------------------------------------------------------------
														
 
															+// 5. AST Break Points - Rust
														
 
															+// --------------------------------------------------------------------------
														
 
															+section("5. AST Break Points - Rust");
														
 
															+
														
 
															+const RS_SAMPLE = `use std::collections::HashMap;
														
 
															+use std::io;
														
 
															+
														
 
															+pub struct Config {
														
 
															+    port: u16,
														
 
															+    host: String,
														
 
															+}
														
 
															+
														
 
															+impl Config {
														
 
															+    pub fn new(port: u16, host: String) -> Self {
														
 
															+        Config { port, host }
														
 
															+    }
														
 
															+
														
 
															+    pub fn address(&self) -> String {
														
 
															+        format!("{}:{}", self.host, self.port)
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+pub trait Configurable {
														
 
															+    fn configure(&mut self, config: &Config);
														
 
															+}
														
 
															+
														
 
															+pub enum ServerState {
														
 
															+    Running,
														
 
															+    Stopped,
														
 
															+    Error(String),
														
 
															+}
														
 
															+
														
 
															+pub fn start_server(config: Config) -> io::Result<()> {
														
 
															+    Ok(())
														
 
															+}
														
 
															+`;
														
 
															+
														
 
															+const rsPoints = await getASTBreakPoints(RS_SAMPLE, "config.rs");
														
 
															+console.log(`\n  Rust break points (${rsPoints.length} total):`);
														
 
															+for (const p of rsPoints) {
														
 
															+  const snippet = RS_SAMPLE.slice(p.pos, p.pos + 40).replace(/\n/g, "\\n");
														
 
															+  console.log(`    pos=${String(p.pos).padStart(4)} score=${String(p.score).padStart(3)} type=${p.type.padEnd(15)} text="${snippet}..."`);
														
 
															+}
														
 
															+
														
 
															+check("Has use/import break points", rsPoints.filter(p => p.type === "ast:import").length >= 2);
														
 
															+check("Has struct break point", rsPoints.some(p => p.type === "ast:struct"));
														
 
															+check("Has impl break point", rsPoints.some(p => p.type === "ast:impl"));
														
 
															+check("Has trait break point", rsPoints.some(p => p.type === "ast:trait"));
														
 
															+check("Has enum break point", rsPoints.some(p => p.type === "ast:enum"));
														
 
															+check("Has function break point", rsPoints.some(p => p.type === "ast:func"));
														
 
															+check("Struct scores 100", rsPoints.find(p => p.type === "ast:struct")?.score === 100);
														
 
															+check("Impl scores 100", rsPoints.find(p => p.type === "ast:impl")?.score === 100);
														
 
															+check("Trait scores 100", rsPoints.find(p => p.type === "ast:trait")?.score === 100);
														
 
															+check("Enum scores 80", rsPoints.find(p => p.type === "ast:enum")?.score === 80);
														
 
															+
														
 
															+// --------------------------------------------------------------------------
														
 
															+// 6. Merge Break Points
														
 
															+// --------------------------------------------------------------------------
														
 
															+section("6. mergeBreakPoints");
														
 
															+
														
 
															+const regexPoints = [
														
 
															+  { pos: 10, score: 20, type: "blank" },
														
 
															+  { pos: 50, score: 1, type: "newline" },
														
 
															+  { pos: 100, score: 20, type: "blank" },
														
 
															+];
														
 
															+const astPointsMerge = [
														
 
															+  { pos: 10, score: 90, type: "ast:func" },
														
 
															+  { pos: 75, score: 100, type: "ast:class" },
														
 
															+  { pos: 100, score: 60, type: "ast:import" },
														
 
															+];
														
 
															+
														
 
															+const merged = mergeBreakPoints(regexPoints, astPointsMerge);
														
 
															+console.log(`\n  Merged break points (${merged.length} total):`);
														
 
															+for (const p of merged) {
														
 
															+  console.log(`    pos=${String(p.pos).padStart(4)} score=${String(p.score).padStart(3)} type=${p.type}`);
														
 
															+}
														
 
															+
														
 
															+check("Merge has 4 unique positions", merged.length === 4);
														
 
															+check("pos 10: AST wins (90 > 20)", merged.find(p => p.pos === 10)?.score === 90);
														
 
															+check("pos 50: regex only (1)", merged.find(p => p.pos === 50)?.score === 1);
														
 
															+check("pos 75: AST only (100)", merged.find(p => p.pos === 75)?.score === 100);
														
 
															+check("pos 100: AST wins (60 > 20)", merged.find(p => p.pos === 100)?.score === 60);
														
 
															+check("Sorted by position", merged.every((p, i) => i === 0 || p.pos >= merged[i-1].pos));
														
 
															+
														
 
															+// --------------------------------------------------------------------------
														
 
															+// 7. AST vs Regex Chunking Comparison (Large Synthetic File)
														
 
															+// --------------------------------------------------------------------------
														
 
															+section("7. AST vs Regex Chunking Comparison");
														
 
															+
														
 
															+const largeTSParts = [];
														
 
															+for (let i = 0; i < 30; i++) {
														
 
															+  largeTSParts.push(`
														
 
															+export function handler${i}(req: Request, res: Response): void {
														
 
															+  const startTime = Date.now();
														
 
															+  const userId = req.params.userId;
														
 
															+  const sessionToken = req.headers.authorization;
														
 
															+
														
 
															+  // Validate the incoming request parameters
														
 
															+  if (!userId || !sessionToken) {
														
 
															+    res.status(400).json({ error: "Missing required parameters" });
														
 
															+    return;
														
 
															+  }
														
 
															+
														
 
															+  // Process the request with detailed logging
														
 
															+  console.log(\`Processing request \${i} for user \${userId}\`);
														
 
															+  const result = processBusinessLogic${i}(userId, sessionToken);
														
 
															+
														
 
															+  // Return the response with timing info
														
 
															+  const elapsed = Date.now() - startTime;
														
 
															+  res.json({ data: result, processingTimeMs: elapsed });
														
 
															+}
														
 
															+`);
														
 
															+}
														
 
															+const largeTS = largeTSParts.join("\n");
														
 
															+
														
 
															+console.log(`\n  Large TS file: ${largeTS.length} chars, ${largeTSParts.length} functions`);
														
 
															+
														
 
															+const regexChunks = chunkDocument(largeTS);
														
 
															+const astChunks = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, "handlers.ts", "auto");
														
 
															+
														
 
															+console.log(`  Regex chunks: ${regexChunks.length}`);
														
 
															+console.log(`  AST chunks:   ${astChunks.length}`);
														
 
															+
														
 
															+function countSplitFunctions(chunks, source) {
														
 
															+  let splits = 0;
														
 
															+  for (let i = 0; i < 30; i++) {
														
 
															+    const funcStart = source.indexOf(`function handler${i}(`);
														
 
															+    const nextFunc = source.indexOf(`function handler${i + 1}(`, funcStart + 1);
														
 
															+    const funcEnd = nextFunc > 0 ? nextFunc : source.length;
														
 
															+    const chunkIndices = new Set();
														
 
															+    for (let ci = 0; ci < chunks.length; ci++) {
														
 
															+      const chunkStart = chunks[ci].pos;
														
 
															+      const chunkEnd = chunkStart + chunks[ci].text.length;
														
 
															+      if (chunkStart < funcEnd && chunkEnd > funcStart) {
														
 
															+        chunkIndices.add(ci);
														
 
															+      }
														
 
															+    }
														
 
															+    if (chunkIndices.size > 1) splits++;
														
 
															+  }
														
 
															+  return splits;
														
 
															+}
														
 
															+
														
 
															+const regexSplits = countSplitFunctions(regexChunks, largeTS);
														
 
															+const astSplitsSynth = countSplitFunctions(astChunks, largeTS);
														
 
															+
														
 
															+console.log(`\n  Functions split across chunks:`);
														
 
															+console.log(`    Regex: ${regexSplits} / 30`);
														
 
															+console.log(`    AST:   ${astSplitsSynth} / 30`);
														
 
															+
														
 
															+check("AST splits fewer functions than regex", astSplitsSynth <= regexSplits,
														
 
															+  `AST split ${astSplitsSynth}, regex split ${regexSplits}`);
														
 
															+
														
 
															+// --------------------------------------------------------------------------
														
 
															+// 8. Markdown Files Unchanged
														
 
															+// --------------------------------------------------------------------------
														
 
															+section("8. Markdown Files Unchanged in Auto Mode");
														
 
															+
														
 
															+const mdContent = [];
														
 
															+for (let i = 0; i < 15; i++) {
														
 
															+  mdContent.push(`# Section ${i}\n\n${"Lorem ipsum dolor sit amet. ".repeat(40)}\n`);
														
 
															+}
														
 
															+const largeMD = mdContent.join("\n");
														
 
															+
														
 
															+const mdRegex = chunkDocument(largeMD);
														
 
															+const mdAst = await chunkDocumentAsync(largeMD, undefined, undefined, undefined, "readme.md", "auto");
														
 
															+
														
 
															+check("Same number of chunks", mdRegex.length === mdAst.length,
														
 
															+  `regex=${mdRegex.length}, ast=${mdAst.length}`);
														
 
															+
														
 
															+let mdIdentical = true;
														
 
															+for (let i = 0; i < mdRegex.length; i++) {
														
 
															+  if (mdRegex[i]?.text !== mdAst[i]?.text || mdRegex[i]?.pos !== mdAst[i]?.pos) {
														
 
															+    mdIdentical = false;
														
 
															+    break;
														
 
															+  }
														
 
															+}
														
 
															+check("Chunk content is identical", mdIdentical);
														
 
															+
														
 
															+// --------------------------------------------------------------------------
														
 
															+// 9-11. Strategy bypass, no-filepath fallback, error handling
														
 
															+// --------------------------------------------------------------------------
														
 
															+section("9. Regex Strategy Bypass");
														
 
															+const regexOnly = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, "handlers.ts", "regex");
														
 
															+const syncRegex = chunkDocument(largeTS);
														
 
															+check("Same chunks as sync regex", regexOnly.length === syncRegex.length &&
														
 
															+  regexOnly.every((c, i) => c.text === syncRegex[i]?.text));
														
 
															+
														
 
															+section("10. No Filepath Falls Back to Regex");
														
 
															+const noPathChunks = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, undefined, "auto");
														
 
															+check("Same chunks as regex", noPathChunks.length === syncRegex.length);
														
 
															+
														
 
															+section("11. Error Handling & Edge Cases");
														
 
															+check("Empty file -> []", (await getASTBreakPoints("", "e.ts")).length === 0);
														
 
															+check("Broken syntax doesn't crash", Array.isArray(await getASTBreakPoints("function { %%", "x.ts")));
														
 
															+check("Unknown ext -> []", (await getASTBreakPoints("data", "f.csv")).length === 0);
														
 
															+check("Markdown -> []", (await getASTBreakPoints("# H", "r.md")).length === 0);
														
 
															+const smallChunks = await chunkDocumentAsync("export const x = 1;", undefined, undefined, undefined, "s.ts", "auto");
														
 
															+check("Small file -> 1 chunk", smallChunks.length === 1);
														
 
															+
														
 
															+// --------------------------------------------------------------------------
														
 
															+// 12. chunkDocumentWithBreakPoints Equivalence
														
 
															+// --------------------------------------------------------------------------
														
 
															+section("12. chunkDocumentWithBreakPoints Equivalence");
														
 
															+const eqContent = "a".repeat(5000) + "\n\n" + "b".repeat(5000);
														
 
															+const eqOld = chunkDocument(eqContent);
														
 
															+const eqNew = chunkDocumentWithBreakPoints(eqContent, scanBreakPoints(eqContent), findCodeFences(eqContent));
														
 
															+check("Identical output", eqOld.length === eqNew.length &&
														
 
															+  eqOld.every((c, i) => c.text === eqNew[i]?.text && c.pos === eqNew[i]?.pos));
														
 
															+
														
 
															+// --------------------------------------------------------------------------
														
 
															+// 13. Synthetic performance
														
 
															+// --------------------------------------------------------------------------
														
 
															+section("13. Synthetic Performance");
														
 
															+
														
 
															+const t0 = performance.now();
														
 
															+for (let i = 0; i < 10; i++) await getASTBreakPoints(largeTS, "p.ts");
														
 
															+const astExtractMs = (performance.now() - t0) / 10;
														
 
															+
														
 
															+const t1 = performance.now();
														
 
															+for (let i = 0; i < 10; i++) scanBreakPoints(largeTS);
														
 
															+const regexExtractMs = (performance.now() - t1) / 10;
														
 
															+
														
 
															+const t2 = performance.now();
														
 
															+for (let i = 0; i < 10; i++) await chunkDocumentAsync(largeTS, undefined, undefined, undefined, "p.ts", "auto");
														
 
															+const astFullMs = (performance.now() - t2) / 10;
														
 
															+
														
 
															+const t3 = performance.now();
														
 
															+for (let i = 0; i < 10; i++) chunkDocument(largeTS);
														
 
															+const regexFullMs = (performance.now() - t3) / 10;
														
 
															+
														
 
															+console.log(`\n  File size:                    ${formatBytes(largeTS.length)}`);
														
 
															+console.log(`  AST break point extraction:   ${astExtractMs.toFixed(1)}ms`);
														
 
															+console.log(`  Regex break point extraction: ${regexExtractMs.toFixed(1)}ms`);
														
 
															+console.log(`  Full AST chunking:            ${astFullMs.toFixed(1)}ms`);
														
 
															+console.log(`  Full regex chunking:          ${regexFullMs.toFixed(1)}ms`);
														
 
															+console.log(`  Overhead per file:            ${(astFullMs - regexFullMs).toFixed(1)}ms`);
														
 
															+
														
 
															+check("AST chunking < 50ms per file", astFullMs < 50, `was ${astFullMs.toFixed(1)}ms`);
														
 
															+
														
 
															+// End of synthetic tests
														
 
															+section("Synthetic Test Results");
														
 
															+console.log(`\n  ${passed} passed, ${failed} failed`);
														
 
															+
														
 
															+} // end if (!skipSynthetic)
														
 
															+
														
 
															+
														
 
															+// ============================================================================
														
 
															+// PART 2: Real Collection Scan
														
 
															+// ============================================================================
														
 
															+
														
 
															+if (scanDir) {
														
 
															+
														
 
															+section(`Real Collection Scan: ${scanDir}`);
														
 
															+
														
 
															+console.log(`\n  Discovering files...`);
														
 
															+const realFiles = walkDir(scanDir);
														
 
															+console.log(`  Found ${realFiles.length} files\n`);
														
 
															+
														
 
															+if (realFiles.length === 0) {
														
 
															+  console.log("  No supported files found. Supported: .ts .tsx .js .jsx .py .go .rs .md");
														
 
															+} else {
														
 
															+
														
 
															+  // Classify files
														
 
															+  const byLang = {};
														
 
															+  let totalBytes = 0;
														
 
															+  const fileEntries = [];
														
 
															+
														
 
															+  for (const filepath of realFiles) {
														
 
															+    let content;
														
 
															+    try {
														
 
															+      const stat = statSync(filepath);
														
 
															+      if (stat.size > 500_000) continue; // skip files > 500KB
														
 
															+      content = readFileSync(filepath, "utf-8");
														
 
															+    } catch {
														
 
															+      continue;
														
 
															+    }
														
 
															+    if (!content.trim()) continue;
														
 
															+
														
 
															+    const rel = relative(scanDir, filepath);
														
 
															+    const lang = detectLanguage(filepath);
														
 
															+    const langLabel = lang ?? "markdown";
														
 
															+
														
 
															+    byLang[langLabel] = (byLang[langLabel] || 0) + 1;
														
 
															+    totalBytes += content.length;
														
 
															+    fileEntries.push({ filepath, rel, lang, langLabel, content });
														
 
															+  }
														
 
															+
														
 
															+  // Print file distribution
														
 
															+  console.log("  File distribution:");
														
 
															+  for (const [lang, count] of Object.entries(byLang).sort((a, b) => b[1] - a[1])) {
														
 
															+    console.log(`    ${lang.padEnd(14)} ${count} files`);
														
 
															+  }
														
 
															+  console.log(`    ${"total".padEnd(14)} ${fileEntries.length} files (${formatBytes(totalBytes)})`);
														
 
															+
														
 
															+  // ---- Per-file analysis ----
														
 
															+
														
 
															+  // Accumulators
														
 
															+  const perLang = {};
														
 
															+  let totalRegexChunks = 0;
														
 
															+  let totalAstChunks = 0;
														
 
															+  let totalRegexMs = 0;
														
 
															+  let totalAstMs = 0;
														
 
															+  let filesWithDifference = 0;
														
 
															+  let multiChunkFiles = 0;
														
 
															+  const bigDiffs = [];  // files where AST made the biggest difference
														
 
															+
														
 
															+  console.log(`\n  Analyzing ${fileEntries.length} files...\n`);
														
 
															+
														
 
															+  for (const entry of fileEntries) {
														
 
															+    const { rel, lang, langLabel, content } = entry;
														
 
															+    const isCode = lang !== null;
														
 
															+
														
 
															+    // Regex chunking
														
 
															+    const rt0 = performance.now();
														
 
															+    const rChunks = chunkDocument(content);
														
 
															+    const rMs = performance.now() - rt0;
														
 
															+
														
 
															+    // AST chunking
														
 
															+    const at0 = performance.now();
														
 
															+    const aChunks = await chunkDocumentAsync(content, undefined, undefined, undefined, rel, "auto");
														
 
															+    const aMs = performance.now() - at0;
														
 
															+
														
 
															+    totalRegexChunks += rChunks.length;
														
 
															+    totalAstChunks += aChunks.length;
														
 
															+    totalRegexMs += rMs;
														
 
															+    totalAstMs += aMs;
														
 
															+
														
 
															+    if (rChunks.length > 1 || aChunks.length > 1) multiChunkFiles++;
														
 
															+
														
 
															+    const chunkDiff = aChunks.length - rChunks.length;
														
 
															+    const contentDiffers = rChunks.length !== aChunks.length ||
														
 
															+      rChunks.some((c, i) => c.text !== aChunks[i]?.text);
														
 
															+
														
 
															+    if (contentDiffers) filesWithDifference++;
														
 
															+
														
 
															+    // Per-language stats
														
 
															+    if (!perLang[langLabel]) {
														
 
															+      perLang[langLabel] = {
														
 
															+        files: 0, bytes: 0, regexChunks: 0, astChunks: 0,
														
 
															+        regexMs: 0, astMs: 0, astBreakpoints: 0, diffs: 0,
														
 
															+      };
														
 
															+    }
														
 
															+    const s = perLang[langLabel];
														
 
															+    s.files++;
														
 
															+    s.bytes += content.length;
														
 
															+    s.regexChunks += rChunks.length;
														
 
															+    s.astChunks += aChunks.length;
														
 
															+    s.regexMs += rMs;
														
 
															+    s.astMs += aMs;
														
 
															+    if (contentDiffers) s.diffs++;
														
 
															+
														
 
															+    // Count AST breakpoints for code files
														
 
															+    if (isCode) {
														
 
															+      const bp = await getASTBreakPoints(content, rel);
														
 
															+      s.astBreakpoints += bp.length;
														
 
															+    }
														
 
															+
														
 
															+    // Track big differences for the detailed report
														
 
															+    if (contentDiffers && isCode && (rChunks.length > 1 || aChunks.length > 1)) {
														
 
															+      bigDiffs.push({
														
 
															+        rel, lang: langLabel, bytes: content.length,
														
 
															+        regexN: rChunks.length, astN: aChunks.length,
														
 
															+        diff: chunkDiff, overheadMs: aMs - rMs,
														
 
															+      });
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  // ---- Aggregate report ----
														
 
															+
														
 
															+  section("Per-Language Summary");
														
 
															+
														
 
															+  const langOrder = Object.entries(perLang).sort((a, b) => b[1].files - a[1].files);
														
 
															+  const colW = { lang: 14, files: 7, bytes: 10, rChunks: 9, aChunks: 9, bps: 6, diffs: 6, rMs: 9, aMs: 9 };
														
 
															+
														
 
															+  console.log(
														
 
															+    `\n  ${"Language".padEnd(colW.lang)}${"Files".padStart(colW.files)}${"Size".padStart(colW.bytes)}` +
														
 
															+    `${"Rx Chnk".padStart(colW.rChunks)}${"AST Chnk".padStart(colW.aChunks)}` +
														
 
															+    `${"BPs".padStart(colW.bps)}${"Diffs".padStart(colW.diffs)}` +
														
 
															+    `${"Rx ms".padStart(colW.rMs)}${"AST ms".padStart(colW.aMs)}`
														
 
															+  );
														
 
															+  console.log("  " + "-".repeat(Object.values(colW).reduce((a, b) => a + b, 0)));
														
 
															+
														
 
															+  for (const [lang, s] of langOrder) {
														
 
															+    console.log(
														
 
															+      `  ${lang.padEnd(colW.lang)}` +
														
 
															+      `${String(s.files).padStart(colW.files)}` +
														
 
															+      `${formatBytes(s.bytes).padStart(colW.bytes)}` +
														
 
															+      `${String(s.regexChunks).padStart(colW.rChunks)}` +
														
 
															+      `${String(s.astChunks).padStart(colW.aChunks)}` +
														
 
															+      `${String(s.astBreakpoints).padStart(colW.bps)}` +
														
 
															+      `${String(s.diffs).padStart(colW.diffs)}` +
														
 
															+      `${s.regexMs.toFixed(1).padStart(colW.rMs)}` +
														
 
															+      `${s.astMs.toFixed(1).padStart(colW.aMs)}`
														
 
															+    );
														
 
															+  }
														
 
															+
														
 
															+  console.log("  " + "-".repeat(Object.values(colW).reduce((a, b) => a + b, 0)));
														
 
															+  console.log(
														
 
															+    `  ${"TOTAL".padEnd(colW.lang)}` +
														
 
															+    `${String(fileEntries.length).padStart(colW.files)}` +
														
 
															+    `${formatBytes(totalBytes).padStart(colW.bytes)}` +
														
 
															+    `${String(totalRegexChunks).padStart(colW.rChunks)}` +
														
 
															+    `${String(totalAstChunks).padStart(colW.aChunks)}` +
														
 
															+    `${"".padStart(colW.bps)}` +
														
 
															+    `${String(filesWithDifference).padStart(colW.diffs)}` +
														
 
															+    `${totalRegexMs.toFixed(1).padStart(colW.rMs)}` +
														
 
															+    `${totalAstMs.toFixed(1).padStart(colW.aMs)}`
														
 
															+  );
														
 
															+
														
 
															+  // ---- Headline stats ----
														
 
															+
														
 
															+  section("Headline Stats");
														
 
															+
														
 
															+  const codeFiles = fileEntries.filter(e => e.lang !== null).length;
														
 
															+  const mdFiles = fileEntries.filter(e => e.lang === null).length;
														
 
															+  const avgOverheadMs = codeFiles > 0
														
 
															+    ? (langOrder.filter(([l]) => l !== "markdown").reduce((s, [, v]) => s + v.astMs - v.regexMs, 0)) / codeFiles
														
 
															+    : 0;
														
 
															+
														
 
															+  console.log(`
														
 
															+  Files scanned:         ${fileEntries.length} (${codeFiles} code, ${mdFiles} markdown)
														
 
															+  Multi-chunk files:     ${multiChunkFiles} (files large enough to produce >1 chunk)
														
 
															+  Files where AST differed: ${filesWithDifference} / ${fileEntries.length} (${pct(filesWithDifference, fileEntries.length)})
														
 
															+  Total chunks (regex):  ${totalRegexChunks}
														
 
															+  Total chunks (AST):    ${totalAstChunks}  (${totalAstChunks > totalRegexChunks ? "+" : ""}${totalAstChunks - totalRegexChunks})
														
 
															+  Total time (regex):    ${totalRegexMs.toFixed(1)}ms
														
 
															+  Total time (AST):      ${totalAstMs.toFixed(1)}ms  (+${(totalAstMs - totalRegexMs).toFixed(1)}ms overhead)
														
 
															+  Avg overhead per code file: ${avgOverheadMs.toFixed(2)}ms
														
 
															+  `);
														
 
															+
														
 
															+  // ---- Top differences ----
														
 
															+
														
 
															+  if (bigDiffs.length > 0) {
														
 
															+    section("Top Files Where AST Changed Chunking");
														
 
															+
														
 
															+    bigDiffs.sort((a, b) => Math.abs(b.diff) - Math.abs(a.diff));
														
 
															+    const topN = bigDiffs.slice(0, 20);
														
 
															+
														
 
															+    console.log(
														
 
															+      `\n  ${"File".padEnd(50)} ${"Lang".padEnd(12)} ${"Size".padStart(8)} ` +
														
 
															+      `${"Rx".padStart(4)} ${"AST".padStart(4)} ${"Diff".padStart(5)} ` +
														
 
															+      `${"OH ms".padStart(7)}`
														
 
															+    );
														
 
															+    console.log("  " + "-".repeat(94));
														
 
															+
														
 
															+    for (const d of topN) {
														
 
															+      const sign = d.diff > 0 ? "+" : "";
														
 
															+      console.log(
														
 
															+        `  ${d.rel.slice(0, 49).padEnd(50)} ${d.lang.padEnd(12)} ${formatBytes(d.bytes).padStart(8)} ` +
														
 
															+        `${String(d.regexN).padStart(4)} ${String(d.astN).padStart(4)} ${(sign + d.diff).padStart(5)} ` +
														
 
															+        `${d.overheadMs.toFixed(1).padStart(7)}`
														
 
															+      );
														
 
															+    }
														
 
															+
														
 
															+    if (bigDiffs.length > 20) {
														
 
															+      console.log(`\n  ... and ${bigDiffs.length - 20} more files with differences`);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  // ---- Markdown regression check ----
														
 
															+
														
 
															+  const mdEntries = fileEntries.filter(e => e.lang === null);
														
 
															+  if (mdEntries.length > 0) {
														
 
															+    section("Markdown Regression Check");
														
 
															+
														
 
															+    let mdRegressions = 0;
														
 
															+    for (const entry of mdEntries) {
														
 
															+      const rChunks = chunkDocument(entry.content);
														
 
															+      const aChunks = await chunkDocumentAsync(entry.content, undefined, undefined, undefined, entry.rel, "auto");
														
 
															+      const same = rChunks.length === aChunks.length &&
														
 
															+        rChunks.every((c, i) => c.text === aChunks[i]?.text);
														
 
															+      if (!same) {
														
 
															+        mdRegressions++;
														
 
															+        console.log(`  REGRESSION: ${entry.rel} (regex=${rChunks.length}, ast=${aChunks.length})`);
														
 
															+      }
														
 
															+    }
														
 
															+
														
 
															+    if (mdRegressions === 0) {
														
 
															+      console.log(`\n  All ${mdEntries.length} markdown files produce identical chunks. No regressions.`);
														
 
															+    } else {
														
 
															+      console.log(`\n  ${mdRegressions} / ${mdEntries.length} markdown files differ (unexpected!)`);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+} // end if realFiles.length > 0
														
 
															+
														
 
															+} // end if scanDir
														
 
															+
														
 
															+// ============================================================================
														
 
															+// Final Summary
														
 
															+// ============================================================================
														
 
															+
														
 
															+console.log(`\n${"=".repeat(70)}`);
														
 
															+if (!skipSynthetic) {
														
 
															+  console.log(`  SYNTHETIC TESTS: ${passed} passed, ${failed} failed`);
														
 
															+}
														
 
															+if (scanDir) {
														
 
															+  console.log(`  COLLECTION SCAN: complete (see report above)`);
														
 
															+}
														
 
															+if (!scanDir && !skipSynthetic) {
														
 
															+  console.log(`\n  Tip: Run with a directory argument to scan real files:`);
														
 
															+  console.log(`    npx tsx test-ast-chunking.mjs ~/dev/my-project`);
														
 
															+}
														
 
															+console.log("=".repeat(70));
														
 
															+
														
 
															+if (failed > 0) process.exit(1);
														
--- a/test/ast.test.ts
+++ b/test/ast.test.ts
@@ -0,0 +1,329 @@
 
															+/**
														
 
															+ * ast.test.ts - Tests for AST-aware chunking support
														
 
															+ *
														
 
															+ * Tests language detection, AST break point extraction for each
														
 
															+ * supported language, and graceful fallback on errors.
														
 
															+ */
														
 
															+
														
 
															+import { describe, test, expect } from "vitest";
														
 
															+import { detectLanguage, getASTBreakPoints, extractSymbols } from "../src/ast.js";
														
 
															+import type { SupportedLanguage } from "../src/ast.js";
														
 
															+
														
 
															+// =============================================================================
														
 
															+// Language Detection
														
 
															+// =============================================================================
														
 
															+
														
 
															+describe("detectLanguage", () => {
														
 
															+  test("recognizes TypeScript extensions", () => {
														
 
															+    expect(detectLanguage("src/auth.ts")).toBe("typescript");
														
 
															+    expect(detectLanguage("src/auth.mts")).toBe("typescript");
														
 
															+    expect(detectLanguage("src/auth.cts")).toBe("typescript");
														
 
															+  });
														
 
															+
														
 
															+  test("recognizes TSX extension", () => {
														
 
															+    expect(detectLanguage("src/App.tsx")).toBe("tsx");
														
 
															+  });
														
 
															+
														
 
															+  test("recognizes JavaScript extensions", () => {
														
 
															+    expect(detectLanguage("src/util.js")).toBe("javascript");
														
 
															+    expect(detectLanguage("src/util.mjs")).toBe("javascript");
														
 
															+    expect(detectLanguage("src/util.cjs")).toBe("javascript");
														
 
															+  });
														
 
															+
														
 
															+  test("recognizes JSX as tsx", () => {
														
 
															+    expect(detectLanguage("src/App.jsx")).toBe("tsx");
														
 
															+  });
														
 
															+
														
 
															+  test("recognizes Python extension", () => {
														
 
															+    expect(detectLanguage("src/auth.py")).toBe("python");
														
 
															+  });
														
 
															+
														
 
															+  test("recognizes Go extension", () => {
														
 
															+    expect(detectLanguage("src/auth.go")).toBe("go");
														
 
															+  });
														
 
															+
														
 
															+  test("recognizes Rust extension", () => {
														
 
															+    expect(detectLanguage("src/auth.rs")).toBe("rust");
														
 
															+  });
														
 
															+
														
 
															+  test("returns null for markdown", () => {
														
 
															+    expect(detectLanguage("docs/README.md")).toBeNull();
														
 
															+  });
														
 
															+
														
 
															+  test("returns null for unknown extensions", () => {
														
 
															+    expect(detectLanguage("data/file.csv")).toBeNull();
														
 
															+    expect(detectLanguage("config.yaml")).toBeNull();
														
 
															+    expect(detectLanguage("Makefile")).toBeNull();
														
 
															+  });
														
 
															+
														
 
															+  test("is case-insensitive for extensions", () => {
														
 
															+    expect(detectLanguage("src/Auth.TS")).toBe("typescript");
														
 
															+    expect(detectLanguage("src/Auth.PY")).toBe("python");
														
 
															+  });
														
 
															+
														
 
															+  test("works with virtual qmd:// paths", () => {
														
 
															+    expect(detectLanguage("qmd://myproject/src/auth.ts")).toBe("typescript");
														
 
															+    expect(detectLanguage("qmd://docs/README.md")).toBeNull();
														
 
															+  });
														
 
															+});
														
 
															+
														
 
															+// =============================================================================
														
 
															+// AST Break Points - TypeScript
														
 
															+// =============================================================================
														
 
															+
														
 
															+describe("getASTBreakPoints - TypeScript", () => {
														
 
															+  const TS_SAMPLE = `import { Database } from './db';
														
 
															+import type { User } from './types';
														
 
															+
														
 
															+interface AuthConfig {
														
 
															+  secret: string;
														
 
															+  ttl: number;
														
 
															+}
														
 
															+
														
 
															+type UserId = string;
														
 
															+
														
 
															+export class AuthService {
														
 
															+  constructor(private db: Database) {}
														
 
															+
														
 
															+  async authenticate(user: User, token: string): Promise<boolean> {
														
 
															+    const session = await this.db.findSession(token);
														
 
															+    return session?.userId === user.id;
														
 
															+  }
														
 
															+
														
 
															+  validateToken(token: string): boolean {
														
 
															+    return token.length === 64;
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+export function hashPassword(password: string): string {
														
 
															+  return crypto.createHash('sha256').update(password).digest('hex');
														
 
															+}
														
 
															+`;
														
 
															+
														
 
															+  test("produces break points at function, class, and import boundaries", async () => {
														
 
															+    const points = await getASTBreakPoints(TS_SAMPLE, "src/auth.ts");
														
 
															+    expect(points.length).toBeGreaterThan(0);
														
 
															+
														
 
															+    // Should have import, interface, type, class (via export), method, and function break points
														
 
															+    const types = points.map(p => p.type);
														
 
															+    expect(types.some(t => t.includes("import"))).toBe(true);
														
 
															+    expect(types.some(t => t.includes("iface"))).toBe(true);
														
 
															+    expect(types.some(t => t.includes("type"))).toBe(true);
														
 
															+    expect(types.some(t => t.includes("export") || t.includes("class"))).toBe(true);
														
 
															+    expect(types.some(t => t.includes("method"))).toBe(true);
														
 
															+  });
														
 
															+
														
 
															+  test("break points are sorted by position", async () => {
														
 
															+    const points = await getASTBreakPoints(TS_SAMPLE, "src/auth.ts");
														
 
															+    for (let i = 1; i < points.length; i++) {
														
 
															+      expect(points[i]!.pos).toBeGreaterThanOrEqual(points[i - 1]!.pos);
														
 
															+    }
														
 
															+  });
														
 
															+
														
 
															+  test("scores align with expected hierarchy", async () => {
														
 
															+    const points = await getASTBreakPoints(TS_SAMPLE, "src/auth.ts");
														
 
															+
														
 
															+    // Class/interface should score 100
														
 
															+    const ifacePoint = points.find(p => p.type === "ast:iface");
														
 
															+    expect(ifacePoint?.score).toBe(100);
														
 
															+
														
 
															+    // Function/method should score 90
														
 
															+    const methodPoint = points.find(p => p.type === "ast:method");
														
 
															+    expect(methodPoint?.score).toBe(90);
														
 
															+
														
 
															+    // Import should score 60
														
 
															+    const importPoint = points.find(p => p.type === "ast:import");
														
 
															+    expect(importPoint?.score).toBe(60);
														
 
															+  });
														
 
															+
														
 
															+  test("break point positions match actual content positions", async () => {
														
 
															+    const points = await getASTBreakPoints(TS_SAMPLE, "src/auth.ts");
														
 
															+
														
 
															+    // First import should be at position 0
														
 
															+    const firstImport = points.find(p => p.type === "ast:import");
														
 
															+    expect(firstImport).toBeDefined();
														
 
															+    expect(TS_SAMPLE.slice(firstImport!.pos, firstImport!.pos + 6)).toBe("import");
														
 
															+  });
														
 
															+});
														
 
															+
														
 
															+// =============================================================================
														
 
															+// AST Break Points - Python
														
 
															+// =============================================================================
														
 
															+
														
 
															+describe("getASTBreakPoints - Python", () => {
														
 
															+  const PY_SAMPLE = `import os
														
 
															+from typing import Optional
														
 
															+
														
 
															+class AuthService:
														
 
															+    def __init__(self, db):
														
 
															+        self.db = db
														
 
															+
														
 
															+    async def authenticate(self, user, token):
														
 
															+        session = await self.db.find(token)
														
 
															+        return session.user_id == user.id
														
 
															+
														
 
															+    def validate_token(self, token):
														
 
															+        return len(token) == 64
														
 
															+
														
 
															+def hash_password(password: str) -> str:
														
 
															+    return hashlib.sha256(password.encode()).hexdigest()
														
 
															+
														
 
															+@decorator
														
 
															+def decorated_func():
														
 
															+    pass
														
 
															+`;
														
 
															+
														
 
															+  test("produces break points for class, function, import, and decorated definitions", async () => {
														
 
															+    const points = await getASTBreakPoints(PY_SAMPLE, "auth.py");
														
 
															+    const types = points.map(p => p.type);
														
 
															+
														
 
															+    expect(types.some(t => t.includes("import"))).toBe(true);
														
 
															+    expect(types.some(t => t.includes("class"))).toBe(true);
														
 
															+    expect(types.some(t => t.includes("func"))).toBe(true);
														
 
															+    expect(types.some(t => t.includes("decorated"))).toBe(true);
														
 
															+  });
														
 
															+
														
 
															+  test("captures method definitions inside classes", async () => {
														
 
															+    const points = await getASTBreakPoints(PY_SAMPLE, "auth.py");
														
 
															+    // Should capture __init__, authenticate, and validate_token as func
														
 
															+    const funcPoints = points.filter(p => p.type === "ast:func");
														
 
															+    expect(funcPoints.length).toBeGreaterThanOrEqual(3);
														
 
															+  });
														
 
															+});
														
 
															+
														
 
															+// =============================================================================
														
 
															+// AST Break Points - Go
														
 
															+// =============================================================================
														
 
															+
														
 
															+describe("getASTBreakPoints - Go", () => {
														
 
															+  const GO_SAMPLE = `package main
														
 
															+
														
 
															+import "fmt"
														
 
															+
														
 
															+type AuthService struct {
														
 
															+    db *Database
														
 
															+}
														
 
															+
														
 
															+func (s *AuthService) Authenticate(user User) bool {
														
 
															+    return true
														
 
															+}
														
 
															+
														
 
															+func HashPassword(password string) string {
														
 
															+    return "hash"
														
 
															+}
														
 
															+`;
														
 
															+
														
 
															+  test("produces break points for type, function, method, and import", async () => {
														
 
															+    const points = await getASTBreakPoints(GO_SAMPLE, "auth.go");
														
 
															+    const types = points.map(p => p.type);
														
 
															+
														
 
															+    expect(types.some(t => t.includes("import"))).toBe(true);
														
 
															+    expect(types.some(t => t.includes("type"))).toBe(true);
														
 
															+    expect(types.some(t => t.includes("method"))).toBe(true);
														
 
															+    expect(types.some(t => t.includes("func"))).toBe(true);
														
 
															+  });
														
 
															+
														
 
															+  test("function and method both score 90", async () => {
														
 
															+    const points = await getASTBreakPoints(GO_SAMPLE, "auth.go");
														
 
															+    const funcPoint = points.find(p => p.type === "ast:func");
														
 
															+    const methodPoint = points.find(p => p.type === "ast:method");
														
 
															+
														
 
															+    expect(funcPoint?.score).toBe(90);
														
 
															+    expect(methodPoint?.score).toBe(90);
														
 
															+  });
														
 
															+});
														
 
															+
														
 
															+// =============================================================================
														
 
															+// AST Break Points - Rust
														
 
															+// =============================================================================
														
 
															+
														
 
															+describe("getASTBreakPoints - Rust", () => {
														
 
															+  const RS_SAMPLE = `use std::collections::HashMap;
														
 
															+
														
 
															+struct AuthService {
														
 
															+    db: Database,
														
 
															+}
														
 
															+
														
 
															+impl AuthService {
														
 
															+    fn authenticate(&self, user: &User) -> bool {
														
 
															+        true
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+trait Authenticatable {
														
 
															+    fn validate(&self) -> bool;
														
 
															+}
														
 
															+
														
 
															+enum Role {
														
 
															+    Admin,
														
 
															+    User,
														
 
															+}
														
 
															+
														
 
															+fn hash_password(password: &str) -> String {
														
 
															+    String::new()
														
 
															+}
														
 
															+`;
														
 
															+
														
 
															+  test("produces break points for struct, impl, trait, enum, function, and use", async () => {
														
 
															+    const points = await getASTBreakPoints(RS_SAMPLE, "auth.rs");
														
 
															+    const types = points.map(p => p.type);
														
 
															+
														
 
															+    expect(types.some(t => t.includes("import"))).toBe(true);  // use_declaration -> @import
														
 
															+    expect(types.some(t => t.includes("struct"))).toBe(true);
														
 
															+    expect(types.some(t => t.includes("impl"))).toBe(true);
														
 
															+    expect(types.some(t => t.includes("trait"))).toBe(true);
														
 
															+    expect(types.some(t => t.includes("enum"))).toBe(true);
														
 
															+    expect(types.some(t => t.includes("func"))).toBe(true);
														
 
															+  });
														
 
															+
														
 
															+  test("struct, impl, and trait all score 100", async () => {
														
 
															+    const points = await getASTBreakPoints(RS_SAMPLE, "auth.rs");
														
 
															+    const structPoint = points.find(p => p.type === "ast:struct");
														
 
															+    const implPoint = points.find(p => p.type === "ast:impl");
														
 
															+    const traitPoint = points.find(p => p.type === "ast:trait");
														
 
															+
														
 
															+    expect(structPoint?.score).toBe(100);
														
 
															+    expect(implPoint?.score).toBe(100);
														
 
															+    expect(traitPoint?.score).toBe(100);
														
 
															+  });
														
 
															+});
														
 
															+
														
 
															+// =============================================================================
														
 
															+// Error Handling & Fallback
														
 
															+// =============================================================================
														
 
															+
														
 
															+describe("getASTBreakPoints - error handling", () => {
														
 
															+  test("returns empty array for unsupported file types", async () => {
														
 
															+    const points = await getASTBreakPoints("# Hello World", "readme.md");
														
 
															+    expect(points).toEqual([]);
														
 
															+  });
														
 
															+
														
 
															+  test("returns empty array for unknown extensions", async () => {
														
 
															+    const points = await getASTBreakPoints("data,here", "file.csv");
														
 
															+    expect(points).toEqual([]);
														
 
															+  });
														
 
															+
														
 
															+  test("handles empty content gracefully", async () => {
														
 
															+    const points = await getASTBreakPoints("", "empty.ts");
														
 
															+    expect(points).toEqual([]);
														
 
															+  });
														
 
															+
														
 
															+  test("handles syntactically invalid code gracefully", async () => {
														
 
															+    // Tree-sitter is error-tolerant, so this should still parse (with error nodes)
														
 
															+    // but should not crash
														
 
															+    const points = await getASTBreakPoints("function { broken syntax %%%", "broken.ts");
														
 
															+    // Should either return some partial break points or empty array — not throw
														
 
															+    expect(Array.isArray(points)).toBe(true);
														
 
															+  });
														
 
															+});
														
 
															+
														
 
															+// =============================================================================
														
 
															+// Symbol Extraction Stub (Phase 2)
														
 
															+// =============================================================================
														
 
															+
														
 
															+describe("extractSymbols", () => {
														
 
															+  test("returns empty array (Phase 2 stub)", () => {
														
 
															+    const symbols = extractSymbols("function foo() {}", "typescript", 0, 18);
														
 
															+    expect(symbols).toEqual([]);
														
 
															+  });
														
 
															+});
														
--- a/test/store.test.ts
+++ b/test/store.test.ts
@@ -29,6 +29,9 @@ import {
 
															   formatDocForEmbedding,
														
 
															   chunkDocument,
														
 
															   chunkDocumentByTokens,
														
 
															+  chunkDocumentAsync,
														
 
															+  chunkDocumentWithBreakPoints,
														
 
															+  mergeBreakPoints,
														
 
															   scanBreakPoints,
														
 
															   findCodeFences,
														
 
															   isInsideCodeFence,
														
@@ -1020,6 +1023,127 @@ Final section content.
 
															   });
														
 
															 });
														
 
															+// =============================================================================
														
 
															+// AST-Aware Chunking Integration Tests
														
 
															+// =============================================================================
														
 
															+
														
 
															+describe("mergeBreakPoints", () => {
														
 
															+  test("merges two sets of break points keeping highest score at each position", () => {
														
 
															+    const regexPoints: BreakPoint[] = [
														
 
															+      { pos: 10, score: 20, type: "blank" },
														
 
															+      { pos: 50, score: 1, type: "newline" },
														
 
															+    ];
														
 
															+    const astPoints: BreakPoint[] = [
														
 
															+      { pos: 10, score: 90, type: "ast:func" },
														
 
															+      { pos: 100, score: 100, type: "ast:class" },
														
 
															+    ];
														
 
															+
														
 
															+    const merged = mergeBreakPoints(regexPoints, astPoints);
														
 
															+    expect(merged).toHaveLength(3);
														
 
															+
														
 
															+    // pos 10: AST score (90) wins over regex (20)
														
 
															+    const at10 = merged.find(p => p.pos === 10);
														
 
															+    expect(at10?.score).toBe(90);
														
 
															+    expect(at10?.type).toBe("ast:func");
														
 
															+
														
 
															+    // pos 50: only regex
														
 
															+    expect(merged.find(p => p.pos === 50)?.score).toBe(1);
														
 
															+
														
 
															+    // pos 100: only AST
														
 
															+    expect(merged.find(p => p.pos === 100)?.score).toBe(100);
														
 
															+  });
														
 
															+
														
 
															+  test("returns sorted by position", () => {
														
 
															+    const a: BreakPoint[] = [{ pos: 100, score: 10, type: "a" }];
														
 
															+    const b: BreakPoint[] = [{ pos: 5, score: 20, type: "b" }];
														
 
															+    const merged = mergeBreakPoints(a, b);
														
 
															+    expect(merged[0]!.pos).toBe(5);
														
 
															+    expect(merged[1]!.pos).toBe(100);
														
 
															+  });
														
 
															+});
														
 
															+
														
 
															+describe("chunkDocumentWithBreakPoints", () => {
														
 
															+  test("produces same output as chunkDocument for same input", () => {
														
 
															+    const content = "a".repeat(5000) + "\n\n" + "b".repeat(5000);
														
 
															+    const breakPoints = scanBreakPoints(content);
														
 
															+    const codeFences = findCodeFences(content);
														
 
															+
														
 
															+    const chunksOriginal = chunkDocument(content);
														
 
															+    const chunksNew = chunkDocumentWithBreakPoints(content, breakPoints, codeFences);
														
 
															+
														
 
															+    expect(chunksNew.length).toBe(chunksOriginal.length);
														
 
															+    for (let i = 0; i < chunksNew.length; i++) {
														
 
															+      expect(chunksNew[i]!.text).toBe(chunksOriginal[i]!.text);
														
 
															+      expect(chunksNew[i]!.pos).toBe(chunksOriginal[i]!.pos);
														
 
															+    }
														
 
															+  });
														
 
															+});
														
 
															+
														
 
															+describe("AST-aware chunkDocumentAsync", () => {
														
 
															+  const TS_CODE = `import { Database } from './db';
														
 
															+
														
 
															+export class AuthService {
														
 
															+  constructor(private db: Database) {}
														
 
															+
														
 
															+  async authenticate(user: User, token: string): Promise<boolean> {
														
 
															+    const session = await this.db.findSession(token);
														
 
															+    return session?.userId === user.id;
														
 
															+  }
														
 
															+
														
 
															+  validateToken(token: string): boolean {
														
 
															+    return token.length === 64;
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+export function hashPassword(password: string): string {
														
 
															+  return crypto.createHash('sha256').update(password).digest('hex');
														
 
															+}
														
 
															+`.repeat(10); // Repeat to make it large enough to trigger chunking
														
 
															+
														
 
															+  test("returns chunks for code files with AST strategy", async () => {
														
 
															+    const chunks = await chunkDocumentAsync(TS_CODE, undefined, undefined, undefined, "auth.ts", "auto");
														
 
															+    expect(chunks.length).toBeGreaterThan(0);
														
 
															+    // Each chunk should have text and pos
														
 
															+    for (const chunk of chunks) {
														
 
															+      expect(typeof chunk.text).toBe("string");
														
 
															+      expect(chunk.text.length).toBeGreaterThan(0);
														
 
															+      expect(chunk.pos).toBeGreaterThanOrEqual(0);
														
 
															+    }
														
 
															+  });
														
 
															+
														
 
															+  test("regex strategy produces same output as chunkDocument for code files", async () => {
														
 
															+    const asyncChunks = await chunkDocumentAsync(TS_CODE, undefined, undefined, undefined, "auth.ts", "regex");
														
 
															+    const syncChunks = chunkDocument(TS_CODE);
														
 
															+
														
 
															+    expect(asyncChunks.length).toBe(syncChunks.length);
														
 
															+    for (let i = 0; i < asyncChunks.length; i++) {
														
 
															+      expect(asyncChunks[i]!.text).toBe(syncChunks[i]!.text);
														
 
															+      expect(asyncChunks[i]!.pos).toBe(syncChunks[i]!.pos);
														
 
															+    }
														
 
															+  });
														
 
															+
														
 
															+  test("markdown files are unchanged in auto mode", async () => {
														
 
															+    const mdContent = ("# Heading\n\n" + "Some text. ".repeat(200) + "\n\n").repeat(10);
														
 
															+    const asyncChunks = await chunkDocumentAsync(mdContent, undefined, undefined, undefined, "readme.md", "auto");
														
 
															+    const syncChunks = chunkDocument(mdContent);
														
 
															+
														
 
															+    expect(asyncChunks.length).toBe(syncChunks.length);
														
 
															+    for (let i = 0; i < asyncChunks.length; i++) {
														
 
															+      expect(asyncChunks[i]!.text).toBe(syncChunks[i]!.text);
														
 
															+    }
														
 
															+  });
														
 
															+
														
 
															+  test("no filepath falls back to regex-only", async () => {
														
 
															+    const asyncChunks = await chunkDocumentAsync(TS_CODE, undefined, undefined, undefined, undefined, "auto");
														
 
															+    const syncChunks = chunkDocument(TS_CODE);
														
 
															+
														
 
															+    expect(asyncChunks.length).toBe(syncChunks.length);
														
 
															+    for (let i = 0; i < asyncChunks.length; i++) {
														
 
															+      expect(asyncChunks[i]!.text).toBe(syncChunks[i]!.text);
														
 
															+    }
														
 
															+  });
														
 
															+});
														
 
															+
														
 
															 // =============================================================================
														
 
															 // Caching Tests
														
 
															 // =============================================================================