3 сар өмнө · 244ddf5ecb
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,18 @@
 
				 
			
 
				 ## [Unreleased]
			
 
				 
			
 
				+### Added
			
 
				+
			
 
				+- AST-aware chunking for code files via `web-tree-sitter`. Supported
			
 
				+  languages: TypeScript/JavaScript, Python, Go, and Rust. Code files
			
 
				+  are chunked at function, class, and import boundaries instead of
			
 
				+  arbitrary text positions. Markdown and unknown file types are unchanged.
			
 
				+- `--chunk-strategy <auto|regex>` flag for `qmd embed` and `qmd query`.
			
 
				+  Default is `regex` (existing behavior). Use `auto` to enable AST-aware
			
 
				+  chunking for code files.
			
 
				+- `qmd status` now shows AST grammar availability.
			
 
				+- SDK: `chunkStrategy` option on `embed()` and `search()` methods.
			
 
				+
			
 
				 ### Fixes
			
 
				 
			
 
				 - Sync stale `bun.lock` (`better-sqlite3` 11.x → 12.x). CI and release
			
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -138,6 +138,7 @@ bun test --preload ./src/test-preload.ts test/
 
				 - node-llama-cpp for embeddings (embeddinggemma), reranking (qwen3-reranker), and query expansion (Qwen3)
			
 
				 - Reciprocal Rank Fusion (RRF) for combining results
			
 
				 - Smart chunking: 900 tokens/chunk with 15% overlap, prefers markdown headings as boundaries
			
 
				+- AST-aware chunking: use `--chunk-strategy auto` to chunk code files (.ts/.js/.py/.go/.rs) at function/class/import boundaries via tree-sitter. Default is `regex` (existing behavior). Markdown and unknown file types always use regex chunking.
			
 
				 
			
 
				 ## Important: Do NOT run automatically
			
 
				 
			
--- a/README.md
+++ b/README.md
@@ -318,6 +318,7 @@ const result = await store.update({
 
				 // Generate vector embeddings
			
 
				 const embedResult = await store.embed({
			
 
				   force: false,           // true to re-embed everything
			
 
				+  chunkStrategy: "auto",  // "regex" (default) or "auto" (AST for code files)
			
 
				   onProgress: ({ current, total, collection }) => {
			
 
				     console.log(`Embedding ${current}/${total}`)
			
 
				   },
			
@@ -564,8 +565,27 @@ qmd embed
 
				 
			
 
				 # Force re-embed everything
			
 
				 qmd embed -f
			
 
				+
			
 
				+# Enable AST-aware chunking for code files (TS, JS, Python, Go, Rust)
			
 
				+qmd embed --chunk-strategy auto
			
 
				+
			
 
				+# Also works with query for consistent chunk selection
			
 
				+qmd query "auth flow" --chunk-strategy auto
			
 
				 ```
			
 
				 
			
 
				+**AST-aware chunking** (`--chunk-strategy auto`) uses tree-sitter to chunk code
			
 
				+files at function, class, and import boundaries instead of arbitrary text
			
 
				+positions. This produces higher-quality chunks and better search results for
			
 
				+codebases. Markdown and other file types always use regex-based chunking
			
 
				+regardless of strategy.
			
 
				+
			
 
				+The default is `regex` (existing behavior). Use `--chunk-strategy auto` to
			
 
				+opt in. Run `qmd status` to verify which grammars are available.
			
 
				+
			
 
				+> **Note:** Tree-sitter grammars are optional dependencies. If they are not
			
 
				+> installed, `--chunk-strategy auto` falls back to regex-only chunking
			
 
				+> automatically. Tested on both Node.js and Bun.
			
 
				+
			
 
				 ### Context Management
			
 
				 
			
 
				 Context adds descriptive metadata to collections and paths, helping search understand your content.
			
@@ -813,6 +833,19 @@ The squared distance decay means a heading 200 tokens back (score ~30) still bea
 
				 
			
 
				 **Code Fence Protection:** Break points inside code blocks are ignored—code stays together. If a code block exceeds the chunk size, it's kept whole when possible.
			
 
				 
			
 
				+**AST-Aware Chunking (Code Files):**
			
 
				+
			
 
				+For supported code files, QMD also parses the source with [tree-sitter](https://tree-sitter.github.io/) and adds AST-derived break points that are merged with the regex scores above:
			
 
				+
			
 
				+| AST Node | Score | Languages |
			
 
				+|----------|-------|-----------|
			
 
				+| Class / interface / struct / impl / trait | 100 | All |
			
 
				+| Function / method | 90 | All |
			
 
				+| Type alias / enum | 80 | All |
			
 
				+| Import / use declaration | 60 | All |
			
 
				+
			
 
				+Supported for `.ts`, `.tsx`, `.js`, `.jsx`, `.py`, `.go`, and `.rs` files. Enable with `--chunk-strategy auto`. Markdown and other file types always use regex chunking.
			
 
				+
			
 
				 ### Query Flow (Hybrid)
			
 
				 
			
 
				 ```
			
--- a/package.json
+++ b/package.json
@@ -51,6 +51,7 @@
 
				     "node-llama-cpp": "^3.17.1",
			
 
				     "picomatch": "^4.0.0",
			
 
				     "sqlite-vec": "^0.1.7-alpha.2",
			
 
				+    "web-tree-sitter": "0.26.7",
			
 
				     "yaml": "^2.8.2",
			
 
				     "zod": "4.2.1"
			
 
				   },
			
@@ -59,7 +60,11 @@
 
				     "sqlite-vec-darwin-x64": "^0.1.7-alpha.2",
			
 
				     "sqlite-vec-linux-arm64": "^0.1.7-alpha.2",
			
 
				     "sqlite-vec-linux-x64": "^0.1.7-alpha.2",
			
 
				-    "sqlite-vec-windows-x64": "^0.1.7-alpha.2"
			
 
				+    "sqlite-vec-windows-x64": "^0.1.7-alpha.2",
			
 
				+    "tree-sitter-go": "0.23.4",
			
 
				+    "tree-sitter-python": "0.23.4",
			
 
				+    "tree-sitter-rust": "0.24.0",
			
 
				+    "tree-sitter-typescript": "0.23.2"
			
 
				   },
			
 
				   "devDependencies": {
			
 
				     "@types/better-sqlite3": "^7.6.0",
			
--- a/src/ast.ts
+++ b/src/ast.ts
@@ -0,0 +1,391 @@
 
				+/**
			
 
				+ * AST-aware chunking support via web-tree-sitter.
			
 
				+ *
			
 
				+ * Provides language detection, AST break point extraction for supported
			
 
				+ * code file types, and a stub for future symbol extraction.
			
 
				+ *
			
 
				+ * All functions degrade gracefully: parse failures or unsupported languages
			
 
				+ * return empty arrays, falling back to regex-only chunking.
			
 
				+ *
			
 
				+ * ## Dependency Note
			
 
				+ *
			
 
				+ * Grammar packages (tree-sitter-typescript, etc.) are listed as
			
 
				+ * optionalDependencies with pinned versions. They ship native prebuilds
			
 
				+ * and source files (~72 MB total) but QMD only uses the .wasm files
			
 
				+ * (~5 MB). If install size becomes a concern, the .wasm files can be
			
 
				+ * bundled directly in the repo (e.g. assets/grammars/) and resolved
			
 
				+ * via import.meta.url instead of require.resolve(), eliminating the
			
 
				+ * grammar packages entirely.
			
 
				+ */
			
 
				+
			
 
				+import { createRequire } from "node:module";
			
 
				+import { extname } from "node:path";
			
 
				+import type { BreakPoint } from "./store.js";
			
 
				+
			
 
				+// web-tree-sitter types — imported dynamically to avoid top-level WASM init
			
 
				+type ParserType = import("web-tree-sitter").Parser;
			
 
				+type LanguageType = import("web-tree-sitter").Language;
			
 
				+type QueryType = import("web-tree-sitter").Query;
			
 
				+
			
 
				+// =============================================================================
			
 
				+// Language Detection
			
 
				+// =============================================================================
			
 
				+
			
 
				+export type SupportedLanguage = "typescript" | "tsx" | "javascript" | "python" | "go" | "rust";
			
 
				+
			
 
				+const EXTENSION_MAP: Record<string, SupportedLanguage> = {
			
 
				+  ".ts": "typescript",
			
 
				+  ".tsx": "tsx",
			
 
				+  ".js": "javascript",
			
 
				+  ".jsx": "tsx",
			
 
				+  ".mts": "typescript",
			
 
				+  ".cts": "typescript",
			
 
				+  ".mjs": "javascript",
			
 
				+  ".cjs": "javascript",
			
 
				+  ".py": "python",
			
 
				+  ".go": "go",
			
 
				+  ".rs": "rust",
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * Detect language from file path extension.
			
 
				+ * Returns null for unsupported or unknown extensions (including .md).
			
 
				+ */
			
 
				+export function detectLanguage(filepath: string): SupportedLanguage | null {
			
 
				+  const ext = extname(filepath).toLowerCase();
			
 
				+  return EXTENSION_MAP[ext] ?? null;
			
 
				+}
			
 
				+
			
 
				+// =============================================================================
			
 
				+// Grammar Resolution
			
 
				+// =============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * Maps language to the npm package and wasm filename for the grammar.
			
 
				+ */
			
 
				+const GRAMMAR_MAP: Record<SupportedLanguage, { pkg: string; wasm: string }> = {
			
 
				+  typescript: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-typescript.wasm" },
			
 
				+  tsx:        { pkg: "tree-sitter-typescript", wasm: "tree-sitter-tsx.wasm" },
			
 
				+  javascript: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-typescript.wasm" },
			
 
				+  python:     { pkg: "tree-sitter-python",     wasm: "tree-sitter-python.wasm" },
			
 
				+  go:         { pkg: "tree-sitter-go",         wasm: "tree-sitter-go.wasm" },
			
 
				+  rust:       { pkg: "tree-sitter-rust",        wasm: "tree-sitter-rust.wasm" },
			
 
				+};
			
 
				+
			
 
				+// =============================================================================
			
 
				+// Per-Language Query Definitions
			
 
				+// =============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * Tree-sitter S-expression queries for each language.
			
 
				+ * Each capture name maps to a break point score via SCORE_MAP.
			
 
				+ *
			
 
				+ * For TypeScript/JavaScript, we match export_statement wrappers to get the
			
 
				+ * correct start position (before `export`), plus bare declarations for
			
 
				+ * non-exported code.
			
 
				+ */
			
 
				+const LANGUAGE_QUERIES: Record<SupportedLanguage, string> = {
			
 
				+  typescript: `
			
 
				+    (export_statement) @export
			
 
				+    (class_declaration) @class
			
 
				+    (function_declaration) @func
			
 
				+    (method_definition) @method
			
 
				+    (interface_declaration) @iface
			
 
				+    (type_alias_declaration) @type
			
 
				+    (enum_declaration) @enum
			
 
				+    (import_statement) @import
			
 
				+    (lexical_declaration (variable_declarator value: (arrow_function))) @func
			
 
				+    (lexical_declaration (variable_declarator value: (function_expression))) @func
			
 
				+  `,
			
 
				+  tsx: `
			
 
				+    (export_statement) @export
			
 
				+    (class_declaration) @class
			
 
				+    (function_declaration) @func
			
 
				+    (method_definition) @method
			
 
				+    (interface_declaration) @iface
			
 
				+    (type_alias_declaration) @type
			
 
				+    (enum_declaration) @enum
			
 
				+    (import_statement) @import
			
 
				+    (lexical_declaration (variable_declarator value: (arrow_function))) @func
			
 
				+    (lexical_declaration (variable_declarator value: (function_expression))) @func
			
 
				+  `,
			
 
				+  javascript: `
			
 
				+    (export_statement) @export
			
 
				+    (class_declaration) @class
			
 
				+    (function_declaration) @func
			
 
				+    (method_definition) @method
			
 
				+    (import_statement) @import
			
 
				+    (lexical_declaration (variable_declarator value: (arrow_function))) @func
			
 
				+    (lexical_declaration (variable_declarator value: (function_expression))) @func
			
 
				+  `,
			
 
				+  python: `
			
 
				+    (class_definition) @class
			
 
				+    (function_definition) @func
			
 
				+    (decorated_definition) @decorated
			
 
				+    (import_statement) @import
			
 
				+    (import_from_statement) @import
			
 
				+  `,
			
 
				+  go: `
			
 
				+    (type_declaration) @type
			
 
				+    (function_declaration) @func
			
 
				+    (method_declaration) @method
			
 
				+    (import_declaration) @import
			
 
				+  `,
			
 
				+  rust: `
			
 
				+    (struct_item) @struct
			
 
				+    (impl_item) @impl
			
 
				+    (function_item) @func
			
 
				+    (trait_item) @trait
			
 
				+    (enum_item) @enum
			
 
				+    (use_declaration) @import
			
 
				+    (type_item) @type
			
 
				+    (mod_item) @mod
			
 
				+  `,
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * Score mapping from capture names to break point scores.
			
 
				+ * Aligned with the markdown BREAK_PATTERNS scale (h1=100, h2=90, etc.)
			
 
				+ * so findBestCutoff() decay works unchanged.
			
 
				+ */
			
 
				+const SCORE_MAP: Record<string, number> = {
			
 
				+  class:     100,
			
 
				+  iface:     100,
			
 
				+  struct:    100,
			
 
				+  trait:     100,
			
 
				+  impl:      100,
			
 
				+  mod:       100,
			
 
				+  export:     90,
			
 
				+  func:       90,
			
 
				+  method:     90,
			
 
				+  decorated:  90,
			
 
				+  type:       80,
			
 
				+  enum:       80,
			
 
				+  import:     60,
			
 
				+};
			
 
				+
			
 
				+// =============================================================================
			
 
				+// Parser Caching & Initialization
			
 
				+// =============================================================================
			
 
				+
			
 
				+let ParserClass: typeof import("web-tree-sitter").Parser | null = null;
			
 
				+let LanguageClass: typeof import("web-tree-sitter").Language | null = null;
			
 
				+let QueryClass: typeof import("web-tree-sitter").Query | null = null;
			
 
				+let initPromise: Promise<void> | null = null;
			
 
				+
			
 
				+/** Languages that have already failed to load — warn only once per process. */
			
 
				+const failedLanguages = new Set<string>();
			
 
				+
			
 
				+/** Cached grammar load promises. */
			
 
				+const grammarCache = new Map<string, Promise<LanguageType>>();
			
 
				+
			
 
				+/** Cached compiled queries per language. */
			
 
				+const queryCache = new Map<string, QueryType>();
			
 
				+
			
 
				+/**
			
 
				+ * Initialize web-tree-sitter. Called once and cached.
			
 
				+ */
			
 
				+async function ensureInit(): Promise<void> {
			
 
				+  if (!initPromise) {
			
 
				+    initPromise = (async () => {
			
 
				+      const mod = await import("web-tree-sitter");
			
 
				+      ParserClass = mod.Parser;
			
 
				+      LanguageClass = mod.Language;
			
 
				+      QueryClass = mod.Query;
			
 
				+      await ParserClass.init();
			
 
				+    })();
			
 
				+  }
			
 
				+  return initPromise;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Resolve the filesystem path to a grammar .wasm file.
			
 
				+ * Uses createRequire to resolve from installed dependency packages.
			
 
				+ */
			
 
				+function resolveGrammarPath(language: SupportedLanguage): string {
			
 
				+  const { pkg, wasm } = GRAMMAR_MAP[language];
			
 
				+  const require = createRequire(import.meta.url);
			
 
				+  return require.resolve(`${pkg}/${wasm}`);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Load and cache a grammar for the given language.
			
 
				+ * Returns null on failure (logs once per language).
			
 
				+ */
			
 
				+async function loadGrammar(language: SupportedLanguage): Promise<LanguageType | null> {
			
 
				+  if (failedLanguages.has(language)) return null;
			
 
				+
			
 
				+  const wasmKey = GRAMMAR_MAP[language].wasm;
			
 
				+  if (!grammarCache.has(wasmKey)) {
			
 
				+    grammarCache.set(wasmKey, (async () => {
			
 
				+      const path = resolveGrammarPath(language);
			
 
				+      return LanguageClass!.load(path);
			
 
				+    })());
			
 
				+  }
			
 
				+
			
 
				+  try {
			
 
				+    return await grammarCache.get(wasmKey)!;
			
 
				+  } catch (err) {
			
 
				+    failedLanguages.add(language);
			
 
				+    grammarCache.delete(wasmKey);
			
 
				+    console.warn(`[qmd] Failed to load tree-sitter grammar for ${language}: ${err}`);
			
 
				+    return null;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Get or create a compiled query for the given language.
			
 
				+ */
			
 
				+function getQuery(language: SupportedLanguage, grammar: LanguageType): QueryType {
			
 
				+  if (!queryCache.has(language)) {
			
 
				+    const source = LANGUAGE_QUERIES[language];
			
 
				+    const query = new QueryClass!(grammar, source);
			
 
				+    queryCache.set(language, query);
			
 
				+  }
			
 
				+  return queryCache.get(language)!;
			
 
				+}
			
 
				+
			
 
				+// =============================================================================
			
 
				+// AST Break Point Extraction
			
 
				+// =============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * Parse a source file and return break points at AST node boundaries.
			
 
				+ *
			
 
				+ * Returns an empty array for unsupported languages, parse failures,
			
 
				+ * or grammar loading failures. Never throws.
			
 
				+ *
			
 
				+ * @param content - The file content to parse.
			
 
				+ * @param filepath - The file path (used for language detection).
			
 
				+ * @returns Array of BreakPoint objects suitable for merging with regex break points.
			
 
				+ */
			
 
				+export async function getASTBreakPoints(
			
 
				+  content: string,
			
 
				+  filepath: string,
			
 
				+): Promise<BreakPoint[]> {
			
 
				+  const language = detectLanguage(filepath);
			
 
				+  if (!language) return [];
			
 
				+
			
 
				+  try {
			
 
				+    await ensureInit();
			
 
				+
			
 
				+    const grammar = await loadGrammar(language);
			
 
				+    if (!grammar) return [];
			
 
				+
			
 
				+    const parser = new ParserClass!();
			
 
				+    parser.setLanguage(grammar);
			
 
				+
			
 
				+    const tree = parser.parse(content);
			
 
				+    if (!tree) {
			
 
				+      parser.delete();
			
 
				+      return [];
			
 
				+    }
			
 
				+
			
 
				+    const query = getQuery(language, grammar);
			
 
				+    const captures = query.captures(tree.rootNode);
			
 
				+
			
 
				+    // Deduplicate: at each byte position, keep the highest-scoring capture.
			
 
				+    // This handles cases like export_statement wrapping a class_declaration
			
 
				+    // at different offsets — we want the outermost (earliest) position.
			
 
				+    const seen = new Map<number, BreakPoint>();
			
 
				+
			
 
				+    for (const cap of captures) {
			
 
				+      const pos = cap.node.startIndex;
			
 
				+      const score = SCORE_MAP[cap.name] ?? 20;
			
 
				+      const type = `ast:${cap.name}`;
			
 
				+
			
 
				+      const existing = seen.get(pos);
			
 
				+      if (!existing || score > existing.score) {
			
 
				+        seen.set(pos, { pos, score, type });
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    tree.delete();
			
 
				+    parser.delete();
			
 
				+
			
 
				+    return Array.from(seen.values()).sort((a, b) => a.pos - b.pos);
			
 
				+  } catch (err) {
			
 
				+    console.warn(`[qmd] AST parse failed for ${filepath}, falling back to regex: ${err instanceof Error ? err.message : err}`);
			
 
				+    return [];
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// =============================================================================
			
 
				+// Health / Status
			
 
				+// =============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * Check which tree-sitter grammars are available.
			
 
				+ * Returns a status object for each supported language.
			
 
				+ */
			
 
				+export async function getASTStatus(): Promise<{
			
 
				+  available: boolean;
			
 
				+  languages: { language: SupportedLanguage; available: boolean; error?: string }[];
			
 
				+}> {
			
 
				+  const languages: { language: SupportedLanguage; available: boolean; error?: string }[] = [];
			
 
				+
			
 
				+  try {
			
 
				+    await ensureInit();
			
 
				+  } catch (err) {
			
 
				+    return {
			
 
				+      available: false,
			
 
				+      languages: (Object.keys(GRAMMAR_MAP) as SupportedLanguage[]).map(lang => ({
			
 
				+        language: lang,
			
 
				+        available: false,
			
 
				+        error: `web-tree-sitter init failed: ${err instanceof Error ? err.message : err}`,
			
 
				+      })),
			
 
				+    };
			
 
				+  }
			
 
				+
			
 
				+  for (const lang of Object.keys(GRAMMAR_MAP) as SupportedLanguage[]) {
			
 
				+    try {
			
 
				+      const grammar = await loadGrammar(lang);
			
 
				+      if (grammar) {
			
 
				+        // Also verify the query compiles
			
 
				+        getQuery(lang, grammar);
			
 
				+        languages.push({ language: lang, available: true });
			
 
				+      } else {
			
 
				+        languages.push({ language: lang, available: false, error: "grammar failed to load" });
			
 
				+      }
			
 
				+    } catch (err) {
			
 
				+      languages.push({
			
 
				+        language: lang,
			
 
				+        available: false,
			
 
				+        error: err instanceof Error ? err.message : String(err),
			
 
				+      });
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  return {
			
 
				+    available: languages.some(l => l.available),
			
 
				+    languages,
			
 
				+  };
			
 
				+}
			
 
				+
			
 
				+// =============================================================================
			
 
				+// Symbol Extraction (Phase 2 Stub)
			
 
				+// =============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * Metadata about a code symbol within a chunk.
			
 
				+ * Stubbed for Phase 2 — always returns empty array in Phase 1.
			
 
				+ */
			
 
				+export interface SymbolInfo {
			
 
				+  name: string;
			
 
				+  kind: string;
			
 
				+  signature?: string;
			
 
				+  line: number;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Extract symbol metadata for code within a byte range.
			
 
				+ * Stubbed for Phase 2 — returns empty array.
			
 
				+ */
			
 
				+export function extractSymbols(
			
 
				+  _content: string,
			
 
				+  _language: string,
			
 
				+  _startPos: number,
			
 
				+  _endPos: number,
			
 
				+): SymbolInfo[] {
			
 
				+  return [];
			
 
				+}
			
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -75,6 +75,7 @@ import {
 
				   generateEmbeddings,
			
 
				   syncConfigToDb,
			
 
				   type ReindexResult,
			
 
				+  type ChunkStrategy,
			
 
				 } from "../store.js";
			
 
				 import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
			
 
				 import {
			
@@ -372,6 +373,32 @@ async function showStatus(): Promise<void> {
 
				     });
			
 
				   }
			
 
				 
			
 
				+  // AST chunking status
			
 
				+  try {
			
 
				+    const { getASTStatus } = await import("../ast.js");
			
 
				+    const ast = await getASTStatus();
			
 
				+    console.log(`\n${c.bold}AST Chunking${c.reset}`);
			
 
				+    if (ast.available) {
			
 
				+      const ok = ast.languages.filter(l => l.available).map(l => l.language);
			
 
				+      const fail = ast.languages.filter(l => !l.available);
			
 
				+      console.log(`  Status:   ${c.green}active${c.reset}`);
			
 
				+      console.log(`  Languages: ${ok.join(", ")}`);
			
 
				+      if (fail.length > 0) {
			
 
				+        for (const f of fail) {
			
 
				+          console.log(`  ${c.yellow}Unavailable: ${f.language} (${f.error})${c.reset}`);
			
 
				+        }
			
 
				+      }
			
 
				+    } else {
			
 
				+      console.log(`  Status:   ${c.yellow}unavailable${c.reset} (falling back to regex chunking)`);
			
 
				+      for (const l of ast.languages) {
			
 
				+        if (l.error) console.log(`  ${c.dim}${l.language}: ${l.error}${c.reset}`);
			
 
				+      }
			
 
				+    }
			
 
				+  } catch {
			
 
				+    console.log(`\n${c.bold}AST Chunking${c.reset}`);
			
 
				+    console.log(`  Status:   ${c.dim}not available${c.reset}`);
			
 
				+  }
			
 
				+
			
 
				   if (collections.length > 0) {
			
 
				     console.log(`\n${c.bold}Collections${c.reset}`);
			
 
				     for (const col of collections) {
			
@@ -1617,10 +1644,17 @@ function parseEmbedBatchOption(name: string, value: unknown): number | undefined
 
				   return parsed;
			
 
				 }
			
 
				 
			
 
				+function parseChunkStrategy(value: unknown): ChunkStrategy | undefined {
			
 
				+  if (value === undefined) return undefined;
			
 
				+  const s = String(value);
			
 
				+  if (s === "auto" || s === "regex") return s;
			
 
				+  throw new Error(`--chunk-strategy must be "auto" or "regex" (got "${s}")`);
			
 
				+}
			
 
				+
			
 
				 async function vectorIndex(
			
 
				   model: string = DEFAULT_EMBED_MODEL,
			
 
				   force: boolean = false,
			
 
				-  batchOptions?: { maxDocsPerBatch?: number; maxBatchBytes?: number },
			
 
				+  batchOptions?: { maxDocsPerBatch?: number; maxBatchBytes?: number; chunkStrategy?: ChunkStrategy },
			
 
				 ): Promise<void> {
			
 
				   const storeInstance = getStore();
			
 
				   const db = storeInstance.db;
			
@@ -1653,6 +1687,7 @@ async function vectorIndex(
 
				     model,
			
 
				     maxDocsPerBatch: batchOptions?.maxDocsPerBatch,
			
 
				     maxBatchBytes: batchOptions?.maxBatchBytes,
			
 
				+    chunkStrategy: batchOptions?.chunkStrategy,
			
 
				     onProgress: (info) => {
			
 
				       if (info.totalBytes === 0) return;
			
 
				       const percent = (info.bytesProcessed / info.totalBytes) * 100;
			
@@ -1746,6 +1781,7 @@ type OutputOptions = {
 
				   candidateLimit?: number;  // Max candidates to rerank (default: 40)
			
 
				   intent?: string;       // Domain intent for disambiguation
			
 
				   skipRerank?: boolean;  // Skip LLM reranking, use RRF scores only
			
 
				+  chunkStrategy?: ChunkStrategy;  // "auto" (default) or "regex"
			
 
				 };
			
 
				 
			
 
				 // Highlight query terms in text (skip short words < 3 chars)
			
@@ -2231,6 +2267,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
 
				         skipRerank: opts.skipRerank,
			
 
				         explain: !!opts.explain,
			
 
				         intent,
			
 
				+        chunkStrategy: opts.chunkStrategy,
			
 
				         hooks: {
			
 
				           onEmbedStart: (count) => {
			
 
				             process.stderr.write(`${c.dim}Embedding ${count} ${count === 1 ? 'query' : 'queries'}...${c.reset}`);
			
@@ -2258,6 +2295,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
 
				         skipRerank: opts.skipRerank,
			
 
				         explain: !!opts.explain,
			
 
				         intent,
			
 
				+        chunkStrategy: opts.chunkStrategy,
			
 
				         hooks: {
			
 
				           onStrongSignal: (score) => {
			
 
				             process.stderr.write(`${c.dim}Strong BM25 signal (${score.toFixed(2)}) — skipping expansion${c.reset}\n`);
			
@@ -2372,6 +2410,8 @@ function parseCLI() {
 
				       "candidate-limit": { type: "string", short: "C" },
			
 
				       "no-rerank": { type: "boolean", default: false },
			
 
				       intent: { type: "string" },
			
 
				+      // Chunking options
			
 
				+      "chunk-strategy": { type: "string" },  // "regex" (default) or "auto" (AST for code files)
			
 
				       // MCP HTTP transport options
			
 
				       http: { type: "boolean" },
			
 
				       daemon: { type: "boolean" },
			
@@ -2413,6 +2453,7 @@ function parseCLI() {
 
				     skipRerank: !!values["no-rerank"],
			
 
				     explain: !!values.explain,
			
 
				     intent: values.intent as string | undefined,
			
 
				+    chunkStrategy: parseChunkStrategy(values["chunk-strategy"]),
			
 
				   };
			
 
				 
			
 
				   return {
			
@@ -2635,6 +2676,9 @@ function showHelp(): void {
 
				   console.log("  --files | --json | --csv | --md | --xml  - Output format");
			
 
				   console.log("  -c, --collection <name>    - Filter by one or more collections");
			
 
				   console.log("");
			
 
				+  console.log("Embed/query options:");
			
 
				+  console.log("  --chunk-strategy <auto|regex> - Chunking mode (default: regex; auto uses AST for code files)");
			
 
				+  console.log("");
			
 
				   console.log("Multi-get options:");
			
 
				   console.log("  -l <num>                   - Maximum lines per file");
			
 
				   console.log("  --max-bytes <num>          - Skip files larger than N bytes (default 10240)");
			
@@ -2957,9 +3001,11 @@ if (isMain) {
 
				       try {
			
 
				         const maxDocsPerBatch = parseEmbedBatchOption("maxDocsPerBatch", cli.values["max-docs-per-batch"]);
			
 
				         const maxBatchMb = parseEmbedBatchOption("maxBatchBytes", cli.values["max-batch-mb"]);
			
 
				+        const embedChunkStrategy = parseChunkStrategy(cli.values["chunk-strategy"]);
			
 
				         await vectorIndex(DEFAULT_EMBED_MODEL, !!cli.values.force, {
			
 
				           maxDocsPerBatch,
			
 
				           maxBatchBytes: maxBatchMb === undefined ? undefined : maxBatchMb * 1024 * 1024,
			
 
				+          chunkStrategy: embedChunkStrategy,
			
 
				         });
			
 
				       } catch (error) {
			
 
				         console.error(error instanceof Error ? error.message : String(error));
			
--- a/src/index.ts
+++ b/src/index.ts
@@ -62,6 +62,7 @@ import {
 
				   type ReindexResult,
			
 
				   type EmbedProgress,
			
 
				   type EmbedResult,
			
 
				+  type ChunkStrategy,
			
 
				 } from "./store.js";
			
 
				 import {
			
 
				   LlamaCpp,
			
@@ -108,8 +109,9 @@ export type {
 
				 // Re-export the internal Store type for advanced consumers
			
 
				 export type { InternalStore };
			
 
				 
			
 
				-// Re-export utility functions used by frontends
			
 
				+// Re-export utility functions and types used by frontends
			
 
				 export { extractSnippet, addLineNumbers, DEFAULT_MULTI_GET_MAX_BYTES };
			
 
				+export type { ChunkStrategy } from "./store.js";
			
 
				 
			
 
				 // Re-export getDefaultDbPath for CLI/MCP that need the default database location
			
 
				 export { getDefaultDbPath } from "./store.js";
			
@@ -161,6 +163,8 @@ export interface SearchOptions {
 
				   minScore?: number;
			
 
				   /** Include explain traces */
			
 
				   explain?: boolean;
			
 
				+  /** Chunk strategy: "auto" (default, uses AST for code files) or "regex" (legacy) */
			
 
				+  chunkStrategy?: ChunkStrategy;
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -288,6 +292,7 @@ export interface QMDStore {
 
				     model?: string;
			
 
				     maxDocsPerBatch?: number;
			
 
				     maxBatchBytes?: number;
			
 
				+    chunkStrategy?: ChunkStrategy;
			
 
				     onProgress?: (info: EmbedProgress) => void;
			
 
				   }): Promise<EmbedResult>;
			
 
				 
			
@@ -391,6 +396,7 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
 
				           explain: opts.explain,
			
 
				           intent: opts.intent,
			
 
				           skipRerank,
			
 
				+          chunkStrategy: opts.chunkStrategy,
			
 
				         });
			
 
				       }
			
 
				 
			
@@ -402,6 +408,7 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
 
				         explain: opts.explain,
			
 
				         intent: opts.intent,
			
 
				         skipRerank,
			
 
				+        chunkStrategy: opts.chunkStrategy,
			
 
				       });
			
 
				     },
			
 
				     searchLex: async (q, opts) => internal.searchFTS(q, opts?.limit, opts?.collection),
			
@@ -506,6 +513,7 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
 
				         model: embedOpts?.model,
			
 
				         maxDocsPerBatch: embedOpts?.maxDocsPerBatch,
			
 
				         maxBatchBytes: embedOpts?.maxBatchBytes,
			
 
				+        chunkStrategy: embedOpts?.chunkStrategy,
			
 
				         onProgress: embedOpts?.onProgress,
			
 
				       });
			
 
				     },
			
--- a/src/store.ts
+++ b/src/store.ts
@@ -223,6 +223,89 @@ export function findBestCutoff(
 
				   return bestPos;
			
 
				 }
			
 
				 
			
 
				+// =============================================================================
			
 
				+// Chunk Strategy
			
 
				+// =============================================================================
			
 
				+
			
 
				+export type ChunkStrategy = "auto" | "regex";
			
 
				+
			
 
				+/**
			
 
				+ * Merge two sets of break points (e.g. regex + AST), keeping the highest
			
 
				+ * score at each position. Result is sorted by position.
			
 
				+ */
			
 
				+export function mergeBreakPoints(a: BreakPoint[], b: BreakPoint[]): BreakPoint[] {
			
 
				+  const seen = new Map<number, BreakPoint>();
			
 
				+  for (const bp of a) {
			
 
				+    const existing = seen.get(bp.pos);
			
 
				+    if (!existing || bp.score > existing.score) {
			
 
				+      seen.set(bp.pos, bp);
			
 
				+    }
			
 
				+  }
			
 
				+  for (const bp of b) {
			
 
				+    const existing = seen.get(bp.pos);
			
 
				+    if (!existing || bp.score > existing.score) {
			
 
				+      seen.set(bp.pos, bp);
			
 
				+    }
			
 
				+  }
			
 
				+  return Array.from(seen.values()).sort((a, b) => a.pos - b.pos);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Core chunk algorithm that operates on precomputed break points and code fences.
			
 
				+ * This is the shared implementation used by both regex-only and AST-aware chunking.
			
 
				+ */
			
 
				+export function chunkDocumentWithBreakPoints(
			
 
				+  content: string,
			
 
				+  breakPoints: BreakPoint[],
			
 
				+  codeFences: CodeFenceRegion[],
			
 
				+  maxChars: number = CHUNK_SIZE_CHARS,
			
 
				+  overlapChars: number = CHUNK_OVERLAP_CHARS,
			
 
				+  windowChars: number = CHUNK_WINDOW_CHARS
			
 
				+): { text: string; pos: number }[] {
			
 
				+  if (content.length <= maxChars) {
			
 
				+    return [{ text: content, pos: 0 }];
			
 
				+  }
			
 
				+
			
 
				+  const chunks: { text: string; pos: number }[] = [];
			
 
				+  let charPos = 0;
			
 
				+
			
 
				+  while (charPos < content.length) {
			
 
				+    const targetEndPos = Math.min(charPos + maxChars, content.length);
			
 
				+    let endPos = targetEndPos;
			
 
				+
			
 
				+    if (endPos < content.length) {
			
 
				+      const bestCutoff = findBestCutoff(
			
 
				+        breakPoints,
			
 
				+        targetEndPos,
			
 
				+        windowChars,
			
 
				+        0.7,
			
 
				+        codeFences
			
 
				+      );
			
 
				+
			
 
				+      if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
			
 
				+        endPos = bestCutoff;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    if (endPos <= charPos) {
			
 
				+      endPos = Math.min(charPos + maxChars, content.length);
			
 
				+    }
			
 
				+
			
 
				+    chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
			
 
				+
			
 
				+    if (endPos >= content.length) {
			
 
				+      break;
			
 
				+    }
			
 
				+    charPos = endPos - overlapChars;
			
 
				+    const lastChunkPos = chunks.at(-1)!.pos;
			
 
				+    if (charPos <= lastChunkPos) {
			
 
				+      charPos = endPos;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  return chunks;
			
 
				+}
			
 
				+
			
 
				 // Hybrid query: strong BM25 signal detection thresholds
			
 
				 // Skip expensive LLM expansion when top result is strong AND clearly separated from runner-up
			
 
				 export const STRONG_SIGNAL_MIN_SCORE = 0.85;
			
@@ -1197,6 +1280,7 @@ export type EmbedOptions = {
 
				   model?: string;
			
 
				   maxDocsPerBatch?: number;
			
 
				   maxBatchBytes?: number;
			
 
				+  chunkStrategy?: ChunkStrategy;
			
 
				   onProgress?: (info: EmbedProgress) => void;
			
 
				 };
			
 
				 
			
@@ -1345,7 +1429,12 @@ export async function generateEmbeddings(
 
				         if (!doc.body.trim()) continue;
			
 
				 
			
 
				         const title = extractTitle(doc.body, doc.path);
			
 
				-        const chunks = await chunkDocumentByTokens(doc.body);
			
 
				+        const chunks = await chunkDocumentByTokens(
			
 
				+          doc.body,
			
 
				+          undefined, undefined, undefined,
			
 
				+          doc.path,
			
 
				+          options?.chunkStrategy,
			
 
				+        );
			
 
				 
			
 
				         for (let seq = 0; seq < chunks.length; seq++) {
			
 
				           batchChunks.push({
			
@@ -2021,78 +2110,66 @@ export function getActiveDocumentPaths(db: Database, collectionName: string): st
 
				 
			
 
				 export { formatQueryForEmbedding, formatDocForEmbedding };
			
 
				 
			
 
				+/**
			
 
				+ * Chunk a document using regex-only break point detection.
			
 
				+ * This is the sync, backward-compatible API used by tests and legacy callers.
			
 
				+ */
			
 
				 export function chunkDocument(
			
 
				   content: string,
			
 
				   maxChars: number = CHUNK_SIZE_CHARS,
			
 
				   overlapChars: number = CHUNK_OVERLAP_CHARS,
			
 
				   windowChars: number = CHUNK_WINDOW_CHARS
			
 
				 ): { text: string; pos: number }[] {
			
 
				-  if (content.length <= maxChars) {
			
 
				-    return [{ text: content, pos: 0 }];
			
 
				-  }
			
 
				-
			
 
				-  // Pre-scan all break points and code fences once
			
 
				   const breakPoints = scanBreakPoints(content);
			
 
				   const codeFences = findCodeFences(content);
			
 
				+  return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
			
 
				+}
			
 
				 
			
 
				-  const chunks: { text: string; pos: number }[] = [];
			
 
				-  let charPos = 0;
			
 
				-
			
 
				-  while (charPos < content.length) {
			
 
				-    // Calculate target end position for this chunk
			
 
				-    const targetEndPos = Math.min(charPos + maxChars, content.length);
			
 
				-
			
 
				-    let endPos = targetEndPos;
			
 
				-
			
 
				-    // If not at the end, find the best break point
			
 
				-    if (endPos < content.length) {
			
 
				-      // Find best cutoff using scored algorithm
			
 
				-      const bestCutoff = findBestCutoff(
			
 
				-        breakPoints,
			
 
				-        targetEndPos,
			
 
				-        windowChars,
			
 
				-        0.7,
			
 
				-        codeFences
			
 
				-      );
			
 
				-
			
 
				-      // Only use the cutoff if it's within our current chunk
			
 
				-      if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
			
 
				-        endPos = bestCutoff;
			
 
				-      }
			
 
				-    }
			
 
				-
			
 
				-    // Ensure we make progress
			
 
				-    if (endPos <= charPos) {
			
 
				-      endPos = Math.min(charPos + maxChars, content.length);
			
 
				-    }
			
 
				-
			
 
				-    chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
			
 
				+/**
			
 
				+ * Async AST-aware chunking. Detects language from filepath, computes AST
			
 
				+ * break points for supported code files, merges with regex break points,
			
 
				+ * and delegates to the shared chunk algorithm.
			
 
				+ *
			
 
				+ * Falls back to regex-only when strategy is "regex", filepath is absent,
			
 
				+ * or language is unsupported.
			
 
				+ */
			
 
				+export async function chunkDocumentAsync(
			
 
				+  content: string,
			
 
				+  maxChars: number = CHUNK_SIZE_CHARS,
			
 
				+  overlapChars: number = CHUNK_OVERLAP_CHARS,
			
 
				+  windowChars: number = CHUNK_WINDOW_CHARS,
			
 
				+  filepath?: string,
			
 
				+  chunkStrategy: ChunkStrategy = "regex",
			
 
				+): Promise<{ text: string; pos: number }[]> {
			
 
				+  const regexPoints = scanBreakPoints(content);
			
 
				+  const codeFences = findCodeFences(content);
			
 
				 
			
 
				-    // Move forward, but overlap with previous chunk
			
 
				-    // For last chunk, don't overlap (just go to the end)
			
 
				-    if (endPos >= content.length) {
			
 
				-      break;
			
 
				-    }
			
 
				-    charPos = endPos - overlapChars;
			
 
				-    const lastChunkPos = chunks.at(-1)!.pos;
			
 
				-    if (charPos <= lastChunkPos) {
			
 
				-      // Prevent infinite loop - move forward at least a bit
			
 
				-      charPos = endPos;
			
 
				+  let breakPoints = regexPoints;
			
 
				+  if (chunkStrategy === "auto" && filepath) {
			
 
				+    const { getASTBreakPoints } = await import("./ast.js");
			
 
				+    const astPoints = await getASTBreakPoints(content, filepath);
			
 
				+    if (astPoints.length > 0) {
			
 
				+      breakPoints = mergeBreakPoints(regexPoints, astPoints);
			
 
				     }
			
 
				   }
			
 
				 
			
 
				-  return chunks;
			
 
				+  return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				  * Chunk a document by actual token count using the LLM tokenizer.
			
 
				  * More accurate than character-based chunking but requires async.
			
 
				+ *
			
 
				+ * When filepath and chunkStrategy are provided, uses AST-aware break points
			
 
				+ * for supported code files.
			
 
				  */
			
 
				 export async function chunkDocumentByTokens(
			
 
				   content: string,
			
 
				   maxTokens: number = CHUNK_SIZE_TOKENS,
			
 
				   overlapTokens: number = CHUNK_OVERLAP_TOKENS,
			
 
				-  windowTokens: number = CHUNK_WINDOW_TOKENS
			
 
				+  windowTokens: number = CHUNK_WINDOW_TOKENS,
			
 
				+  filepath?: string,
			
 
				+  chunkStrategy: ChunkStrategy = "regex",
			
 
				 ): Promise<{ text: string; pos: number; tokens: number }[]> {
			
 
				   const llm = getDefaultLlamaCpp();
			
 
				 
			
@@ -2104,7 +2181,8 @@ export async function chunkDocumentByTokens(
 
				   const windowChars = windowTokens * avgCharsPerToken;
			
 
				 
			
 
				   // Chunk in character space with conservative estimate
			
 
				-  let charChunks = chunkDocument(content, maxChars, overlapChars, windowChars);
			
 
				+  // Use AST-aware chunking for the first pass when filepath/strategy provided
			
 
				+  let charChunks = await chunkDocumentAsync(content, maxChars, overlapChars, windowChars, filepath, chunkStrategy);
			
 
				 
			
 
				   // Tokenize and split any chunks that still exceed limit
			
 
				   const results: { text: string; pos: number; tokens: number }[] = [];
			
@@ -3674,6 +3752,7 @@ export interface HybridQueryOptions {
 
				   explain?: boolean;        // include backend/RRF/rerank score traces
			
 
				   intent?: string;          // domain intent hint for disambiguation
			
 
				   skipRerank?: boolean;     // skip LLM reranking, use only RRF scores
			
 
				+  chunkStrategy?: ChunkStrategy;
			
 
				   hooks?: SearchHooks;
			
 
				 }
			
 
				 
			
@@ -3841,8 +3920,9 @@ export async function hybridQuery(
 
				   const intentTerms = intent ? extractIntentTerms(intent) : [];
			
 
				   const docChunkMap = new Map<string, { chunks: { text: string; pos: number }[]; bestIdx: number }>();
			
 
				 
			
 
				+  const chunkStrategy = options?.chunkStrategy;
			
 
				   for (const cand of candidates) {
			
 
				-    const chunks = chunkDocument(cand.body);
			
 
				+    const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, chunkStrategy);
			
 
				     if (chunks.length === 0) continue;
			
 
				 
			
 
				     // Pick chunk with most keyword overlap (fallback: first chunk)
			
@@ -4082,6 +4162,7 @@ export interface StructuredSearchOptions {
 
				   intent?: string;
			
 
				   /** Skip LLM reranking, use only RRF scores */
			
 
				   skipRerank?: boolean;
			
 
				+  chunkStrategy?: ChunkStrategy;
			
 
				   hooks?: SearchHooks;
			
 
				 }
			
 
				 
			
@@ -4230,9 +4311,10 @@ export async function structuredSearch(
 
				   const queryTerms = primaryQuery.toLowerCase().split(/\s+/).filter(t => t.length > 2);
			
 
				   const intentTerms = intent ? extractIntentTerms(intent) : [];
			
 
				   const docChunkMap = new Map<string, { chunks: { text: string; pos: number }[]; bestIdx: number }>();
			
 
				+  const ssChunkStrategy = options?.chunkStrategy;
			
 
				 
			
 
				   for (const cand of candidates) {
			
 
				-    const chunks = chunkDocument(cand.body);
			
 
				+    const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, ssChunkStrategy);
			
 
				     if (chunks.length === 0) continue;
			
 
				 
			
 
				     // Pick chunk with most keyword overlap
			
--- a/test-ast-chunking.mjs
+++ b/test-ast-chunking.mjs
@@ -0,0 +1,823 @@
 
				+#!/usr/bin/env npx tsx
			
 
				+/**
			
 
				+ * Thorough integration test + real-collection performance report for
			
 
				+ * AST-aware chunking.
			
 
				+ *
			
 
				+ * Usage:
			
 
				+ *   npx tsx test-ast-chunking.mjs                  # synthetic tests only
			
 
				+ *   npx tsx test-ast-chunking.mjs /path/to/code    # + scan a real directory
			
 
				+ *   npx tsx test-ast-chunking.mjs ~/dev/myproject   # works with ~
			
 
				+ *   npx tsx test-ast-chunking.mjs --help
			
 
				+ *
			
 
				+ * The real-collection scan walks the directory tree, finds supported code
			
 
				+ * files (.ts/.js/.py/.go/.rs) and markdown (.md), chunks each file with
			
 
				+ * both strategies, and prints a comparative performance report.
			
 
				+ */
			
 
				+
			
 
				+import { readFileSync, readdirSync, statSync } from "node:fs";
			
 
				+import { join, relative, extname, resolve } from "node:path";
			
 
				+import { homedir } from "node:os";
			
 
				+import { detectLanguage, getASTBreakPoints } from "./src/ast.js";
			
 
				+import {
			
 
				+  chunkDocument,
			
 
				+  chunkDocumentAsync,
			
 
				+  chunkDocumentWithBreakPoints,
			
 
				+  mergeBreakPoints,
			
 
				+  scanBreakPoints,
			
 
				+  findCodeFences,
			
 
				+  CHUNK_SIZE_CHARS,
			
 
				+} from "./src/store.js";
			
 
				+
			
 
				+// ============================================================================
			
 
				+// Helpers
			
 
				+// ============================================================================
			
 
				+
			
 
				+let passed = 0;
			
 
				+let failed = 0;
			
 
				+
			
 
				+function section(title) {
			
 
				+  console.log(`\n${"=".repeat(70)}`);
			
 
				+  console.log(`  ${title}`);
			
 
				+  console.log("=".repeat(70));
			
 
				+}
			
 
				+
			
 
				+function check(label, condition, detail) {
			
 
				+  if (condition) {
			
 
				+    console.log(`  PASS  ${label}`);
			
 
				+    passed++;
			
 
				+  } else {
			
 
				+    console.log(`  FAIL  ${label}`);
			
 
				+    if (detail) console.log(`        ${detail}`);
			
 
				+    failed++;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+function formatBytes(bytes) {
			
 
				+  if (bytes < 1024) return `${bytes} B`;
			
 
				+  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
			
 
				+  return `${(bytes / 1024 / 1024).toFixed(1)} MB`;
			
 
				+}
			
 
				+
			
 
				+function pct(n, d) {
			
 
				+  if (d === 0) return "N/A";
			
 
				+  return `${((n / d) * 100).toFixed(1)}%`;
			
 
				+}
			
 
				+
			
 
				+const SKIP_DIRS = new Set([
			
 
				+  "node_modules", ".git", ".cache", "vendor", "dist", "build",
			
 
				+  "__pycache__", ".tox", ".venv", "venv", ".mypy_cache", "target",
			
 
				+  ".next", ".nuxt", "coverage", ".turbo",
			
 
				+]);
			
 
				+
			
 
				+const CODE_EXTS = new Set([
			
 
				+  ".ts", ".tsx", ".js", ".jsx", ".mts", ".cts", ".mjs", ".cjs",
			
 
				+  ".py", ".go", ".rs",
			
 
				+]);
			
 
				+
			
 
				+const ALL_EXTS = new Set([...CODE_EXTS, ".md"]);
			
 
				+
			
 
				+function walkDir(dir, maxFiles = 5000) {
			
 
				+  const results = [];
			
 
				+  const queue = [dir];
			
 
				+  while (queue.length > 0 && results.length < maxFiles) {
			
 
				+    const current = queue.shift();
			
 
				+    let entries;
			
 
				+    try {
			
 
				+      entries = readdirSync(current, { withFileTypes: true });
			
 
				+    } catch {
			
 
				+      continue;
			
 
				+    }
			
 
				+    for (const entry of entries) {
			
 
				+      if (results.length >= maxFiles) break;
			
 
				+      if (entry.name.startsWith(".")) continue;
			
 
				+      const full = join(current, entry.name);
			
 
				+      if (entry.isDirectory()) {
			
 
				+        if (!SKIP_DIRS.has(entry.name)) queue.push(full);
			
 
				+      } else if (entry.isFile()) {
			
 
				+        const ext = extname(entry.name).toLowerCase();
			
 
				+        if (ALL_EXTS.has(ext)) results.push(full);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  return results;
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// Parse CLI args
			
 
				+// ============================================================================
			
 
				+
			
 
				+const args = process.argv.slice(2);
			
 
				+let scanDir = null;
			
 
				+let skipSynthetic = false;
			
 
				+
			
 
				+for (const arg of args) {
			
 
				+  if (arg === "--help" || arg === "-h") {
			
 
				+    console.log(`Usage: npx tsx test-ast-chunking.mjs [options] [directory]
			
 
				+
			
 
				+Options:
			
 
				+  --help, -h          Show this help
			
 
				+  --scan-only         Skip synthetic tests, only scan directory
			
 
				+
			
 
				+Arguments:
			
 
				+  directory           Path to scan for a real-collection performance report.
			
 
				+                      Walks the tree for .ts/.tsx/.js/.jsx/.py/.go/.rs/.md files.
			
 
				+
			
 
				+Examples:
			
 
				+  npx tsx test-ast-chunking.mjs                    # synthetic tests only
			
 
				+  npx tsx test-ast-chunking.mjs ~/dev/myproject     # synthetic + real scan
			
 
				+  npx tsx test-ast-chunking.mjs --scan-only ~/dev   # real scan only
			
 
				+`);
			
 
				+    process.exit(0);
			
 
				+  }
			
 
				+  if (arg === "--scan-only") {
			
 
				+    skipSynthetic = true;
			
 
				+  } else if (!arg.startsWith("-")) {
			
 
				+    scanDir = arg.startsWith("~") ? arg.replace("~", homedir()) : resolve(arg);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// PART 1: Synthetic Tests
			
 
				+// ============================================================================
			
 
				+
			
 
				+if (!skipSynthetic) {
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+// 1. Language Detection
			
 
				+// --------------------------------------------------------------------------
			
 
				+section("1. Language Detection");
			
 
				+
			
 
				+const langTests = [
			
 
				+  ["src/auth.ts", "typescript"],
			
 
				+  ["src/App.tsx", "tsx"],
			
 
				+  ["src/util.js", "javascript"],
			
 
				+  ["src/App.jsx", "tsx"],
			
 
				+  ["src/auth.mts", "typescript"],
			
 
				+  ["src/auth.cjs", "javascript"],
			
 
				+  ["src/auth.py", "python"],
			
 
				+  ["src/auth.go", "go"],
			
 
				+  ["src/auth.rs", "rust"],
			
 
				+  ["docs/README.md", null],
			
 
				+  ["data/file.csv", null],
			
 
				+  ["Makefile", null],
			
 
				+  ["qmd://myproject/src/auth.ts", "typescript"],
			
 
				+  ["qmd://docs/notes.md", null],
			
 
				+];
			
 
				+
			
 
				+for (const [path, expected] of langTests) {
			
 
				+  const result = detectLanguage(path);
			
 
				+  check(`detectLanguage("${path}") = ${result}`, result === expected,
			
 
				+    `expected ${expected}, got ${result}`);
			
 
				+}
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+// 2. AST Break Points - TypeScript
			
 
				+// --------------------------------------------------------------------------
			
 
				+section("2. AST Break Points - TypeScript");
			
 
				+
			
 
				+const TS_SAMPLE = `import { Database } from './db';
			
 
				+import type { User } from './types';
			
 
				+
			
 
				+interface AuthConfig {
			
 
				+  secret: string;
			
 
				+  ttl: number;
			
 
				+}
			
 
				+
			
 
				+type UserId = string;
			
 
				+
			
 
				+export class AuthService {
			
 
				+  constructor(private db: Database) {}
			
 
				+
			
 
				+  async authenticate(user: User, token: string): Promise<boolean> {
			
 
				+    const session = await this.db.findSession(token);
			
 
				+    return session?.userId === user.id;
			
 
				+  }
			
 
				+
			
 
				+  validateToken(token: string): boolean {
			
 
				+    return token.length === 64;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+export function hashPassword(password: string): string {
			
 
				+  return crypto.createHash('sha256').update(password).digest('hex');
			
 
				+}
			
 
				+
			
 
				+const helper = (x: number) => x * 2;
			
 
				+`;
			
 
				+
			
 
				+const tsPoints = await getASTBreakPoints(TS_SAMPLE, "auth.ts");
			
 
				+console.log(`\n  TypeScript break points (${tsPoints.length} total):`);
			
 
				+for (const p of tsPoints) {
			
 
				+  const snippet = TS_SAMPLE.slice(p.pos, p.pos + 40).replace(/\n/g, "\\n");
			
 
				+  console.log(`    pos=${String(p.pos).padStart(4)} score=${String(p.score).padStart(3)} type=${p.type.padEnd(15)} text="${snippet}..."`);
			
 
				+}
			
 
				+
			
 
				+check("Has import break points", tsPoints.some(p => p.type === "ast:import"));
			
 
				+check("Has interface break point", tsPoints.some(p => p.type === "ast:iface"));
			
 
				+check("Has type break point", tsPoints.some(p => p.type === "ast:type"));
			
 
				+check("Has export break point (class)", tsPoints.some(p => p.type === "ast:export"));
			
 
				+check("Has method break points", tsPoints.filter(p => p.type === "ast:method").length >= 2);
			
 
				+check("Import scores 60", tsPoints.find(p => p.type === "ast:import")?.score === 60);
			
 
				+check("Interface scores 100", tsPoints.find(p => p.type === "ast:iface")?.score === 100);
			
 
				+check("Method scores 90", tsPoints.find(p => p.type === "ast:method")?.score === 90);
			
 
				+check("Export scores 90", tsPoints.find(p => p.type === "ast:export")?.score === 90);
			
 
				+check("Break points sorted by position", tsPoints.every((p, i) => i === 0 || p.pos >= tsPoints[i-1].pos));
			
 
				+
			
 
				+const firstImport = tsPoints.find(p => p.type === "ast:import");
			
 
				+check("First import position is correct",
			
 
				+  TS_SAMPLE.slice(firstImport.pos, firstImport.pos + 6) === "import",
			
 
				+  `at pos ${firstImport.pos}: "${TS_SAMPLE.slice(firstImport.pos, firstImport.pos + 10)}"`);
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+// 3. AST Break Points - Python
			
 
				+// --------------------------------------------------------------------------
			
 
				+section("3. AST Break Points - Python");
			
 
				+
			
 
				+const PY_SAMPLE = `import os
			
 
				+from typing import Optional, List
			
 
				+
			
 
				+class UserService:
			
 
				+    def __init__(self, db):
			
 
				+        self.db = db
			
 
				+
			
 
				+    async def find_user(self, user_id: str) -> Optional[dict]:
			
 
				+        return await self.db.find(user_id)
			
 
				+
			
 
				+    def validate(self, user: dict) -> bool:
			
 
				+        return "id" in user and "name" in user
			
 
				+
			
 
				+def create_user(name: str, email: str) -> dict:
			
 
				+    return {"name": name, "email": email}
			
 
				+
			
 
				+@login_required
			
 
				+def protected_endpoint():
			
 
				+    return "secret"
			
 
				+`;
			
 
				+
			
 
				+const pyPoints = await getASTBreakPoints(PY_SAMPLE, "service.py");
			
 
				+console.log(`\n  Python break points (${pyPoints.length} total):`);
			
 
				+for (const p of pyPoints) {
			
 
				+  const snippet = PY_SAMPLE.slice(p.pos, p.pos + 40).replace(/\n/g, "\\n");
			
 
				+  console.log(`    pos=${String(p.pos).padStart(4)} score=${String(p.score).padStart(3)} type=${p.type.padEnd(15)} text="${snippet}..."`);
			
 
				+}
			
 
				+
			
 
				+check("Has import break points", pyPoints.filter(p => p.type === "ast:import").length >= 2);
			
 
				+check("Has class break point", pyPoints.some(p => p.type === "ast:class"));
			
 
				+check("Has function break points (methods)", pyPoints.filter(p => p.type === "ast:func").length >= 3);
			
 
				+check("Has decorated definition", pyPoints.some(p => p.type === "ast:decorated"));
			
 
				+check("Class scores 100", pyPoints.find(p => p.type === "ast:class")?.score === 100);
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+// 4. AST Break Points - Go
			
 
				+// --------------------------------------------------------------------------
			
 
				+section("4. AST Break Points - Go");
			
 
				+
			
 
				+const GO_SAMPLE = `package main
			
 
				+
			
 
				+import (
			
 
				+    "fmt"
			
 
				+    "net/http"
			
 
				+)
			
 
				+
			
 
				+type Server struct {
			
 
				+    port int
			
 
				+    db   *Database
			
 
				+}
			
 
				+
			
 
				+type Config interface {
			
 
				+    GetPort() int
			
 
				+}
			
 
				+
			
 
				+func NewServer(port int) *Server {
			
 
				+    return &Server{port: port}
			
 
				+}
			
 
				+
			
 
				+func (s *Server) Start() error {
			
 
				+    return http.ListenAndServe(fmt.Sprintf(":%d", s.port), nil)
			
 
				+}
			
 
				+
			
 
				+func (s *Server) Stop() {
			
 
				+    fmt.Println("stopping")
			
 
				+}
			
 
				+`;
			
 
				+
			
 
				+const goPoints = await getASTBreakPoints(GO_SAMPLE, "server.go");
			
 
				+console.log(`\n  Go break points (${goPoints.length} total):`);
			
 
				+for (const p of goPoints) {
			
 
				+  const snippet = GO_SAMPLE.slice(p.pos, p.pos + 40).replace(/\n/g, "\\n");
			
 
				+  console.log(`    pos=${String(p.pos).padStart(4)} score=${String(p.score).padStart(3)} type=${p.type.padEnd(15)} text="${snippet}..."`);
			
 
				+}
			
 
				+
			
 
				+check("Has import break point", goPoints.some(p => p.type === "ast:import"));
			
 
				+check("Has type break points", goPoints.filter(p => p.type === "ast:type").length >= 2);
			
 
				+check("Has function break point", goPoints.some(p => p.type === "ast:func"));
			
 
				+check("Has method break points", goPoints.filter(p => p.type === "ast:method").length >= 2);
			
 
				+check("Type scores 80", goPoints.find(p => p.type === "ast:type")?.score === 80);
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+// 5. AST Break Points - Rust
			
 
				+// --------------------------------------------------------------------------
			
 
				+section("5. AST Break Points - Rust");
			
 
				+
			
 
				+const RS_SAMPLE = `use std::collections::HashMap;
			
 
				+use std::io;
			
 
				+
			
 
				+pub struct Config {
			
 
				+    port: u16,
			
 
				+    host: String,
			
 
				+}
			
 
				+
			
 
				+impl Config {
			
 
				+    pub fn new(port: u16, host: String) -> Self {
			
 
				+        Config { port, host }
			
 
				+    }
			
 
				+
			
 
				+    pub fn address(&self) -> String {
			
 
				+        format!("{}:{}", self.host, self.port)
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+pub trait Configurable {
			
 
				+    fn configure(&mut self, config: &Config);
			
 
				+}
			
 
				+
			
 
				+pub enum ServerState {
			
 
				+    Running,
			
 
				+    Stopped,
			
 
				+    Error(String),
			
 
				+}
			
 
				+
			
 
				+pub fn start_server(config: Config) -> io::Result<()> {
			
 
				+    Ok(())
			
 
				+}
			
 
				+`;
			
 
				+
			
 
				+const rsPoints = await getASTBreakPoints(RS_SAMPLE, "config.rs");
			
 
				+console.log(`\n  Rust break points (${rsPoints.length} total):`);
			
 
				+for (const p of rsPoints) {
			
 
				+  const snippet = RS_SAMPLE.slice(p.pos, p.pos + 40).replace(/\n/g, "\\n");
			
 
				+  console.log(`    pos=${String(p.pos).padStart(4)} score=${String(p.score).padStart(3)} type=${p.type.padEnd(15)} text="${snippet}..."`);
			
 
				+}
			
 
				+
			
 
				+check("Has use/import break points", rsPoints.filter(p => p.type === "ast:import").length >= 2);
			
 
				+check("Has struct break point", rsPoints.some(p => p.type === "ast:struct"));
			
 
				+check("Has impl break point", rsPoints.some(p => p.type === "ast:impl"));
			
 
				+check("Has trait break point", rsPoints.some(p => p.type === "ast:trait"));
			
 
				+check("Has enum break point", rsPoints.some(p => p.type === "ast:enum"));
			
 
				+check("Has function break point", rsPoints.some(p => p.type === "ast:func"));
			
 
				+check("Struct scores 100", rsPoints.find(p => p.type === "ast:struct")?.score === 100);
			
 
				+check("Impl scores 100", rsPoints.find(p => p.type === "ast:impl")?.score === 100);
			
 
				+check("Trait scores 100", rsPoints.find(p => p.type === "ast:trait")?.score === 100);
			
 
				+check("Enum scores 80", rsPoints.find(p => p.type === "ast:enum")?.score === 80);
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+// 6. Merge Break Points
			
 
				+// --------------------------------------------------------------------------
			
 
				+section("6. mergeBreakPoints");
			
 
				+
			
 
				+const regexPoints = [
			
 
				+  { pos: 10, score: 20, type: "blank" },
			
 
				+  { pos: 50, score: 1, type: "newline" },
			
 
				+  { pos: 100, score: 20, type: "blank" },
			
 
				+];
			
 
				+const astPointsMerge = [
			
 
				+  { pos: 10, score: 90, type: "ast:func" },
			
 
				+  { pos: 75, score: 100, type: "ast:class" },
			
 
				+  { pos: 100, score: 60, type: "ast:import" },
			
 
				+];
			
 
				+
			
 
				+const merged = mergeBreakPoints(regexPoints, astPointsMerge);
			
 
				+console.log(`\n  Merged break points (${merged.length} total):`);
			
 
				+for (const p of merged) {
			
 
				+  console.log(`    pos=${String(p.pos).padStart(4)} score=${String(p.score).padStart(3)} type=${p.type}`);
			
 
				+}
			
 
				+
			
 
				+check("Merge has 4 unique positions", merged.length === 4);
			
 
				+check("pos 10: AST wins (90 > 20)", merged.find(p => p.pos === 10)?.score === 90);
			
 
				+check("pos 50: regex only (1)", merged.find(p => p.pos === 50)?.score === 1);
			
 
				+check("pos 75: AST only (100)", merged.find(p => p.pos === 75)?.score === 100);
			
 
				+check("pos 100: AST wins (60 > 20)", merged.find(p => p.pos === 100)?.score === 60);
			
 
				+check("Sorted by position", merged.every((p, i) => i === 0 || p.pos >= merged[i-1].pos));
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+// 7. AST vs Regex Chunking Comparison (Large Synthetic File)
			
 
				+// --------------------------------------------------------------------------
			
 
				+section("7. AST vs Regex Chunking Comparison");
			
 
				+
			
 
				+const largeTSParts = [];
			
 
				+for (let i = 0; i < 30; i++) {
			
 
				+  largeTSParts.push(`
			
 
				+export function handler${i}(req: Request, res: Response): void {
			
 
				+  const startTime = Date.now();
			
 
				+  const userId = req.params.userId;
			
 
				+  const sessionToken = req.headers.authorization;
			
 
				+
			
 
				+  // Validate the incoming request parameters
			
 
				+  if (!userId || !sessionToken) {
			
 
				+    res.status(400).json({ error: "Missing required parameters" });
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  // Process the request with detailed logging
			
 
				+  console.log(\`Processing request \${i} for user \${userId}\`);
			
 
				+  const result = processBusinessLogic${i}(userId, sessionToken);
			
 
				+
			
 
				+  // Return the response with timing info
			
 
				+  const elapsed = Date.now() - startTime;
			
 
				+  res.json({ data: result, processingTimeMs: elapsed });
			
 
				+}
			
 
				+`);
			
 
				+}
			
 
				+const largeTS = largeTSParts.join("\n");
			
 
				+
			
 
				+console.log(`\n  Large TS file: ${largeTS.length} chars, ${largeTSParts.length} functions`);
			
 
				+
			
 
				+const regexChunks = chunkDocument(largeTS);
			
 
				+const astChunks = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, "handlers.ts", "auto");
			
 
				+
			
 
				+console.log(`  Regex chunks: ${regexChunks.length}`);
			
 
				+console.log(`  AST chunks:   ${astChunks.length}`);
			
 
				+
			
 
				+function countSplitFunctions(chunks, source) {
			
 
				+  let splits = 0;
			
 
				+  for (let i = 0; i < 30; i++) {
			
 
				+    const funcStart = source.indexOf(`function handler${i}(`);
			
 
				+    const nextFunc = source.indexOf(`function handler${i + 1}(`, funcStart + 1);
			
 
				+    const funcEnd = nextFunc > 0 ? nextFunc : source.length;
			
 
				+    const chunkIndices = new Set();
			
 
				+    for (let ci = 0; ci < chunks.length; ci++) {
			
 
				+      const chunkStart = chunks[ci].pos;
			
 
				+      const chunkEnd = chunkStart + chunks[ci].text.length;
			
 
				+      if (chunkStart < funcEnd && chunkEnd > funcStart) {
			
 
				+        chunkIndices.add(ci);
			
 
				+      }
			
 
				+    }
			
 
				+    if (chunkIndices.size > 1) splits++;
			
 
				+  }
			
 
				+  return splits;
			
 
				+}
			
 
				+
			
 
				+const regexSplits = countSplitFunctions(regexChunks, largeTS);
			
 
				+const astSplitsSynth = countSplitFunctions(astChunks, largeTS);
			
 
				+
			
 
				+console.log(`\n  Functions split across chunks:`);
			
 
				+console.log(`    Regex: ${regexSplits} / 30`);
			
 
				+console.log(`    AST:   ${astSplitsSynth} / 30`);
			
 
				+
			
 
				+check("AST splits fewer functions than regex", astSplitsSynth <= regexSplits,
			
 
				+  `AST split ${astSplitsSynth}, regex split ${regexSplits}`);
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+// 8. Markdown Files Unchanged
			
 
				+// --------------------------------------------------------------------------
			
 
				+section("8. Markdown Files Unchanged in Auto Mode");
			
 
				+
			
 
				+const mdContent = [];
			
 
				+for (let i = 0; i < 15; i++) {
			
 
				+  mdContent.push(`# Section ${i}\n\n${"Lorem ipsum dolor sit amet. ".repeat(40)}\n`);
			
 
				+}
			
 
				+const largeMD = mdContent.join("\n");
			
 
				+
			
 
				+const mdRegex = chunkDocument(largeMD);
			
 
				+const mdAst = await chunkDocumentAsync(largeMD, undefined, undefined, undefined, "readme.md", "auto");
			
 
				+
			
 
				+check("Same number of chunks", mdRegex.length === mdAst.length,
			
 
				+  `regex=${mdRegex.length}, ast=${mdAst.length}`);
			
 
				+
			
 
				+let mdIdentical = true;
			
 
				+for (let i = 0; i < mdRegex.length; i++) {
			
 
				+  if (mdRegex[i]?.text !== mdAst[i]?.text || mdRegex[i]?.pos !== mdAst[i]?.pos) {
			
 
				+    mdIdentical = false;
			
 
				+    break;
			
 
				+  }
			
 
				+}
			
 
				+check("Chunk content is identical", mdIdentical);
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+// 9-11. Strategy bypass, no-filepath fallback, error handling
			
 
				+// --------------------------------------------------------------------------
			
 
				+section("9. Regex Strategy Bypass");
			
 
				+const regexOnly = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, "handlers.ts", "regex");
			
 
				+const syncRegex = chunkDocument(largeTS);
			
 
				+check("Same chunks as sync regex", regexOnly.length === syncRegex.length &&
			
 
				+  regexOnly.every((c, i) => c.text === syncRegex[i]?.text));
			
 
				+
			
 
				+section("10. No Filepath Falls Back to Regex");
			
 
				+const noPathChunks = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, undefined, "auto");
			
 
				+check("Same chunks as regex", noPathChunks.length === syncRegex.length);
			
 
				+
			
 
				+section("11. Error Handling & Edge Cases");
			
 
				+check("Empty file -> []", (await getASTBreakPoints("", "e.ts")).length === 0);
			
 
				+check("Broken syntax doesn't crash", Array.isArray(await getASTBreakPoints("function { %%", "x.ts")));
			
 
				+check("Unknown ext -> []", (await getASTBreakPoints("data", "f.csv")).length === 0);
			
 
				+check("Markdown -> []", (await getASTBreakPoints("# H", "r.md")).length === 0);
			
 
				+const smallChunks = await chunkDocumentAsync("export const x = 1;", undefined, undefined, undefined, "s.ts", "auto");
			
 
				+check("Small file -> 1 chunk", smallChunks.length === 1);
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+// 12. chunkDocumentWithBreakPoints Equivalence
			
 
				+// --------------------------------------------------------------------------
			
 
				+section("12. chunkDocumentWithBreakPoints Equivalence");
			
 
				+const eqContent = "a".repeat(5000) + "\n\n" + "b".repeat(5000);
			
 
				+const eqOld = chunkDocument(eqContent);
			
 
				+const eqNew = chunkDocumentWithBreakPoints(eqContent, scanBreakPoints(eqContent), findCodeFences(eqContent));
			
 
				+check("Identical output", eqOld.length === eqNew.length &&
			
 
				+  eqOld.every((c, i) => c.text === eqNew[i]?.text && c.pos === eqNew[i]?.pos));
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+// 13. Synthetic performance
			
 
				+// --------------------------------------------------------------------------
			
 
				+section("13. Synthetic Performance");
			
 
				+
			
 
				+const t0 = performance.now();
			
 
				+for (let i = 0; i < 10; i++) await getASTBreakPoints(largeTS, "p.ts");
			
 
				+const astExtractMs = (performance.now() - t0) / 10;
			
 
				+
			
 
				+const t1 = performance.now();
			
 
				+for (let i = 0; i < 10; i++) scanBreakPoints(largeTS);
			
 
				+const regexExtractMs = (performance.now() - t1) / 10;
			
 
				+
			
 
				+const t2 = performance.now();
			
 
				+for (let i = 0; i < 10; i++) await chunkDocumentAsync(largeTS, undefined, undefined, undefined, "p.ts", "auto");
			
 
				+const astFullMs = (performance.now() - t2) / 10;
			
 
				+
			
 
				+const t3 = performance.now();
			
 
				+for (let i = 0; i < 10; i++) chunkDocument(largeTS);
			
 
				+const regexFullMs = (performance.now() - t3) / 10;
			
 
				+
			
 
				+console.log(`\n  File size:                    ${formatBytes(largeTS.length)}`);
			
 
				+console.log(`  AST break point extraction:   ${astExtractMs.toFixed(1)}ms`);
			
 
				+console.log(`  Regex break point extraction: ${regexExtractMs.toFixed(1)}ms`);
			
 
				+console.log(`  Full AST chunking:            ${astFullMs.toFixed(1)}ms`);
			
 
				+console.log(`  Full regex chunking:          ${regexFullMs.toFixed(1)}ms`);
			
 
				+console.log(`  Overhead per file:            ${(astFullMs - regexFullMs).toFixed(1)}ms`);
			
 
				+
			
 
				+check("AST chunking < 50ms per file", astFullMs < 50, `was ${astFullMs.toFixed(1)}ms`);
			
 
				+
			
 
				+// End of synthetic tests
			
 
				+section("Synthetic Test Results");
			
 
				+console.log(`\n  ${passed} passed, ${failed} failed`);
			
 
				+
			
 
				+} // end if (!skipSynthetic)
			
 
				+
			
 
				+
			
 
				+// ============================================================================
			
 
				+// PART 2: Real Collection Scan
			
 
				+// ============================================================================
			
 
				+
			
 
				+if (scanDir) {
			
 
				+
			
 
				+section(`Real Collection Scan: ${scanDir}`);
			
 
				+
			
 
				+console.log(`\n  Discovering files...`);
			
 
				+const realFiles = walkDir(scanDir);
			
 
				+console.log(`  Found ${realFiles.length} files\n`);
			
 
				+
			
 
				+if (realFiles.length === 0) {
			
 
				+  console.log("  No supported files found. Supported: .ts .tsx .js .jsx .py .go .rs .md");
			
 
				+} else {
			
 
				+
			
 
				+  // Classify files
			
 
				+  const byLang = {};
			
 
				+  let totalBytes = 0;
			
 
				+  const fileEntries = [];
			
 
				+
			
 
				+  for (const filepath of realFiles) {
			
 
				+    let content;
			
 
				+    try {
			
 
				+      const stat = statSync(filepath);
			
 
				+      if (stat.size > 500_000) continue; // skip files > 500KB
			
 
				+      content = readFileSync(filepath, "utf-8");
			
 
				+    } catch {
			
 
				+      continue;
			
 
				+    }
			
 
				+    if (!content.trim()) continue;
			
 
				+
			
 
				+    const rel = relative(scanDir, filepath);
			
 
				+    const lang = detectLanguage(filepath);
			
 
				+    const langLabel = lang ?? "markdown";
			
 
				+
			
 
				+    byLang[langLabel] = (byLang[langLabel] || 0) + 1;
			
 
				+    totalBytes += content.length;
			
 
				+    fileEntries.push({ filepath, rel, lang, langLabel, content });
			
 
				+  }
			
 
				+
			
 
				+  // Print file distribution
			
 
				+  console.log("  File distribution:");
			
 
				+  for (const [lang, count] of Object.entries(byLang).sort((a, b) => b[1] - a[1])) {
			
 
				+    console.log(`    ${lang.padEnd(14)} ${count} files`);
			
 
				+  }
			
 
				+  console.log(`    ${"total".padEnd(14)} ${fileEntries.length} files (${formatBytes(totalBytes)})`);
			
 
				+
			
 
				+  // ---- Per-file analysis ----
			
 
				+
			
 
				+  // Accumulators
			
 
				+  const perLang = {};
			
 
				+  let totalRegexChunks = 0;
			
 
				+  let totalAstChunks = 0;
			
 
				+  let totalRegexMs = 0;
			
 
				+  let totalAstMs = 0;
			
 
				+  let filesWithDifference = 0;
			
 
				+  let multiChunkFiles = 0;
			
 
				+  const bigDiffs = [];  // files where AST made the biggest difference
			
 
				+
			
 
				+  console.log(`\n  Analyzing ${fileEntries.length} files...\n`);
			
 
				+
			
 
				+  for (const entry of fileEntries) {
			
 
				+    const { rel, lang, langLabel, content } = entry;
			
 
				+    const isCode = lang !== null;
			
 
				+
			
 
				+    // Regex chunking
			
 
				+    const rt0 = performance.now();
			
 
				+    const rChunks = chunkDocument(content);
			
 
				+    const rMs = performance.now() - rt0;
			
 
				+
			
 
				+    // AST chunking
			
 
				+    const at0 = performance.now();
			
 
				+    const aChunks = await chunkDocumentAsync(content, undefined, undefined, undefined, rel, "auto");
			
 
				+    const aMs = performance.now() - at0;
			
 
				+
			
 
				+    totalRegexChunks += rChunks.length;
			
 
				+    totalAstChunks += aChunks.length;
			
 
				+    totalRegexMs += rMs;
			
 
				+    totalAstMs += aMs;
			
 
				+
			
 
				+    if (rChunks.length > 1 || aChunks.length > 1) multiChunkFiles++;
			
 
				+
			
 
				+    const chunkDiff = aChunks.length - rChunks.length;
			
 
				+    const contentDiffers = rChunks.length !== aChunks.length ||
			
 
				+      rChunks.some((c, i) => c.text !== aChunks[i]?.text);
			
 
				+
			
 
				+    if (contentDiffers) filesWithDifference++;
			
 
				+
			
 
				+    // Per-language stats
			
 
				+    if (!perLang[langLabel]) {
			
 
				+      perLang[langLabel] = {
			
 
				+        files: 0, bytes: 0, regexChunks: 0, astChunks: 0,
			
 
				+        regexMs: 0, astMs: 0, astBreakpoints: 0, diffs: 0,
			
 
				+      };
			
 
				+    }
			
 
				+    const s = perLang[langLabel];
			
 
				+    s.files++;
			
 
				+    s.bytes += content.length;
			
 
				+    s.regexChunks += rChunks.length;
			
 
				+    s.astChunks += aChunks.length;
			
 
				+    s.regexMs += rMs;
			
 
				+    s.astMs += aMs;
			
 
				+    if (contentDiffers) s.diffs++;
			
 
				+
			
 
				+    // Count AST breakpoints for code files
			
 
				+    if (isCode) {
			
 
				+      const bp = await getASTBreakPoints(content, rel);
			
 
				+      s.astBreakpoints += bp.length;
			
 
				+    }
			
 
				+
			
 
				+    // Track big differences for the detailed report
			
 
				+    if (contentDiffers && isCode && (rChunks.length > 1 || aChunks.length > 1)) {
			
 
				+      bigDiffs.push({
			
 
				+        rel, lang: langLabel, bytes: content.length,
			
 
				+        regexN: rChunks.length, astN: aChunks.length,
			
 
				+        diff: chunkDiff, overheadMs: aMs - rMs,
			
 
				+      });
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // ---- Aggregate report ----
			
 
				+
			
 
				+  section("Per-Language Summary");
			
 
				+
			
 
				+  const langOrder = Object.entries(perLang).sort((a, b) => b[1].files - a[1].files);
			
 
				+  const colW = { lang: 14, files: 7, bytes: 10, rChunks: 9, aChunks: 9, bps: 6, diffs: 6, rMs: 9, aMs: 9 };
			
 
				+
			
 
				+  console.log(
			
 
				+    `\n  ${"Language".padEnd(colW.lang)}${"Files".padStart(colW.files)}${"Size".padStart(colW.bytes)}` +
			
 
				+    `${"Rx Chnk".padStart(colW.rChunks)}${"AST Chnk".padStart(colW.aChunks)}` +
			
 
				+    `${"BPs".padStart(colW.bps)}${"Diffs".padStart(colW.diffs)}` +
			
 
				+    `${"Rx ms".padStart(colW.rMs)}${"AST ms".padStart(colW.aMs)}`
			
 
				+  );
			
 
				+  console.log("  " + "-".repeat(Object.values(colW).reduce((a, b) => a + b, 0)));
			
 
				+
			
 
				+  for (const [lang, s] of langOrder) {
			
 
				+    console.log(
			
 
				+      `  ${lang.padEnd(colW.lang)}` +
			
 
				+      `${String(s.files).padStart(colW.files)}` +
			
 
				+      `${formatBytes(s.bytes).padStart(colW.bytes)}` +
			
 
				+      `${String(s.regexChunks).padStart(colW.rChunks)}` +
			
 
				+      `${String(s.astChunks).padStart(colW.aChunks)}` +
			
 
				+      `${String(s.astBreakpoints).padStart(colW.bps)}` +
			
 
				+      `${String(s.diffs).padStart(colW.diffs)}` +
			
 
				+      `${s.regexMs.toFixed(1).padStart(colW.rMs)}` +
			
 
				+      `${s.astMs.toFixed(1).padStart(colW.aMs)}`
			
 
				+    );
			
 
				+  }
			
 
				+
			
 
				+  console.log("  " + "-".repeat(Object.values(colW).reduce((a, b) => a + b, 0)));
			
 
				+  console.log(
			
 
				+    `  ${"TOTAL".padEnd(colW.lang)}` +
			
 
				+    `${String(fileEntries.length).padStart(colW.files)}` +
			
 
				+    `${formatBytes(totalBytes).padStart(colW.bytes)}` +
			
 
				+    `${String(totalRegexChunks).padStart(colW.rChunks)}` +
			
 
				+    `${String(totalAstChunks).padStart(colW.aChunks)}` +
			
 
				+    `${"".padStart(colW.bps)}` +
			
 
				+    `${String(filesWithDifference).padStart(colW.diffs)}` +
			
 
				+    `${totalRegexMs.toFixed(1).padStart(colW.rMs)}` +
			
 
				+    `${totalAstMs.toFixed(1).padStart(colW.aMs)}`
			
 
				+  );
			
 
				+
			
 
				+  // ---- Headline stats ----
			
 
				+
			
 
				+  section("Headline Stats");
			
 
				+
			
 
				+  const codeFiles = fileEntries.filter(e => e.lang !== null).length;
			
 
				+  const mdFiles = fileEntries.filter(e => e.lang === null).length;
			
 
				+  const avgOverheadMs = codeFiles > 0
			
 
				+    ? (langOrder.filter(([l]) => l !== "markdown").reduce((s, [, v]) => s + v.astMs - v.regexMs, 0)) / codeFiles
			
 
				+    : 0;
			
 
				+
			
 
				+  console.log(`
			
 
				+  Files scanned:         ${fileEntries.length} (${codeFiles} code, ${mdFiles} markdown)
			
 
				+  Multi-chunk files:     ${multiChunkFiles} (files large enough to produce >1 chunk)
			
 
				+  Files where AST differed: ${filesWithDifference} / ${fileEntries.length} (${pct(filesWithDifference, fileEntries.length)})
			
 
				+  Total chunks (regex):  ${totalRegexChunks}
			
 
				+  Total chunks (AST):    ${totalAstChunks}  (${totalAstChunks > totalRegexChunks ? "+" : ""}${totalAstChunks - totalRegexChunks})
			
 
				+  Total time (regex):    ${totalRegexMs.toFixed(1)}ms
			
 
				+  Total time (AST):      ${totalAstMs.toFixed(1)}ms  (+${(totalAstMs - totalRegexMs).toFixed(1)}ms overhead)
			
 
				+  Avg overhead per code file: ${avgOverheadMs.toFixed(2)}ms
			
 
				+  `);
			
 
				+
			
 
				+  // ---- Top differences ----
			
 
				+
			
 
				+  if (bigDiffs.length > 0) {
			
 
				+    section("Top Files Where AST Changed Chunking");
			
 
				+
			
 
				+    bigDiffs.sort((a, b) => Math.abs(b.diff) - Math.abs(a.diff));
			
 
				+    const topN = bigDiffs.slice(0, 20);
			
 
				+
			
 
				+    console.log(
			
 
				+      `\n  ${"File".padEnd(50)} ${"Lang".padEnd(12)} ${"Size".padStart(8)} ` +
			
 
				+      `${"Rx".padStart(4)} ${"AST".padStart(4)} ${"Diff".padStart(5)} ` +
			
 
				+      `${"OH ms".padStart(7)}`
			
 
				+    );
			
 
				+    console.log("  " + "-".repeat(94));
			
 
				+
			
 
				+    for (const d of topN) {
			
 
				+      const sign = d.diff > 0 ? "+" : "";
			
 
				+      console.log(
			
 
				+        `  ${d.rel.slice(0, 49).padEnd(50)} ${d.lang.padEnd(12)} ${formatBytes(d.bytes).padStart(8)} ` +
			
 
				+        `${String(d.regexN).padStart(4)} ${String(d.astN).padStart(4)} ${(sign + d.diff).padStart(5)} ` +
			
 
				+        `${d.overheadMs.toFixed(1).padStart(7)}`
			
 
				+      );
			
 
				+    }
			
 
				+
			
 
				+    if (bigDiffs.length > 20) {
			
 
				+      console.log(`\n  ... and ${bigDiffs.length - 20} more files with differences`);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // ---- Markdown regression check ----
			
 
				+
			
 
				+  const mdEntries = fileEntries.filter(e => e.lang === null);
			
 
				+  if (mdEntries.length > 0) {
			
 
				+    section("Markdown Regression Check");
			
 
				+
			
 
				+    let mdRegressions = 0;
			
 
				+    for (const entry of mdEntries) {
			
 
				+      const rChunks = chunkDocument(entry.content);
			
 
				+      const aChunks = await chunkDocumentAsync(entry.content, undefined, undefined, undefined, entry.rel, "auto");
			
 
				+      const same = rChunks.length === aChunks.length &&
			
 
				+        rChunks.every((c, i) => c.text === aChunks[i]?.text);
			
 
				+      if (!same) {
			
 
				+        mdRegressions++;
			
 
				+        console.log(`  REGRESSION: ${entry.rel} (regex=${rChunks.length}, ast=${aChunks.length})`);
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    if (mdRegressions === 0) {
			
 
				+      console.log(`\n  All ${mdEntries.length} markdown files produce identical chunks. No regressions.`);
			
 
				+    } else {
			
 
				+      console.log(`\n  ${mdRegressions} / ${mdEntries.length} markdown files differ (unexpected!)`);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+} // end if realFiles.length > 0
			
 
				+
			
 
				+} // end if scanDir
			
 
				+
			
 
				+// ============================================================================
			
 
				+// Final Summary
			
 
				+// ============================================================================
			
 
				+
			
 
				+console.log(`\n${"=".repeat(70)}`);
			
 
				+if (!skipSynthetic) {
			
 
				+  console.log(`  SYNTHETIC TESTS: ${passed} passed, ${failed} failed`);
			
 
				+}
			
 
				+if (scanDir) {
			
 
				+  console.log(`  COLLECTION SCAN: complete (see report above)`);
			
 
				+}
			
 
				+if (!scanDir && !skipSynthetic) {
			
 
				+  console.log(`\n  Tip: Run with a directory argument to scan real files:`);
			
 
				+  console.log(`    npx tsx test-ast-chunking.mjs ~/dev/my-project`);
			
 
				+}
			
 
				+console.log("=".repeat(70));
			
 
				+
			
 
				+if (failed > 0) process.exit(1);
			
--- a/test/ast.test.ts
+++ b/test/ast.test.ts
@@ -0,0 +1,329 @@
 
				+/**
			
 
				+ * ast.test.ts - Tests for AST-aware chunking support
			
 
				+ *
			
 
				+ * Tests language detection, AST break point extraction for each
			
 
				+ * supported language, and graceful fallback on errors.
			
 
				+ */
			
 
				+
			
 
				+import { describe, test, expect } from "vitest";
			
 
				+import { detectLanguage, getASTBreakPoints, extractSymbols } from "../src/ast.js";
			
 
				+import type { SupportedLanguage } from "../src/ast.js";
			
 
				+
			
 
				+// =============================================================================
			
 
				+// Language Detection
			
 
				+// =============================================================================
			
 
				+
			
 
				+describe("detectLanguage", () => {
			
 
				+  test("recognizes TypeScript extensions", () => {
			
 
				+    expect(detectLanguage("src/auth.ts")).toBe("typescript");
			
 
				+    expect(detectLanguage("src/auth.mts")).toBe("typescript");
			
 
				+    expect(detectLanguage("src/auth.cts")).toBe("typescript");
			
 
				+  });
			
 
				+
			
 
				+  test("recognizes TSX extension", () => {
			
 
				+    expect(detectLanguage("src/App.tsx")).toBe("tsx");
			
 
				+  });
			
 
				+
			
 
				+  test("recognizes JavaScript extensions", () => {
			
 
				+    expect(detectLanguage("src/util.js")).toBe("javascript");
			
 
				+    expect(detectLanguage("src/util.mjs")).toBe("javascript");
			
 
				+    expect(detectLanguage("src/util.cjs")).toBe("javascript");
			
 
				+  });
			
 
				+
			
 
				+  test("recognizes JSX as tsx", () => {
			
 
				+    expect(detectLanguage("src/App.jsx")).toBe("tsx");
			
 
				+  });
			
 
				+
			
 
				+  test("recognizes Python extension", () => {
			
 
				+    expect(detectLanguage("src/auth.py")).toBe("python");
			
 
				+  });
			
 
				+
			
 
				+  test("recognizes Go extension", () => {
			
 
				+    expect(detectLanguage("src/auth.go")).toBe("go");
			
 
				+  });
			
 
				+
			
 
				+  test("recognizes Rust extension", () => {
			
 
				+    expect(detectLanguage("src/auth.rs")).toBe("rust");
			
 
				+  });
			
 
				+
			
 
				+  test("returns null for markdown", () => {
			
 
				+    expect(detectLanguage("docs/README.md")).toBeNull();
			
 
				+  });
			
 
				+
			
 
				+  test("returns null for unknown extensions", () => {
			
 
				+    expect(detectLanguage("data/file.csv")).toBeNull();
			
 
				+    expect(detectLanguage("config.yaml")).toBeNull();
			
 
				+    expect(detectLanguage("Makefile")).toBeNull();
			
 
				+  });
			
 
				+
			
 
				+  test("is case-insensitive for extensions", () => {
			
 
				+    expect(detectLanguage("src/Auth.TS")).toBe("typescript");
			
 
				+    expect(detectLanguage("src/Auth.PY")).toBe("python");
			
 
				+  });
			
 
				+
			
 
				+  test("works with virtual qmd:// paths", () => {
			
 
				+    expect(detectLanguage("qmd://myproject/src/auth.ts")).toBe("typescript");
			
 
				+    expect(detectLanguage("qmd://docs/README.md")).toBeNull();
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+// =============================================================================
			
 
				+// AST Break Points - TypeScript
			
 
				+// =============================================================================
			
 
				+
			
 
				+describe("getASTBreakPoints - TypeScript", () => {
			
 
				+  const TS_SAMPLE = `import { Database } from './db';
			
 
				+import type { User } from './types';
			
 
				+
			
 
				+interface AuthConfig {
			
 
				+  secret: string;
			
 
				+  ttl: number;
			
 
				+}
			
 
				+
			
 
				+type UserId = string;
			
 
				+
			
 
				+export class AuthService {
			
 
				+  constructor(private db: Database) {}
			
 
				+
			
 
				+  async authenticate(user: User, token: string): Promise<boolean> {
			
 
				+    const session = await this.db.findSession(token);
			
 
				+    return session?.userId === user.id;
			
 
				+  }
			
 
				+
			
 
				+  validateToken(token: string): boolean {
			
 
				+    return token.length === 64;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+export function hashPassword(password: string): string {
			
 
				+  return crypto.createHash('sha256').update(password).digest('hex');
			
 
				+}
			
 
				+`;
			
 
				+
			
 
				+  test("produces break points at function, class, and import boundaries", async () => {
			
 
				+    const points = await getASTBreakPoints(TS_SAMPLE, "src/auth.ts");
			
 
				+    expect(points.length).toBeGreaterThan(0);
			
 
				+
			
 
				+    // Should have import, interface, type, class (via export), method, and function break points
			
 
				+    const types = points.map(p => p.type);
			
 
				+    expect(types.some(t => t.includes("import"))).toBe(true);
			
 
				+    expect(types.some(t => t.includes("iface"))).toBe(true);
			
 
				+    expect(types.some(t => t.includes("type"))).toBe(true);
			
 
				+    expect(types.some(t => t.includes("export") || t.includes("class"))).toBe(true);
			
 
				+    expect(types.some(t => t.includes("method"))).toBe(true);
			
 
				+  });
			
 
				+
			
 
				+  test("break points are sorted by position", async () => {
			
 
				+    const points = await getASTBreakPoints(TS_SAMPLE, "src/auth.ts");
			
 
				+    for (let i = 1; i < points.length; i++) {
			
 
				+      expect(points[i]!.pos).toBeGreaterThanOrEqual(points[i - 1]!.pos);
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("scores align with expected hierarchy", async () => {
			
 
				+    const points = await getASTBreakPoints(TS_SAMPLE, "src/auth.ts");
			
 
				+
			
 
				+    // Class/interface should score 100
			
 
				+    const ifacePoint = points.find(p => p.type === "ast:iface");
			
 
				+    expect(ifacePoint?.score).toBe(100);
			
 
				+
			
 
				+    // Function/method should score 90
			
 
				+    const methodPoint = points.find(p => p.type === "ast:method");
			
 
				+    expect(methodPoint?.score).toBe(90);
			
 
				+
			
 
				+    // Import should score 60
			
 
				+    const importPoint = points.find(p => p.type === "ast:import");
			
 
				+    expect(importPoint?.score).toBe(60);
			
 
				+  });
			
 
				+
			
 
				+  test("break point positions match actual content positions", async () => {
			
 
				+    const points = await getASTBreakPoints(TS_SAMPLE, "src/auth.ts");
			
 
				+
			
 
				+    // First import should be at position 0
			
 
				+    const firstImport = points.find(p => p.type === "ast:import");
			
 
				+    expect(firstImport).toBeDefined();
			
 
				+    expect(TS_SAMPLE.slice(firstImport!.pos, firstImport!.pos + 6)).toBe("import");
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+// =============================================================================
			
 
				+// AST Break Points - Python
			
 
				+// =============================================================================
			
 
				+
			
 
				+describe("getASTBreakPoints - Python", () => {
			
 
				+  const PY_SAMPLE = `import os
			
 
				+from typing import Optional
			
 
				+
			
 
				+class AuthService:
			
 
				+    def __init__(self, db):
			
 
				+        self.db = db
			
 
				+
			
 
				+    async def authenticate(self, user, token):
			
 
				+        session = await self.db.find(token)
			
 
				+        return session.user_id == user.id
			
 
				+
			
 
				+    def validate_token(self, token):
			
 
				+        return len(token) == 64
			
 
				+
			
 
				+def hash_password(password: str) -> str:
			
 
				+    return hashlib.sha256(password.encode()).hexdigest()
			
 
				+
			
 
				+@decorator
			
 
				+def decorated_func():
			
 
				+    pass
			
 
				+`;
			
 
				+
			
 
				+  test("produces break points for class, function, import, and decorated definitions", async () => {
			
 
				+    const points = await getASTBreakPoints(PY_SAMPLE, "auth.py");
			
 
				+    const types = points.map(p => p.type);
			
 
				+
			
 
				+    expect(types.some(t => t.includes("import"))).toBe(true);
			
 
				+    expect(types.some(t => t.includes("class"))).toBe(true);
			
 
				+    expect(types.some(t => t.includes("func"))).toBe(true);
			
 
				+    expect(types.some(t => t.includes("decorated"))).toBe(true);
			
 
				+  });
			
 
				+
			
 
				+  test("captures method definitions inside classes", async () => {
			
 
				+    const points = await getASTBreakPoints(PY_SAMPLE, "auth.py");
			
 
				+    // Should capture __init__, authenticate, and validate_token as func
			
 
				+    const funcPoints = points.filter(p => p.type === "ast:func");
			
 
				+    expect(funcPoints.length).toBeGreaterThanOrEqual(3);
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+// =============================================================================
			
 
				+// AST Break Points - Go
			
 
				+// =============================================================================
			
 
				+
			
 
				+describe("getASTBreakPoints - Go", () => {
			
 
				+  const GO_SAMPLE = `package main
			
 
				+
			
 
				+import "fmt"
			
 
				+
			
 
				+type AuthService struct {
			
 
				+    db *Database
			
 
				+}
			
 
				+
			
 
				+func (s *AuthService) Authenticate(user User) bool {
			
 
				+    return true
			
 
				+}
			
 
				+
			
 
				+func HashPassword(password string) string {
			
 
				+    return "hash"
			
 
				+}
			
 
				+`;
			
 
				+
			
 
				+  test("produces break points for type, function, method, and import", async () => {
			
 
				+    const points = await getASTBreakPoints(GO_SAMPLE, "auth.go");
			
 
				+    const types = points.map(p => p.type);
			
 
				+
			
 
				+    expect(types.some(t => t.includes("import"))).toBe(true);
			
 
				+    expect(types.some(t => t.includes("type"))).toBe(true);
			
 
				+    expect(types.some(t => t.includes("method"))).toBe(true);
			
 
				+    expect(types.some(t => t.includes("func"))).toBe(true);
			
 
				+  });
			
 
				+
			
 
				+  test("function and method both score 90", async () => {
			
 
				+    const points = await getASTBreakPoints(GO_SAMPLE, "auth.go");
			
 
				+    const funcPoint = points.find(p => p.type === "ast:func");
			
 
				+    const methodPoint = points.find(p => p.type === "ast:method");
			
 
				+
			
 
				+    expect(funcPoint?.score).toBe(90);
			
 
				+    expect(methodPoint?.score).toBe(90);
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+// =============================================================================
			
 
				+// AST Break Points - Rust
			
 
				+// =============================================================================
			
 
				+
			
 
				+describe("getASTBreakPoints - Rust", () => {
			
 
				+  const RS_SAMPLE = `use std::collections::HashMap;
			
 
				+
			
 
				+struct AuthService {
			
 
				+    db: Database,
			
 
				+}
			
 
				+
			
 
				+impl AuthService {
			
 
				+    fn authenticate(&self, user: &User) -> bool {
			
 
				+        true
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+trait Authenticatable {
			
 
				+    fn validate(&self) -> bool;
			
 
				+}
			
 
				+
			
 
				+enum Role {
			
 
				+    Admin,
			
 
				+    User,
			
 
				+}
			
 
				+
			
 
				+fn hash_password(password: &str) -> String {
			
 
				+    String::new()
			
 
				+}
			
 
				+`;
			
 
				+
			
 
				+  test("produces break points for struct, impl, trait, enum, function, and use", async () => {
			
 
				+    const points = await getASTBreakPoints(RS_SAMPLE, "auth.rs");
			
 
				+    const types = points.map(p => p.type);
			
 
				+
			
 
				+    expect(types.some(t => t.includes("import"))).toBe(true);  // use_declaration -> @import
			
 
				+    expect(types.some(t => t.includes("struct"))).toBe(true);
			
 
				+    expect(types.some(t => t.includes("impl"))).toBe(true);
			
 
				+    expect(types.some(t => t.includes("trait"))).toBe(true);
			
 
				+    expect(types.some(t => t.includes("enum"))).toBe(true);
			
 
				+    expect(types.some(t => t.includes("func"))).toBe(true);
			
 
				+  });
			
 
				+
			
 
				+  test("struct, impl, and trait all score 100", async () => {
			
 
				+    const points = await getASTBreakPoints(RS_SAMPLE, "auth.rs");
			
 
				+    const structPoint = points.find(p => p.type === "ast:struct");
			
 
				+    const implPoint = points.find(p => p.type === "ast:impl");
			
 
				+    const traitPoint = points.find(p => p.type === "ast:trait");
			
 
				+
			
 
				+    expect(structPoint?.score).toBe(100);
			
 
				+    expect(implPoint?.score).toBe(100);
			
 
				+    expect(traitPoint?.score).toBe(100);
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+// =============================================================================
			
 
				+// Error Handling & Fallback
			
 
				+// =============================================================================
			
 
				+
			
 
				+describe("getASTBreakPoints - error handling", () => {
			
 
				+  test("returns empty array for unsupported file types", async () => {
			
 
				+    const points = await getASTBreakPoints("# Hello World", "readme.md");
			
 
				+    expect(points).toEqual([]);
			
 
				+  });
			
 
				+
			
 
				+  test("returns empty array for unknown extensions", async () => {
			
 
				+    const points = await getASTBreakPoints("data,here", "file.csv");
			
 
				+    expect(points).toEqual([]);
			
 
				+  });
			
 
				+
			
 
				+  test("handles empty content gracefully", async () => {
			
 
				+    const points = await getASTBreakPoints("", "empty.ts");
			
 
				+    expect(points).toEqual([]);
			
 
				+  });
			
 
				+
			
 
				+  test("handles syntactically invalid code gracefully", async () => {
			
 
				+    // Tree-sitter is error-tolerant, so this should still parse (with error nodes)
			
 
				+    // but should not crash
			
 
				+    const points = await getASTBreakPoints("function { broken syntax %%%", "broken.ts");
			
 
				+    // Should either return some partial break points or empty array — not throw
			
 
				+    expect(Array.isArray(points)).toBe(true);
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+// =============================================================================
			
 
				+// Symbol Extraction Stub (Phase 2)
			
 
				+// =============================================================================
			
 
				+
			
 
				+describe("extractSymbols", () => {
			
 
				+  test("returns empty array (Phase 2 stub)", () => {
			
 
				+    const symbols = extractSymbols("function foo() {}", "typescript", 0, 18);
			
 
				+    expect(symbols).toEqual([]);
			
 
				+  });
			
 
				+});
			
--- a/test/store.test.ts
+++ b/test/store.test.ts
@@ -29,6 +29,9 @@ import {
 
				   formatDocForEmbedding,
			
 
				   chunkDocument,
			
 
				   chunkDocumentByTokens,
			
 
				+  chunkDocumentAsync,
			
 
				+  chunkDocumentWithBreakPoints,
			
 
				+  mergeBreakPoints,
			
 
				   scanBreakPoints,
			
 
				   findCodeFences,
			
 
				   isInsideCodeFence,
			
@@ -1020,6 +1023,127 @@ Final section content.
 
				   });
			
 
				 });
			
 
				 
			
 
				+// =============================================================================
			
 
				+// AST-Aware Chunking Integration Tests
			
 
				+// =============================================================================
			
 
				+
			
 
				+describe("mergeBreakPoints", () => {
			
 
				+  test("merges two sets of break points keeping highest score at each position", () => {
			
 
				+    const regexPoints: BreakPoint[] = [
			
 
				+      { pos: 10, score: 20, type: "blank" },
			
 
				+      { pos: 50, score: 1, type: "newline" },
			
 
				+    ];
			
 
				+    const astPoints: BreakPoint[] = [
			
 
				+      { pos: 10, score: 90, type: "ast:func" },
			
 
				+      { pos: 100, score: 100, type: "ast:class" },
			
 
				+    ];
			
 
				+
			
 
				+    const merged = mergeBreakPoints(regexPoints, astPoints);
			
 
				+    expect(merged).toHaveLength(3);
			
 
				+
			
 
				+    // pos 10: AST score (90) wins over regex (20)
			
 
				+    const at10 = merged.find(p => p.pos === 10);
			
 
				+    expect(at10?.score).toBe(90);
			
 
				+    expect(at10?.type).toBe("ast:func");
			
 
				+
			
 
				+    // pos 50: only regex
			
 
				+    expect(merged.find(p => p.pos === 50)?.score).toBe(1);
			
 
				+
			
 
				+    // pos 100: only AST
			
 
				+    expect(merged.find(p => p.pos === 100)?.score).toBe(100);
			
 
				+  });
			
 
				+
			
 
				+  test("returns sorted by position", () => {
			
 
				+    const a: BreakPoint[] = [{ pos: 100, score: 10, type: "a" }];
			
 
				+    const b: BreakPoint[] = [{ pos: 5, score: 20, type: "b" }];
			
 
				+    const merged = mergeBreakPoints(a, b);
			
 
				+    expect(merged[0]!.pos).toBe(5);
			
 
				+    expect(merged[1]!.pos).toBe(100);
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+describe("chunkDocumentWithBreakPoints", () => {
			
 
				+  test("produces same output as chunkDocument for same input", () => {
			
 
				+    const content = "a".repeat(5000) + "\n\n" + "b".repeat(5000);
			
 
				+    const breakPoints = scanBreakPoints(content);
			
 
				+    const codeFences = findCodeFences(content);
			
 
				+
			
 
				+    const chunksOriginal = chunkDocument(content);
			
 
				+    const chunksNew = chunkDocumentWithBreakPoints(content, breakPoints, codeFences);
			
 
				+
			
 
				+    expect(chunksNew.length).toBe(chunksOriginal.length);
			
 
				+    for (let i = 0; i < chunksNew.length; i++) {
			
 
				+      expect(chunksNew[i]!.text).toBe(chunksOriginal[i]!.text);
			
 
				+      expect(chunksNew[i]!.pos).toBe(chunksOriginal[i]!.pos);
			
 
				+    }
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+describe("AST-aware chunkDocumentAsync", () => {
			
 
				+  const TS_CODE = `import { Database } from './db';
			
 
				+
			
 
				+export class AuthService {
			
 
				+  constructor(private db: Database) {}
			
 
				+
			
 
				+  async authenticate(user: User, token: string): Promise<boolean> {
			
 
				+    const session = await this.db.findSession(token);
			
 
				+    return session?.userId === user.id;
			
 
				+  }
			
 
				+
			
 
				+  validateToken(token: string): boolean {
			
 
				+    return token.length === 64;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+export function hashPassword(password: string): string {
			
 
				+  return crypto.createHash('sha256').update(password).digest('hex');
			
 
				+}
			
 
				+`.repeat(10); // Repeat to make it large enough to trigger chunking
			
 
				+
			
 
				+  test("returns chunks for code files with AST strategy", async () => {
			
 
				+    const chunks = await chunkDocumentAsync(TS_CODE, undefined, undefined, undefined, "auth.ts", "auto");
			
 
				+    expect(chunks.length).toBeGreaterThan(0);
			
 
				+    // Each chunk should have text and pos
			
 
				+    for (const chunk of chunks) {
			
 
				+      expect(typeof chunk.text).toBe("string");
			
 
				+      expect(chunk.text.length).toBeGreaterThan(0);
			
 
				+      expect(chunk.pos).toBeGreaterThanOrEqual(0);
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("regex strategy produces same output as chunkDocument for code files", async () => {
			
 
				+    const asyncChunks = await chunkDocumentAsync(TS_CODE, undefined, undefined, undefined, "auth.ts", "regex");
			
 
				+    const syncChunks = chunkDocument(TS_CODE);
			
 
				+
			
 
				+    expect(asyncChunks.length).toBe(syncChunks.length);
			
 
				+    for (let i = 0; i < asyncChunks.length; i++) {
			
 
				+      expect(asyncChunks[i]!.text).toBe(syncChunks[i]!.text);
			
 
				+      expect(asyncChunks[i]!.pos).toBe(syncChunks[i]!.pos);
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("markdown files are unchanged in auto mode", async () => {
			
 
				+    const mdContent = ("# Heading\n\n" + "Some text. ".repeat(200) + "\n\n").repeat(10);
			
 
				+    const asyncChunks = await chunkDocumentAsync(mdContent, undefined, undefined, undefined, "readme.md", "auto");
			
 
				+    const syncChunks = chunkDocument(mdContent);
			
 
				+
			
 
				+    expect(asyncChunks.length).toBe(syncChunks.length);
			
 
				+    for (let i = 0; i < asyncChunks.length; i++) {
			
 
				+      expect(asyncChunks[i]!.text).toBe(syncChunks[i]!.text);
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("no filepath falls back to regex-only", async () => {
			
 
				+    const asyncChunks = await chunkDocumentAsync(TS_CODE, undefined, undefined, undefined, undefined, "auto");
			
 
				+    const syncChunks = chunkDocument(TS_CODE);
			
 
				+
			
 
				+    expect(asyncChunks.length).toBe(syncChunks.length);
			
 
				+    for (let i = 0; i < asyncChunks.length; i++) {
			
 
				+      expect(asyncChunks[i]!.text).toBe(syncChunks[i]!.text);
			
 
				+    }
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				 // =============================================================================
			
 
				 // Caching Tests
			
 
				 // =============================================================================