2 mesi fa · c464952b1d
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -1654,8 +1654,8 @@ function parseEmbedBatchOption(name: string, value: unknown): number | undefined
 
				 function parseChunkStrategy(value: unknown): ChunkStrategy | undefined {
			
 
				   if (value === undefined) return undefined;
			
 
				   const s = String(value);
			
 
				-  if (s === "auto" || s === "regex") return s;
			
 
				-  throw new Error(`--chunk-strategy must be "auto" or "regex" (got "${s}")`);
			
 
				+  if (s === "auto" || s === "regex" || s === "function") return s;
			
 
				+  throw new Error(`--chunk-strategy must be "auto", "regex", or "function" (got "${s}")`);
			
 
				 }
			
 
				 
			
 
				 async function vectorIndex(
			
--- a/src/collections.ts
+++ b/src/collections.ts
@@ -9,6 +9,7 @@ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
 
				 import { join, dirname } from "path";
			
 
				 import { homedir } from "os";
			
 
				 import YAML from "yaml";
			
 
				+import type { ChunkStrategy } from "./store.js";
			
 
				 
			
 
				 // ============================================================================
			
 
				 // Types
			
@@ -31,6 +32,21 @@ export interface Collection {
 
				   context?: ContextMap;      // Optional context definitions
			
 
				   update?: string;           // Optional bash command to run during qmd update
			
 
				   includeByDefault?: boolean; // Include in queries by default (default: true)
			
 
				+  /**
			
 
				+   * Chunking strategy for this collection (Phase 2 — i-bud0h8vu). When
			
 
				+   * unset, qmd falls back to the global CLI `--chunk-strategy` flag.
			
 
				+   *
			
 
				+   *   - "auto"     — char-based chunks with AST break points as hints.
			
 
				+   *   - "regex"    — char-based chunks without AST hints (legacy).
			
 
				+   *   - "function" — one chunk per AST function/class/method range for
			
 
				+   *                  supported code files. Opt-in per collection; files
			
 
				+   *                  with zero detected ranges fall back to "auto".
			
 
				+   *
			
 
				+   * Changing this value requires a per-collection force-reindex
			
 
				+   * (`qmd update --force <collection>`). The `content_hash`-keyed rows
			
 
				+   * replace in-place, so other collections are unaffected.
			
 
				+   */
			
 
				+  chunkStrategy?: ChunkStrategy;
			
 
				 }
			
 
				 
			
 
				 /**
			
--- a/src/store.ts
+++ b/src/store.ts
@@ -227,7 +227,7 @@ export function findBestCutoff(
 
				 // Chunk Strategy
			
 
				 // =============================================================================
			
 
				 
			
 
				-export type ChunkStrategy = "auto" | "regex";
			
 
				+export type ChunkStrategy = "auto" | "regex" | "function";
			
 
				 
			
 
				 /**
			
 
				  * Merge two sets of break points (e.g. regex + AST), keeping the highest
			
@@ -1298,6 +1298,7 @@ type PendingEmbeddingDoc = {
 
				   hash: string;
			
 
				   path: string;
			
 
				   bytes: number;
			
 
				+  collection: string;
			
 
				 };
			
 
				 
			
 
				 type EmbeddingDoc = PendingEmbeddingDoc & {
			
@@ -1330,8 +1331,13 @@ function resolveEmbedOptions(options?: EmbedOptions): Required<Pick<EmbedOptions
 
				 }
			
 
				 
			
 
				 function getPendingEmbeddingDocs(db: Database): PendingEmbeddingDoc[] {
			
 
				+  // `MIN(d.collection)` deterministically picks one collection per hash when
			
 
				+  // the same content is indexed in multiple collections (SQLite tie-breaks
			
 
				+  // alphabetically). The identical bytes produce identical chunks regardless
			
 
				+  // of which collection wins; the chunkStrategy lookup still resolves via
			
 
				+  // that collection's YAML config. See Phase 2 design notes (i-bud0h8vu).
			
 
				   return db.prepare(`
			
 
				-    SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
			
 
				+    SELECT d.hash, MIN(d.path) as path, MIN(d.collection) as collection, length(CAST(c.doc AS BLOB)) as bytes
			
 
				     FROM documents d
			
 
				     JOIN content c ON d.hash = c.hash
			
 
				     LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
			
@@ -1417,6 +1423,23 @@ export async function generateEmbeddings(
 
				   const totalDocs = docsToEmbed.length;
			
 
				   const startTime = Date.now();
			
 
				 
			
 
				+  // Per-collection chunkStrategy lookup (Phase 2 — i-bud0h8vu). YAML
			
 
				+  // `chunkStrategy` on a collection wins over `options.chunkStrategy`
			
 
				+  // (global CLI flag); falls back to the global option, then to
			
 
				+  // chunkDocumentByTokens' own "regex" default when neither is set.
			
 
				+  // Opt-in per collection — collections without the field are untouched.
			
 
				+  const collectionStrategies = new Map<string, ChunkStrategy>();
			
 
				+  try {
			
 
				+    const { listCollections: listYamlCollections } = await import("./collections.js");
			
 
				+    for (const c of listYamlCollections()) {
			
 
				+      if (c.chunkStrategy) collectionStrategies.set(c.name, c.chunkStrategy);
			
 
				+    }
			
 
				+  } catch {
			
 
				+    // If YAML config is missing/unreadable, fall back silently to the
			
 
				+    // global strategy — no collection overrides. Keeps SDK/inline
			
 
				+    // callers that never touch ~/.config/qmd working.
			
 
				+  }
			
 
				+
			
 
				   // Use store's LlamaCpp or global singleton, wrapped in a session
			
 
				   const llm = getLlm(store);
			
 
				   const embedModelUri = llm.embedModelName;
			
@@ -1446,11 +1469,13 @@ export async function generateEmbeddings(
 
				         if (!doc.body.trim()) continue;
			
 
				 
			
 
				         const title = extractTitle(doc.body, doc.path);
			
 
				+        const perCollectionStrategy = collectionStrategies.get(doc.collection);
			
 
				+        const chunkStrategy = perCollectionStrategy ?? options?.chunkStrategy;
			
 
				         const chunks = await chunkDocumentByTokens(
			
 
				           doc.body,
			
 
				           undefined, undefined, undefined,
			
 
				           doc.path,
			
 
				-          options?.chunkStrategy,
			
 
				+          chunkStrategy,
			
 
				           session.signal,
			
 
				         );
			
 
				 
			
@@ -2171,8 +2196,12 @@ export function chunkDocument(
 
				  * break points for supported code files, merges with regex break points,
			
 
				  * and delegates to the shared chunk algorithm.
			
 
				  *
			
 
				- * Falls back to regex-only when strategy is "regex", filepath is absent,
			
 
				- * or language is unsupported.
			
 
				+ * Strategies:
			
 
				+ *   - "regex"    (default) — char-based chunking with regex break points only.
			
 
				+ *   - "auto"     — regex break points merged with AST break points (soft hints).
			
 
				+ *   - "function" — one chunk per AST function range (Phase 2); inter-range
			
 
				+ *                  gaps (imports, top-level code) are char-chunked with AST
			
 
				+ *                  hints. Falls back to "auto" when zero ranges are detected.
			
 
				  */
			
 
				 export async function chunkDocumentAsync(
			
 
				   content: string,
			
@@ -2185,6 +2214,29 @@ export async function chunkDocumentAsync(
 
				   const regexPoints = scanBreakPoints(content);
			
 
				   const codeFences = findCodeFences(content);
			
 
				 
			
 
				+  // "function" strategy: delegate to the function-level chunker. If no
			
 
				+  // ranges are detected (markdown, unsupported lang, parse failure), fall
			
 
				+  // back to "auto" behavior (AST-break-point-assisted char chunking).
			
 
				+  if (chunkStrategy === "function" && filepath) {
			
 
				+    const { getASTFunctionRanges, getASTBreakPoints } = await import("./ast.js");
			
 
				+    const ranges = await getASTFunctionRanges(content, filepath);
			
 
				+    if (ranges.length > 0) {
			
 
				+      return chunkByFunctionRanges(
			
 
				+        content,
			
 
				+        ranges,
			
 
				+        regexPoints,
			
 
				+        codeFences,
			
 
				+        maxChars,
			
 
				+        overlapChars,
			
 
				+        windowChars,
			
 
				+      );
			
 
				+    }
			
 
				+    // Zero ranges — fall through to auto behavior so break points still help.
			
 
				+    const astPoints = await getASTBreakPoints(content, filepath);
			
 
				+    const merged = astPoints.length > 0 ? mergeBreakPoints(regexPoints, astPoints) : regexPoints;
			
 
				+    return chunkDocumentWithBreakPoints(content, merged, codeFences, maxChars, overlapChars, windowChars);
			
 
				+  }
			
 
				+
			
 
				   let breakPoints = regexPoints;
			
 
				   if (chunkStrategy === "auto" && filepath) {
			
 
				     const { getASTBreakPoints } = await import("./ast.js");
			
@@ -2197,6 +2249,99 @@ export async function chunkDocumentAsync(
 
				   return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * Produce one chunk per AST function range, plus char-chunks for the gaps
			
 
				+ * between ranges (imports, top-level code). Ranges that exceed `maxChars`
			
 
				+ * are further split using the existing char-based algorithm so we never
			
 
				+ * emit a single oversized chunk.
			
 
				+ *
			
 
				+ * Preconditions: `ranges` is non-empty, sorted by `startIndex`, and the
			
 
				+ * ranges are non-overlapping (as produced by `getASTFunctionRanges`).
			
 
				+ */
			
 
				+function chunkByFunctionRanges(
			
 
				+  content: string,
			
 
				+  ranges: import("./ast.js").FunctionRange[],
			
 
				+  regexPoints: BreakPoint[],
			
 
				+  codeFences: CodeFenceRegion[],
			
 
				+  maxChars: number,
			
 
				+  overlapChars: number,
			
 
				+  windowChars: number,
			
 
				+): { text: string; pos: number }[] {
			
 
				+  const out: { text: string; pos: number }[] = [];
			
 
				+  let cursor = 0;
			
 
				+
			
 
				+  const emitGap = (start: number, end: number) => {
			
 
				+    if (start >= end) return;
			
 
				+    const gap = content.slice(start, end);
			
 
				+    // Whitespace-only gaps are dropped — they carry no embeddable signal.
			
 
				+    if (!gap.trim()) return;
			
 
				+
			
 
				+    if (gap.length <= maxChars) {
			
 
				+      out.push({ text: gap, pos: start });
			
 
				+      return;
			
 
				+    }
			
 
				+
			
 
				+    // Reuse char-based algorithm for oversized gaps. Restrict break
			
 
				+    // points and code fences to the gap window and rebase positions so
			
 
				+    // chunkDocumentWithBreakPoints operates on a standalone slice.
			
 
				+    const subPoints = regexPoints
			
 
				+      .filter(p => p.pos >= start && p.pos < end)
			
 
				+      .map(p => ({ ...p, pos: p.pos - start }));
			
 
				+    const subFences = codeFences
			
 
				+      .filter(f => f.end > start && f.start < end)
			
 
				+      .map(f => ({
			
 
				+        start: Math.max(0, f.start - start),
			
 
				+        end: Math.max(0, Math.min(end, f.end) - start),
			
 
				+      }));
			
 
				+    const sub = chunkDocumentWithBreakPoints(gap, subPoints, subFences, maxChars, overlapChars, windowChars);
			
 
				+    for (const c of sub) out.push({ text: c.text, pos: start + c.pos });
			
 
				+  };
			
 
				+
			
 
				+  for (const range of ranges) {
			
 
				+    // Emit any leading / inter-range gap (imports, top-level code).
			
 
				+    emitGap(cursor, range.startIndex);
			
 
				+
			
 
				+    const body = content.slice(range.startIndex, range.endIndex);
			
 
				+    if (body.length === 0) {
			
 
				+      cursor = range.endIndex;
			
 
				+      continue;
			
 
				+    }
			
 
				+
			
 
				+    if (body.length <= maxChars) {
			
 
				+      out.push({ text: body, pos: range.startIndex });
			
 
				+    } else {
			
 
				+      // Oversized function/class — split with char algorithm so we stay
			
 
				+      // under the embed token budget. Break points inside the range are
			
 
				+      // reused to keep splits at syntactically-sensible positions.
			
 
				+      const subPoints = regexPoints
			
 
				+        .filter(p => p.pos >= range.startIndex && p.pos < range.endIndex)
			
 
				+        .map(p => ({ ...p, pos: p.pos - range.startIndex }));
			
 
				+      const subFences = codeFences
			
 
				+        .filter(f => f.end > range.startIndex && f.start < range.endIndex)
			
 
				+        .map(f => ({
			
 
				+          start: Math.max(0, f.start - range.startIndex),
			
 
				+          end: Math.max(0, Math.min(range.endIndex, f.end) - range.startIndex),
			
 
				+        }));
			
 
				+      const sub = chunkDocumentWithBreakPoints(body, subPoints, subFences, maxChars, overlapChars, windowChars);
			
 
				+      for (const c of sub) out.push({ text: c.text, pos: range.startIndex + c.pos });
			
 
				+    }
			
 
				+
			
 
				+    cursor = range.endIndex;
			
 
				+  }
			
 
				+
			
 
				+  // Trailing gap after the last range.
			
 
				+  emitGap(cursor, content.length);
			
 
				+
			
 
				+  // Edge case: content consisted entirely of whitespace-only gaps (zero
			
 
				+  // emitted chunks). Preserve the invariant that non-empty content yields
			
 
				+  // at least one chunk.
			
 
				+  if (out.length === 0 && content.length > 0) {
			
 
				+    return [{ text: content, pos: 0 }];
			
 
				+  }
			
 
				+
			
 
				+  return out;
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * Chunk a document by actual token count using the LLM tokenizer.
			
 
				  * More accurate than character-based chunking but requires async.
			
--- a/test/ast-chunking.test.ts
+++ b/test/ast-chunking.test.ts
@@ -197,3 +197,71 @@ describe("AST break point scores", () => {
 
				     expect(points.find(p => p.type === "ast:enum")?.score).toBe(80);
			
 
				   });
			
 
				 });
			
 
				+
			
 
				+// ==========================================================================
			
 
				+// Function-level chunk strategy (Phase 2)
			
 
				+// ==========================================================================
			
 
				+
			
 
				+describe("chunkDocumentAsync with chunkStrategy='function'", () => {
			
 
				+  const TS_CODE = `import { X } from "./x";
			
 
				+
			
 
				+export function alpha(): number {
			
 
				+  const start = Date.now();
			
 
				+  return start;
			
 
				+}
			
 
				+
			
 
				+export function beta(): number {
			
 
				+  return 42;
			
 
				+}
			
 
				+
			
 
				+export class Gamma {
			
 
				+  constructor() {}
			
 
				+  run(): void {}
			
 
				+}
			
 
				+`;
			
 
				+
			
 
				+  test("produces one chunk per top-level code unit + import gap", async () => {
			
 
				+    const chunks = await chunkDocumentAsync(TS_CODE, undefined, undefined, undefined, "x.ts", "function");
			
 
				+    // Expect at least: import-gap, alpha, beta, Gamma = 4 chunks.
			
 
				+    expect(chunks.length).toBeGreaterThanOrEqual(3);
			
 
				+  });
			
 
				+
			
 
				+  test("each function chunk contains exactly one function/class body", async () => {
			
 
				+    const chunks = await chunkDocumentAsync(TS_CODE, undefined, undefined, undefined, "x.ts", "function");
			
 
				+    const alphaChunk = chunks.find(c => c.text.includes("function alpha"));
			
 
				+    const betaChunk = chunks.find(c => c.text.includes("function beta"));
			
 
				+    const classChunk = chunks.find(c => c.text.includes("class Gamma"));
			
 
				+
			
 
				+    expect(alphaChunk).toBeDefined();
			
 
				+    expect(betaChunk).toBeDefined();
			
 
				+    expect(classChunk).toBeDefined();
			
 
				+
			
 
				+    expect(alphaChunk!.text.includes("function beta")).toBe(false);
			
 
				+    expect(betaChunk!.text.includes("class Gamma")).toBe(false);
			
 
				+  });
			
 
				+
			
 
				+  test("pos reflects absolute offset in original content", async () => {
			
 
				+    const chunks = await chunkDocumentAsync(TS_CODE, undefined, undefined, undefined, "x.ts", "function");
			
 
				+    for (const c of chunks) {
			
 
				+      expect(c.pos).toBeGreaterThanOrEqual(0);
			
 
				+      const slice = TS_CODE.slice(c.pos, c.pos + Math.min(20, c.text.length));
			
 
				+      const head = c.text.slice(0, Math.min(20, c.text.length));
			
 
				+      expect(slice).toBe(head);
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("markdown falls back to auto behavior when chunkStrategy='function'", async () => {
			
 
				+    // Markdown → detectLanguage returns null → getASTFunctionRanges returns []
			
 
				+    // → fall through to auto behavior → short markdown = 1 chunk.
			
 
				+    const md = "# Heading\n\nSome paragraph text.";
			
 
				+    const chunks = await chunkDocumentAsync(md, undefined, undefined, undefined, "readme.md", "function");
			
 
				+    expect(chunks.length).toBe(1);
			
 
				+    expect(chunks[0]!.text).toBe(md);
			
 
				+  });
			
 
				+
			
 
				+  test("code file with only bare statements falls back to auto (no ranges)", async () => {
			
 
				+    const bare = "const x = 1;\nconst y = 2;\n";
			
 
				+    const chunks = await chunkDocumentAsync(bare, undefined, undefined, undefined, "bare.ts", "function");
			
 
				+    expect(chunks.length).toBe(1);
			
 
				+  });
			
 
				+});
			
--- a/test/ast.test.ts
+++ b/test/ast.test.ts
@@ -6,7 +6,7 @@
 
				  */
			
 
				 
			
 
				 import { describe, test, expect } from "vitest";
			
 
				-import { detectLanguage, getASTBreakPoints, extractSymbols } from "../src/ast.js";
			
 
				+import { detectLanguage, getASTBreakPoints, getASTFunctionRanges, extractSymbols } from "../src/ast.js";
			
 
				 import type { SupportedLanguage } from "../src/ast.js";
			
 
				 
			
 
				 // =============================================================================
			
@@ -317,6 +317,144 @@ describe("getASTBreakPoints - error handling", () => {
 
				   });
			
 
				 });
			
 
				 
			
 
				+// =============================================================================
			
 
				+// Function-Level Range Extraction (Phase 2)
			
 
				+// =============================================================================
			
 
				+
			
 
				+describe("getASTFunctionRanges - TypeScript", () => {
			
 
				+  const TS_SAMPLE = `import { Database } from './db';
			
 
				+
			
 
				+interface Config {
			
 
				+  secret: string;
			
 
				+}
			
 
				+
			
 
				+type UserId = string;
			
 
				+
			
 
				+export class Service {
			
 
				+  constructor(private db: Database) {}
			
 
				+
			
 
				+  async fetch(id: UserId): Promise<string> {
			
 
				+    return this.db.get(id);
			
 
				+  }
			
 
				+
			
 
				+  parse(raw: string): string {
			
 
				+    return raw.trim();
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+export function helper(x: string): string {
			
 
				+  return x.toUpperCase();
			
 
				+}
			
 
				+
			
 
				+const arrow = (n: number): number => n + 1;
			
 
				+`;
			
 
				+
			
 
				+  test("returns one range per top-level code unit", async () => {
			
 
				+    const ranges = await getASTFunctionRanges(TS_SAMPLE, "src/service.ts");
			
 
				+    // interface, type, export class, export function, const arrow = 5 ranges
			
 
				+    // (the methods inside the class are absorbed by the class range)
			
 
				+    expect(ranges.length).toBeGreaterThanOrEqual(4);
			
 
				+  });
			
 
				+
			
 
				+  test("ranges are sorted by startIndex", async () => {
			
 
				+    const ranges = await getASTFunctionRanges(TS_SAMPLE, "src/service.ts");
			
 
				+    for (let i = 1; i < ranges.length; i++) {
			
 
				+      expect(ranges[i]!.startIndex).toBeGreaterThanOrEqual(ranges[i - 1]!.startIndex);
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("ranges do not overlap", async () => {
			
 
				+    const ranges = await getASTFunctionRanges(TS_SAMPLE, "src/service.ts");
			
 
				+    for (let i = 1; i < ranges.length; i++) {
			
 
				+      expect(ranges[i]!.startIndex).toBeGreaterThanOrEqual(ranges[i - 1]!.endIndex);
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("each range slice is non-empty and starts at a recognizable token", async () => {
			
 
				+    const ranges = await getASTFunctionRanges(TS_SAMPLE, "src/service.ts");
			
 
				+    for (const r of ranges) {
			
 
				+      const slice = TS_SAMPLE.slice(r.startIndex, r.endIndex);
			
 
				+      expect(slice.length).toBeGreaterThan(0);
			
 
				+      expect(/^(export|class|interface|type|function|const)\b/.test(slice.trimStart())).toBe(true);
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("export class range is captured as one unit (not split into methods)", async () => {
			
 
				+    const ranges = await getASTFunctionRanges(TS_SAMPLE, "src/service.ts");
			
 
				+    const classRange = ranges.find(r => {
			
 
				+      const slice = TS_SAMPLE.slice(r.startIndex, r.endIndex);
			
 
				+      return slice.includes("class Service") && slice.includes("parse(");
			
 
				+    });
			
 
				+    expect(classRange).toBeDefined();
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+describe("getASTFunctionRanges - Python", () => {
			
 
				+  const PY_SAMPLE = `import os
			
 
				+
			
 
				+class Service:
			
 
				+    def __init__(self):
			
 
				+        self.x = 1
			
 
				+
			
 
				+    def run(self):
			
 
				+        return self.x
			
 
				+
			
 
				+@decorator
			
 
				+def decorated_func():
			
 
				+    return 42
			
 
				+
			
 
				+def plain_func():
			
 
				+    return 1
			
 
				+`;
			
 
				+
			
 
				+  test("captures class and function definitions (including decorated)", async () => {
			
 
				+    const ranges = await getASTFunctionRanges(PY_SAMPLE, "service.py");
			
 
				+    expect(ranges.length).toBeGreaterThanOrEqual(3);
			
 
				+    const types = ranges.map(r => r.type);
			
 
				+    expect(types.some(t => t === "ast:class")).toBe(true);
			
 
				+    expect(types.some(t => t === "ast:func" || t === "ast:decorated")).toBe(true);
			
 
				+  });
			
 
				+
			
 
				+  test("decorated function range includes the decorator", async () => {
			
 
				+    const ranges = await getASTFunctionRanges(PY_SAMPLE, "service.py");
			
 
				+    const decorated = ranges.find(r => {
			
 
				+      const slice = PY_SAMPLE.slice(r.startIndex, r.endIndex);
			
 
				+      return slice.includes("decorated_func");
			
 
				+    });
			
 
				+    expect(decorated).toBeDefined();
			
 
				+    const slice = PY_SAMPLE.slice(decorated!.startIndex, decorated!.endIndex);
			
 
				+    expect(slice.trimStart().startsWith("@decorator")).toBe(true);
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+describe("getASTFunctionRanges - error handling", () => {
			
 
				+  test("returns empty array for markdown", async () => {
			
 
				+    const ranges = await getASTFunctionRanges("# Hello", "README.md");
			
 
				+    expect(ranges).toEqual([]);
			
 
				+  });
			
 
				+
			
 
				+  test("returns empty array for unknown extension", async () => {
			
 
				+    const ranges = await getASTFunctionRanges("noop", "notes.txt");
			
 
				+    expect(ranges).toEqual([]);
			
 
				+  });
			
 
				+
			
 
				+  test("returns empty array for empty file", async () => {
			
 
				+    const ranges = await getASTFunctionRanges("", "empty.ts");
			
 
				+    expect(ranges).toEqual([]);
			
 
				+  });
			
 
				+
			
 
				+  test("handles garbage input gracefully (non-throwing)", async () => {
			
 
				+    const ranges = await getASTFunctionRanges("function {{ broken !!", "broken.ts");
			
 
				+    expect(Array.isArray(ranges)).toBe(true);
			
 
				+  });
			
 
				+
			
 
				+  test("returns empty array for content with no top-level units", async () => {
			
 
				+    const ranges = await getASTFunctionRanges("const x = 1;\nconst y = 2;\n", "vars.ts");
			
 
				+    // lexical_declaration only matches when value is arrow_function/function_expression
			
 
				+    expect(ranges).toEqual([]);
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				 // =============================================================================
			
 
				 // Symbol Extraction Stub (Phase 2)
			
 
				 // =============================================================================
			
--- a/test/collections-config.test.ts
+++ b/test/collections-config.test.ts
@@ -7,8 +7,16 @@
 
				 
			
 
				 import { describe, test, expect, beforeEach, afterEach } from "vitest";
			
 
				 import { join } from "path";
			
 
				-import { homedir } from "os";
			
 
				-import { getConfigPath, setConfigIndexName } from "../src/collections.js";
			
 
				+import { homedir, tmpdir } from "os";
			
 
				+import { mkdtempSync, rmSync, readFileSync } from "fs";
			
 
				+import {
			
 
				+  getConfigPath,
			
 
				+  setConfigIndexName,
			
 
				+  setConfigSource,
			
 
				+  loadConfig,
			
 
				+  saveConfig,
			
 
				+} from "../src/collections.js";
			
 
				+import type { CollectionConfig } from "../src/collections.js";
			
 
				 
			
 
				 // Save/restore env vars around each test
			
 
				 let savedEnv: Record<string, string | undefined>;
			
@@ -72,3 +80,75 @@ describe("getConfigDir via getConfigPath", () => {
 
				     expect(getConfigPath()).toBe(join("/xdg/config", "qmd", "myindex.yml"));
			
 
				   });
			
 
				 });
			
 
				+
			
 
				+// ============================================================================
			
 
				+// chunkStrategy schema round-trip (Phase 2 — i-bud0h8vu)
			
 
				+// ============================================================================
			
 
				+
			
 
				+describe("Collection.chunkStrategy YAML round-trip", () => {
			
 
				+  let tmpDir: string;
			
 
				+
			
 
				+  beforeEach(() => {
			
 
				+    tmpDir = mkdtempSync(join(tmpdir(), "qmd-chunkstrategy-"));
			
 
				+    process.env.QMD_CONFIG_DIR = tmpDir;
			
 
				+    setConfigIndexName("index");
			
 
				+  });
			
 
				+
			
 
				+  afterEach(() => {
			
 
				+    // Reset config source so we don't leak inline state
			
 
				+    setConfigSource();
			
 
				+    try {
			
 
				+      rmSync(tmpDir, { recursive: true, force: true });
			
 
				+    } catch {
			
 
				+      // best-effort
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("chunkStrategy field persists through save/load cycle", () => {
			
 
				+    const config: CollectionConfig = {
			
 
				+      collections: {
			
 
				+        "oivo-cli": {
			
 
				+          path: "/srv/cli/src",
			
 
				+          pattern: "**/*.ts",
			
 
				+          chunkStrategy: "function",
			
 
				+        },
			
 
				+        "oivo-docs": {
			
 
				+          path: "/srv/docs",
			
 
				+          pattern: "**/*.md",
			
 
				+          // no chunkStrategy — should remain unset after round-trip
			
 
				+        },
			
 
				+      },
			
 
				+    };
			
 
				+    saveConfig(config);
			
 
				+
			
 
				+    const loaded = loadConfig();
			
 
				+    expect(loaded.collections["oivo-cli"]?.chunkStrategy).toBe("function");
			
 
				+    expect(loaded.collections["oivo-docs"]?.chunkStrategy).toBeUndefined();
			
 
				+  });
			
 
				+
			
 
				+  test("chunkStrategy 'auto' and 'regex' round-trip", () => {
			
 
				+    const config: CollectionConfig = {
			
 
				+      collections: {
			
 
				+        a: { path: "/a", pattern: "*.ts", chunkStrategy: "auto" },
			
 
				+        b: { path: "/b", pattern: "*.ts", chunkStrategy: "regex" },
			
 
				+      },
			
 
				+    };
			
 
				+    saveConfig(config);
			
 
				+
			
 
				+    const loaded = loadConfig();
			
 
				+    expect(loaded.collections.a?.chunkStrategy).toBe("auto");
			
 
				+    expect(loaded.collections.b?.chunkStrategy).toBe("regex");
			
 
				+  });
			
 
				+
			
 
				+  test("omitted chunkStrategy does not appear in serialized YAML", () => {
			
 
				+    const config: CollectionConfig = {
			
 
				+      collections: {
			
 
				+        plain: { path: "/p", pattern: "*.md" },
			
 
				+      },
			
 
				+    };
			
 
				+    saveConfig(config);
			
 
				+
			
 
				+    const yaml = readFileSync(join(tmpDir, "index.yml"), "utf-8");
			
 
				+    expect(yaml).not.toContain("chunkStrategy");
			
 
				+  });
			
 
				+});