Sfoglia il codice sorgente

feat(qmd): Phase 2 — function-level chunk strategy (i-bud0h8vu)

Ships the "function" ChunkStrategy: one chunk per AST function/class/
method range instead of char-window chunks. Opt-in per collection via
YAML chunkStrategy: function. Default unchanged for existing YAML.

Changes:
  * src/store.ts
    - Extend ChunkStrategy = "auto" | "regex" | "function".
    - chunkDocumentAsync branches on "function" to chunkByFunctionRanges,
      falling back to "auto" behavior when zero ranges detected.
    - Add chunkByFunctionRanges helper: one chunk per range, inter-range
      gaps char-chunked, oversized ranges re-split via the shared algo.
    - PendingEmbeddingDoc gains "collection" field.
    - getPendingEmbeddingDocs SELECTs MIN(d.collection) per hash.
    - generateEmbeddings resolves per-collection chunkStrategy from YAML
      via listCollections() Map lookup. Precedence:
      collection override > global option > function default "regex".
  * src/collections.ts
    - Collection interface gains optional chunkStrategy field. Omitted
      on save when unset (no behavior change for existing YAML).
  * src/cli/qmd.ts
    - parseChunkStrategy accepts "function" in addition to auto/regex.

Tests (+21 net):
  * test/ast.test.ts — getASTFunctionRanges unit tests: TS/Python
    (class/func/decorated), error handling.
  * test/ast-chunking.test.ts — chunkDocumentAsync chunkStrategy=
    "function" integration: per-unit chunks, no cross-unit leak, pos
    reflects absolute offset, markdown/bare-stmt fallback to auto.
  * test/collections-config.test.ts — chunkStrategy YAML round-trip.

Note: getASTFunctionRanges + FUNCTION_CAPTURE_NAMES + FunctionRange
shipped earlier in commit 89267c1 (sibling Phase 3 co-edit absorbed
Phase 2 ast.ts additions under i-76v1j1ld Session-Id — documented
in issue i-bud0h8vu comment).

Verification:
  CI=1 npx vitest run test/ast.test.ts test/ast-chunking.test.ts
    test/collections-config.test.ts  ->  64/64 pass
  npx tsc -p tsconfig.build.json --noEmit  ->  0 errors
  workspace_typecheck({component: "cli"})  ->  0 errors

Session-Id: 71c39606
root 1 mese fa
parent
commit
c464952b1d
6 ha cambiato i file con 457 aggiunte e 10 eliminazioni
  1. 2 2
      src/cli/qmd.ts
  2. 16 0
      src/collections.ts
  3. 150 5
      src/store.ts
  4. 68 0
      test/ast-chunking.test.ts
  5. 139 1
      test/ast.test.ts
  6. 82 2
      test/collections-config.test.ts

+ 2 - 2
src/cli/qmd.ts

@@ -1654,8 +1654,8 @@ function parseEmbedBatchOption(name: string, value: unknown): number | undefined
 function parseChunkStrategy(value: unknown): ChunkStrategy | undefined {
   if (value === undefined) return undefined;
   const s = String(value);
-  if (s === "auto" || s === "regex") return s;
-  throw new Error(`--chunk-strategy must be "auto" or "regex" (got "${s}")`);
+  if (s === "auto" || s === "regex" || s === "function") return s;
+  throw new Error(`--chunk-strategy must be "auto", "regex", or "function" (got "${s}")`);
 }
 
 async function vectorIndex(

+ 16 - 0
src/collections.ts

@@ -9,6 +9,7 @@ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
 import { join, dirname } from "path";
 import { homedir } from "os";
 import YAML from "yaml";
+import type { ChunkStrategy } from "./store.js";
 
 // ============================================================================
 // Types
@@ -31,6 +32,21 @@ export interface Collection {
   context?: ContextMap;      // Optional context definitions
   update?: string;           // Optional bash command to run during qmd update
   includeByDefault?: boolean; // Include in queries by default (default: true)
+  /**
+   * Chunking strategy for this collection (Phase 2 — i-bud0h8vu). When
+   * unset, qmd falls back to the global CLI `--chunk-strategy` flag.
+   *
+   *   - "auto"     — char-based chunks with AST break points as hints.
+   *   - "regex"    — char-based chunks without AST hints (legacy).
+   *   - "function" — one chunk per AST function/class/method range for
+   *                  supported code files. Opt-in per collection; files
+   *                  with zero detected ranges fall back to "auto".
+   *
+   * Changing this value requires a per-collection force-reindex
+   * (`qmd update --force <collection>`). The `content_hash`-keyed rows
+   * replace in-place, so other collections are unaffected.
+   */
+  chunkStrategy?: ChunkStrategy;
 }
 
 /**

+ 150 - 5
src/store.ts

@@ -227,7 +227,7 @@ export function findBestCutoff(
 // Chunk Strategy
 // =============================================================================
 
-export type ChunkStrategy = "auto" | "regex";
+export type ChunkStrategy = "auto" | "regex" | "function";
 
 /**
  * Merge two sets of break points (e.g. regex + AST), keeping the highest
@@ -1298,6 +1298,7 @@ type PendingEmbeddingDoc = {
   hash: string;
   path: string;
   bytes: number;
+  collection: string;
 };
 
 type EmbeddingDoc = PendingEmbeddingDoc & {
@@ -1330,8 +1331,13 @@ function resolveEmbedOptions(options?: EmbedOptions): Required<Pick<EmbedOptions
 }
 
 function getPendingEmbeddingDocs(db: Database): PendingEmbeddingDoc[] {
+  // `MIN(d.collection)` deterministically picks one collection per hash when
+  // the same content is indexed in multiple collections (SQLite tie-breaks
+  // alphabetically). The identical bytes produce identical chunks regardless
+  // of which collection wins; the chunkStrategy lookup still resolves via
+  // that collection's YAML config. See Phase 2 design notes (i-bud0h8vu).
   return db.prepare(`
-    SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
+    SELECT d.hash, MIN(d.path) as path, MIN(d.collection) as collection, length(CAST(c.doc AS BLOB)) as bytes
     FROM documents d
     JOIN content c ON d.hash = c.hash
     LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
@@ -1417,6 +1423,23 @@ export async function generateEmbeddings(
   const totalDocs = docsToEmbed.length;
   const startTime = Date.now();
 
+  // Per-collection chunkStrategy lookup (Phase 2 — i-bud0h8vu). YAML
+  // `chunkStrategy` on a collection wins over `options.chunkStrategy`
+  // (global CLI flag); falls back to the global option, then to
+  // chunkDocumentByTokens' own "regex" default when neither is set.
+  // Opt-in per collection — collections without the field are untouched.
+  const collectionStrategies = new Map<string, ChunkStrategy>();
+  try {
+    const { listCollections: listYamlCollections } = await import("./collections.js");
+    for (const c of listYamlCollections()) {
+      if (c.chunkStrategy) collectionStrategies.set(c.name, c.chunkStrategy);
+    }
+  } catch {
+    // If YAML config is missing/unreadable, fall back silently to the
+    // global strategy — no collection overrides. Keeps SDK/inline
+    // callers that never touch ~/.config/qmd working.
+  }
+
   // Use store's LlamaCpp or global singleton, wrapped in a session
   const llm = getLlm(store);
   const embedModelUri = llm.embedModelName;
@@ -1446,11 +1469,13 @@ export async function generateEmbeddings(
         if (!doc.body.trim()) continue;
 
         const title = extractTitle(doc.body, doc.path);
+        const perCollectionStrategy = collectionStrategies.get(doc.collection);
+        const chunkStrategy = perCollectionStrategy ?? options?.chunkStrategy;
         const chunks = await chunkDocumentByTokens(
           doc.body,
           undefined, undefined, undefined,
           doc.path,
-          options?.chunkStrategy,
+          chunkStrategy,
           session.signal,
         );
 
@@ -2171,8 +2196,12 @@ export function chunkDocument(
  * break points for supported code files, merges with regex break points,
  * and delegates to the shared chunk algorithm.
  *
- * Falls back to regex-only when strategy is "regex", filepath is absent,
- * or language is unsupported.
+ * Strategies:
+ *   - "regex"    (default) — char-based chunking with regex break points only.
+ *   - "auto"     — regex break points merged with AST break points (soft hints).
+ *   - "function" — one chunk per AST function range (Phase 2); inter-range
+ *                  gaps (imports, top-level code) are char-chunked with AST
+ *                  hints. Falls back to "auto" when zero ranges are detected.
  */
 export async function chunkDocumentAsync(
   content: string,
@@ -2185,6 +2214,29 @@ export async function chunkDocumentAsync(
   const regexPoints = scanBreakPoints(content);
   const codeFences = findCodeFences(content);
 
+  // "function" strategy: delegate to the function-level chunker. If no
+  // ranges are detected (markdown, unsupported lang, parse failure), fall
+  // back to "auto" behavior (AST-break-point-assisted char chunking).
+  if (chunkStrategy === "function" && filepath) {
+    const { getASTFunctionRanges, getASTBreakPoints } = await import("./ast.js");
+    const ranges = await getASTFunctionRanges(content, filepath);
+    if (ranges.length > 0) {
+      return chunkByFunctionRanges(
+        content,
+        ranges,
+        regexPoints,
+        codeFences,
+        maxChars,
+        overlapChars,
+        windowChars,
+      );
+    }
+    // Zero ranges — fall through to auto behavior so break points still help.
+    const astPoints = await getASTBreakPoints(content, filepath);
+    const merged = astPoints.length > 0 ? mergeBreakPoints(regexPoints, astPoints) : regexPoints;
+    return chunkDocumentWithBreakPoints(content, merged, codeFences, maxChars, overlapChars, windowChars);
+  }
+
   let breakPoints = regexPoints;
   if (chunkStrategy === "auto" && filepath) {
     const { getASTBreakPoints } = await import("./ast.js");
@@ -2197,6 +2249,99 @@ export async function chunkDocumentAsync(
   return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
 }
 
+/**
+ * Produce one chunk per AST function range, plus char-chunks for the gaps
+ * between ranges (imports, top-level code). Ranges that exceed `maxChars`
+ * are further split using the existing char-based algorithm so we never
+ * emit a single oversized chunk.
+ *
+ * Preconditions: `ranges` is non-empty, sorted by `startIndex`, and the
+ * ranges are non-overlapping (as produced by `getASTFunctionRanges`).
+ */
+function chunkByFunctionRanges(
+  content: string,
+  ranges: import("./ast.js").FunctionRange[],
+  regexPoints: BreakPoint[],
+  codeFences: CodeFenceRegion[],
+  maxChars: number,
+  overlapChars: number,
+  windowChars: number,
+): { text: string; pos: number }[] {
+  const out: { text: string; pos: number }[] = [];
+  let cursor = 0;
+
+  const emitGap = (start: number, end: number) => {
+    if (start >= end) return;
+    const gap = content.slice(start, end);
+    // Whitespace-only gaps are dropped — they carry no embeddable signal.
+    if (!gap.trim()) return;
+
+    if (gap.length <= maxChars) {
+      out.push({ text: gap, pos: start });
+      return;
+    }
+
+    // Reuse char-based algorithm for oversized gaps. Restrict break
+    // points and code fences to the gap window and rebase positions so
+    // chunkDocumentWithBreakPoints operates on a standalone slice.
+    const subPoints = regexPoints
+      .filter(p => p.pos >= start && p.pos < end)
+      .map(p => ({ ...p, pos: p.pos - start }));
+    const subFences = codeFences
+      .filter(f => f.end > start && f.start < end)
+      .map(f => ({
+        start: Math.max(0, f.start - start),
+        end: Math.max(0, Math.min(end, f.end) - start),
+      }));
+    const sub = chunkDocumentWithBreakPoints(gap, subPoints, subFences, maxChars, overlapChars, windowChars);
+    for (const c of sub) out.push({ text: c.text, pos: start + c.pos });
+  };
+
+  for (const range of ranges) {
+    // Emit any leading / inter-range gap (imports, top-level code).
+    emitGap(cursor, range.startIndex);
+
+    const body = content.slice(range.startIndex, range.endIndex);
+    if (body.length === 0) {
+      cursor = range.endIndex;
+      continue;
+    }
+
+    if (body.length <= maxChars) {
+      out.push({ text: body, pos: range.startIndex });
+    } else {
+      // Oversized function/class — split with char algorithm so we stay
+      // under the embed token budget. Break points inside the range are
+      // reused to keep splits at syntactically-sensible positions.
+      const subPoints = regexPoints
+        .filter(p => p.pos >= range.startIndex && p.pos < range.endIndex)
+        .map(p => ({ ...p, pos: p.pos - range.startIndex }));
+      const subFences = codeFences
+        .filter(f => f.end > range.startIndex && f.start < range.endIndex)
+        .map(f => ({
+          start: Math.max(0, f.start - range.startIndex),
+          end: Math.max(0, Math.min(range.endIndex, f.end) - range.startIndex),
+        }));
+      const sub = chunkDocumentWithBreakPoints(body, subPoints, subFences, maxChars, overlapChars, windowChars);
+      for (const c of sub) out.push({ text: c.text, pos: range.startIndex + c.pos });
+    }
+
+    cursor = range.endIndex;
+  }
+
+  // Trailing gap after the last range.
+  emitGap(cursor, content.length);
+
+  // Edge case: content consisted entirely of whitespace-only gaps (zero
+  // emitted chunks). Preserve the invariant that non-empty content yields
+  // at least one chunk.
+  if (out.length === 0 && content.length > 0) {
+    return [{ text: content, pos: 0 }];
+  }
+
+  return out;
+}
+
 /**
  * Chunk a document by actual token count using the LLM tokenizer.
  * More accurate than character-based chunking but requires async.

+ 68 - 0
test/ast-chunking.test.ts

@@ -197,3 +197,71 @@ describe("AST break point scores", () => {
     expect(points.find(p => p.type === "ast:enum")?.score).toBe(80);
   });
 });
+
+// ==========================================================================
+// Function-level chunk strategy (Phase 2)
+// ==========================================================================
+
+describe("chunkDocumentAsync with chunkStrategy='function'", () => {
+  const TS_CODE = `import { X } from "./x";
+
+export function alpha(): number {
+  const start = Date.now();
+  return start;
+}
+
+export function beta(): number {
+  return 42;
+}
+
+export class Gamma {
+  constructor() {}
+  run(): void {}
+}
+`;
+
+  test("produces one chunk per top-level code unit + import gap", async () => {
+    const chunks = await chunkDocumentAsync(TS_CODE, undefined, undefined, undefined, "x.ts", "function");
+    // Expect at least: import-gap, alpha, beta, Gamma = 4 chunks.
+    expect(chunks.length).toBeGreaterThanOrEqual(3);
+  });
+
+  test("each function chunk contains exactly one function/class body", async () => {
+    const chunks = await chunkDocumentAsync(TS_CODE, undefined, undefined, undefined, "x.ts", "function");
+    const alphaChunk = chunks.find(c => c.text.includes("function alpha"));
+    const betaChunk = chunks.find(c => c.text.includes("function beta"));
+    const classChunk = chunks.find(c => c.text.includes("class Gamma"));
+
+    expect(alphaChunk).toBeDefined();
+    expect(betaChunk).toBeDefined();
+    expect(classChunk).toBeDefined();
+
+    expect(alphaChunk!.text.includes("function beta")).toBe(false);
+    expect(betaChunk!.text.includes("class Gamma")).toBe(false);
+  });
+
+  test("pos reflects absolute offset in original content", async () => {
+    const chunks = await chunkDocumentAsync(TS_CODE, undefined, undefined, undefined, "x.ts", "function");
+    for (const c of chunks) {
+      expect(c.pos).toBeGreaterThanOrEqual(0);
+      const slice = TS_CODE.slice(c.pos, c.pos + Math.min(20, c.text.length));
+      const head = c.text.slice(0, Math.min(20, c.text.length));
+      expect(slice).toBe(head);
+    }
+  });
+
+  test("markdown falls back to auto behavior when chunkStrategy='function'", async () => {
+    // Markdown → detectLanguage returns null → getASTFunctionRanges returns []
+    // → fall through to auto behavior → short markdown = 1 chunk.
+    const md = "# Heading\n\nSome paragraph text.";
+    const chunks = await chunkDocumentAsync(md, undefined, undefined, undefined, "readme.md", "function");
+    expect(chunks.length).toBe(1);
+    expect(chunks[0]!.text).toBe(md);
+  });
+
+  test("code file with only bare statements falls back to auto (no ranges)", async () => {
+    const bare = "const x = 1;\nconst y = 2;\n";
+    const chunks = await chunkDocumentAsync(bare, undefined, undefined, undefined, "bare.ts", "function");
+    expect(chunks.length).toBe(1);
+  });
+});

+ 139 - 1
test/ast.test.ts

@@ -6,7 +6,7 @@
  */
 
 import { describe, test, expect } from "vitest";
-import { detectLanguage, getASTBreakPoints, extractSymbols } from "../src/ast.js";
+import { detectLanguage, getASTBreakPoints, getASTFunctionRanges, extractSymbols } from "../src/ast.js";
 import type { SupportedLanguage } from "../src/ast.js";
 
 // =============================================================================
@@ -317,6 +317,144 @@ describe("getASTBreakPoints - error handling", () => {
   });
 });
 
+// =============================================================================
+// Function-Level Range Extraction (Phase 2)
+// =============================================================================
+
+describe("getASTFunctionRanges - TypeScript", () => {
+  const TS_SAMPLE = `import { Database } from './db';
+
+interface Config {
+  secret: string;
+}
+
+type UserId = string;
+
+export class Service {
+  constructor(private db: Database) {}
+
+  async fetch(id: UserId): Promise<string> {
+    return this.db.get(id);
+  }
+
+  parse(raw: string): string {
+    return raw.trim();
+  }
+}
+
+export function helper(x: string): string {
+  return x.toUpperCase();
+}
+
+const arrow = (n: number): number => n + 1;
+`;
+
+  test("returns one range per top-level code unit", async () => {
+    const ranges = await getASTFunctionRanges(TS_SAMPLE, "src/service.ts");
+    // interface, type, export class, export function, const arrow = 5 ranges
+    // (the methods inside the class are absorbed by the class range)
+    expect(ranges.length).toBeGreaterThanOrEqual(4);
+  });
+
+  test("ranges are sorted by startIndex", async () => {
+    const ranges = await getASTFunctionRanges(TS_SAMPLE, "src/service.ts");
+    for (let i = 1; i < ranges.length; i++) {
+      expect(ranges[i]!.startIndex).toBeGreaterThanOrEqual(ranges[i - 1]!.startIndex);
+    }
+  });
+
+  test("ranges do not overlap", async () => {
+    const ranges = await getASTFunctionRanges(TS_SAMPLE, "src/service.ts");
+    for (let i = 1; i < ranges.length; i++) {
+      expect(ranges[i]!.startIndex).toBeGreaterThanOrEqual(ranges[i - 1]!.endIndex);
+    }
+  });
+
+  test("each range slice is non-empty and starts at a recognizable token", async () => {
+    const ranges = await getASTFunctionRanges(TS_SAMPLE, "src/service.ts");
+    for (const r of ranges) {
+      const slice = TS_SAMPLE.slice(r.startIndex, r.endIndex);
+      expect(slice.length).toBeGreaterThan(0);
+      expect(/^(export|class|interface|type|function|const)\b/.test(slice.trimStart())).toBe(true);
+    }
+  });
+
+  test("export class range is captured as one unit (not split into methods)", async () => {
+    const ranges = await getASTFunctionRanges(TS_SAMPLE, "src/service.ts");
+    const classRange = ranges.find(r => {
+      const slice = TS_SAMPLE.slice(r.startIndex, r.endIndex);
+      return slice.includes("class Service") && slice.includes("parse(");
+    });
+    expect(classRange).toBeDefined();
+  });
+});
+
+describe("getASTFunctionRanges - Python", () => {
+  const PY_SAMPLE = `import os
+
+class Service:
+    def __init__(self):
+        self.x = 1
+
+    def run(self):
+        return self.x
+
+@decorator
+def decorated_func():
+    return 42
+
+def plain_func():
+    return 1
+`;
+
+  test("captures class and function definitions (including decorated)", async () => {
+    const ranges = await getASTFunctionRanges(PY_SAMPLE, "service.py");
+    expect(ranges.length).toBeGreaterThanOrEqual(3);
+    const types = ranges.map(r => r.type);
+    expect(types.some(t => t === "ast:class")).toBe(true);
+    expect(types.some(t => t === "ast:func" || t === "ast:decorated")).toBe(true);
+  });
+
+  test("decorated function range includes the decorator", async () => {
+    const ranges = await getASTFunctionRanges(PY_SAMPLE, "service.py");
+    const decorated = ranges.find(r => {
+      const slice = PY_SAMPLE.slice(r.startIndex, r.endIndex);
+      return slice.includes("decorated_func");
+    });
+    expect(decorated).toBeDefined();
+    const slice = PY_SAMPLE.slice(decorated!.startIndex, decorated!.endIndex);
+    expect(slice.trimStart().startsWith("@decorator")).toBe(true);
+  });
+});
+
+describe("getASTFunctionRanges - error handling", () => {
+  test("returns empty array for markdown", async () => {
+    const ranges = await getASTFunctionRanges("# Hello", "README.md");
+    expect(ranges).toEqual([]);
+  });
+
+  test("returns empty array for unknown extension", async () => {
+    const ranges = await getASTFunctionRanges("noop", "notes.txt");
+    expect(ranges).toEqual([]);
+  });
+
+  test("returns empty array for empty file", async () => {
+    const ranges = await getASTFunctionRanges("", "empty.ts");
+    expect(ranges).toEqual([]);
+  });
+
+  test("handles garbage input gracefully (non-throwing)", async () => {
+    const ranges = await getASTFunctionRanges("function {{ broken !!", "broken.ts");
+    expect(Array.isArray(ranges)).toBe(true);
+  });
+
+  test("returns empty array for content with no top-level units", async () => {
+    const ranges = await getASTFunctionRanges("const x = 1;\nconst y = 2;\n", "vars.ts");
+    // lexical_declaration only matches when value is arrow_function/function_expression
+    expect(ranges).toEqual([]);
+  });
+});
+
 // =============================================================================
 // Symbol Extraction Stub (Phase 2)
 // =============================================================================

+ 82 - 2
test/collections-config.test.ts

@@ -7,8 +7,16 @@
 
 import { describe, test, expect, beforeEach, afterEach } from "vitest";
 import { join } from "path";
-import { homedir } from "os";
-import { getConfigPath, setConfigIndexName } from "../src/collections.js";
+import { homedir, tmpdir } from "os";
+import { mkdtempSync, rmSync, readFileSync } from "fs";
+import {
+  getConfigPath,
+  setConfigIndexName,
+  setConfigSource,
+  loadConfig,
+  saveConfig,
+} from "../src/collections.js";
+import type { CollectionConfig } from "../src/collections.js";
 
 // Save/restore env vars around each test
 let savedEnv: Record<string, string | undefined>;
@@ -72,3 +80,75 @@ describe("getConfigDir via getConfigPath", () => {
     expect(getConfigPath()).toBe(join("/xdg/config", "qmd", "myindex.yml"));
   });
 });
+
+// ============================================================================
+// chunkStrategy schema round-trip (Phase 2 — i-bud0h8vu)
+// ============================================================================
+
+describe("Collection.chunkStrategy YAML round-trip", () => {
+  let tmpDir: string;
+
+  beforeEach(() => {
+    tmpDir = mkdtempSync(join(tmpdir(), "qmd-chunkstrategy-"));
+    process.env.QMD_CONFIG_DIR = tmpDir;
+    setConfigIndexName("index");
+  });
+
+  afterEach(() => {
+    // Reset config source so we don't leak inline state
+    setConfigSource();
+    try {
+      rmSync(tmpDir, { recursive: true, force: true });
+    } catch {
+      // best-effort
+    }
+  });
+
+  test("chunkStrategy field persists through save/load cycle", () => {
+    const config: CollectionConfig = {
+      collections: {
+        "oivo-cli": {
+          path: "/srv/cli/src",
+          pattern: "**/*.ts",
+          chunkStrategy: "function",
+        },
+        "oivo-docs": {
+          path: "/srv/docs",
+          pattern: "**/*.md",
+          // no chunkStrategy — should remain unset after round-trip
+        },
+      },
+    };
+    saveConfig(config);
+
+    const loaded = loadConfig();
+    expect(loaded.collections["oivo-cli"]?.chunkStrategy).toBe("function");
+    expect(loaded.collections["oivo-docs"]?.chunkStrategy).toBeUndefined();
+  });
+
+  test("chunkStrategy 'auto' and 'regex' round-trip", () => {
+    const config: CollectionConfig = {
+      collections: {
+        a: { path: "/a", pattern: "*.ts", chunkStrategy: "auto" },
+        b: { path: "/b", pattern: "*.ts", chunkStrategy: "regex" },
+      },
+    };
+    saveConfig(config);
+
+    const loaded = loadConfig();
+    expect(loaded.collections.a?.chunkStrategy).toBe("auto");
+    expect(loaded.collections.b?.chunkStrategy).toBe("regex");
+  });
+
+  test("omitted chunkStrategy does not appear in serialized YAML", () => {
+    const config: CollectionConfig = {
+      collections: {
+        plain: { path: "/p", pattern: "*.md" },
+      },
+    };
+    saveConfig(config);
+
+    const yaml = readFileSync(join(tmpDir, "index.yml"), "utf-8");
+    expect(yaml).not.toContain("chunkStrategy");
+  });
+});