Преглед изворни кода

Add LLM session management for lifecycle safety

Adds a session layer that prevents LLM contexts from being disposed
mid-operation during long-running tasks like batch embedding or
multi-step search workflows (expand → embed → rerank).

Key changes:
- Add LLMSessionManager with reference counting for active sessions
- Add LLMSession class for scoped access with automatic acquire/release
- Add withLLMSession() API for multi-step workflows
- Update idle timer to check canUnloadLLM() before disposing
- Wrap querySearch, vectorSearch, and embed command in sessions
- Add optional session parameter to searchVec and getEmbedding

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Tobi Lütke пре 3 месеци
родитељ
комит
32d313ad6b
5 измењених фајлова са 751 додато и 280 уклоњено
  1. 175 0
      src/llm.test.ts
  2. 285 4
      src/llm.ts
  3. 283 270
      src/qmd.ts
  4. 1 1
      src/store.test.ts
  5. 7 5
      src/store.ts

+ 175 - 0
src/llm.test.ts

@@ -12,7 +12,11 @@ import {
   LlamaCpp,
   getDefaultLlamaCpp,
   disposeDefaultLlamaCpp,
+  withLLMSession,
+  canUnloadLLM,
+  SessionReleasedError,
   type RerankDocument,
+  type ILLMSession,
 } from "./llm.js";
 
 // =============================================================================
@@ -382,3 +386,174 @@ describe("LlamaCpp Integration", () => {
   });
 });
 
+// =============================================================================
+// Session Management Tests
+// =============================================================================
+
+describe("LLM Session Management", () => {
+  describe("withLLMSession", () => {
+    test("session provides access to LLM operations", async () => {
+      const result = await withLLMSession(async (session) => {
+        expect(session.isValid).toBe(true);
+        const embedding = await session.embed("test text");
+        expect(embedding).not.toBeNull();
+        expect(embedding!.embedding.length).toBe(768);
+        return "success";
+      });
+      expect(result).toBe("success");
+    });
+
+    test("session is invalid after release", async () => {
+      let capturedSession: ILLMSession | null = null;
+
+      await withLLMSession(async (session) => {
+        capturedSession = session;
+        expect(session.isValid).toBe(true);
+      });
+
+      // Session should be invalid after withLLMSession returns
+      expect(capturedSession).not.toBeNull();
+      expect(capturedSession!.isValid).toBe(false);
+    });
+
+    test("session prevents idle unload during operations", async () => {
+      await withLLMSession(async (session) => {
+        // While inside a session, canUnloadLLM should return false
+        expect(canUnloadLLM()).toBe(false);
+
+        // Perform an operation
+        await session.embed("test");
+
+        // Still should not be able to unload
+        expect(canUnloadLLM()).toBe(false);
+      });
+
+      // After session ends, should be able to unload
+      expect(canUnloadLLM()).toBe(true);
+    });
+
+    test("nested sessions increment ref count", async () => {
+      await withLLMSession(async (outerSession) => {
+        expect(canUnloadLLM()).toBe(false);
+
+        await withLLMSession(async (innerSession) => {
+          expect(canUnloadLLM()).toBe(false);
+          expect(innerSession.isValid).toBe(true);
+          expect(outerSession.isValid).toBe(true);
+        });
+
+        // Inner session released, but outer still active
+        expect(canUnloadLLM()).toBe(false);
+        expect(outerSession.isValid).toBe(true);
+      });
+
+      // All sessions released
+      expect(canUnloadLLM()).toBe(true);
+    });
+
+    test("session embedBatch works correctly", async () => {
+      await withLLMSession(async (session) => {
+        const texts = ["Hello world", "Test text", "Another document"];
+        const results = await session.embedBatch(texts);
+
+        expect(results).toHaveLength(3);
+        for (const result of results) {
+          expect(result).not.toBeNull();
+          expect(result!.embedding.length).toBe(768);
+        }
+      });
+    });
+
+    test("session rerank works correctly", async () => {
+      await withLLMSession(async (session) => {
+        const documents: RerankDocument[] = [
+          { file: "a.txt", text: "The capital of France is Paris." },
+          { file: "b.txt", text: "Dogs are great pets." },
+        ];
+
+        const result = await session.rerank("What is the capital of France?", documents);
+
+        expect(result.results).toHaveLength(2);
+        expect(result.results[0]!.file).toBe("a.txt");
+        expect(result.results[0]!.score).toBeGreaterThan(result.results[1]!.score);
+      });
+    });
+
+    test("max duration aborts session after timeout", async () => {
+      let aborted = false;
+
+      try {
+        await withLLMSession(async (session) => {
+          // Wait longer than max duration
+          await new Promise(resolve => setTimeout(resolve, 150));
+
+          // This operation should throw because session was aborted
+          await session.embed("test");
+        }, { maxDuration: 50 }); // 50ms max
+      } catch (err) {
+        if (err instanceof SessionReleasedError) {
+          aborted = true;
+        } else {
+          throw err;
+        }
+      }
+
+      expect(aborted).toBe(true);
+    }, 5000);
+
+    test("external abort signal propagates to session", async () => {
+      const abortController = new AbortController();
+      let sessionAborted = false;
+
+      const promise = withLLMSession(async (session) => {
+        // Wait a bit then check if aborted
+        await new Promise(resolve => setTimeout(resolve, 100));
+
+        if (!session.isValid) {
+          sessionAborted = true;
+          throw new SessionReleasedError("Session aborted");
+        }
+
+        return "should not reach";
+      }, { signal: abortController.signal });
+
+      // Abort after 20ms
+      setTimeout(() => abortController.abort(), 20);
+
+      try {
+        await promise;
+      } catch (err) {
+        // Expected
+      }
+
+      expect(sessionAborted).toBe(true);
+    }, 5000);
+
+    test("session provides abort signal for monitoring", async () => {
+      await withLLMSession(async (session) => {
+        expect(session.signal).toBeInstanceOf(AbortSignal);
+        expect(session.signal.aborted).toBe(false);
+      });
+    });
+
+    test("returns value from callback", async () => {
+      const result = await withLLMSession(async (session) => {
+        await session.embed("test");
+        return { status: "complete", count: 42 };
+      });
+
+      expect(result).toEqual({ status: "complete", count: 42 });
+    });
+
+    test("propagates errors from callback", async () => {
+      const customError = new Error("Custom test error");
+
+      await expect(
+        withLLMSession(async () => {
+          throw customError;
+        })
+      ).rejects.toThrow("Custom test error");
+    });
+  });
+});
+

+ 285 - 4
src/llm.ts

@@ -119,6 +119,32 @@ export type RerankOptions = {
   model?: string;
 };
 
+/**
+ * Options for LLM sessions
+ */
+export type LLMSessionOptions = {
+  /** Max session duration in ms (default: 10 minutes) */
+  maxDuration?: number;
+  /** External abort signal */
+  signal?: AbortSignal;
+  /** Debug name for logging */
+  name?: string;
+};
+
+/**
+ * Session interface for scoped LLM access with lifecycle guarantees
+ */
+export interface ILLMSession {
+  embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
+  embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>;
+  expandQuery(query: string, options?: { context?: string; includeLexical?: boolean }): Promise<Queryable[]>;
+  rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
+  /** Whether this session is still valid (not released or aborted) */
+  readonly isValid: boolean;
+  /** Abort signal for this session (aborts on release or maxDuration) */
+  readonly signal: AbortSignal;
+}
+
 /**
  * Supported query types for different search backends
  */
@@ -225,8 +251,8 @@ export type LlamaCppConfig = {
 /**
  * LLM implementation using node-llama-cpp
  */
-// Default inactivity timeout: 2 minutes
-const DEFAULT_INACTIVITY_TIMEOUT_MS = 2 * 60 * 1000;
+// Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
+const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
 
 export class LlamaCpp implements LLM {
   private llama: Llama | null = null;
@@ -267,7 +293,7 @@ export class LlamaCpp implements LLM {
 
   /**
    * Reset the inactivity timer. Called after each model operation.
-   * When timer fires, models are unloaded to free memory.
+   * When timer fires, models are unloaded to free memory (if no active sessions).
    */
   private touchActivity(): void {
     // Clear existing timer
@@ -279,6 +305,14 @@ export class LlamaCpp implements LLM {
     // Only set timer if we have disposable contexts and timeout is enabled
     if (this.inactivityTimeoutMs > 0 && this.hasLoadedContexts()) {
       this.inactivityTimer = setTimeout(() => {
+        // Check if session manager allows unloading
+        // canUnloadLLM is defined later in this file - it checks the session manager
+        // We use dynamic import pattern to avoid circular dependency issues
+        if (typeof canUnloadLLM === 'function' && !canUnloadLLM()) {
+          // Active sessions/operations - reschedule timer
+          this.touchActivity();
+          return;
+        }
         this.unloadIdleResources().catch(err => {
           console.error("Error unloading idle resources:", err);
         });
@@ -390,6 +424,8 @@ export class LlamaCpp implements LLM {
       const modelPath = await this.resolveModel(this.embedModelUri);
       const model = await llama.loadModel({ modelPath });
       this.embedModel = model;
+      // Model loading counts as activity - ping to keep alive
+      this.touchActivity();
       return model;
     })();
 
@@ -421,7 +457,9 @@ export class LlamaCpp implements LLM {
       })();
 
       try {
-        await this.embedContextCreatePromise;
+        const context = await this.embedContextCreatePromise;
+        this.touchActivity();
+        return context;
       } finally {
         this.embedContextCreatePromise = null;
       }
@@ -476,6 +514,8 @@ export class LlamaCpp implements LLM {
       const modelPath = await this.resolveModel(this.rerankModelUri);
       const model = await llama.loadModel({ modelPath });
       this.rerankModel = model;
+      // Model loading counts as activity - ping to keep alive
+      this.touchActivity();
       return model;
     })();
 
@@ -538,6 +578,9 @@ export class LlamaCpp implements LLM {
   // ==========================================================================
 
   async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
+    // Ping activity at start to keep models alive during this operation
+    this.touchActivity();
+
     try {
       const context = await this.ensureEmbedContext();
       const embedding = await context.getEmbeddingFor(text);
@@ -557,6 +600,9 @@ export class LlamaCpp implements LLM {
    * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
    */
   async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
+    // Ping activity at start to keep models alive during this operation
+    this.touchActivity();
+
     if (texts.length === 0) return [];
 
     try {
@@ -587,6 +633,9 @@ export class LlamaCpp implements LLM {
   }
 
   async generate(prompt: string, options: GenerateOptions = {}): Promise<GenerateResult | null> {
+    // Ping activity at start to keep models alive during this operation
+    this.touchActivity();
+
     // Ensure model is loaded
     await this.ensureGenerateModel();
 
@@ -639,6 +688,9 @@ export class LlamaCpp implements LLM {
   // ==========================================================================
 
   async expandQuery(query: string, options: { context?: string, includeLexical?: boolean } = {}): Promise<Queryable[]> {
+    // Ping activity at start to keep models alive during this operation
+    this.touchActivity();
+
     const llama = await this.ensureLlama();
     await this.ensureGenerateModel();
 
@@ -740,6 +792,9 @@ Final Output:`;
     documents: RerankDocument[],
     options: RerankOptions = {}
   ): Promise<RerankResult> {
+    // Ping activity at start to keep models alive during this operation
+    this.touchActivity();
+
     const context = await this.ensureRerankContext();
 
     // Build a map from document text to original indices (for lookup after sorting)
@@ -808,6 +863,232 @@ Final Output:`;
   }
 }
 
+// =============================================================================
+// Session Management Layer
+// =============================================================================
+
+/**
+ * Manages LLM session lifecycle with reference counting.
+ * Coordinates with LlamaCpp idle timeout to prevent disposal during active sessions.
+ */
+class LLMSessionManager {
+  private llm: LlamaCpp;
+  private _activeSessionCount = 0;
+  private _inFlightOperations = 0;
+
+  constructor(llm: LlamaCpp) {
+    this.llm = llm;
+  }
+
+  get activeSessionCount(): number {
+    return this._activeSessionCount;
+  }
+
+  get inFlightOperations(): number {
+    return this._inFlightOperations;
+  }
+
+  /**
+   * Returns true only when both session count and in-flight operations are 0.
+   * Used by LlamaCpp to determine if idle unload is safe.
+   */
+  canUnload(): boolean {
+    return this._activeSessionCount === 0 && this._inFlightOperations === 0;
+  }
+
+  acquire(): void {
+    this._activeSessionCount++;
+  }
+
+  release(): void {
+    this._activeSessionCount = Math.max(0, this._activeSessionCount - 1);
+  }
+
+  operationStart(): void {
+    this._inFlightOperations++;
+  }
+
+  operationEnd(): void {
+    this._inFlightOperations = Math.max(0, this._inFlightOperations - 1);
+  }
+
+  getLlamaCpp(): LlamaCpp {
+    return this.llm;
+  }
+}
+
+/**
+ * Error thrown when an operation is attempted on a released or aborted session.
+ */
+export class SessionReleasedError extends Error {
+  constructor(message = "LLM session has been released or aborted") {
+    super(message);
+    this.name = "SessionReleasedError";
+  }
+}
+
+/**
+ * Scoped LLM session with automatic lifecycle management.
+ * Wraps LlamaCpp methods with operation tracking and abort handling.
+ */
+class LLMSession implements ILLMSession {
+  private manager: LLMSessionManager;
+  private released = false;
+  private abortController: AbortController;
+  private maxDurationTimer: ReturnType<typeof setTimeout> | null = null;
+  private name: string;
+
+  constructor(manager: LLMSessionManager, options: LLMSessionOptions = {}) {
+    this.manager = manager;
+    this.name = options.name || "unnamed";
+    this.abortController = new AbortController();
+
+    // Link external abort signal if provided
+    if (options.signal) {
+      if (options.signal.aborted) {
+        this.abortController.abort(options.signal.reason);
+      } else {
+        options.signal.addEventListener("abort", () => {
+          this.abortController.abort(options.signal!.reason);
+        }, { once: true });
+      }
+    }
+
+    // Set up max duration timer
+    const maxDuration = options.maxDuration ?? 10 * 60 * 1000; // Default 10 minutes
+    if (maxDuration > 0) {
+      this.maxDurationTimer = setTimeout(() => {
+        this.abortController.abort(new Error(`Session "${this.name}" exceeded max duration of ${maxDuration}ms`));
+      }, maxDuration);
+      this.maxDurationTimer.unref(); // Don't keep process alive
+    }
+
+    // Acquire session lease
+    this.manager.acquire();
+  }
+
+  get isValid(): boolean {
+    return !this.released && !this.abortController.signal.aborted;
+  }
+
+  get signal(): AbortSignal {
+    return this.abortController.signal;
+  }
+
+  /**
+   * Release the session and decrement ref count.
+   * Called automatically by withLLMSession when the callback completes.
+   */
+  release(): void {
+    if (this.released) return;
+    this.released = true;
+
+    if (this.maxDurationTimer) {
+      clearTimeout(this.maxDurationTimer);
+      this.maxDurationTimer = null;
+    }
+
+    this.abortController.abort(new Error("Session released"));
+    this.manager.release();
+  }
+
+  /**
+   * Wrap an operation with tracking and abort checking.
+   */
+  private async withOperation<T>(fn: () => Promise<T>): Promise<T> {
+    if (!this.isValid) {
+      throw new SessionReleasedError();
+    }
+
+    this.manager.operationStart();
+    try {
+      // Check abort before starting
+      if (this.abortController.signal.aborted) {
+        throw new SessionReleasedError(
+          this.abortController.signal.reason?.message || "Session aborted"
+        );
+      }
+      return await fn();
+    } finally {
+      this.manager.operationEnd();
+    }
+  }
+
+  async embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null> {
+    return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
+  }
+
+  async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
+    return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts));
+  }
+
+  async expandQuery(
+    query: string,
+    options?: { context?: string; includeLexical?: boolean }
+  ): Promise<Queryable[]> {
+    return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
+  }
+
+  async rerank(
+    query: string,
+    documents: RerankDocument[],
+    options?: RerankOptions
+  ): Promise<RerankResult> {
+    return this.withOperation(() => this.manager.getLlamaCpp().rerank(query, documents, options));
+  }
+}
+
+// Session manager for the default LlamaCpp instance
+let defaultSessionManager: LLMSessionManager | null = null;
+
+/**
+ * Get the session manager for the default LlamaCpp instance.
+ */
+function getSessionManager(): LLMSessionManager {
+  const llm = getDefaultLlamaCpp();
+  if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) {
+    defaultSessionManager = new LLMSessionManager(llm);
+  }
+  return defaultSessionManager;
+}
+
+/**
+ * Execute a function with a scoped LLM session.
+ * The session provides lifecycle guarantees - resources won't be disposed mid-operation.
+ *
+ * @example
+ * ```typescript
+ * await withLLMSession(async (session) => {
+ *   const expanded = await session.expandQuery(query);
+ *   const embeddings = await session.embedBatch(texts);
+ *   const reranked = await session.rerank(query, docs);
+ *   return reranked;
+ * }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
+ * ```
+ */
+export async function withLLMSession<T>(
+  fn: (session: ILLMSession) => Promise<T>,
+  options?: LLMSessionOptions
+): Promise<T> {
+  const manager = getSessionManager();
+  const session = new LLMSession(manager, options);
+
+  try {
+    return await fn(session);
+  } finally {
+    session.release();
+  }
+}
+
+/**
+ * Check if idle unload is safe (no active sessions or operations).
+ * Used internally by LlamaCpp idle timer.
+ */
+export function canUnloadLLM(): boolean {
+  if (!defaultSessionManager) return true;
+  return defaultSessionManager.canUnload();
+}
+
 // =============================================================================
 // Singleton for default LlamaCpp instance
 // =============================================================================

+ 283 - 270
src/qmd.ts

@@ -65,7 +65,7 @@ import {
   createStore,
   getDefaultDbPath,
 } from "./store.js";
-import { getDefaultLlamaCpp, disposeDefaultLlamaCpp, type RerankDocument, type Queryable, type QueryType } from "./llm.js";
+import { getDefaultLlamaCpp, disposeDefaultLlamaCpp, withLLMSession, type ILLMSession, type RerankDocument, type Queryable, type QueryType } from "./llm.js";
 import type { SearchResult, RankedResult } from "./store.js";
 import {
   formatSearchResults,
@@ -231,20 +231,21 @@ function computeDisplayPath(
 }
 
 // Rerank documents using node-llama-cpp cross-encoder model
-async function rerank(query: string, documents: { file: string; text: string }[], _model: string = DEFAULT_RERANK_MODEL, _db?: Database): Promise<{ file: string; score: number }[]> {
+async function rerank(query: string, documents: { file: string; text: string }[], _model: string = DEFAULT_RERANK_MODEL, _db?: Database, session?: ILLMSession): Promise<{ file: string; score: number }[]> {
   if (documents.length === 0) return [];
 
   const total = documents.length;
   process.stderr.write(`Reranking ${total} documents...\n`);
   progress.indeterminate();
 
-  const llm = getDefaultLlamaCpp();
   const rerankDocs: RerankDocument[] = documents.map((doc) => ({
     file: doc.file,
     text: doc.text.slice(0, 4000), // Truncate to context limit
   }));
 
-  const result = await llm.rerank(query, rerankDocs);
+  const result = session
+    ? await session.rerank(query, rerankDocs)
+    : await getDefaultLlamaCpp().rerank(query, rerankDocs);
 
   progress.clear();
   process.stderr.write("\n");
@@ -1543,7 +1544,7 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
     return;
   }
 
-  const totalBytes = allChunks.reduce((sum, c) => sum + c.bytes, 0);
+  const totalBytes = allChunks.reduce((sum, chk) => sum + chk.bytes, 0);
   const totalChunks = allChunks.length;
   const totalDocs = hashesToEmbed.length;
 
@@ -1556,99 +1557,103 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
   // Hide cursor during embedding
   cursor.hide();
 
-  // Get embedding dimensions from first chunk
-  progress.indeterminate();
-  const llm = getDefaultLlamaCpp();
-  const firstChunk = allChunks[0];
-  if (!firstChunk) {
-    throw new Error("No chunks available to embed");
-  }
-  const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title);
-  const firstResult = await llm.embed(firstText);
-  if (!firstResult) {
-    throw new Error("Failed to get embedding dimensions from first chunk");
-  }
-  ensureVecTable(db, firstResult.embedding.length);
+  // Wrap all LLM embedding operations in a session for lifecycle management
+  // Use 30 minute timeout for large collections
+  await withLLMSession(async (session) => {
+    // Get embedding dimensions from first chunk
+    progress.indeterminate();
+    const firstChunk = allChunks[0];
+    if (!firstChunk) {
+      throw new Error("No chunks available to embed");
+    }
+    const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title);
+    const firstResult = await session.embed(firstText);
+    if (!firstResult) {
+      throw new Error("Failed to get embedding dimensions from first chunk");
+    }
+    ensureVecTable(db, firstResult.embedding.length);
 
-  let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
-  const startTime = Date.now();
+    let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
+    const startTime = Date.now();
 
-  // Batch embedding for better throughput
-  // Process in batches of 32 to balance memory usage and efficiency
-  const BATCH_SIZE = 32;
+    // Batch embedding for better throughput
+    // Process in batches of 32 to balance memory usage and efficiency
+    const BATCH_SIZE = 32;
 
-  for (let batchStart = 0; batchStart < allChunks.length; batchStart += BATCH_SIZE) {
-    const batchEnd = Math.min(batchStart + BATCH_SIZE, allChunks.length);
-    const batch = allChunks.slice(batchStart, batchEnd);
+    for (let batchStart = 0; batchStart < allChunks.length; batchStart += BATCH_SIZE) {
+      const batchEnd = Math.min(batchStart + BATCH_SIZE, allChunks.length);
+      const batch = allChunks.slice(batchStart, batchEnd);
 
-    // Format texts for embedding
-    const texts = batch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title));
+      // Format texts for embedding
+      const texts = batch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title));
 
-    try {
-      // Batch embed all texts at once
-      const embeddings = await llm.embedBatch(texts);
+      try {
+        // Batch embed all texts at once
+        const embeddings = await session.embedBatch(texts);
 
-      // Insert each embedding
-      for (let i = 0; i < batch.length; i++) {
-        const chunk = batch[i]!;
-        const embedding = embeddings[i];
+        // Insert each embedding
+        for (let i = 0; i < batch.length; i++) {
+          const chunk = batch[i]!;
+          const embedding = embeddings[i];
 
-        if (embedding) {
-          insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
-          chunksEmbedded++;
-        } else {
-          errors++;
-          console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}${c.reset}`);
-        }
-        bytesProcessed += chunk.bytes;
-      }
-    } catch (err) {
-      // If batch fails, try individual embeddings as fallback
-      for (const chunk of batch) {
-        try {
-          const text = formatDocForEmbedding(chunk.text, chunk.title);
-          const result = await llm.embed(text);
-          if (result) {
-            insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
+          if (embedding) {
+            insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
             chunksEmbedded++;
           } else {
             errors++;
+            console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}${c.reset}`);
           }
-        } catch (innerErr) {
-          errors++;
-          console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}: ${innerErr}${c.reset}`);
+          bytesProcessed += chunk.bytes;
+        }
+      } catch (err) {
+        // If batch fails, try individual embeddings as fallback
+        for (const chunk of batch) {
+          try {
+            const text = formatDocForEmbedding(chunk.text, chunk.title);
+            const result = await session.embed(text);
+            if (result) {
+              insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
+              chunksEmbedded++;
+            } else {
+              errors++;
+            }
+          } catch (innerErr) {
+            errors++;
+            console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}: ${innerErr}${c.reset}`);
+          }
+          bytesProcessed += chunk.bytes;
         }
-        bytesProcessed += chunk.bytes;
       }
-    }
 
-    const percent = (bytesProcessed / totalBytes) * 100;
-    progress.set(percent);
+      const percent = (bytesProcessed / totalBytes) * 100;
+      progress.set(percent);
 
-    const elapsed = (Date.now() - startTime) / 1000;
-    const bytesPerSec = bytesProcessed / elapsed;
-    const remainingBytes = totalBytes - bytesProcessed;
-    const etaSec = remainingBytes / bytesPerSec;
+      const elapsed = (Date.now() - startTime) / 1000;
+      const bytesPerSec = bytesProcessed / elapsed;
+      const remainingBytes = totalBytes - bytesProcessed;
+      const etaSec = remainingBytes / bytesPerSec;
 
-    const bar = renderProgressBar(percent);
-    const percentStr = percent.toFixed(0).padStart(3);
-    const throughput = `${formatBytes(bytesPerSec)}/s`;
-    const eta = elapsed > 2 ? formatETA(etaSec) : "...";
-    const errStr = errors > 0 ? ` ${c.yellow}${errors} err${c.reset}` : "";
+      const bar = renderProgressBar(percent);
+      const percentStr = percent.toFixed(0).padStart(3);
+      const throughput = `${formatBytes(bytesPerSec)}/s`;
+      const eta = elapsed > 2 ? formatETA(etaSec) : "...";
+      const errStr = errors > 0 ? ` ${c.yellow}${errors} err${c.reset}` : "";
 
-    process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${chunksEmbedded}/${totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset}   `);
-  }
+      process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${chunksEmbedded}/${totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset}   `);
+    }
 
-  progress.clear();
-  cursor.show();
-  const totalTimeSec = (Date.now() - startTime) / 1000;
-  const avgThroughput = formatBytes(totalBytes / totalTimeSec);
+    progress.clear();
+    cursor.show();
+    const totalTimeSec = (Date.now() - startTime) / 1000;
+    const avgThroughput = formatBytes(totalBytes / totalTimeSec);
+
+    console.log(`\r${c.green}${renderProgressBar(100)}${c.reset} ${c.bold}100%${c.reset}                                    `);
+    console.log(`\n${c.green}✓ Done!${c.reset} Embedded ${c.bold}${chunksEmbedded}${c.reset} chunks from ${c.bold}${totalDocs}${c.reset} documents in ${c.bold}${formatETA(totalTimeSec)}${c.reset} ${c.dim}(${avgThroughput}/s)${c.reset}`);
+    if (errors > 0) {
+      console.log(`${c.yellow}⚠ ${errors} chunks failed${c.reset}`);
+    }
+  }, { maxDuration: 30 * 60 * 1000, name: 'embed-command' });
 
-  console.log(`\r${c.green}${renderProgressBar(100)}${c.reset} ${c.bold}100%${c.reset}                                    `);
-  console.log(`\n${c.green}✓ Done!${c.reset} Embedded ${c.bold}${chunksEmbedded}${c.reset} chunks from ${c.bold}${totalDocs}${c.reset} documents in ${c.bold}${formatETA(totalTimeSec)}${c.reset} ${c.dim}(${avgThroughput}/s)${c.reset}`);
-  if (errors > 0) {
-    console.log(`${c.yellow}⚠ ${errors} chunks failed${c.reset}`);
-  }
   closeDb();
 }
 
@@ -1975,60 +1980,64 @@ async function vectorSearch(query: string, opts: OutputOptions, model: string =
   // Check index health and warn about issues
   checkIndexHealth(db);
 
-  // Expand query using structured output (no lexical for vector-only search)
-  const queryables = await expandQueryStructured(query, false, opts.context);
+  // Wrap LLM operations in a session for lifecycle management
+  await withLLMSession(async (session) => {
+    // Expand query using structured output (no lexical for vector-only search)
+    const queryables = await expandQueryStructured(query, false, opts.context, session);
 
-  // Build list of queries for vector search: original, vec, and hyde
-  const vectorQueries: string[] = [query];
-  for (const q of queryables) {
-    if (q.type === 'vec' || q.type === 'hyde') {
-      if (q.text && q.text !== query) {
-        vectorQueries.push(q.text);
+    // Build list of queries for vector search: original, vec, and hyde
+    const vectorQueries: string[] = [query];
+    for (const q of queryables) {
+      if (q.type === 'vec' || q.type === 'hyde') {
+        if (q.text && q.text !== query) {
+          vectorQueries.push(q.text);
+        }
       }
     }
-  }
 
-  process.stderr.write(`${c.dim}Searching ${vectorQueries.length} vector queries...${c.reset}\n`);
+    process.stderr.write(`${c.dim}Searching ${vectorQueries.length} vector queries...${c.reset}\n`);
 
-  // Collect results from all query variations
-  const perQueryLimit = opts.all ? 500 : 20;
-  const allResults = new Map<string, { file: string; displayPath: string; title: string; body: string; score: number; hash: string }>();
+    // Collect results from all query variations
+    const perQueryLimit = opts.all ? 500 : 20;
+    const allResults = new Map<string, { file: string; displayPath: string; title: string; body: string; score: number; hash: string }>();
 
-  // IMPORTANT: Run vector searches sequentially, not with Promise.all.
-  // node-llama-cpp's embedding context hangs when multiple concurrent embed() calls
-  // are made. This is a known limitation of the LlamaEmbeddingContext.
-  // See: https://github.com/tobi/qmd/pull/23
-  for (const q of vectorQueries) {
-    const vecResults = await searchVec(db, q, model, perQueryLimit, collectionName as any);
-    for (const r of vecResults) {
-      const existing = allResults.get(r.filepath);
-      if (!existing || r.score > existing.score) {
-        allResults.set(r.filepath, { file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score, hash: r.hash });
+    // IMPORTANT: Run vector searches sequentially, not with Promise.all.
+    // node-llama-cpp's embedding context hangs when multiple concurrent embed() calls
+    // are made. This is a known limitation of the LlamaEmbeddingContext.
+    // See: https://github.com/tobi/qmd/pull/23
+    for (const q of vectorQueries) {
+      const vecResults = await searchVec(db, q, model, perQueryLimit, collectionName as any, session);
+      for (const r of vecResults) {
+        const existing = allResults.get(r.filepath);
+        if (!existing || r.score > existing.score) {
+          allResults.set(r.filepath, { file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score, hash: r.hash });
+        }
       }
     }
-  }
 
-  // Sort by max score and limit to requested count
-  const results = Array.from(allResults.values())
-    .sort((a, b) => b.score - a.score)
-    .slice(0, opts.limit)
-    .map(r => ({ ...r, context: getContextForFile(db, r.file) }));
+    // Sort by max score and limit to requested count
+    const results = Array.from(allResults.values())
+      .sort((a, b) => b.score - a.score)
+      .slice(0, opts.limit)
+      .map(r => ({ ...r, context: getContextForFile(db, r.file) }));
 
-  closeDb();
+    closeDb();
 
-  if (results.length === 0) {
-    console.log("No results found.");
-    return;
-  }
-  outputResults(results, query, { ...opts, limit: results.length }); // Already limited
+    if (results.length === 0) {
+      console.log("No results found.");
+      return;
+    }
+    outputResults(results, query, { ...opts, limit: results.length }); // Already limited
+  }, { maxDuration: 10 * 60 * 1000, name: 'vectorSearch' });
 }
 
 // Expand query using structured output with GBNF grammar
-async function expandQueryStructured(query: string, includeLexical: boolean = true, context?: string): Promise<Queryable[]> {
+async function expandQueryStructured(query: string, includeLexical: boolean = true, context?: string, session?: ILLMSession): Promise<Queryable[]> {
   process.stderr.write(`${c.dim}Expanding query...${c.reset}\n`);
 
-  const llm = getDefaultLlamaCpp();
-  const queryables = await llm.expandQuery(query, { includeLexical, context });
+  const queryables = session
+    ? await session.expandQuery(query, { includeLexical, context })
+    : await getDefaultLlamaCpp().expandQuery(query, { includeLexical, context });
 
   // Log the expansion as a tree
   const lines: string[] = [];
@@ -2060,8 +2069,8 @@ async function expandQueryStructured(query: string, includeLexical: boolean = tr
   return queryables;
 }
 
-async function expandQuery(query: string, _model: string = DEFAULT_QUERY_MODEL, _db?: Database): Promise<string[]> {
-  const queryables = await expandQueryStructured(query, true);
+async function expandQuery(query: string, _model: string = DEFAULT_QUERY_MODEL, _db?: Database, session?: ILLMSession): Promise<string[]> {
+  const queryables = await expandQueryStructured(query, true, undefined, session);
   const queries = new Set<string>([query]);
   for (const q of queryables) {
     queries.add(q.text);
@@ -2098,178 +2107,182 @@ async function querySearch(query: string, opts: OutputOptions, embedModel: strin
   const secondScore = initialFts[1]?.score ?? 0;
   const hasStrongSignal = initialFts.length > 0 && topScore >= 0.85 && (topScore - secondScore) >= 0.15;
 
-  let ftsQueries: string[] = [query];
-  let vectorQueries: string[] = [query];
-
-  if (hasStrongSignal) {
-    // Strong BM25 signal - skip expensive LLM expansion
-    process.stderr.write(`${c.dim}Strong BM25 signal (${topScore.toFixed(2)}) - skipping expansion${c.reset}\n`);
-    // Still log the "expansion tree" in the same style as vsearch for consistency.
-    {
-      const lines: string[] = [];
-      lines.push(`${c.dim}├─ ${query} · (lexical+vector)${c.reset}`);
-      lines[lines.length - 1] = lines[lines.length - 1]!.replace('├─', '└─');
-      for (const line of lines) process.stderr.write(line + '\n');
-    }
-  } else {
-    // Weak signal - expand query for better recall
-    const queryables = await expandQueryStructured(query, true, opts.context);
-
-    for (const q of queryables) {
-      if (q.type === 'lex') {
-        if (q.text && q.text !== query) ftsQueries.push(q.text);
-      } else if (q.type === 'vec' || q.type === 'hyde') {
-        if (q.text && q.text !== query) vectorQueries.push(q.text);
+  // Wrap LLM operations in a session for lifecycle management
+  await withLLMSession(async (session) => {
+    let ftsQueries: string[] = [query];
+    let vectorQueries: string[] = [query];
+
+    if (hasStrongSignal) {
+      // Strong BM25 signal - skip expensive LLM expansion
+      process.stderr.write(`${c.dim}Strong BM25 signal (${topScore.toFixed(2)}) - skipping expansion${c.reset}\n`);
+      // Still log the "expansion tree" in the same style as vsearch for consistency.
+      {
+        const lines: string[] = [];
+        lines.push(`${c.dim}├─ ${query} · (lexical+vector)${c.reset}`);
+        lines[lines.length - 1] = lines[lines.length - 1]!.replace('├─', '└─');
+        for (const line of lines) process.stderr.write(line + '\n');
+      }
+    } else {
+      // Weak signal - expand query for better recall
+      const queryables = await expandQueryStructured(query, true, opts.context, session);
+
+      for (const q of queryables) {
+        if (q.type === 'lex') {
+          if (q.text && q.text !== query) ftsQueries.push(q.text);
+        } else if (q.type === 'vec' || q.type === 'hyde') {
+          if (q.text && q.text !== query) vectorQueries.push(q.text);
+        }
       }
     }
-  }
-
-  process.stderr.write(`${c.dim}Searching ${ftsQueries.length} lexical + ${vectorQueries.length} vector queries...${c.reset}\n`);
 
-  // Collect ranked result lists for RRF fusion
-  const rankedLists: RankedResult[][] = [];
+    process.stderr.write(`${c.dim}Searching ${ftsQueries.length} lexical + ${vectorQueries.length} vector queries...${c.reset}\n`);
 
-  // Map to store hash by filepath for final results
-  const hashMap = new Map<string, string>();
+    // Collect ranked result lists for RRF fusion
+    const rankedLists: RankedResult[][] = [];
 
-  // Run all searches concurrently (FTS + Vector)
-  const searchPromises: Promise<void>[] = [];
+    // Map to store hash by filepath for final results
+    const hashMap = new Map<string, string>();
 
-  // FTS searches
-  for (const q of ftsQueries) {
-    if (!q) continue;
-    searchPromises.push((async () => {
-      const ftsResults = searchFTS(db, q, 20, (collectionName || "") as any);
-      if (ftsResults.length > 0) {
-        for (const r of ftsResults) {
-          // Mutex for hashMap is not strictly needed as it's just adding values
-          hashMap.set(r.filepath, r.hash);
-        }
-        rankedLists.push(ftsResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score })));
-      }
-    })());
-  }
+    // Run all searches concurrently (FTS + Vector)
+    const searchPromises: Promise<void>[] = [];
 
-  // Vector searches
-  if (hasVectors) {
-    for (const q of vectorQueries) {
+    // FTS searches
+    for (const q of ftsQueries) {
       if (!q) continue;
       searchPromises.push((async () => {
-        const vecResults = await searchVec(db, q, embedModel, 20, (collectionName || "") as any);
-        if (vecResults.length > 0) {
-          for (const r of vecResults) hashMap.set(r.filepath, r.hash);
-          rankedLists.push(vecResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score })));
+        const ftsResults = searchFTS(db, q, 20, (collectionName || "") as any);
+        if (ftsResults.length > 0) {
+          for (const r of ftsResults) {
+            // Mutex for hashMap is not strictly needed as it's just adding values
+            hashMap.set(r.filepath, r.hash);
+          }
+          rankedLists.push(ftsResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score })));
         }
       })());
     }
-  }
 
-  await Promise.all(searchPromises);
+    // Vector searches (session ensures contexts stay alive)
+    if (hasVectors) {
+      for (const q of vectorQueries) {
+        if (!q) continue;
+        searchPromises.push((async () => {
+          const vecResults = await searchVec(db, q, embedModel, 20, (collectionName || "") as any, session);
+          if (vecResults.length > 0) {
+            for (const r of vecResults) hashMap.set(r.filepath, r.hash);
+            rankedLists.push(vecResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score })));
+          }
+        })());
+      }
+    }
 
-  // Apply Reciprocal Rank Fusion to combine all ranked lists
-  // Give 2x weight to original query results (first 2 lists: FTS + vector)
-  const weights = rankedLists.map((_, i) => i < 2 ? 2.0 : 1.0);
-  const fused = reciprocalRankFusion(rankedLists, weights);
-  // Hard cap reranking for latency/cost. We rerank per-document (best chunk only).
-  const RERANK_DOC_LIMIT = 40;
-  const candidates = fused.slice(0, RERANK_DOC_LIMIT);
+    await Promise.all(searchPromises);
 
-  if (candidates.length === 0) {
-    console.log("No results found.");
-    closeDb();
-    return;
-  }
+    // Apply Reciprocal Rank Fusion to combine all ranked lists
+    // Give 2x weight to original query results (first 2 lists: FTS + vector)
+    const weights = rankedLists.map((_, i) => i < 2 ? 2.0 : 1.0);
+    const fused = reciprocalRankFusion(rankedLists, weights);
+    // Hard cap reranking for latency/cost. We rerank per-document (best chunk only).
+    const RERANK_DOC_LIMIT = 40;
+    const candidates = fused.slice(0, RERANK_DOC_LIMIT);
 
-  // Rerank multiple chunks per document, then aggregate scores
-  // This improves ranking for long documents where keyword-matched chunk isn't always best
-  // We only rerank ONE chunk per document (best chunk by a simple keyword heuristic),
-  // so we never rerank more than RERANK_DOC_LIMIT items.
-  const chunksToRerank: { file: string; text: string; chunkIdx: number }[] = [];
-  const docChunkMap = new Map<string, { chunks: { text: string; pos: number }[]; bestIdx: number }>();
-
-  const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
-  for (const c of candidates) {
-    const chunks = chunkDocument(c.body);
-    if (chunks.length === 0) continue;
-
-    // Choose best chunk by keyword matches; fall back to first chunk.
-    let bestIdx = 0;
-    let bestScore = -1;
-    for (let i = 0; i < chunks.length; i++) {
-      const chunkLower = chunks[i]!.text.toLowerCase();
-      const score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
-      if (score > bestScore) {
-        bestScore = score;
-        bestIdx = i;
-      }
+    if (candidates.length === 0) {
+      console.log("No results found.");
+      closeDb();
+      return;
     }
 
-    chunksToRerank.push({ file: c.file, text: chunks[bestIdx]!.text, chunkIdx: bestIdx });
-    docChunkMap.set(c.file, { chunks, bestIdx });
-  }
+    // Rerank multiple chunks per document, then aggregate scores
+    // This improves ranking for long documents where keyword-matched chunk isn't always best
+    // We only rerank ONE chunk per document (best chunk by a simple keyword heuristic),
+    // so we never rerank more than RERANK_DOC_LIMIT items.
+    const chunksToRerank: { file: string; text: string; chunkIdx: number }[] = [];
+    const docChunkMap = new Map<string, { chunks: { text: string; pos: number }[]; bestIdx: number }>();
+
+    const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
+    for (const cand of candidates) {
+      const chunks = chunkDocument(cand.body);
+      if (chunks.length === 0) continue;
+
+      // Choose best chunk by keyword matches; fall back to first chunk.
+      let bestIdx = 0;
+      let bestScore = -1;
+      for (let i = 0; i < chunks.length; i++) {
+        const chunkLower = chunks[i]!.text.toLowerCase();
+        const score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
+        if (score > bestScore) {
+          bestScore = score;
+          bestIdx = i;
+        }
+      }
 
-  // Rerank selected chunks (with caching). One chunk per doc -> one rerank item per doc.
-  const reranked = await rerank(
-    query,
-    chunksToRerank.map(c => ({ file: c.file, text: c.text })),
-    rerankModel,
-    db
-  );
+      chunksToRerank.push({ file: cand.file, text: chunks[bestIdx]!.text, chunkIdx: bestIdx });
+      docChunkMap.set(cand.file, { chunks, bestIdx });
+    }
 
-  const aggregatedScores = new Map<string, { score: number; bestChunkIdx: number }>();
-  for (const r of reranked) {
-    const chunkInfo = docChunkMap.get(r.file);
-    aggregatedScores.set(r.file, { score: r.score, bestChunkIdx: chunkInfo?.bestIdx ?? 0 });
-  }
-
-  // Blend RRF position score with aggregated reranker score using position-aware weights
-  // Top retrieval results get more protection from reranker disagreement
-  const candidateMap = new Map(candidates.map(c => [c.file, { displayPath: c.displayPath, title: c.title, body: c.body }]));
-  const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1])); // 1-indexed rank
-
-  const finalResults = Array.from(aggregatedScores.entries()).map(([file, { score: rerankScore, bestChunkIdx }]) => {
-    const rrfRank = rrfRankMap.get(file) || 30;
-    // Position-aware blending: top retrieval results preserved more
-    // Rank 1-3: 75% RRF, 25% reranker (trust retrieval for exact matches)
-    // Rank 4-10: 60% RRF, 40% reranker
-    // Rank 11+: 40% RRF, 60% reranker (trust reranker for lower-ranked)
-    let rrfWeight: number;
-    if (rrfRank <= 3) {
-      rrfWeight = 0.75;
-    } else if (rrfRank <= 10) {
-      rrfWeight = 0.60;
-    } else {
-      rrfWeight = 0.40;
-    }
-    const rrfScore = 1 / rrfRank;  // Position-based: 1, 0.5, 0.33...
-    const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * rerankScore;
-    const candidate = candidateMap.get(file);
-    // Use the best-scoring chunk's text for the body (better for snippets)
-    const chunkInfo = docChunkMap.get(file);
-    const chunkBody = chunkInfo ? (chunkInfo.chunks[bestChunkIdx]?.text || chunkInfo.chunks[0]!.text) : candidate?.body || "";
-    const chunkPos = chunkInfo ? (chunkInfo.chunks[bestChunkIdx]?.pos || 0) : 0;
-    return {
-      file,
-      displayPath: candidate?.displayPath || "",
-      title: candidate?.title || "",
-      body: chunkBody,
-      chunkPos,
-      score: blendedScore,
-      context: getContextForFile(db, file),
-      hash: hashMap.get(file) || "",
-    };
-  }).sort((a, b) => b.score - a.score);
-
-  // Deduplicate by file (safety net - shouldn't happen but prevents duplicate output)
-  const seenFiles = new Set<string>();
-  const dedupedResults = finalResults.filter(r => {
-    if (seenFiles.has(r.file)) return false;
-    seenFiles.add(r.file);
-    return true;
-  });
+    // Rerank selected chunks (with caching). One chunk per doc -> one rerank item per doc.
+    const reranked = await rerank(
+      query,
+      chunksToRerank.map(ch => ({ file: ch.file, text: ch.text })),
+      rerankModel,
+      db,
+      session
+    );
 
-  closeDb();
-  outputResults(dedupedResults, query, opts);
+    const aggregatedScores = new Map<string, { score: number; bestChunkIdx: number }>();
+    for (const r of reranked) {
+      const chunkInfo = docChunkMap.get(r.file);
+      aggregatedScores.set(r.file, { score: r.score, bestChunkIdx: chunkInfo?.bestIdx ?? 0 });
+    }
+
+    // Blend RRF position score with aggregated reranker score using position-aware weights
+    // Top retrieval results get more protection from reranker disagreement
+    const candidateMap = new Map(candidates.map(cand => [cand.file, { displayPath: cand.displayPath, title: cand.title, body: cand.body }]));
+    const rrfRankMap = new Map(candidates.map((cand, i) => [cand.file, i + 1])); // 1-indexed rank
+
+    const finalResults = Array.from(aggregatedScores.entries()).map(([file, { score: rerankScore, bestChunkIdx }]) => {
+      const rrfRank = rrfRankMap.get(file) || 30;
+      // Position-aware blending: top retrieval results preserved more
+      // Rank 1-3: 75% RRF, 25% reranker (trust retrieval for exact matches)
+      // Rank 4-10: 60% RRF, 40% reranker
+      // Rank 11+: 40% RRF, 60% reranker (trust reranker for lower-ranked)
+      let rrfWeight: number;
+      if (rrfRank <= 3) {
+        rrfWeight = 0.75;
+      } else if (rrfRank <= 10) {
+        rrfWeight = 0.60;
+      } else {
+        rrfWeight = 0.40;
+      }
+      const rrfScore = 1 / rrfRank;  // Position-based: 1, 0.5, 0.33...
+      const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * rerankScore;
+      const candidate = candidateMap.get(file);
+      // Use the best-scoring chunk's text for the body (better for snippets)
+      const chunkInfo = docChunkMap.get(file);
+      const chunkBody = chunkInfo ? (chunkInfo.chunks[bestChunkIdx]?.text || chunkInfo.chunks[0]!.text) : candidate?.body || "";
+      const chunkPos = chunkInfo ? (chunkInfo.chunks[bestChunkIdx]?.pos || 0) : 0;
+      return {
+        file,
+        displayPath: candidate?.displayPath || "",
+        title: candidate?.title || "",
+        body: chunkBody,
+        chunkPos,
+        score: blendedScore,
+        context: getContextForFile(db, file),
+        hash: hashMap.get(file) || "",
+      };
+    }).sort((a, b) => b.score - a.score);
+
+    // Deduplicate by file (safety net - shouldn't happen but prevents duplicate output)
+    const seenFiles = new Set<string>();
+    const dedupedResults = finalResults.filter(r => {
+      if (seenFiles.has(r.file)) return false;
+      seenFiles.add(r.file);
+      return true;
+    });
+
+    closeDb();
+    outputResults(dedupedResults, query, opts);
+  }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
 }
 
 // Parse CLI arguments using util.parseArgs

+ 1 - 1
src/store.test.ts

@@ -1850,7 +1850,7 @@ describe("LlamaCpp Integration", () => {
     expect(allResults).toHaveLength(2);
 
     // Search with collection filter - should return only from collection1
-    const filtered = await store.searchVec("content", "embeddinggemma", 10, collection1 as unknown as number);
+    const filtered = await store.searchVec("content", "embeddinggemma", 10, collection1);
     expect(filtered).toHaveLength(1);
     expect(filtered[0]!.collectionName).toBe(collection1);
 

+ 7 - 5
src/store.ts

@@ -21,6 +21,7 @@ import {
   formatQueryForEmbedding,
   formatDocForEmbedding,
   type RerankDocument,
+  type ILLMSession,
 } from "./llm";
 import {
   findContextForPath as collectionsFindContextForPath,
@@ -1900,11 +1901,11 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle
 // Vector Search
 // =============================================================================
 
-export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string): Promise<SearchResult[]> {
+export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession): Promise<SearchResult[]> {
   const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
   if (!tableExists) return [];
 
-  const embedding = await getEmbedding(query, model, true);
+  const embedding = await getEmbedding(query, model, true, session);
   if (!embedding) return [];
 
   // IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
@@ -1990,11 +1991,12 @@ export async function searchVec(db: Database, query: string, model: string, limi
 // Embeddings
 // =============================================================================
 
-async function getEmbedding(text: string, model: string, isQuery: boolean): Promise<number[] | null> {
-  const llm = getDefaultLlamaCpp();
+async function getEmbedding(text: string, model: string, isQuery: boolean, session?: ILLMSession): Promise<number[] | null> {
   // Format text using the appropriate prompt template
   const formattedText = isQuery ? formatQueryForEmbedding(text) : formatDocForEmbedding(text);
-  const result = await llm.embed(formattedText, { model, isQuery });
+  const result = session
+    ? await session.embed(formattedText, { model, isQuery })
+    : await getDefaultLlamaCpp().embed(formattedText, { model, isQuery });
   return result?.embedding || null;
 }