5 сар өмнө · 32d313ad6b
--- a/src/llm.test.ts
+++ b/src/llm.test.ts
@@ -12,7 +12,11 @@ import {
 
				   LlamaCpp,
			
 
				   getDefaultLlamaCpp,
			
 
				   disposeDefaultLlamaCpp,
			
 
				+  withLLMSession,
			
 
				+  canUnloadLLM,
			
 
				+  SessionReleasedError,
			
 
				   type RerankDocument,
			
 
				+  type ILLMSession,
			
 
				 } from "./llm.js";
			
 
				 
			
 
				 // =============================================================================
			
@@ -382,3 +386,174 @@ describe("LlamaCpp Integration", () => {
 
				   });
			
 
				 });
			
 
				 
			
 
				+// =============================================================================
			
 
				+// Session Management Tests
			
 
				+// =============================================================================
			
 
				+
			
 
				+describe("LLM Session Management", () => {
			
 
				+  describe("withLLMSession", () => {
			
 
				+    test("session provides access to LLM operations", async () => {
			
 
				+      const result = await withLLMSession(async (session) => {
			
 
				+        expect(session.isValid).toBe(true);
			
 
				+        const embedding = await session.embed("test text");
			
 
				+        expect(embedding).not.toBeNull();
			
 
				+        expect(embedding!.embedding.length).toBe(768);
			
 
				+        return "success";
			
 
				+      });
			
 
				+      expect(result).toBe("success");
			
 
				+    });
			
 
				+
			
 
				+    test("session is invalid after release", async () => {
			
 
				+      let capturedSession: ILLMSession | null = null;
			
 
				+
			
 
				+      await withLLMSession(async (session) => {
			
 
				+        capturedSession = session;
			
 
				+        expect(session.isValid).toBe(true);
			
 
				+      });
			
 
				+
			
 
				+      // Session should be invalid after withLLMSession returns
			
 
				+      expect(capturedSession).not.toBeNull();
			
 
				+      expect(capturedSession!.isValid).toBe(false);
			
 
				+    });
			
 
				+
			
 
				+    test("session prevents idle unload during operations", async () => {
			
 
				+      await withLLMSession(async (session) => {
			
 
				+        // While inside a session, canUnloadLLM should return false
			
 
				+        expect(canUnloadLLM()).toBe(false);
			
 
				+
			
 
				+        // Perform an operation
			
 
				+        await session.embed("test");
			
 
				+
			
 
				+        // Still should not be able to unload
			
 
				+        expect(canUnloadLLM()).toBe(false);
			
 
				+      });
			
 
				+
			
 
				+      // After session ends, should be able to unload
			
 
				+      expect(canUnloadLLM()).toBe(true);
			
 
				+    });
			
 
				+
			
 
				+    test("nested sessions increment ref count", async () => {
			
 
				+      await withLLMSession(async (outerSession) => {
			
 
				+        expect(canUnloadLLM()).toBe(false);
			
 
				+
			
 
				+        await withLLMSession(async (innerSession) => {
			
 
				+          expect(canUnloadLLM()).toBe(false);
			
 
				+          expect(innerSession.isValid).toBe(true);
			
 
				+          expect(outerSession.isValid).toBe(true);
			
 
				+        });
			
 
				+
			
 
				+        // Inner session released, but outer still active
			
 
				+        expect(canUnloadLLM()).toBe(false);
			
 
				+        expect(outerSession.isValid).toBe(true);
			
 
				+      });
			
 
				+
			
 
				+      // All sessions released
			
 
				+      expect(canUnloadLLM()).toBe(true);
			
 
				+    });
			
 
				+
			
 
				+    test("session embedBatch works correctly", async () => {
			
 
				+      await withLLMSession(async (session) => {
			
 
				+        const texts = ["Hello world", "Test text", "Another document"];
			
 
				+        const results = await session.embedBatch(texts);
			
 
				+
			
 
				+        expect(results).toHaveLength(3);
			
 
				+        for (const result of results) {
			
 
				+          expect(result).not.toBeNull();
			
 
				+          expect(result!.embedding.length).toBe(768);
			
 
				+        }
			
 
				+      });
			
 
				+    });
			
 
				+
			
 
				+    test("session rerank works correctly", async () => {
			
 
				+      await withLLMSession(async (session) => {
			
 
				+        const documents: RerankDocument[] = [
			
 
				+          { file: "a.txt", text: "The capital of France is Paris." },
			
 
				+          { file: "b.txt", text: "Dogs are great pets." },
			
 
				+        ];
			
 
				+
			
 
				+        const result = await session.rerank("What is the capital of France?", documents);
			
 
				+
			
 
				+        expect(result.results).toHaveLength(2);
			
 
				+        expect(result.results[0]!.file).toBe("a.txt");
			
 
				+        expect(result.results[0]!.score).toBeGreaterThan(result.results[1]!.score);
			
 
				+      });
			
 
				+    });
			
 
				+
			
 
				+    test("max duration aborts session after timeout", async () => {
			
 
				+      let aborted = false;
			
 
				+
			
 
				+      try {
			
 
				+        await withLLMSession(async (session) => {
			
 
				+          // Wait longer than max duration
			
 
				+          await new Promise(resolve => setTimeout(resolve, 150));
			
 
				+
			
 
				+          // This operation should throw because session was aborted
			
 
				+          await session.embed("test");
			
 
				+        }, { maxDuration: 50 }); // 50ms max
			
 
				+      } catch (err) {
			
 
				+        if (err instanceof SessionReleasedError) {
			
 
				+          aborted = true;
			
 
				+        } else {
			
 
				+          throw err;
			
 
				+        }
			
 
				+      }
			
 
				+
			
 
				+      expect(aborted).toBe(true);
			
 
				+    }, 5000);
			
 
				+
			
 
				+    test("external abort signal propagates to session", async () => {
			
 
				+      const abortController = new AbortController();
			
 
				+      let sessionAborted = false;
			
 
				+
			
 
				+      const promise = withLLMSession(async (session) => {
			
 
				+        // Wait a bit then check if aborted
			
 
				+        await new Promise(resolve => setTimeout(resolve, 100));
			
 
				+
			
 
				+        if (!session.isValid) {
			
 
				+          sessionAborted = true;
			
 
				+          throw new SessionReleasedError("Session aborted");
			
 
				+        }
			
 
				+
			
 
				+        return "should not reach";
			
 
				+      }, { signal: abortController.signal });
			
 
				+
			
 
				+      // Abort after 20ms
			
 
				+      setTimeout(() => abortController.abort(), 20);
			
 
				+
			
 
				+      try {
			
 
				+        await promise;
			
 
				+      } catch (err) {
			
 
				+        // Expected
			
 
				+      }
			
 
				+
			
 
				+      expect(sessionAborted).toBe(true);
			
 
				+    }, 5000);
			
 
				+
			
 
				+    test("session provides abort signal for monitoring", async () => {
			
 
				+      await withLLMSession(async (session) => {
			
 
				+        expect(session.signal).toBeInstanceOf(AbortSignal);
			
 
				+        expect(session.signal.aborted).toBe(false);
			
 
				+      });
			
 
				+    });
			
 
				+
			
 
				+    test("returns value from callback", async () => {
			
 
				+      const result = await withLLMSession(async (session) => {
			
 
				+        await session.embed("test");
			
 
				+        return { status: "complete", count: 42 };
			
 
				+      });
			
 
				+
			
 
				+      expect(result).toEqual({ status: "complete", count: 42 });
			
 
				+    });
			
 
				+
			
 
				+    test("propagates errors from callback", async () => {
			
 
				+      const customError = new Error("Custom test error");
			
 
				+
			
 
				+      await expect(
			
 
				+        withLLMSession(async () => {
			
 
				+          throw customError;
			
 
				+        })
			
 
				+      ).rejects.toThrow("Custom test error");
			
 
				+    });
			
 
				+  });
			
 
				+});
			
 
				+
			
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -119,6 +119,32 @@ export type RerankOptions = {
 
				   model?: string;
			
 
				 };
			
 
				 
			
 
				+/**
			
 
				+ * Options for LLM sessions
			
 
				+ */
			
 
				+export type LLMSessionOptions = {
			
 
				+  /** Max session duration in ms (default: 10 minutes) */
			
 
				+  maxDuration?: number;
			
 
				+  /** External abort signal */
			
 
				+  signal?: AbortSignal;
			
 
				+  /** Debug name for logging */
			
 
				+  name?: string;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * Session interface for scoped LLM access with lifecycle guarantees
			
 
				+ */
			
 
				+export interface ILLMSession {
			
 
				+  embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
			
 
				+  embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>;
			
 
				+  expandQuery(query: string, options?: { context?: string; includeLexical?: boolean }): Promise<Queryable[]>;
			
 
				+  rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
			
 
				+  /** Whether this session is still valid (not released or aborted) */
			
 
				+  readonly isValid: boolean;
			
 
				+  /** Abort signal for this session (aborts on release or maxDuration) */
			
 
				+  readonly signal: AbortSignal;
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * Supported query types for different search backends
			
 
				  */
			
@@ -225,8 +251,8 @@ export type LlamaCppConfig = {
 
				 /**
			
 
				  * LLM implementation using node-llama-cpp
			
 
				  */
			
 
				-// Default inactivity timeout: 2 minutes
			
 
				-const DEFAULT_INACTIVITY_TIMEOUT_MS = 2 * 60 * 1000;
			
 
				+// Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
			
 
				+const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
			
 
				 
			
 
				 export class LlamaCpp implements LLM {
			
 
				   private llama: Llama | null = null;
			
@@ -267,7 +293,7 @@ export class LlamaCpp implements LLM {
 
				 
			
 
				   /**
			
 
				    * Reset the inactivity timer. Called after each model operation.
			
 
				-   * When timer fires, models are unloaded to free memory.
			
 
				+   * When timer fires, models are unloaded to free memory (if no active sessions).
			
 
				    */
			
 
				   private touchActivity(): void {
			
 
				     // Clear existing timer
			
@@ -279,6 +305,14 @@ export class LlamaCpp implements LLM {
 
				     // Only set timer if we have disposable contexts and timeout is enabled
			
 
				     if (this.inactivityTimeoutMs > 0 && this.hasLoadedContexts()) {
			
 
				       this.inactivityTimer = setTimeout(() => {
			
 
				+        // Check if session manager allows unloading
			
 
				+        // canUnloadLLM is defined later in this file - it checks the session manager
			
 
				+        // We use dynamic import pattern to avoid circular dependency issues
			
 
				+        if (typeof canUnloadLLM === 'function' && !canUnloadLLM()) {
			
 
				+          // Active sessions/operations - reschedule timer
			
 
				+          this.touchActivity();
			
 
				+          return;
			
 
				+        }
			
 
				         this.unloadIdleResources().catch(err => {
			
 
				           console.error("Error unloading idle resources:", err);
			
 
				         });
			
@@ -390,6 +424,8 @@ export class LlamaCpp implements LLM {
 
				       const modelPath = await this.resolveModel(this.embedModelUri);
			
 
				       const model = await llama.loadModel({ modelPath });
			
 
				       this.embedModel = model;
			
 
				+      // Model loading counts as activity - ping to keep alive
			
 
				+      this.touchActivity();
			
 
				       return model;
			
 
				     })();
			
 
				 
			
@@ -421,7 +457,9 @@ export class LlamaCpp implements LLM {
 
				       })();
			
 
				 
			
 
				       try {
			
 
				-        await this.embedContextCreatePromise;
			
 
				+        const context = await this.embedContextCreatePromise;
			
 
				+        this.touchActivity();
			
 
				+        return context;
			
 
				       } finally {
			
 
				         this.embedContextCreatePromise = null;
			
 
				       }
			
@@ -476,6 +514,8 @@ export class LlamaCpp implements LLM {
 
				       const modelPath = await this.resolveModel(this.rerankModelUri);
			
 
				       const model = await llama.loadModel({ modelPath });
			
 
				       this.rerankModel = model;
			
 
				+      // Model loading counts as activity - ping to keep alive
			
 
				+      this.touchActivity();
			
 
				       return model;
			
 
				     })();
			
 
				 
			
@@ -538,6 +578,9 @@ export class LlamaCpp implements LLM {
 
				   // ==========================================================================
			
 
				 
			
 
				   async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
			
 
				+    // Ping activity at start to keep models alive during this operation
			
 
				+    this.touchActivity();
			
 
				+
			
 
				     try {
			
 
				       const context = await this.ensureEmbedContext();
			
 
				       const embedding = await context.getEmbeddingFor(text);
			
@@ -557,6 +600,9 @@ export class LlamaCpp implements LLM {
 
				    * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
			
 
				    */
			
 
				   async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
			
 
				+    // Ping activity at start to keep models alive during this operation
			
 
				+    this.touchActivity();
			
 
				+
			
 
				     if (texts.length === 0) return [];
			
 
				 
			
 
				     try {
			
@@ -587,6 +633,9 @@ export class LlamaCpp implements LLM {
 
				   }
			
 
				 
			
 
				   async generate(prompt: string, options: GenerateOptions = {}): Promise<GenerateResult | null> {
			
 
				+    // Ping activity at start to keep models alive during this operation
			
 
				+    this.touchActivity();
			
 
				+
			
 
				     // Ensure model is loaded
			
 
				     await this.ensureGenerateModel();
			
 
				 
			
@@ -639,6 +688,9 @@ export class LlamaCpp implements LLM {
 
				   // ==========================================================================
			
 
				 
			
 
				   async expandQuery(query: string, options: { context?: string, includeLexical?: boolean } = {}): Promise<Queryable[]> {
			
 
				+    // Ping activity at start to keep models alive during this operation
			
 
				+    this.touchActivity();
			
 
				+
			
 
				     const llama = await this.ensureLlama();
			
 
				     await this.ensureGenerateModel();
			
 
				 
			
@@ -740,6 +792,9 @@ Final Output:`;
 
				     documents: RerankDocument[],
			
 
				     options: RerankOptions = {}
			
 
				   ): Promise<RerankResult> {
			
 
				+    // Ping activity at start to keep models alive during this operation
			
 
				+    this.touchActivity();
			
 
				+
			
 
				     const context = await this.ensureRerankContext();
			
 
				 
			
 
				     // Build a map from document text to original indices (for lookup after sorting)
			
@@ -808,6 +863,232 @@ Final Output:`;
 
				   }
			
 
				 }
			
 
				 
			
 
				+// =============================================================================
			
 
				+// Session Management Layer
			
 
				+// =============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * Manages LLM session lifecycle with reference counting.
			
 
				+ * Coordinates with LlamaCpp idle timeout to prevent disposal during active sessions.
			
 
				+ */
			
 
				+class LLMSessionManager {
			
 
				+  private llm: LlamaCpp;
			
 
				+  private _activeSessionCount = 0;
			
 
				+  private _inFlightOperations = 0;
			
 
				+
			
 
				+  constructor(llm: LlamaCpp) {
			
 
				+    this.llm = llm;
			
 
				+  }
			
 
				+
			
 
				+  get activeSessionCount(): number {
			
 
				+    return this._activeSessionCount;
			
 
				+  }
			
 
				+
			
 
				+  get inFlightOperations(): number {
			
 
				+    return this._inFlightOperations;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Returns true only when both session count and in-flight operations are 0.
			
 
				+   * Used by LlamaCpp to determine if idle unload is safe.
			
 
				+   */
			
 
				+  canUnload(): boolean {
			
 
				+    return this._activeSessionCount === 0 && this._inFlightOperations === 0;
			
 
				+  }
			
 
				+
			
 
				+  acquire(): void {
			
 
				+    this._activeSessionCount++;
			
 
				+  }
			
 
				+
			
 
				+  release(): void {
			
 
				+    this._activeSessionCount = Math.max(0, this._activeSessionCount - 1);
			
 
				+  }
			
 
				+
			
 
				+  operationStart(): void {
			
 
				+    this._inFlightOperations++;
			
 
				+  }
			
 
				+
			
 
				+  operationEnd(): void {
			
 
				+    this._inFlightOperations = Math.max(0, this._inFlightOperations - 1);
			
 
				+  }
			
 
				+
			
 
				+  getLlamaCpp(): LlamaCpp {
			
 
				+    return this.llm;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Error thrown when an operation is attempted on a released or aborted session.
			
 
				+ */
			
 
				+export class SessionReleasedError extends Error {
			
 
				+  constructor(message = "LLM session has been released or aborted") {
			
 
				+    super(message);
			
 
				+    this.name = "SessionReleasedError";
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Scoped LLM session with automatic lifecycle management.
			
 
				+ * Wraps LlamaCpp methods with operation tracking and abort handling.
			
 
				+ */
			
 
				+class LLMSession implements ILLMSession {
			
 
				+  private manager: LLMSessionManager;
			
 
				+  private released = false;
			
 
				+  private abortController: AbortController;
			
 
				+  private maxDurationTimer: ReturnType<typeof setTimeout> | null = null;
			
 
				+  private name: string;
			
 
				+
			
 
				+  constructor(manager: LLMSessionManager, options: LLMSessionOptions = {}) {
			
 
				+    this.manager = manager;
			
 
				+    this.name = options.name || "unnamed";
			
 
				+    this.abortController = new AbortController();
			
 
				+
			
 
				+    // Link external abort signal if provided
			
 
				+    if (options.signal) {
			
 
				+      if (options.signal.aborted) {
			
 
				+        this.abortController.abort(options.signal.reason);
			
 
				+      } else {
			
 
				+        options.signal.addEventListener("abort", () => {
			
 
				+          this.abortController.abort(options.signal!.reason);
			
 
				+        }, { once: true });
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Set up max duration timer
			
 
				+    const maxDuration = options.maxDuration ?? 10 * 60 * 1000; // Default 10 minutes
			
 
				+    if (maxDuration > 0) {
			
 
				+      this.maxDurationTimer = setTimeout(() => {
			
 
				+        this.abortController.abort(new Error(`Session "${this.name}" exceeded max duration of ${maxDuration}ms`));
			
 
				+      }, maxDuration);
			
 
				+      this.maxDurationTimer.unref(); // Don't keep process alive
			
 
				+    }
			
 
				+
			
 
				+    // Acquire session lease
			
 
				+    this.manager.acquire();
			
 
				+  }
			
 
				+
			
 
				+  get isValid(): boolean {
			
 
				+    return !this.released && !this.abortController.signal.aborted;
			
 
				+  }
			
 
				+
			
 
				+  get signal(): AbortSignal {
			
 
				+    return this.abortController.signal;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Release the session and decrement ref count.
			
 
				+   * Called automatically by withLLMSession when the callback completes.
			
 
				+   */
			
 
				+  release(): void {
			
 
				+    if (this.released) return;
			
 
				+    this.released = true;
			
 
				+
			
 
				+    if (this.maxDurationTimer) {
			
 
				+      clearTimeout(this.maxDurationTimer);
			
 
				+      this.maxDurationTimer = null;
			
 
				+    }
			
 
				+
			
 
				+    this.abortController.abort(new Error("Session released"));
			
 
				+    this.manager.release();
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Wrap an operation with tracking and abort checking.
			
 
				+   */
			
 
				+  private async withOperation<T>(fn: () => Promise<T>): Promise<T> {
			
 
				+    if (!this.isValid) {
			
 
				+      throw new SessionReleasedError();
			
 
				+    }
			
 
				+
			
 
				+    this.manager.operationStart();
			
 
				+    try {
			
 
				+      // Check abort before starting
			
 
				+      if (this.abortController.signal.aborted) {
			
 
				+        throw new SessionReleasedError(
			
 
				+          this.abortController.signal.reason?.message || "Session aborted"
			
 
				+        );
			
 
				+      }
			
 
				+      return await fn();
			
 
				+    } finally {
			
 
				+      this.manager.operationEnd();
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  async embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null> {
			
 
				+    return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
			
 
				+  }
			
 
				+
			
 
				+  async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
			
 
				+    return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts));
			
 
				+  }
			
 
				+
			
 
				+  async expandQuery(
			
 
				+    query: string,
			
 
				+    options?: { context?: string; includeLexical?: boolean }
			
 
				+  ): Promise<Queryable[]> {
			
 
				+    return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
			
 
				+  }
			
 
				+
			
 
				+  async rerank(
			
 
				+    query: string,
			
 
				+    documents: RerankDocument[],
			
 
				+    options?: RerankOptions
			
 
				+  ): Promise<RerankResult> {
			
 
				+    return this.withOperation(() => this.manager.getLlamaCpp().rerank(query, documents, options));
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// Session manager for the default LlamaCpp instance
			
 
				+let defaultSessionManager: LLMSessionManager | null = null;
			
 
				+
			
 
				+/**
			
 
				+ * Get the session manager for the default LlamaCpp instance.
			
 
				+ */
			
 
				+function getSessionManager(): LLMSessionManager {
			
 
				+  const llm = getDefaultLlamaCpp();
			
 
				+  if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) {
			
 
				+    defaultSessionManager = new LLMSessionManager(llm);
			
 
				+  }
			
 
				+  return defaultSessionManager;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Execute a function with a scoped LLM session.
			
 
				+ * The session provides lifecycle guarantees - resources won't be disposed mid-operation.
			
 
				+ *
			
 
				+ * @example
			
 
				+ * ```typescript
			
 
				+ * await withLLMSession(async (session) => {
			
 
				+ *   const expanded = await session.expandQuery(query);
			
 
				+ *   const embeddings = await session.embedBatch(texts);
			
 
				+ *   const reranked = await session.rerank(query, docs);
			
 
				+ *   return reranked;
			
 
				+ * }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
			
 
				+ * ```
			
 
				+ */
			
 
				+export async function withLLMSession<T>(
			
 
				+  fn: (session: ILLMSession) => Promise<T>,
			
 
				+  options?: LLMSessionOptions
			
 
				+): Promise<T> {
			
 
				+  const manager = getSessionManager();
			
 
				+  const session = new LLMSession(manager, options);
			
 
				+
			
 
				+  try {
			
 
				+    return await fn(session);
			
 
				+  } finally {
			
 
				+    session.release();
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Check if idle unload is safe (no active sessions or operations).
			
 
				+ * Used internally by LlamaCpp idle timer.
			
 
				+ */
			
 
				+export function canUnloadLLM(): boolean {
			
 
				+  if (!defaultSessionManager) return true;
			
 
				+  return defaultSessionManager.canUnload();
			
 
				+}
			
 
				+
			
 
				 // =============================================================================
			
 
				 // Singleton for default LlamaCpp instance
			
 
				 // =============================================================================
			
--- a/src/qmd.ts
+++ b/src/qmd.ts
@@ -65,7 +65,7 @@ import {
 
				   createStore,
			
 
				   getDefaultDbPath,
			
 
				 } from "./store.js";
			
 
				-import { getDefaultLlamaCpp, disposeDefaultLlamaCpp, type RerankDocument, type Queryable, type QueryType } from "./llm.js";
			
 
				+import { getDefaultLlamaCpp, disposeDefaultLlamaCpp, withLLMSession, type ILLMSession, type RerankDocument, type Queryable, type QueryType } from "./llm.js";
			
 
				 import type { SearchResult, RankedResult } from "./store.js";
			
 
				 import {
			
 
				   formatSearchResults,
			
@@ -231,20 +231,21 @@ function computeDisplayPath(
 
				 }
			
 
				 
			
 
				 // Rerank documents using node-llama-cpp cross-encoder model
			
 
				-async function rerank(query: string, documents: { file: string; text: string }[], _model: string = DEFAULT_RERANK_MODEL, _db?: Database): Promise<{ file: string; score: number }[]> {
			
 
				+async function rerank(query: string, documents: { file: string; text: string }[], _model: string = DEFAULT_RERANK_MODEL, _db?: Database, session?: ILLMSession): Promise<{ file: string; score: number }[]> {
			
 
				   if (documents.length === 0) return [];
			
 
				 
			
 
				   const total = documents.length;
			
 
				   process.stderr.write(`Reranking ${total} documents...\n`);
			
 
				   progress.indeterminate();
			
 
				 
			
 
				-  const llm = getDefaultLlamaCpp();
			
 
				   const rerankDocs: RerankDocument[] = documents.map((doc) => ({
			
 
				     file: doc.file,
			
 
				     text: doc.text.slice(0, 4000), // Truncate to context limit
			
 
				   }));
			
 
				 
			
 
				-  const result = await llm.rerank(query, rerankDocs);
			
 
				+  const result = session
			
 
				+    ? await session.rerank(query, rerankDocs)
			
 
				+    : await getDefaultLlamaCpp().rerank(query, rerankDocs);
			
 
				 
			
 
				   progress.clear();
			
 
				   process.stderr.write("\n");
			
@@ -1543,7 +1544,7 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
 
				     return;
			
 
				   }
			
 
				 
			
 
				-  const totalBytes = allChunks.reduce((sum, c) => sum + c.bytes, 0);
			
 
				+  const totalBytes = allChunks.reduce((sum, chk) => sum + chk.bytes, 0);
			
 
				   const totalChunks = allChunks.length;
			
 
				   const totalDocs = hashesToEmbed.length;
			
 
				 
			
@@ -1556,99 +1557,103 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
 
				   // Hide cursor during embedding
			
 
				   cursor.hide();
			
 
				 
			
 
				-  // Get embedding dimensions from first chunk
			
 
				-  progress.indeterminate();
			
 
				-  const llm = getDefaultLlamaCpp();
			
 
				-  const firstChunk = allChunks[0];
			
 
				-  if (!firstChunk) {
			
 
				-    throw new Error("No chunks available to embed");
			
 
				-  }
			
 
				-  const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title);
			
 
				-  const firstResult = await llm.embed(firstText);
			
 
				-  if (!firstResult) {
			
 
				-    throw new Error("Failed to get embedding dimensions from first chunk");
			
 
				-  }
			
 
				-  ensureVecTable(db, firstResult.embedding.length);
			
 
				+  // Wrap all LLM embedding operations in a session for lifecycle management
			
 
				+  // Use 30 minute timeout for large collections
			
 
				+  await withLLMSession(async (session) => {
			
 
				+    // Get embedding dimensions from first chunk
			
 
				+    progress.indeterminate();
			
 
				+    const firstChunk = allChunks[0];
			
 
				+    if (!firstChunk) {
			
 
				+      throw new Error("No chunks available to embed");
			
 
				+    }
			
 
				+    const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title);
			
 
				+    const firstResult = await session.embed(firstText);
			
 
				+    if (!firstResult) {
			
 
				+      throw new Error("Failed to get embedding dimensions from first chunk");
			
 
				+    }
			
 
				+    ensureVecTable(db, firstResult.embedding.length);
			
 
				 
			
 
				-  let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
			
 
				-  const startTime = Date.now();
			
 
				+    let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
			
 
				+    const startTime = Date.now();
			
 
				 
			
 
				-  // Batch embedding for better throughput
			
 
				-  // Process in batches of 32 to balance memory usage and efficiency
			
 
				-  const BATCH_SIZE = 32;
			
 
				+    // Batch embedding for better throughput
			
 
				+    // Process in batches of 32 to balance memory usage and efficiency
			
 
				+    const BATCH_SIZE = 32;
			
 
				 
			
 
				-  for (let batchStart = 0; batchStart < allChunks.length; batchStart += BATCH_SIZE) {
			
 
				-    const batchEnd = Math.min(batchStart + BATCH_SIZE, allChunks.length);
			
 
				-    const batch = allChunks.slice(batchStart, batchEnd);
			
 
				+    for (let batchStart = 0; batchStart < allChunks.length; batchStart += BATCH_SIZE) {
			
 
				+      const batchEnd = Math.min(batchStart + BATCH_SIZE, allChunks.length);
			
 
				+      const batch = allChunks.slice(batchStart, batchEnd);
			
 
				 
			
 
				-    // Format texts for embedding
			
 
				-    const texts = batch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title));
			
 
				+      // Format texts for embedding
			
 
				+      const texts = batch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title));
			
 
				 
			
 
				-    try {
			
 
				-      // Batch embed all texts at once
			
 
				-      const embeddings = await llm.embedBatch(texts);
			
 
				+      try {
			
 
				+        // Batch embed all texts at once
			
 
				+        const embeddings = await session.embedBatch(texts);
			
 
				 
			
 
				-      // Insert each embedding
			
 
				-      for (let i = 0; i < batch.length; i++) {
			
 
				-        const chunk = batch[i]!;
			
 
				-        const embedding = embeddings[i];
			
 
				+        // Insert each embedding
			
 
				+        for (let i = 0; i < batch.length; i++) {
			
 
				+          const chunk = batch[i]!;
			
 
				+          const embedding = embeddings[i];
			
 
				 
			
 
				-        if (embedding) {
			
 
				-          insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
			
 
				-          chunksEmbedded++;
			
 
				-        } else {
			
 
				-          errors++;
			
 
				-          console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}${c.reset}`);
			
 
				-        }
			
 
				-        bytesProcessed += chunk.bytes;
			
 
				-      }
			
 
				-    } catch (err) {
			
 
				-      // If batch fails, try individual embeddings as fallback
			
 
				-      for (const chunk of batch) {
			
 
				-        try {
			
 
				-          const text = formatDocForEmbedding(chunk.text, chunk.title);
			
 
				-          const result = await llm.embed(text);
			
 
				-          if (result) {
			
 
				-            insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
			
 
				+          if (embedding) {
			
 
				+            insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
			
 
				             chunksEmbedded++;
			
 
				           } else {
			
 
				             errors++;
			
 
				+            console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}${c.reset}`);
			
 
				           }
			
 
				-        } catch (innerErr) {
			
 
				-          errors++;
			
 
				-          console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}: ${innerErr}${c.reset}`);
			
 
				+          bytesProcessed += chunk.bytes;
			
 
				+        }
			
 
				+      } catch (err) {
			
 
				+        // If batch fails, try individual embeddings as fallback
			
 
				+        for (const chunk of batch) {
			
 
				+          try {
			
 
				+            const text = formatDocForEmbedding(chunk.text, chunk.title);
			
 
				+            const result = await session.embed(text);
			
 
				+            if (result) {
			
 
				+              insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
			
 
				+              chunksEmbedded++;
			
 
				+            } else {
			
 
				+              errors++;
			
 
				+            }
			
 
				+          } catch (innerErr) {
			
 
				+            errors++;
			
 
				+            console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}: ${innerErr}${c.reset}`);
			
 
				+          }
			
 
				+          bytesProcessed += chunk.bytes;
			
 
				         }
			
 
				-        bytesProcessed += chunk.bytes;
			
 
				       }
			
 
				-    }
			
 
				 
			
 
				-    const percent = (bytesProcessed / totalBytes) * 100;
			
 
				-    progress.set(percent);
			
 
				+      const percent = (bytesProcessed / totalBytes) * 100;
			
 
				+      progress.set(percent);
			
 
				 
			
 
				-    const elapsed = (Date.now() - startTime) / 1000;
			
 
				-    const bytesPerSec = bytesProcessed / elapsed;
			
 
				-    const remainingBytes = totalBytes - bytesProcessed;
			
 
				-    const etaSec = remainingBytes / bytesPerSec;
			
 
				+      const elapsed = (Date.now() - startTime) / 1000;
			
 
				+      const bytesPerSec = bytesProcessed / elapsed;
			
 
				+      const remainingBytes = totalBytes - bytesProcessed;
			
 
				+      const etaSec = remainingBytes / bytesPerSec;
			
 
				 
			
 
				-    const bar = renderProgressBar(percent);
			
 
				-    const percentStr = percent.toFixed(0).padStart(3);
			
 
				-    const throughput = `${formatBytes(bytesPerSec)}/s`;
			
 
				-    const eta = elapsed > 2 ? formatETA(etaSec) : "...";
			
 
				-    const errStr = errors > 0 ? ` ${c.yellow}${errors} err${c.reset}` : "";
			
 
				+      const bar = renderProgressBar(percent);
			
 
				+      const percentStr = percent.toFixed(0).padStart(3);
			
 
				+      const throughput = `${formatBytes(bytesPerSec)}/s`;
			
 
				+      const eta = elapsed > 2 ? formatETA(etaSec) : "...";
			
 
				+      const errStr = errors > 0 ? ` ${c.yellow}${errors} err${c.reset}` : "";
			
 
				 
			
 
				-    process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${chunksEmbedded}/${totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset}   `);
			
 
				-  }
			
 
				+      process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${chunksEmbedded}/${totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset}   `);
			
 
				+    }
			
 
				 
			
 
				-  progress.clear();
			
 
				-  cursor.show();
			
 
				-  const totalTimeSec = (Date.now() - startTime) / 1000;
			
 
				-  const avgThroughput = formatBytes(totalBytes / totalTimeSec);
			
 
				+    progress.clear();
			
 
				+    cursor.show();
			
 
				+    const totalTimeSec = (Date.now() - startTime) / 1000;
			
 
				+    const avgThroughput = formatBytes(totalBytes / totalTimeSec);
			
 
				+
			
 
				+    console.log(`\r${c.green}${renderProgressBar(100)}${c.reset} ${c.bold}100%${c.reset}                                    `);
			
 
				+    console.log(`\n${c.green}✓ Done!${c.reset} Embedded ${c.bold}${chunksEmbedded}${c.reset} chunks from ${c.bold}${totalDocs}${c.reset} documents in ${c.bold}${formatETA(totalTimeSec)}${c.reset} ${c.dim}(${avgThroughput}/s)${c.reset}`);
			
 
				+    if (errors > 0) {
			
 
				+      console.log(`${c.yellow}⚠ ${errors} chunks failed${c.reset}`);
			
 
				+    }
			
 
				+  }, { maxDuration: 30 * 60 * 1000, name: 'embed-command' });
			
 
				 
			
 
				-  console.log(`\r${c.green}${renderProgressBar(100)}${c.reset} ${c.bold}100%${c.reset}                                    `);
			
 
				-  console.log(`\n${c.green}✓ Done!${c.reset} Embedded ${c.bold}${chunksEmbedded}${c.reset} chunks from ${c.bold}${totalDocs}${c.reset} documents in ${c.bold}${formatETA(totalTimeSec)}${c.reset} ${c.dim}(${avgThroughput}/s)${c.reset}`);
			
 
				-  if (errors > 0) {
			
 
				-    console.log(`${c.yellow}⚠ ${errors} chunks failed${c.reset}`);
			
 
				-  }
			
 
				   closeDb();
			
 
				 }
			
 
				 
			
@@ -1975,60 +1980,64 @@ async function vectorSearch(query: string, opts: OutputOptions, model: string =
 
				   // Check index health and warn about issues
			
 
				   checkIndexHealth(db);
			
 
				 
			
 
				-  // Expand query using structured output (no lexical for vector-only search)
			
 
				-  const queryables = await expandQueryStructured(query, false, opts.context);
			
 
				+  // Wrap LLM operations in a session for lifecycle management
			
 
				+  await withLLMSession(async (session) => {
			
 
				+    // Expand query using structured output (no lexical for vector-only search)
			
 
				+    const queryables = await expandQueryStructured(query, false, opts.context, session);
			
 
				 
			
 
				-  // Build list of queries for vector search: original, vec, and hyde
			
 
				-  const vectorQueries: string[] = [query];
			
 
				-  for (const q of queryables) {
			
 
				-    if (q.type === 'vec' || q.type === 'hyde') {
			
 
				-      if (q.text && q.text !== query) {
			
 
				-        vectorQueries.push(q.text);
			
 
				+    // Build list of queries for vector search: original, vec, and hyde
			
 
				+    const vectorQueries: string[] = [query];
			
 
				+    for (const q of queryables) {
			
 
				+      if (q.type === 'vec' || q.type === 'hyde') {
			
 
				+        if (q.text && q.text !== query) {
			
 
				+          vectorQueries.push(q.text);
			
 
				+        }
			
 
				       }
			
 
				     }
			
 
				-  }
			
 
				 
			
 
				-  process.stderr.write(`${c.dim}Searching ${vectorQueries.length} vector queries...${c.reset}\n`);
			
 
				+    process.stderr.write(`${c.dim}Searching ${vectorQueries.length} vector queries...${c.reset}\n`);
			
 
				 
			
 
				-  // Collect results from all query variations
			
 
				-  const perQueryLimit = opts.all ? 500 : 20;
			
 
				-  const allResults = new Map<string, { file: string; displayPath: string; title: string; body: string; score: number; hash: string }>();
			
 
				+    // Collect results from all query variations
			
 
				+    const perQueryLimit = opts.all ? 500 : 20;
			
 
				+    const allResults = new Map<string, { file: string; displayPath: string; title: string; body: string; score: number; hash: string }>();
			
 
				 
			
 
				-  // IMPORTANT: Run vector searches sequentially, not with Promise.all.
			
 
				-  // node-llama-cpp's embedding context hangs when multiple concurrent embed() calls
			
 
				-  // are made. This is a known limitation of the LlamaEmbeddingContext.
			
 
				-  // See: https://github.com/tobi/qmd/pull/23
			
 
				-  for (const q of vectorQueries) {
			
 
				-    const vecResults = await searchVec(db, q, model, perQueryLimit, collectionName as any);
			
 
				-    for (const r of vecResults) {
			
 
				-      const existing = allResults.get(r.filepath);
			
 
				-      if (!existing || r.score > existing.score) {
			
 
				-        allResults.set(r.filepath, { file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score, hash: r.hash });
			
 
				+    // IMPORTANT: Run vector searches sequentially, not with Promise.all.
			
 
				+    // node-llama-cpp's embedding context hangs when multiple concurrent embed() calls
			
 
				+    // are made. This is a known limitation of the LlamaEmbeddingContext.
			
 
				+    // See: https://github.com/tobi/qmd/pull/23
			
 
				+    for (const q of vectorQueries) {
			
 
				+      const vecResults = await searchVec(db, q, model, perQueryLimit, collectionName as any, session);
			
 
				+      for (const r of vecResults) {
			
 
				+        const existing = allResults.get(r.filepath);
			
 
				+        if (!existing || r.score > existing.score) {
			
 
				+          allResults.set(r.filepath, { file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score, hash: r.hash });
			
 
				+        }
			
 
				       }
			
 
				     }
			
 
				-  }
			
 
				 
			
 
				-  // Sort by max score and limit to requested count
			
 
				-  const results = Array.from(allResults.values())
			
 
				-    .sort((a, b) => b.score - a.score)
			
 
				-    .slice(0, opts.limit)
			
 
				-    .map(r => ({ ...r, context: getContextForFile(db, r.file) }));
			
 
				+    // Sort by max score and limit to requested count
			
 
				+    const results = Array.from(allResults.values())
			
 
				+      .sort((a, b) => b.score - a.score)
			
 
				+      .slice(0, opts.limit)
			
 
				+      .map(r => ({ ...r, context: getContextForFile(db, r.file) }));
			
 
				 
			
 
				-  closeDb();
			
 
				+    closeDb();
			
 
				 
			
 
				-  if (results.length === 0) {
			
 
				-    console.log("No results found.");
			
 
				-    return;
			
 
				-  }
			
 
				-  outputResults(results, query, { ...opts, limit: results.length }); // Already limited
			
 
				+    if (results.length === 0) {
			
 
				+      console.log("No results found.");
			
 
				+      return;
			
 
				+    }
			
 
				+    outputResults(results, query, { ...opts, limit: results.length }); // Already limited
			
 
				+  }, { maxDuration: 10 * 60 * 1000, name: 'vectorSearch' });
			
 
				 }
			
 
				 
			
 
				 // Expand query using structured output with GBNF grammar
			
 
				-async function expandQueryStructured(query: string, includeLexical: boolean = true, context?: string): Promise<Queryable[]> {
			
 
				+async function expandQueryStructured(query: string, includeLexical: boolean = true, context?: string, session?: ILLMSession): Promise<Queryable[]> {
			
 
				   process.stderr.write(`${c.dim}Expanding query...${c.reset}\n`);
			
 
				 
			
 
				-  const llm = getDefaultLlamaCpp();
			
 
				-  const queryables = await llm.expandQuery(query, { includeLexical, context });
			
 
				+  const queryables = session
			
 
				+    ? await session.expandQuery(query, { includeLexical, context })
			
 
				+    : await getDefaultLlamaCpp().expandQuery(query, { includeLexical, context });
			
 
				 
			
 
				   // Log the expansion as a tree
			
 
				   const lines: string[] = [];
			
@@ -2060,8 +2069,8 @@ async function expandQueryStructured(query: string, includeLexical: boolean = tr
 
				   return queryables;
			
 
				 }
			
 
				 
			
 
				-async function expandQuery(query: string, _model: string = DEFAULT_QUERY_MODEL, _db?: Database): Promise<string[]> {
			
 
				-  const queryables = await expandQueryStructured(query, true);
			
 
				+async function expandQuery(query: string, _model: string = DEFAULT_QUERY_MODEL, _db?: Database, session?: ILLMSession): Promise<string[]> {
			
 
				+  const queryables = await expandQueryStructured(query, true, undefined, session);
			
 
				   const queries = new Set<string>([query]);
			
 
				   for (const q of queryables) {
			
 
				     queries.add(q.text);
			
@@ -2098,178 +2107,182 @@ async function querySearch(query: string, opts: OutputOptions, embedModel: strin
 
				   const secondScore = initialFts[1]?.score ?? 0;
			
 
				   const hasStrongSignal = initialFts.length > 0 && topScore >= 0.85 && (topScore - secondScore) >= 0.15;
			
 
				 
			
 
				-  let ftsQueries: string[] = [query];
			
 
				-  let vectorQueries: string[] = [query];
			
 
				-
			
 
				-  if (hasStrongSignal) {
			
 
				-    // Strong BM25 signal - skip expensive LLM expansion
			
 
				-    process.stderr.write(`${c.dim}Strong BM25 signal (${topScore.toFixed(2)}) - skipping expansion${c.reset}\n`);
			
 
				-    // Still log the "expansion tree" in the same style as vsearch for consistency.
			
 
				-    {
			
 
				-      const lines: string[] = [];
			
 
				-      lines.push(`${c.dim}├─ ${query} · (lexical+vector)${c.reset}`);
			
 
				-      lines[lines.length - 1] = lines[lines.length - 1]!.replace('├─', '└─');
			
 
				-      for (const line of lines) process.stderr.write(line + '\n');
			
 
				-    }
			
 
				-  } else {
			
 
				-    // Weak signal - expand query for better recall
			
 
				-    const queryables = await expandQueryStructured(query, true, opts.context);
			
 
				-
			
 
				-    for (const q of queryables) {
			
 
				-      if (q.type === 'lex') {
			
 
				-        if (q.text && q.text !== query) ftsQueries.push(q.text);
			
 
				-      } else if (q.type === 'vec' || q.type === 'hyde') {
			
 
				-        if (q.text && q.text !== query) vectorQueries.push(q.text);
			
 
				+  // Wrap LLM operations in a session for lifecycle management
			
 
				+  await withLLMSession(async (session) => {
			
 
				+    let ftsQueries: string[] = [query];
			
 
				+    let vectorQueries: string[] = [query];
			
 
				+
			
 
				+    if (hasStrongSignal) {
			
 
				+      // Strong BM25 signal - skip expensive LLM expansion
			
 
				+      process.stderr.write(`${c.dim}Strong BM25 signal (${topScore.toFixed(2)}) - skipping expansion${c.reset}\n`);
			
 
				+      // Still log the "expansion tree" in the same style as vsearch for consistency.
			
 
				+      {
			
 
				+        const lines: string[] = [];
			
 
				+        lines.push(`${c.dim}├─ ${query} · (lexical+vector)${c.reset}`);
			
 
				+        lines[lines.length - 1] = lines[lines.length - 1]!.replace('├─', '└─');
			
 
				+        for (const line of lines) process.stderr.write(line + '\n');
			
 
				+      }
			
 
				+    } else {
			
 
				+      // Weak signal - expand query for better recall
			
 
				+      const queryables = await expandQueryStructured(query, true, opts.context, session);
			
 
				+
			
 
				+      for (const q of queryables) {
			
 
				+        if (q.type === 'lex') {
			
 
				+          if (q.text && q.text !== query) ftsQueries.push(q.text);
			
 
				+        } else if (q.type === 'vec' || q.type === 'hyde') {
			
 
				+          if (q.text && q.text !== query) vectorQueries.push(q.text);
			
 
				+        }
			
 
				       }
			
 
				     }
			
 
				-  }
			
 
				-
			
 
				-  process.stderr.write(`${c.dim}Searching ${ftsQueries.length} lexical + ${vectorQueries.length} vector queries...${c.reset}\n`);
			
 
				 
			
 
				-  // Collect ranked result lists for RRF fusion
			
 
				-  const rankedLists: RankedResult[][] = [];
			
 
				+    process.stderr.write(`${c.dim}Searching ${ftsQueries.length} lexical + ${vectorQueries.length} vector queries...${c.reset}\n`);
			
 
				 
			
 
				-  // Map to store hash by filepath for final results
			
 
				-  const hashMap = new Map<string, string>();
			
 
				+    // Collect ranked result lists for RRF fusion
			
 
				+    const rankedLists: RankedResult[][] = [];
			
 
				 
			
 
				-  // Run all searches concurrently (FTS + Vector)
			
 
				-  const searchPromises: Promise<void>[] = [];
			
 
				+    // Map to store hash by filepath for final results
			
 
				+    const hashMap = new Map<string, string>();
			
 
				 
			
 
				-  // FTS searches
			
 
				-  for (const q of ftsQueries) {
			
 
				-    if (!q) continue;
			
 
				-    searchPromises.push((async () => {
			
 
				-      const ftsResults = searchFTS(db, q, 20, (collectionName || "") as any);
			
 
				-      if (ftsResults.length > 0) {
			
 
				-        for (const r of ftsResults) {
			
 
				-          // Mutex for hashMap is not strictly needed as it's just adding values
			
 
				-          hashMap.set(r.filepath, r.hash);
			
 
				-        }
			
 
				-        rankedLists.push(ftsResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score })));
			
 
				-      }
			
 
				-    })());
			
 
				-  }
			
 
				+    // Run all searches concurrently (FTS + Vector)
			
 
				+    const searchPromises: Promise<void>[] = [];
			
 
				 
			
 
				-  // Vector searches
			
 
				-  if (hasVectors) {
			
 
				-    for (const q of vectorQueries) {
			
 
				+    // FTS searches
			
 
				+    for (const q of ftsQueries) {
			
 
				       if (!q) continue;
			
 
				       searchPromises.push((async () => {
			
 
				-        const vecResults = await searchVec(db, q, embedModel, 20, (collectionName || "") as any);
			
 
				-        if (vecResults.length > 0) {
			
 
				-          for (const r of vecResults) hashMap.set(r.filepath, r.hash);
			
 
				-          rankedLists.push(vecResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score })));
			
 
				+        const ftsResults = searchFTS(db, q, 20, (collectionName || "") as any);
			
 
				+        if (ftsResults.length > 0) {
			
 
				+          for (const r of ftsResults) {
			
 
				+            // Mutex for hashMap is not strictly needed as it's just adding values
			
 
				+            hashMap.set(r.filepath, r.hash);
			
 
				+          }
			
 
				+          rankedLists.push(ftsResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score })));
			
 
				         }
			
 
				       })());
			
 
				     }
			
 
				-  }
			
 
				 
			
 
				-  await Promise.all(searchPromises);
			
 
				+    // Vector searches (session ensures contexts stay alive)
			
 
				+    if (hasVectors) {
			
 
				+      for (const q of vectorQueries) {
			
 
				+        if (!q) continue;
			
 
				+        searchPromises.push((async () => {
			
 
				+          const vecResults = await searchVec(db, q, embedModel, 20, (collectionName || "") as any, session);
			
 
				+          if (vecResults.length > 0) {
			
 
				+            for (const r of vecResults) hashMap.set(r.filepath, r.hash);
			
 
				+            rankedLists.push(vecResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score })));
			
 
				+          }
			
 
				+        })());
			
 
				+      }
			
 
				+    }
			
 
				 
			
 
				-  // Apply Reciprocal Rank Fusion to combine all ranked lists
			
 
				-  // Give 2x weight to original query results (first 2 lists: FTS + vector)
			
 
				-  const weights = rankedLists.map((_, i) => i < 2 ? 2.0 : 1.0);
			
 
				-  const fused = reciprocalRankFusion(rankedLists, weights);
			
 
				-  // Hard cap reranking for latency/cost. We rerank per-document (best chunk only).
			
 
				-  const RERANK_DOC_LIMIT = 40;
			
 
				-  const candidates = fused.slice(0, RERANK_DOC_LIMIT);
			
 
				+    await Promise.all(searchPromises);
			
 
				 
			
 
				-  if (candidates.length === 0) {
			
 
				-    console.log("No results found.");
			
 
				-    closeDb();
			
 
				-    return;
			
 
				-  }
			
 
				+    // Apply Reciprocal Rank Fusion to combine all ranked lists
			
 
				+    // Give 2x weight to original query results (first 2 lists: FTS + vector)
			
 
				+    const weights = rankedLists.map((_, i) => i < 2 ? 2.0 : 1.0);
			
 
				+    const fused = reciprocalRankFusion(rankedLists, weights);
			
 
				+    // Hard cap reranking for latency/cost. We rerank per-document (best chunk only).
			
 
				+    const RERANK_DOC_LIMIT = 40;
			
 
				+    const candidates = fused.slice(0, RERANK_DOC_LIMIT);
			
 
				 
			
 
				-  // Rerank multiple chunks per document, then aggregate scores
			
 
				-  // This improves ranking for long documents where keyword-matched chunk isn't always best
			
 
				-  // We only rerank ONE chunk per document (best chunk by a simple keyword heuristic),
			
 
				-  // so we never rerank more than RERANK_DOC_LIMIT items.
			
 
				-  const chunksToRerank: { file: string; text: string; chunkIdx: number }[] = [];
			
 
				-  const docChunkMap = new Map<string, { chunks: { text: string; pos: number }[]; bestIdx: number }>();
			
 
				-
			
 
				-  const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
			
 
				-  for (const c of candidates) {
			
 
				-    const chunks = chunkDocument(c.body);
			
 
				-    if (chunks.length === 0) continue;
			
 
				-
			
 
				-    // Choose best chunk by keyword matches; fall back to first chunk.
			
 
				-    let bestIdx = 0;
			
 
				-    let bestScore = -1;
			
 
				-    for (let i = 0; i < chunks.length; i++) {
			
 
				-      const chunkLower = chunks[i]!.text.toLowerCase();
			
 
				-      const score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
			
 
				-      if (score > bestScore) {
			
 
				-        bestScore = score;
			
 
				-        bestIdx = i;
			
 
				-      }
			
 
				+    if (candidates.length === 0) {
			
 
				+      console.log("No results found.");
			
 
				+      closeDb();
			
 
				+      return;
			
 
				     }
			
 
				 
			
 
				-    chunksToRerank.push({ file: c.file, text: chunks[bestIdx]!.text, chunkIdx: bestIdx });
			
 
				-    docChunkMap.set(c.file, { chunks, bestIdx });
			
 
				-  }
			
 
				+    // Rerank multiple chunks per document, then aggregate scores
			
 
				+    // This improves ranking for long documents where keyword-matched chunk isn't always best
			
 
				+    // We only rerank ONE chunk per document (best chunk by a simple keyword heuristic),
			
 
				+    // so we never rerank more than RERANK_DOC_LIMIT items.
			
 
				+    const chunksToRerank: { file: string; text: string; chunkIdx: number }[] = [];
			
 
				+    const docChunkMap = new Map<string, { chunks: { text: string; pos: number }[]; bestIdx: number }>();
			
 
				+
			
 
				+    const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
			
 
				+    for (const cand of candidates) {
			
 
				+      const chunks = chunkDocument(cand.body);
			
 
				+      if (chunks.length === 0) continue;
			
 
				+
			
 
				+      // Choose best chunk by keyword matches; fall back to first chunk.
			
 
				+      let bestIdx = 0;
			
 
				+      let bestScore = -1;
			
 
				+      for (let i = 0; i < chunks.length; i++) {
			
 
				+        const chunkLower = chunks[i]!.text.toLowerCase();
			
 
				+        const score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
			
 
				+        if (score > bestScore) {
			
 
				+          bestScore = score;
			
 
				+          bestIdx = i;
			
 
				+        }
			
 
				+      }
			
 
				 
			
 
				-  // Rerank selected chunks (with caching). One chunk per doc -> one rerank item per doc.
			
 
				-  const reranked = await rerank(
			
 
				-    query,
			
 
				-    chunksToRerank.map(c => ({ file: c.file, text: c.text })),
			
 
				-    rerankModel,
			
 
				-    db
			
 
				-  );
			
 
				+      chunksToRerank.push({ file: cand.file, text: chunks[bestIdx]!.text, chunkIdx: bestIdx });
			
 
				+      docChunkMap.set(cand.file, { chunks, bestIdx });
			
 
				+    }
			
 
				 
			
 
				-  const aggregatedScores = new Map<string, { score: number; bestChunkIdx: number }>();
			
 
				-  for (const r of reranked) {
			
 
				-    const chunkInfo = docChunkMap.get(r.file);
			
 
				-    aggregatedScores.set(r.file, { score: r.score, bestChunkIdx: chunkInfo?.bestIdx ?? 0 });
			
 
				-  }
			
 
				-
			
 
				-  // Blend RRF position score with aggregated reranker score using position-aware weights
			
 
				-  // Top retrieval results get more protection from reranker disagreement
			
 
				-  const candidateMap = new Map(candidates.map(c => [c.file, { displayPath: c.displayPath, title: c.title, body: c.body }]));
			
 
				-  const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1])); // 1-indexed rank
			
 
				-
			
 
				-  const finalResults = Array.from(aggregatedScores.entries()).map(([file, { score: rerankScore, bestChunkIdx }]) => {
			
 
				-    const rrfRank = rrfRankMap.get(file) || 30;
			
 
				-    // Position-aware blending: top retrieval results preserved more
			
 
				-    // Rank 1-3: 75% RRF, 25% reranker (trust retrieval for exact matches)
			
 
				-    // Rank 4-10: 60% RRF, 40% reranker
			
 
				-    // Rank 11+: 40% RRF, 60% reranker (trust reranker for lower-ranked)
			
 
				-    let rrfWeight: number;
			
 
				-    if (rrfRank <= 3) {
			
 
				-      rrfWeight = 0.75;
			
 
				-    } else if (rrfRank <= 10) {
			
 
				-      rrfWeight = 0.60;
			
 
				-    } else {
			
 
				-      rrfWeight = 0.40;
			
 
				-    }
			
 
				-    const rrfScore = 1 / rrfRank;  // Position-based: 1, 0.5, 0.33...
			
 
				-    const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * rerankScore;
			
 
				-    const candidate = candidateMap.get(file);
			
 
				-    // Use the best-scoring chunk's text for the body (better for snippets)
			
 
				-    const chunkInfo = docChunkMap.get(file);
			
 
				-    const chunkBody = chunkInfo ? (chunkInfo.chunks[bestChunkIdx]?.text || chunkInfo.chunks[0]!.text) : candidate?.body || "";
			
 
				-    const chunkPos = chunkInfo ? (chunkInfo.chunks[bestChunkIdx]?.pos || 0) : 0;
			
 
				-    return {
			
 
				-      file,
			
 
				-      displayPath: candidate?.displayPath || "",
			
 
				-      title: candidate?.title || "",
			
 
				-      body: chunkBody,
			
 
				-      chunkPos,
			
 
				-      score: blendedScore,
			
 
				-      context: getContextForFile(db, file),
			
 
				-      hash: hashMap.get(file) || "",
			
 
				-    };
			
 
				-  }).sort((a, b) => b.score - a.score);
			
 
				-
			
 
				-  // Deduplicate by file (safety net - shouldn't happen but prevents duplicate output)
			
 
				-  const seenFiles = new Set<string>();
			
 
				-  const dedupedResults = finalResults.filter(r => {
			
 
				-    if (seenFiles.has(r.file)) return false;
			
 
				-    seenFiles.add(r.file);
			
 
				-    return true;
			
 
				-  });
			
 
				+    // Rerank selected chunks (with caching). One chunk per doc -> one rerank item per doc.
			
 
				+    const reranked = await rerank(
			
 
				+      query,
			
 
				+      chunksToRerank.map(ch => ({ file: ch.file, text: ch.text })),
			
 
				+      rerankModel,
			
 
				+      db,
			
 
				+      session
			
 
				+    );
			
 
				 
			
 
				-  closeDb();
			
 
				-  outputResults(dedupedResults, query, opts);
			
 
				+    const aggregatedScores = new Map<string, { score: number; bestChunkIdx: number }>();
			
 
				+    for (const r of reranked) {
			
 
				+      const chunkInfo = docChunkMap.get(r.file);
			
 
				+      aggregatedScores.set(r.file, { score: r.score, bestChunkIdx: chunkInfo?.bestIdx ?? 0 });
			
 
				+    }
			
 
				+
			
 
				+    // Blend RRF position score with aggregated reranker score using position-aware weights
			
 
				+    // Top retrieval results get more protection from reranker disagreement
			
 
				+    const candidateMap = new Map(candidates.map(cand => [cand.file, { displayPath: cand.displayPath, title: cand.title, body: cand.body }]));
			
 
				+    const rrfRankMap = new Map(candidates.map((cand, i) => [cand.file, i + 1])); // 1-indexed rank
			
 
				+
			
 
				+    const finalResults = Array.from(aggregatedScores.entries()).map(([file, { score: rerankScore, bestChunkIdx }]) => {
			
 
				+      const rrfRank = rrfRankMap.get(file) || 30;
			
 
				+      // Position-aware blending: top retrieval results preserved more
			
 
				+      // Rank 1-3: 75% RRF, 25% reranker (trust retrieval for exact matches)
			
 
				+      // Rank 4-10: 60% RRF, 40% reranker
			
 
				+      // Rank 11+: 40% RRF, 60% reranker (trust reranker for lower-ranked)
			
 
				+      let rrfWeight: number;
			
 
				+      if (rrfRank <= 3) {
			
 
				+        rrfWeight = 0.75;
			
 
				+      } else if (rrfRank <= 10) {
			
 
				+        rrfWeight = 0.60;
			
 
				+      } else {
			
 
				+        rrfWeight = 0.40;
			
 
				+      }
			
 
				+      const rrfScore = 1 / rrfRank;  // Position-based: 1, 0.5, 0.33...
			
 
				+      const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * rerankScore;
			
 
				+      const candidate = candidateMap.get(file);
			
 
				+      // Use the best-scoring chunk's text for the body (better for snippets)
			
 
				+      const chunkInfo = docChunkMap.get(file);
			
 
				+      const chunkBody = chunkInfo ? (chunkInfo.chunks[bestChunkIdx]?.text || chunkInfo.chunks[0]!.text) : candidate?.body || "";
			
 
				+      const chunkPos = chunkInfo ? (chunkInfo.chunks[bestChunkIdx]?.pos || 0) : 0;
			
 
				+      return {
			
 
				+        file,
			
 
				+        displayPath: candidate?.displayPath || "",
			
 
				+        title: candidate?.title || "",
			
 
				+        body: chunkBody,
			
 
				+        chunkPos,
			
 
				+        score: blendedScore,
			
 
				+        context: getContextForFile(db, file),
			
 
				+        hash: hashMap.get(file) || "",
			
 
				+      };
			
 
				+    }).sort((a, b) => b.score - a.score);
			
 
				+
			
 
				+    // Deduplicate by file (safety net - shouldn't happen but prevents duplicate output)
			
 
				+    const seenFiles = new Set<string>();
			
 
				+    const dedupedResults = finalResults.filter(r => {
			
 
				+      if (seenFiles.has(r.file)) return false;
			
 
				+      seenFiles.add(r.file);
			
 
				+      return true;
			
 
				+    });
			
 
				+
			
 
				+    closeDb();
			
 
				+    outputResults(dedupedResults, query, opts);
			
 
				+  }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
			
 
				 }
			
 
				 
			
 
				 // Parse CLI arguments using util.parseArgs
			
--- a/src/store.test.ts
+++ b/src/store.test.ts
@@ -1850,7 +1850,7 @@ describe("LlamaCpp Integration", () => {
 
				     expect(allResults).toHaveLength(2);
			
 
				 
			
 
				     // Search with collection filter - should return only from collection1
			
 
				-    const filtered = await store.searchVec("content", "embeddinggemma", 10, collection1 as unknown as number);
			
 
				+    const filtered = await store.searchVec("content", "embeddinggemma", 10, collection1);
			
 
				     expect(filtered).toHaveLength(1);
			
 
				     expect(filtered[0]!.collectionName).toBe(collection1);
			
 
				 
			
--- a/src/store.ts
+++ b/src/store.ts
@@ -21,6 +21,7 @@ import {
 
				   formatQueryForEmbedding,
			
 
				   formatDocForEmbedding,
			
 
				   type RerankDocument,
			
 
				+  type ILLMSession,
			
 
				 } from "./llm";
			
 
				 import {
			
 
				   findContextForPath as collectionsFindContextForPath,
			
@@ -1900,11 +1901,11 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle
 
				 // Vector Search
			
 
				 // =============================================================================
			
 
				 
			
 
				-export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string): Promise<SearchResult[]> {
			
 
				+export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession): Promise<SearchResult[]> {
			
 
				   const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
			
 
				   if (!tableExists) return [];
			
 
				 
			
 
				-  const embedding = await getEmbedding(query, model, true);
			
 
				+  const embedding = await getEmbedding(query, model, true, session);
			
 
				   if (!embedding) return [];
			
 
				 
			
 
				   // IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
			
@@ -1990,11 +1991,12 @@ export async function searchVec(db: Database, query: string, model: string, limi
 
				 // Embeddings
			
 
				 // =============================================================================
			
 
				 
			
 
				-async function getEmbedding(text: string, model: string, isQuery: boolean): Promise<number[] | null> {
			
 
				-  const llm = getDefaultLlamaCpp();
			
 
				+async function getEmbedding(text: string, model: string, isQuery: boolean, session?: ILLMSession): Promise<number[] | null> {
			
 
				   // Format text using the appropriate prompt template
			
 
				   const formattedText = isQuery ? formatQueryForEmbedding(text) : formatDocForEmbedding(text);
			
 
				-  const result = await llm.embed(formattedText, { model, isQuery });
			
 
				+  const result = session
			
 
				+    ? await session.embed(formattedText, { model, isQuery })
			
 
				+    : await getDefaultLlamaCpp().embed(formattedText, { model, isQuery });
			
 
				   return result?.embedding || null;
			
 
				 }