4 ay önce · 0dec1df047
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -335,6 +335,11 @@ export type LlamaCppConfig = {
 
				   generateModel?: string;
			
 
				   rerankModel?: string;
			
 
				   modelCacheDir?: string;
			
 
				+  /**
			
 
				+   * Context size used for query expansion generation contexts.
			
 
				+   * Default: 2048. Can also be set via QMD_EXPAND_CONTEXT_SIZE.
			
 
				+   */
			
 
				+  expandContextSize?: number;
			
 
				   /**
			
 
				    * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
			
 
				    *
			
@@ -357,6 +362,28 @@ export type LlamaCppConfig = {
 
				  */
			
 
				 // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
			
 
				 const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
			
 
				+const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
			
 
				+
			
 
				+function resolveExpandContextSize(configValue?: number): number {
			
 
				+  if (configValue !== undefined) {
			
 
				+    if (!Number.isInteger(configValue) || configValue <= 0) {
			
 
				+      throw new Error(`Invalid expandContextSize: ${configValue}. Must be a positive integer.`);
			
 
				+    }
			
 
				+    return configValue;
			
 
				+  }
			
 
				+
			
 
				+  const envValue = process.env.QMD_EXPAND_CONTEXT_SIZE?.trim();
			
 
				+  if (!envValue) return DEFAULT_EXPAND_CONTEXT_SIZE;
			
 
				+
			
 
				+  const parsed = Number.parseInt(envValue, 10);
			
 
				+  if (!Number.isInteger(parsed) || parsed <= 0) {
			
 
				+    process.stderr.write(
			
 
				+      `QMD Warning: invalid QMD_EXPAND_CONTEXT_SIZE="${envValue}", using default ${DEFAULT_EXPAND_CONTEXT_SIZE}.\n`
			
 
				+    );
			
 
				+    return DEFAULT_EXPAND_CONTEXT_SIZE;
			
 
				+  }
			
 
				+  return parsed;
			
 
				+}
			
 
				 
			
 
				 export class LlamaCpp implements LLM {
			
 
				   private llama: Llama | null = null;
			
@@ -370,6 +397,7 @@ export class LlamaCpp implements LLM {
 
				   private generateModelUri: string;
			
 
				   private rerankModelUri: string;
			
 
				   private modelCacheDir: string;
			
 
				+  private expandContextSize: number;
			
 
				 
			
 
				   // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
			
 
				   private embedModelLoadPromise: Promise<LlamaModel> | null = null;
			
@@ -390,6 +418,7 @@ export class LlamaCpp implements LLM {
 
				     this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
			
 
				     this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
			
 
				     this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
			
 
				+    this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
			
 
				     this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
			
 
				     this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
			
 
				   }
			
@@ -727,7 +756,6 @@ export class LlamaCpp implements LLM {
 
				   // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
			
 
				   // Use 2048 for safety margin. Still 17× less than auto (40960).
			
 
				   private static readonly RERANK_CONTEXT_SIZE = 2048;
			
 
				-
			
 
				   private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
			
 
				     if (this.rerankContexts.length === 0) {
			
 
				       const model = await this.ensureRerankModel();
			
@@ -960,8 +988,10 @@ export class LlamaCpp implements LLM {
 
				 
			
 
				     const prompt = `/no_think Expand this search query: ${query}`;
			
 
				 
			
 
				-    // Create fresh context for each call
			
 
				-    const genContext = await this.generateModel!.createContext();
			
 
				+    // Create a bounded context for expansion to prevent large default VRAM allocations.
			
 
				+    const genContext = await this.generateModel!.createContext({
			
 
				+      contextSize: this.expandContextSize,
			
 
				+    });
			
 
				     const sequence = genContext.getSequence();
			
 
				     const session = new LlamaChatSession({ contextSequence: sequence });
			
 
				 
			
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@@ -7,7 +7,7 @@
 
				  * rerank functions first to trigger model downloads.
			
 
				  */
			
 
				 
			
 
				-import { describe, test, expect, beforeAll, afterAll } from "vitest";
			
 
				+import { describe, test, expect, beforeAll, afterAll, vi } from "vitest";
			
 
				 import {
			
 
				   LlamaCpp,
			
 
				   getDefaultLlamaCpp,
			
@@ -55,6 +55,68 @@ describe("LlamaCpp.modelExists", () => {
 
				   });
			
 
				 });
			
 
				 
			
 
				+describe("LlamaCpp expand context size config", () => {
			
 
				+  const defaultExpandContextSize = 2048;
			
 
				+
			
 
				+  test("uses default expand context size when no config or env is set", () => {
			
 
				+    const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
			
 
				+    delete process.env.QMD_EXPAND_CONTEXT_SIZE;
			
 
				+    try {
			
 
				+      const llm = new LlamaCpp({}) as any;
			
 
				+      expect(llm.expandContextSize).toBe(defaultExpandContextSize);
			
 
				+    } finally {
			
 
				+      if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
			
 
				+      else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("uses QMD_EXPAND_CONTEXT_SIZE when set to a positive integer", () => {
			
 
				+    const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
			
 
				+    process.env.QMD_EXPAND_CONTEXT_SIZE = "3072";
			
 
				+    try {
			
 
				+      const llm = new LlamaCpp({}) as any;
			
 
				+      expect(llm.expandContextSize).toBe(3072);
			
 
				+    } finally {
			
 
				+      if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
			
 
				+      else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("config value overrides QMD_EXPAND_CONTEXT_SIZE", () => {
			
 
				+    const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
			
 
				+    process.env.QMD_EXPAND_CONTEXT_SIZE = "4096";
			
 
				+    try {
			
 
				+      const llm = new LlamaCpp({ expandContextSize: 1536 }) as any;
			
 
				+      expect(llm.expandContextSize).toBe(1536);
			
 
				+    } finally {
			
 
				+      if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
			
 
				+      else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("falls back to default and warns when QMD_EXPAND_CONTEXT_SIZE is invalid", () => {
			
 
				+    const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
			
 
				+    process.env.QMD_EXPAND_CONTEXT_SIZE = "bad";
			
 
				+    const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
			
 
				+    try {
			
 
				+      const llm = new LlamaCpp({}) as any;
			
 
				+      expect(llm.expandContextSize).toBe(defaultExpandContextSize);
			
 
				+      expect(stderrSpy).toHaveBeenCalled();
			
 
				+      expect(String(stderrSpy.mock.calls[0]?.[0] || "")).toContain("QMD_EXPAND_CONTEXT_SIZE");
			
 
				+    } finally {
			
 
				+      stderrSpy.mockRestore();
			
 
				+      if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
			
 
				+      else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("throws when config expandContextSize is invalid", () => {
			
 
				+    expect(() => new LlamaCpp({ expandContextSize: 0 })).toThrow(
			
 
				+      "Invalid expandContextSize: 0. Must be a positive integer."
			
 
				+    );
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				 // =============================================================================
			
 
				 // Integration Tests (require actual models)
			
 
				 // =============================================================================
			
@@ -600,4 +662,3 @@ describe.skipIf(!!process.env.CI)("LLM Session Management", () => {
 
				     });
			
 
				   });
			
 
				 });
			
 
				-