4 сар өмнө · ee08997f23
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -334,6 +334,11 @@ export type LlamaCppConfig = {
 
															   generateModel?: string;
														
 
															   rerankModel?: string;
														
 
															   modelCacheDir?: string;
														
 
															+  /**
														
 
															+   * Context size used for query expansion generation contexts.
														
 
															+   * Default: 2048. Can also be set via QMD_EXPAND_CONTEXT_SIZE.
														
 
															+   */
														
 
															+  expandContextSize?: number;
														
 
															   /**
														
 
															    * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
														
 
															    *
														
@@ -356,6 +361,28 @@ export type LlamaCppConfig = {
 
															  */
														
 
															 // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
														
 
															 const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
														
 
															+const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
														
 
															+
														
 
															+function resolveExpandContextSize(configValue?: number): number {
														
 
															+  if (configValue !== undefined) {
														
 
															+    if (!Number.isInteger(configValue) || configValue <= 0) {
														
 
															+      throw new Error(`Invalid expandContextSize: ${configValue}. Must be a positive integer.`);
														
 
															+    }
														
 
															+    return configValue;
														
 
															+  }
														
 
															+
														
 
															+  const envValue = process.env.QMD_EXPAND_CONTEXT_SIZE?.trim();
														
 
															+  if (!envValue) return DEFAULT_EXPAND_CONTEXT_SIZE;
														
 
															+
														
 
															+  const parsed = Number.parseInt(envValue, 10);
														
 
															+  if (!Number.isInteger(parsed) || parsed <= 0) {
														
 
															+    process.stderr.write(
														
 
															+      `QMD Warning: invalid QMD_EXPAND_CONTEXT_SIZE="${envValue}", using default ${DEFAULT_EXPAND_CONTEXT_SIZE}.\n`
														
 
															+    );
														
 
															+    return DEFAULT_EXPAND_CONTEXT_SIZE;
														
 
															+  }
														
 
															+  return parsed;
														
 
															+}
														
 
															 export class LlamaCpp implements LLM {
														
 
															   private llama: Llama | null = null;
														
@@ -369,6 +396,7 @@ export class LlamaCpp implements LLM {
 
															   private generateModelUri: string;
														
 
															   private rerankModelUri: string;
														
 
															   private modelCacheDir: string;
														
 
															+  private expandContextSize: number;
														
 
															   // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
														
 
															   private embedModelLoadPromise: Promise<LlamaModel> | null = null;
														
@@ -389,6 +417,7 @@ export class LlamaCpp implements LLM {
 
															     this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
														
 
															     this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
														
 
															     this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
														
 
															+    this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
														
 
															     this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
														
 
															     this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
														
 
															   }
														
@@ -710,7 +739,6 @@ export class LlamaCpp implements LLM {
 
															   // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
														
 
															   // Use 2048 for safety margin. Still 17× less than auto (40960).
														
 
															   private static readonly RERANK_CONTEXT_SIZE = 2048;
														
 
															-
														
 
															   private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
														
 
															     if (this.rerankContexts.length === 0) {
														
 
															       const model = await this.ensureRerankModel();
														
@@ -943,8 +971,10 @@ export class LlamaCpp implements LLM {
 
															     const prompt = `/no_think Expand this search query: ${query}`;
														
 
															-    // Create fresh context for each call
														
 
															-    const genContext = await this.generateModel!.createContext();
														
 
															+    // Create a bounded context for expansion to prevent large default VRAM allocations.
														
 
															+    const genContext = await this.generateModel!.createContext({
														
 
															+      contextSize: this.expandContextSize,
														
 
															+    });
														
 
															     const sequence = genContext.getSequence();
														
 
															     const session = new LlamaChatSession({ contextSequence: sequence });
														
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@@ -7,7 +7,7 @@
 
															  * rerank functions first to trigger model downloads.
														
 
															  */
														
 
															-import { describe, test, expect, beforeAll, afterAll } from "vitest";
														
 
															+import { describe, test, expect, beforeAll, afterAll, vi } from "vitest";
														
 
															 import {
														
 
															   LlamaCpp,
														
 
															   getDefaultLlamaCpp,
														
@@ -55,6 +55,68 @@ describe("LlamaCpp.modelExists", () => {
 
															   });
														
 
															 });
														
 
															+describe("LlamaCpp expand context size config", () => {
														
 
															+  const defaultExpandContextSize = 2048;
														
 
															+
														
 
															+  test("uses default expand context size when no config or env is set", () => {
														
 
															+    const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
														
 
															+    delete process.env.QMD_EXPAND_CONTEXT_SIZE;
														
 
															+    try {
														
 
															+      const llm = new LlamaCpp({}) as any;
														
 
															+      expect(llm.expandContextSize).toBe(defaultExpandContextSize);
														
 
															+    } finally {
														
 
															+      if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
														
 
															+      else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
														
 
															+    }
														
 
															+  });
														
 
															+
														
 
															+  test("uses QMD_EXPAND_CONTEXT_SIZE when set to a positive integer", () => {
														
 
															+    const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
														
 
															+    process.env.QMD_EXPAND_CONTEXT_SIZE = "3072";
														
 
															+    try {
														
 
															+      const llm = new LlamaCpp({}) as any;
														
 
															+      expect(llm.expandContextSize).toBe(3072);
														
 
															+    } finally {
														
 
															+      if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
														
 
															+      else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
														
 
															+    }
														
 
															+  });
														
 
															+
														
 
															+  test("config value overrides QMD_EXPAND_CONTEXT_SIZE", () => {
														
 
															+    const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
														
 
															+    process.env.QMD_EXPAND_CONTEXT_SIZE = "4096";
														
 
															+    try {
														
 
															+      const llm = new LlamaCpp({ expandContextSize: 1536 }) as any;
														
 
															+      expect(llm.expandContextSize).toBe(1536);
														
 
															+    } finally {
														
 
															+      if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
														
 
															+      else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
														
 
															+    }
														
 
															+  });
														
 
															+
														
 
															+  test("falls back to default and warns when QMD_EXPAND_CONTEXT_SIZE is invalid", () => {
														
 
															+    const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
														
 
															+    process.env.QMD_EXPAND_CONTEXT_SIZE = "bad";
														
 
															+    const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
														
 
															+    try {
														
 
															+      const llm = new LlamaCpp({}) as any;
														
 
															+      expect(llm.expandContextSize).toBe(defaultExpandContextSize);
														
 
															+      expect(stderrSpy).toHaveBeenCalled();
														
 
															+      expect(String(stderrSpy.mock.calls[0]?.[0] || "")).toContain("QMD_EXPAND_CONTEXT_SIZE");
														
 
															+    } finally {
														
 
															+      stderrSpy.mockRestore();
														
 
															+      if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
														
 
															+      else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
														
 
															+    }
														
 
															+  });
														
 
															+
														
 
															+  test("throws when config expandContextSize is invalid", () => {
														
 
															+    expect(() => new LlamaCpp({ expandContextSize: 0 })).toThrow(
														
 
															+      "Invalid expandContextSize: 0. Must be a positive integer."
														
 
															+    );
														
 
															+  });
														
 
															+});
														
 
															+
														
 
															 // =============================================================================
														
 
															 // Integration Tests (require actual models)
														
 
															 // =============================================================================
														
@@ -600,4 +662,3 @@ describe.skipIf(!!process.env.CI)("LLM Session Management", () => {
 
															     });
														
 
															   });
														
 
															 });
														
 
															-