Kaynağa Gözat

fix(llm): make expansion context size configurable

Brian Le 2 ay önce
ebeveyn
işleme
0dec1df047
2 değiştirilmiş dosya ile 96 ekleme ve 5 silme
  1. 33 3
      src/llm.ts
  2. 63 2
      test/llm.test.ts

+ 33 - 3
src/llm.ts

@@ -335,6 +335,11 @@ export type LlamaCppConfig = {
   generateModel?: string;
   rerankModel?: string;
   modelCacheDir?: string;
+  /**
+   * Context size used for query expansion generation contexts.
+   * Default: 2048. Can also be set via QMD_EXPAND_CONTEXT_SIZE.
+   */
+  expandContextSize?: number;
   /**
    * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
    *
@@ -357,6 +362,28 @@ export type LlamaCppConfig = {
  */
 // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
 const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
+const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
+
+function resolveExpandContextSize(configValue?: number): number {
+  if (configValue !== undefined) {
+    if (!Number.isInteger(configValue) || configValue <= 0) {
+      throw new Error(`Invalid expandContextSize: ${configValue}. Must be a positive integer.`);
+    }
+    return configValue;
+  }
+
+  const envValue = process.env.QMD_EXPAND_CONTEXT_SIZE?.trim();
+  if (!envValue) return DEFAULT_EXPAND_CONTEXT_SIZE;
+
+  const parsed = Number.parseInt(envValue, 10);
+  if (!Number.isInteger(parsed) || parsed <= 0) {
+    process.stderr.write(
+      `QMD Warning: invalid QMD_EXPAND_CONTEXT_SIZE="${envValue}", using default ${DEFAULT_EXPAND_CONTEXT_SIZE}.\n`
+    );
+    return DEFAULT_EXPAND_CONTEXT_SIZE;
+  }
+  return parsed;
+}
 
 export class LlamaCpp implements LLM {
   private llama: Llama | null = null;
@@ -370,6 +397,7 @@ export class LlamaCpp implements LLM {
   private generateModelUri: string;
   private rerankModelUri: string;
   private modelCacheDir: string;
+  private expandContextSize: number;
 
   // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
   private embedModelLoadPromise: Promise<LlamaModel> | null = null;
@@ -390,6 +418,7 @@ export class LlamaCpp implements LLM {
     this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
     this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
     this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
+    this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
     this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
     this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
   }
@@ -727,7 +756,6 @@ export class LlamaCpp implements LLM {
   // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
   // Use 2048 for safety margin. Still 17× less than auto (40960).
   private static readonly RERANK_CONTEXT_SIZE = 2048;
-
   private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
     if (this.rerankContexts.length === 0) {
       const model = await this.ensureRerankModel();
@@ -960,8 +988,10 @@ export class LlamaCpp implements LLM {
 
     const prompt = `/no_think Expand this search query: ${query}`;
 
-    // Create fresh context for each call
-    const genContext = await this.generateModel!.createContext();
+    // Create a bounded context for expansion to prevent large default VRAM allocations.
+    const genContext = await this.generateModel!.createContext({
+      contextSize: this.expandContextSize,
+    });
     const sequence = genContext.getSequence();
     const session = new LlamaChatSession({ contextSequence: sequence });
 

+ 63 - 2
test/llm.test.ts

@@ -7,7 +7,7 @@
  * rerank functions first to trigger model downloads.
  */
 
-import { describe, test, expect, beforeAll, afterAll } from "vitest";
+import { describe, test, expect, beforeAll, afterAll, vi } from "vitest";
 import {
   LlamaCpp,
   getDefaultLlamaCpp,
@@ -55,6 +55,68 @@ describe("LlamaCpp.modelExists", () => {
   });
 });
 
+describe("LlamaCpp expand context size config", () => {
+  const defaultExpandContextSize = 2048;
+
+  test("uses default expand context size when no config or env is set", () => {
+    const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
+    delete process.env.QMD_EXPAND_CONTEXT_SIZE;
+    try {
+      const llm = new LlamaCpp({}) as any;
+      expect(llm.expandContextSize).toBe(defaultExpandContextSize);
+    } finally {
+      if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
+      else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
+    }
+  });
+
+  test("uses QMD_EXPAND_CONTEXT_SIZE when set to a positive integer", () => {
+    const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
+    process.env.QMD_EXPAND_CONTEXT_SIZE = "3072";
+    try {
+      const llm = new LlamaCpp({}) as any;
+      expect(llm.expandContextSize).toBe(3072);
+    } finally {
+      if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
+      else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
+    }
+  });
+
+  test("config value overrides QMD_EXPAND_CONTEXT_SIZE", () => {
+    const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
+    process.env.QMD_EXPAND_CONTEXT_SIZE = "4096";
+    try {
+      const llm = new LlamaCpp({ expandContextSize: 1536 }) as any;
+      expect(llm.expandContextSize).toBe(1536);
+    } finally {
+      if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
+      else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
+    }
+  });
+
+  test("falls back to default and warns when QMD_EXPAND_CONTEXT_SIZE is invalid", () => {
+    const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
+    process.env.QMD_EXPAND_CONTEXT_SIZE = "bad";
+    const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
+    try {
+      const llm = new LlamaCpp({}) as any;
+      expect(llm.expandContextSize).toBe(defaultExpandContextSize);
+      expect(stderrSpy).toHaveBeenCalled();
+      expect(String(stderrSpy.mock.calls[0]?.[0] || "")).toContain("QMD_EXPAND_CONTEXT_SIZE");
+    } finally {
+      stderrSpy.mockRestore();
+      if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
+      else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
+    }
+  });
+
+  test("throws when config expandContextSize is invalid", () => {
+    expect(() => new LlamaCpp({ expandContextSize: 0 })).toThrow(
+      "Invalid expandContextSize: 0. Must be a positive integer."
+    );
+  });
+});
+
 // =============================================================================
 // Integration Tests (require actual models)
 // =============================================================================
@@ -600,4 +662,3 @@ describe.skipIf(!!process.env.CI)("LLM Session Management", () => {
     });
   });
 });
-