llm.ts 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
  1. /**
  2. * llm.ts - LLM abstraction layer for QMD
  3. *
  4. * Provides a clean interface for LLM operations with an Ollama implementation.
  5. * All raw fetch calls to LLM APIs should go through this module.
  6. */
  7. // =============================================================================
  8. // Types
  9. // =============================================================================
  10. /**
  11. * Token with log probability
  12. */
  13. export type TokenLogProb = {
  14. token: string;
  15. logprob: number;
  16. };
  17. /**
  18. * Embedding result
  19. */
  20. export type EmbeddingResult = {
  21. embedding: number[];
  22. model: string;
  23. };
  24. /**
  25. * Generation result with optional logprobs
  26. */
  27. export type GenerateResult = {
  28. text: string;
  29. model: string;
  30. logprobs?: TokenLogProb[];
  31. done: boolean;
  32. };
  33. /**
  34. * Rerank result for a single document
  35. */
  36. export type RerankDocumentResult = {
  37. file: string;
  38. relevant: boolean;
  39. confidence: number;
  40. score: number;
  41. rawToken: string;
  42. logprob: number;
  43. };
  44. /**
  45. * Batch rerank result
  46. */
  47. export type RerankResult = {
  48. results: RerankDocumentResult[];
  49. model: string;
  50. };
  51. /**
  52. * Model info
  53. */
  54. export type ModelInfo = {
  55. name: string;
  56. exists: boolean;
  57. size?: number;
  58. modifiedAt?: string;
  59. };
  60. /**
  61. * Options for embedding
  62. */
  63. export type EmbedOptions = {
  64. model: string;
  65. isQuery?: boolean;
  66. title?: string;
  67. };
  68. /**
  69. * Options for text generation
  70. */
  71. export type GenerateOptions = {
  72. model: string;
  73. maxTokens?: number;
  74. temperature?: number;
  75. logprobs?: boolean;
  76. raw?: boolean;
  77. stop?: string[];
  78. };
  79. /**
  80. * Options for reranking
  81. */
  82. export type RerankOptions = {
  83. model: string;
  84. batchSize?: number;
  85. };
  86. /**
  87. * Document to rerank
  88. */
  89. export type RerankDocument = {
  90. file: string;
  91. text: string;
  92. title?: string;
  93. };
  94. // =============================================================================
  95. // LLM Interface
  96. // =============================================================================
  97. /**
  98. * Abstract LLM interface - implement this for different backends
  99. */
  100. export interface LLM {
  101. /**
  102. * Get embeddings for text
  103. */
  104. embed(text: string, options: EmbedOptions): Promise<EmbeddingResult | null>;
  105. /**
  106. * Generate text completion
  107. */
  108. generate(prompt: string, options: GenerateOptions): Promise<GenerateResult | null>;
  109. /**
  110. * Check if a model exists
  111. */
  112. modelExists(model: string): Promise<ModelInfo>;
  113. /**
  114. * Pull a model (download if not available)
  115. */
  116. pullModel(model: string, onProgress?: (progress: number) => void): Promise<boolean>;
  117. // ==========================================================================
  118. // High-level abstractions
  119. // ==========================================================================
  120. /**
  121. * Expand a search query into multiple variations
  122. */
  123. expandQuery(query: string, model: string, numVariations?: number): Promise<string[]>;
  124. /**
  125. * Rerank documents by relevance to a query
  126. * Returns list of documents with relevance scores and boolean judgments
  127. */
  128. rerank(query: string, documents: RerankDocument[], options: RerankOptions): Promise<RerankResult>;
  129. /**
  130. * Quick relevance check - returns just boolean judgments with logprobs
  131. * More efficient than full rerank when you just need yes/no
  132. */
  133. rerankerLogprobsCheck(query: string, documents: RerankDocument[], options: RerankOptions): Promise<RerankDocumentResult[]>;
  134. }
  135. // =============================================================================
  136. // Ollama Implementation
  137. // =============================================================================
  138. export type OllamaConfig = {
  139. baseUrl?: string;
  140. defaultEmbedModel?: string;
  141. defaultGenerateModel?: string;
  142. defaultRerankModel?: string;
  143. };
  144. const DEFAULT_OLLAMA_URL = "http://localhost:11434";
  145. const DEFAULT_EMBED_MODEL = "embeddinggemma";
  146. const DEFAULT_GENERATE_MODEL = "qwen3:0.6b";
  147. const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
  148. /**
  149. * Format text for embedding query
  150. */
  151. export function formatQueryForEmbedding(query: string): string {
  152. return `task: search result | query: ${query}`;
  153. }
  154. /**
  155. * Format text for embedding document
  156. */
  157. export function formatDocForEmbedding(text: string, title?: string): string {
  158. return `title: ${title || "none"} | text: ${text}`;
  159. }
  160. /**
  161. * Ollama LLM implementation
  162. */
  163. export class Ollama implements LLM {
  164. private baseUrl: string;
  165. private defaultEmbedModel: string;
  166. private defaultGenerateModel: string;
  167. private defaultRerankModel: string;
  168. constructor(config: OllamaConfig = {}) {
  169. this.baseUrl = config.baseUrl || process.env.OLLAMA_URL || DEFAULT_OLLAMA_URL;
  170. this.defaultEmbedModel = config.defaultEmbedModel || DEFAULT_EMBED_MODEL;
  171. this.defaultGenerateModel = config.defaultGenerateModel || DEFAULT_GENERATE_MODEL;
  172. this.defaultRerankModel = config.defaultRerankModel || DEFAULT_RERANK_MODEL;
  173. }
  174. /**
  175. * Get the base URL for this Ollama instance
  176. */
  177. getBaseUrl(): string {
  178. return this.baseUrl;
  179. }
  180. // ==========================================================================
  181. // Core API methods
  182. // ==========================================================================
  183. async embed(text: string, options: EmbedOptions): Promise<EmbeddingResult | null> {
  184. const model = options.model || this.defaultEmbedModel;
  185. const formatted = options.isQuery
  186. ? formatQueryForEmbedding(text)
  187. : formatDocForEmbedding(text, options.title);
  188. try {
  189. const response = await fetch(`${this.baseUrl}/api/embed`, {
  190. method: "POST",
  191. headers: { "Content-Type": "application/json" },
  192. body: JSON.stringify({ model, input: formatted }),
  193. });
  194. if (!response.ok) {
  195. return null;
  196. }
  197. const data = await response.json() as { embeddings?: number[][] };
  198. if (!data.embeddings?.[0]) {
  199. return null;
  200. }
  201. return {
  202. embedding: data.embeddings[0],
  203. model,
  204. };
  205. } catch {
  206. return null;
  207. }
  208. }
  209. async generate(prompt: string, options: GenerateOptions): Promise<GenerateResult | null> {
  210. const model = options.model || this.defaultGenerateModel;
  211. const requestBody: Record<string, unknown> = {
  212. model,
  213. prompt,
  214. stream: false,
  215. options: {
  216. num_predict: options.maxTokens ?? 150,
  217. temperature: options.temperature ?? 0,
  218. },
  219. };
  220. if (options.logprobs) {
  221. requestBody.logprobs = true;
  222. }
  223. if (options.raw) {
  224. requestBody.raw = true;
  225. }
  226. if (options.stop) {
  227. (requestBody.options as Record<string, unknown>).stop = options.stop;
  228. }
  229. try {
  230. const response = await fetch(`${this.baseUrl}/api/generate`, {
  231. method: "POST",
  232. headers: { "Content-Type": "application/json" },
  233. body: JSON.stringify(requestBody),
  234. });
  235. if (!response.ok) {
  236. return null;
  237. }
  238. const data = await response.json() as {
  239. response?: string;
  240. done?: boolean;
  241. logprobs?: { tokens?: string[]; token_logprobs?: number[] };
  242. };
  243. // Parse logprobs if present
  244. let logprobs: TokenLogProb[] | undefined;
  245. if (data.logprobs?.tokens && data.logprobs?.token_logprobs) {
  246. logprobs = data.logprobs.tokens.map((token, i) => ({
  247. token,
  248. logprob: data.logprobs!.token_logprobs![i],
  249. }));
  250. }
  251. return {
  252. text: data.response || "",
  253. model,
  254. logprobs,
  255. done: data.done ?? true,
  256. };
  257. } catch {
  258. return null;
  259. }
  260. }
  261. async modelExists(model: string): Promise<ModelInfo> {
  262. try {
  263. const response = await fetch(`${this.baseUrl}/api/show`, {
  264. method: "POST",
  265. headers: { "Content-Type": "application/json" },
  266. body: JSON.stringify({ name: model }),
  267. });
  268. if (!response.ok) {
  269. return { name: model, exists: false };
  270. }
  271. const data = await response.json() as {
  272. size?: number;
  273. modified_at?: string;
  274. };
  275. return {
  276. name: model,
  277. exists: true,
  278. size: data.size,
  279. modifiedAt: data.modified_at,
  280. };
  281. } catch {
  282. return { name: model, exists: false };
  283. }
  284. }
  285. async pullModel(model: string, onProgress?: (progress: number) => void): Promise<boolean> {
  286. try {
  287. const response = await fetch(`${this.baseUrl}/api/pull`, {
  288. method: "POST",
  289. headers: { "Content-Type": "application/json" },
  290. body: JSON.stringify({ name: model, stream: false }),
  291. });
  292. if (!response.ok) {
  293. return false;
  294. }
  295. // For non-streaming, we just wait for completion
  296. await response.json();
  297. onProgress?.(100);
  298. return true;
  299. } catch {
  300. return false;
  301. }
  302. }
  303. // ==========================================================================
  304. // High-level abstractions
  305. // ==========================================================================
  306. async expandQuery(query: string, model?: string, numVariations: number = 2): Promise<string[]> {
  307. const useModel = model || this.defaultGenerateModel;
  308. const prompt = `You are a search query expander. Given a search query, generate ${numVariations} alternative queries that would help find relevant documents.
  309. Rules:
  310. - Use synonyms and related terminology (e.g., "craft" → "craftsmanship", "quality", "excellence")
  311. - Rephrase to capture different angles (e.g., "engineering culture" → "technical excellence", "developer practices")
  312. - Keep proper nouns and named concepts exactly as written (e.g., "Build a Business", "Stripe", "Shopify")
  313. - Each variation should be 3-8 words, natural search terms
  314. - Do NOT just append words like "search" or "find" or "documents"
  315. Query: "${query}"
  316. Output exactly ${numVariations} variations, one per line, no numbering or bullets:`;
  317. const result = await this.generate(prompt, {
  318. model: useModel,
  319. maxTokens: 150,
  320. temperature: 0,
  321. });
  322. if (!result) {
  323. return [query];
  324. }
  325. // Parse response - filter out thinking tags and clean up
  326. const cleanText = result.text.replace(/<think>[\s\S]*?<\/think>/g, "").trim();
  327. const lines = cleanText
  328. .split("\n")
  329. .map((l) => l.trim())
  330. .filter((l) => l.length > 2 && l.length < 100 && !l.startsWith("<"));
  331. return [query, ...lines.slice(0, numVariations)];
  332. }
  333. async rerank(
  334. query: string,
  335. documents: RerankDocument[],
  336. options: RerankOptions
  337. ): Promise<RerankResult> {
  338. const results = await this.rerankerLogprobsCheck(query, documents, options);
  339. return {
  340. results: results.sort((a, b) => b.score - a.score),
  341. model: options.model || this.defaultRerankModel,
  342. };
  343. }
  344. async rerankerLogprobsCheck(
  345. query: string,
  346. documents: RerankDocument[],
  347. options: RerankOptions
  348. ): Promise<RerankDocumentResult[]> {
  349. const model = options.model || this.defaultRerankModel;
  350. const batchSize = options.batchSize || 5;
  351. const results: RerankDocumentResult[] = [];
  352. // Process in batches
  353. for (let i = 0; i < documents.length; i += batchSize) {
  354. const batch = documents.slice(i, i + batchSize);
  355. const batchResults = await Promise.all(
  356. batch.map((doc) => this.rerankSingle(query, doc, model))
  357. );
  358. results.push(...batchResults);
  359. }
  360. return results;
  361. }
  362. /**
  363. * Rerank a single document - internal helper
  364. */
  365. private async rerankSingle(
  366. query: string,
  367. doc: RerankDocument,
  368. model: string
  369. ): Promise<RerankDocumentResult> {
  370. const systemPrompt = `Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".`;
  371. const instruct = `Given a search query, determine if the following document is relevant to the query. Consider both direct matches and related concepts.`;
  372. const docTitle = doc.title || doc.file.split("/").pop()?.replace(/\.md$/, "") || doc.file;
  373. const docPreview = doc.text.length > 4000 ? doc.text.substring(0, 4000) + "..." : doc.text;
  374. // Qwen3-reranker prompt format with empty think tags
  375. const prompt = `<|im_start|>system
  376. ${systemPrompt}<|im_end|>
  377. <|im_start|>user
  378. <Instruct>: ${instruct}
  379. <Query>: ${query}
  380. <Document Title>: ${docTitle}
  381. <Document>: ${docPreview}<|im_end|>
  382. <|im_start|>assistant
  383. <think>
  384. </think>
  385. `;
  386. const result = await this.generate(prompt, {
  387. model,
  388. maxTokens: 1,
  389. temperature: 0,
  390. logprobs: true,
  391. raw: true,
  392. });
  393. if (!result) {
  394. return {
  395. file: doc.file,
  396. relevant: false,
  397. confidence: 0,
  398. score: 0,
  399. rawToken: "",
  400. logprob: 0,
  401. };
  402. }
  403. return this.parseRerankResponse(doc.file, result);
  404. }
  405. /**
  406. * Parse rerank response into structured result
  407. */
  408. private parseRerankResponse(file: string, result: GenerateResult): RerankDocumentResult {
  409. const token = result.text.toLowerCase().trim();
  410. const logprob = result.logprobs?.[0]?.logprob ?? 0;
  411. const confidence = Math.exp(logprob);
  412. let relevant: boolean;
  413. let score: number;
  414. if (token.startsWith("yes")) {
  415. relevant = true;
  416. // Score: 0.5 base + up to 0.5 from confidence
  417. score = 0.5 + 0.5 * confidence;
  418. } else if (token.startsWith("no")) {
  419. relevant = false;
  420. // Score: up to 0.5 based on uncertainty (1 - confidence)
  421. score = 0.5 * (1 - confidence);
  422. } else {
  423. // Unknown token - neutral score
  424. relevant = false;
  425. score = 0.3;
  426. }
  427. return {
  428. file,
  429. relevant,
  430. confidence,
  431. score,
  432. rawToken: result.logprobs?.[0]?.token ?? token,
  433. logprob,
  434. };
  435. }
  436. }
  437. // =============================================================================
  438. // Singleton for default Ollama instance
  439. // =============================================================================
  440. let defaultOllama: Ollama | null = null;
  441. /**
  442. * Get the default Ollama instance (creates one if needed)
  443. */
  444. export function getDefaultOllama(): Ollama {
  445. if (!defaultOllama) {
  446. defaultOllama = new Ollama();
  447. }
  448. return defaultOllama;
  449. }
  450. /**
  451. * Set a custom default Ollama instance (useful for testing)
  452. */
  453. export function setDefaultOllama(ollama: Ollama | null): void {
  454. defaultOllama = ollama;
  455. }