store.ts 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239
  1. /**
  2. * QMD Store - Core data access and retrieval functions
  3. *
  4. * This module provides all database operations, search functions, and document
  5. * retrieval for QMD. It returns raw data structures that can be formatted by
  6. * CLI or MCP consumers.
  7. *
  8. * Usage:
  9. * const store = createStore("/path/to/db.sqlite");
  10. * // or use default path:
  11. * const store = createStore();
  12. */
  13. import { Database } from "bun:sqlite";
  14. import { Glob } from "bun";
  15. import * as sqliteVec from "sqlite-vec";
  16. import {
  17. Ollama,
  18. getDefaultOllama,
  19. formatQueryForEmbedding,
  20. formatDocForEmbedding,
  21. type RerankDocument,
  22. } from "./llm";
  23. // =============================================================================
  24. // Configuration
  25. // =============================================================================
  26. const HOME = Bun.env.HOME || "/tmp";
  27. export const DEFAULT_EMBED_MODEL = "embeddinggemma";
  28. export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
  29. export const DEFAULT_QUERY_MODEL = "qwen3:0.6b";
  30. export const DEFAULT_GLOB = "**/*.md";
  31. export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
  32. // Re-export OLLAMA_URL for backwards compatibility
  33. export const OLLAMA_URL = getDefaultOllama().getBaseUrl();
  34. // Chunking: ~2000 tokens per chunk, ~3 bytes/token = 6KB
  35. const CHUNK_BYTE_SIZE = 6 * 1024;
  36. // =============================================================================
  37. // Path utilities
  38. // =============================================================================
  39. export function homedir(): string {
  40. return HOME;
  41. }
  42. export function resolve(...paths: string[]): string {
  43. let result = paths[0].startsWith('/') ? '' : Bun.env.PWD || process.cwd();
  44. for (const p of paths) {
  45. if (p.startsWith('/')) {
  46. result = p;
  47. } else {
  48. result = result + '/' + p;
  49. }
  50. }
  51. const parts = result.split('/').filter(Boolean);
  52. const normalized: string[] = [];
  53. for (const part of parts) {
  54. if (part === '..') normalized.pop();
  55. else if (part !== '.') normalized.push(part);
  56. }
  57. return '/' + normalized.join('/');
  58. }
  59. export function getDefaultDbPath(indexName: string = "index"): string {
  60. // Allow override via INDEX_PATH for testing
  61. if (Bun.env.INDEX_PATH) {
  62. return Bun.env.INDEX_PATH;
  63. }
  64. const cacheDir = Bun.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
  65. const qmdCacheDir = resolve(cacheDir, "qmd");
  66. try { Bun.spawnSync(["mkdir", "-p", qmdCacheDir]); } catch {}
  67. return resolve(qmdCacheDir, `${indexName}.sqlite`);
  68. }
  69. export function getPwd(): string {
  70. return process.env.PWD || process.cwd();
  71. }
  72. export function getRealPath(path: string): string {
  73. try {
  74. const result = Bun.spawnSync(["realpath", path]);
  75. if (result.success) {
  76. return result.stdout.toString().trim();
  77. }
  78. } catch {}
  79. return resolve(path);
  80. }
  81. // =============================================================================
  82. // Database initialization
  83. // =============================================================================
  84. // On macOS, use Homebrew's SQLite which supports extensions
  85. if (process.platform === "darwin") {
  86. const homebrewSqlitePath = "/opt/homebrew/opt/sqlite/lib/libsqlite3.dylib";
  87. try {
  88. if (Bun.file(homebrewSqlitePath).size > 0) {
  89. Database.setCustomSQLite(homebrewSqlitePath);
  90. }
  91. } catch {}
  92. }
  93. function initializeDatabase(db: Database): void {
  94. sqliteVec.load(db);
  95. db.exec("PRAGMA journal_mode = WAL");
  96. // Collections table
  97. db.exec(`
  98. CREATE TABLE IF NOT EXISTS collections (
  99. id INTEGER PRIMARY KEY AUTOINCREMENT,
  100. pwd TEXT NOT NULL,
  101. glob_pattern TEXT NOT NULL,
  102. created_at TEXT NOT NULL,
  103. context TEXT,
  104. UNIQUE(pwd, glob_pattern)
  105. )
  106. `);
  107. // Path-based context
  108. db.exec(`
  109. CREATE TABLE IF NOT EXISTS path_contexts (
  110. id INTEGER PRIMARY KEY AUTOINCREMENT,
  111. path_prefix TEXT NOT NULL UNIQUE,
  112. context TEXT NOT NULL,
  113. created_at TEXT NOT NULL
  114. )
  115. `);
  116. db.exec(`CREATE INDEX IF NOT EXISTS idx_path_contexts_prefix ON path_contexts(path_prefix)`);
  117. // Cache table for Ollama API calls
  118. db.exec(`
  119. CREATE TABLE IF NOT EXISTS ollama_cache (
  120. hash TEXT PRIMARY KEY,
  121. result TEXT NOT NULL,
  122. created_at TEXT NOT NULL
  123. )
  124. `);
  125. // Documents table
  126. db.exec(`
  127. CREATE TABLE IF NOT EXISTS documents (
  128. id INTEGER PRIMARY KEY AUTOINCREMENT,
  129. collection_id INTEGER NOT NULL,
  130. name TEXT NOT NULL,
  131. title TEXT NOT NULL,
  132. hash TEXT NOT NULL,
  133. filepath TEXT NOT NULL,
  134. display_path TEXT NOT NULL DEFAULT '',
  135. body TEXT NOT NULL,
  136. created_at TEXT NOT NULL,
  137. modified_at TEXT NOT NULL,
  138. active INTEGER NOT NULL DEFAULT 1,
  139. FOREIGN KEY (collection_id) REFERENCES collections(id)
  140. )
  141. `);
  142. // Migration: add display_path column if missing
  143. const docInfo = db.prepare(`PRAGMA table_info(documents)`).all() as { name: string }[];
  144. const hasDisplayPath = docInfo.some(col => col.name === 'display_path');
  145. if (!hasDisplayPath) {
  146. db.exec(`ALTER TABLE documents ADD COLUMN display_path TEXT NOT NULL DEFAULT ''`);
  147. }
  148. db.exec(`CREATE UNIQUE INDEX IF NOT EXISTS idx_documents_display_path ON documents(display_path) WHERE display_path != '' AND active = 1`);
  149. // Content vectors
  150. const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
  151. const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
  152. if (cvInfo.length > 0 && !hasSeqColumn) {
  153. db.exec(`DROP TABLE IF EXISTS content_vectors`);
  154. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  155. }
  156. db.exec(`
  157. CREATE TABLE IF NOT EXISTS content_vectors (
  158. hash TEXT NOT NULL,
  159. seq INTEGER NOT NULL DEFAULT 0,
  160. pos INTEGER NOT NULL DEFAULT 0,
  161. model TEXT NOT NULL,
  162. embedded_at TEXT NOT NULL,
  163. PRIMARY KEY (hash, seq)
  164. )
  165. `);
  166. // FTS
  167. db.exec(`
  168. CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
  169. name, body,
  170. content='documents',
  171. content_rowid='id',
  172. tokenize='porter unicode61'
  173. )
  174. `);
  175. db.exec(`
  176. CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents BEGIN
  177. INSERT INTO documents_fts(rowid, name, body) VALUES (new.id, new.name, new.body);
  178. END
  179. `);
  180. db.exec(`
  181. CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
  182. INSERT INTO documents_fts(documents_fts, rowid, name, body) VALUES('delete', old.id, old.name, old.body);
  183. END
  184. `);
  185. db.exec(`
  186. CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents BEGIN
  187. INSERT INTO documents_fts(documents_fts, rowid, name, body) VALUES('delete', old.id, old.name, old.body);
  188. INSERT INTO documents_fts(rowid, name, body) VALUES (new.id, new.name, new.body);
  189. END
  190. `);
  191. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection_id, active)`);
  192. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`);
  193. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_filepath ON documents(filepath, active)`);
  194. db.exec(`CREATE UNIQUE INDEX IF NOT EXISTS idx_documents_filepath_active ON documents(filepath) WHERE active = 1`);
  195. }
  196. function ensureVecTableInternal(db: Database, dimensions: number): void {
  197. const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get() as { sql: string } | null;
  198. if (tableInfo) {
  199. const match = tableInfo.sql.match(/float\[(\d+)\]/);
  200. const hasHashSeq = tableInfo.sql.includes('hash_seq');
  201. if (match && parseInt(match[1]) === dimensions && hasHashSeq) return;
  202. db.exec("DROP TABLE IF EXISTS vectors_vec");
  203. }
  204. db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}])`);
  205. }
  206. // =============================================================================
  207. // Store Factory
  208. // =============================================================================
  209. export type Store = {
  210. db: Database;
  211. dbPath: string;
  212. close: () => void;
  213. ensureVecTable: (dimensions: number) => void;
  214. // Index health
  215. getHashesNeedingEmbedding: () => number;
  216. getIndexHealth: () => IndexHealthInfo;
  217. getStatus: () => IndexStatus;
  218. // Caching
  219. getCacheKey: typeof getCacheKey;
  220. getCachedResult: (cacheKey: string) => string | null;
  221. setCachedResult: (cacheKey: string, result: string) => void;
  222. clearCache: () => void;
  223. // Context
  224. getContextForFile: (filepath: string) => string | null;
  225. getCollectionIdByName: (name: string) => number | null;
  226. // Search
  227. searchFTS: (query: string, limit?: number, collectionId?: number) => SearchResult[];
  228. searchVec: (query: string, model: string, limit?: number, collectionId?: number) => Promise<SearchResult[]>;
  229. // Query expansion & reranking
  230. expandQuery: (query: string, model?: string) => Promise<string[]>;
  231. rerank: (query: string, documents: { file: string; text: string }[], model?: string) => Promise<{ file: string; score: number }[]>;
  232. // Document retrieval
  233. findDocument: (filename: string, options?: { includeBody?: boolean }) => DocumentResult | DocumentNotFound;
  234. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => string | null;
  235. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => { docs: MultiGetResult[]; errors: string[] };
  236. // Legacy compatibility
  237. getDocument: (filename: string, fromLine?: number, maxLines?: number) => (DocumentResult & { body: string }) | DocumentNotFound;
  238. getMultipleDocuments: (pattern: string, maxLines?: number, maxBytes?: number) => { files: MultiGetFile[]; errors: string[] };
  239. // Fuzzy matching
  240. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => string[];
  241. matchFilesByGlob: (pattern: string) => { filepath: string; displayPath: string; bodyLength: number }[];
  242. };
  243. /**
  244. * Create a new store instance with the given database path.
  245. * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
  246. *
  247. * @param dbPath - Path to the SQLite database file
  248. * @returns Store instance with all methods bound to the database
  249. */
  250. export function createStore(dbPath?: string): Store {
  251. const resolvedPath = dbPath || getDefaultDbPath();
  252. const db = new Database(resolvedPath);
  253. initializeDatabase(db);
  254. return {
  255. db,
  256. dbPath: resolvedPath,
  257. close: () => db.close(),
  258. ensureVecTable: (dimensions: number) => ensureVecTableInternal(db, dimensions),
  259. // Index health
  260. getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
  261. getIndexHealth: () => getIndexHealth(db),
  262. getStatus: () => getStatus(db),
  263. // Caching
  264. getCacheKey,
  265. getCachedResult: (cacheKey: string) => getCachedResult(db, cacheKey),
  266. setCachedResult: (cacheKey: string, result: string) => setCachedResult(db, cacheKey, result),
  267. clearCache: () => clearCache(db),
  268. // Context
  269. getContextForFile: (filepath: string) => getContextForFile(db, filepath),
  270. getCollectionIdByName: (name: string) => getCollectionIdByName(db, name),
  271. // Search
  272. searchFTS: (query: string, limit?: number, collectionId?: number) => searchFTS(db, query, limit, collectionId),
  273. searchVec: (query: string, model: string, limit?: number, collectionId?: number) => searchVec(db, query, model, limit, collectionId),
  274. // Query expansion & reranking
  275. expandQuery: (query: string, model?: string) => expandQuery(query, model, db),
  276. rerank: (query: string, documents: { file: string; text: string }[], model?: string) => rerank(query, documents, model, db),
  277. // Document retrieval
  278. findDocument: (filename: string, options?: { includeBody?: boolean }) => findDocument(db, filename, options),
  279. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => getDocumentBody(db, doc, fromLine, maxLines),
  280. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => findDocuments(db, pattern, options),
  281. // Legacy compatibility
  282. getDocument: (filename: string, fromLine?: number, maxLines?: number) => getDocument(db, filename, fromLine, maxLines),
  283. getMultipleDocuments: (pattern: string, maxLines?: number, maxBytes?: number) => getMultipleDocuments(db, pattern, maxLines, maxBytes),
  284. // Fuzzy matching
  285. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => findSimilarFiles(db, query, maxDistance, limit),
  286. matchFilesByGlob: (pattern: string) => matchFilesByGlob(db, pattern),
  287. };
  288. }
  289. // =============================================================================
  290. // Legacy compatibility - will be removed
  291. // =============================================================================
  292. let _legacyDb: Database | null = null;
  293. let _legacyDbPath: string | null = null;
  294. /** @deprecated Use createStore() instead */
  295. export function setCustomIndexName(name: string | null): void {
  296. _legacyDbPath = name ? getDefaultDbPath(name) : null;
  297. _legacyDb = null; // Reset so next getDb() creates new connection
  298. }
  299. /** @deprecated Use createStore() instead */
  300. export function getDbPath(): string {
  301. return _legacyDbPath || getDefaultDbPath();
  302. }
  303. /** @deprecated Use createStore() instead */
  304. export function getDb(): Database {
  305. if (!_legacyDb) {
  306. _legacyDb = new Database(getDbPath());
  307. initializeDatabase(_legacyDb);
  308. }
  309. return _legacyDb;
  310. }
  311. /** @deprecated Use store.db.close() instead. Closes the legacy db and resets singleton. */
  312. export function closeDb(): void {
  313. if (_legacyDb) {
  314. _legacyDb.close();
  315. _legacyDb = null;
  316. }
  317. }
  318. /** @deprecated Use store.ensureVecTable() instead */
  319. export function ensureVecTable(db: Database, dimensions: number): void {
  320. ensureVecTableInternal(db, dimensions);
  321. }
  322. // =============================================================================
  323. // Core Document Type
  324. // =============================================================================
  325. /**
  326. * Unified document result type with all metadata.
  327. * Body is optional - use getDocumentBody() to load it separately if needed.
  328. */
  329. export type DocumentResult = {
  330. filepath: string; // Full filesystem path
  331. displayPath: string; // Short display path (e.g., "docs/readme.md")
  332. title: string; // Document title (from first heading or filename)
  333. context: string | null; // Folder context description if configured
  334. hash: string; // Content hash for caching/change detection
  335. collectionId: number; // Parent collection ID
  336. modifiedAt: string; // Last modification timestamp
  337. bodyLength: number; // Body length in bytes (useful before loading)
  338. body?: string; // Document body (optional, load with getDocumentBody)
  339. };
  340. /**
  341. * Search result extends DocumentResult with score and source info
  342. */
  343. export type SearchResult = DocumentResult & {
  344. score: number; // Relevance score (0-1)
  345. source: "fts" | "vec"; // Search source (full-text or vector)
  346. chunkPos?: number; // Character position of matching chunk (for vector search)
  347. };
  348. /**
  349. * Ranked result for RRF fusion (simplified, used internally)
  350. */
  351. export type RankedResult = {
  352. file: string;
  353. displayPath: string;
  354. title: string;
  355. body: string;
  356. score: number;
  357. };
  358. /**
  359. * Error result when document is not found
  360. */
  361. export type DocumentNotFound = {
  362. error: "not_found";
  363. query: string;
  364. similarFiles: string[];
  365. };
  366. /**
  367. * Result from multi-get operations
  368. */
  369. export type MultiGetResult = {
  370. doc: DocumentResult;
  371. skipped: false;
  372. } | {
  373. doc: Pick<DocumentResult, "filepath" | "displayPath">;
  374. skipped: true;
  375. skipReason: string;
  376. };
  377. export type CollectionInfo = {
  378. id: number;
  379. path: string;
  380. pattern: string;
  381. documents: number;
  382. lastUpdated: string;
  383. };
  384. export type IndexStatus = {
  385. totalDocuments: number;
  386. needsEmbedding: number;
  387. hasVectorIndex: boolean;
  388. collections: CollectionInfo[];
  389. };
  390. // =============================================================================
  391. // Index health
  392. // =============================================================================
  393. export function getHashesNeedingEmbedding(db: Database): number {
  394. const result = db.prepare(`
  395. SELECT COUNT(DISTINCT d.hash) as count
  396. FROM documents d
  397. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  398. WHERE d.active = 1 AND v.hash IS NULL
  399. `).get() as { count: number };
  400. return result.count;
  401. }
  402. export type IndexHealthInfo = {
  403. needsEmbedding: number;
  404. totalDocs: number;
  405. daysStale: number | null;
  406. };
  407. export function getIndexHealth(db: Database): IndexHealthInfo {
  408. const needsEmbedding = getHashesNeedingEmbedding(db);
  409. const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count;
  410. const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
  411. let daysStale: number | null = null;
  412. if (mostRecent?.latest) {
  413. const lastUpdate = new Date(mostRecent.latest);
  414. daysStale = Math.floor((Date.now() - lastUpdate.getTime()) / (24 * 60 * 60 * 1000));
  415. }
  416. return { needsEmbedding, totalDocs, daysStale };
  417. }
  418. // =============================================================================
  419. // Caching
  420. // =============================================================================
  421. export function getCacheKey(url: string, body: object): string {
  422. const hash = new Bun.CryptoHasher("sha256");
  423. hash.update(url);
  424. hash.update(JSON.stringify(body));
  425. return hash.digest("hex");
  426. }
  427. export function getCachedResult(db: Database, cacheKey: string): string | null {
  428. const row = db.prepare(`SELECT result FROM ollama_cache WHERE hash = ?`).get(cacheKey) as { result: string } | null;
  429. return row?.result || null;
  430. }
  431. export function setCachedResult(db: Database, cacheKey: string, result: string): void {
  432. const now = new Date().toISOString();
  433. db.prepare(`INSERT OR REPLACE INTO ollama_cache (hash, result, created_at) VALUES (?, ?, ?)`).run(cacheKey, result, now);
  434. if (Math.random() < 0.01) {
  435. db.exec(`DELETE FROM ollama_cache WHERE hash NOT IN (SELECT hash FROM ollama_cache ORDER BY created_at DESC LIMIT 1000)`);
  436. }
  437. }
  438. export function clearCache(db: Database): void {
  439. db.exec(`DELETE FROM ollama_cache`);
  440. }
  441. // =============================================================================
  442. // Document helpers
  443. // =============================================================================
  444. export async function hashContent(content: string): Promise<string> {
  445. const hash = new Bun.CryptoHasher("sha256");
  446. hash.update(content);
  447. return hash.digest("hex");
  448. }
  449. export function extractTitle(content: string, filename: string): string {
  450. const match = content.match(/^##?\s+(.+)$/m);
  451. if (match) {
  452. const title = match[1].trim();
  453. if (title === "📝 Notes" || title === "Notes") {
  454. const nextMatch = content.match(/^##\s+(.+)$/m);
  455. if (nextMatch) return nextMatch[1].trim();
  456. }
  457. return title;
  458. }
  459. return filename.replace(/\.md$/, "").split("/").pop() || filename;
  460. }
  461. // Re-export from llm.ts for backwards compatibility
  462. export { formatQueryForEmbedding, formatDocForEmbedding };
  463. export function chunkDocument(content: string, maxBytes: number = CHUNK_BYTE_SIZE): { text: string; pos: number }[] {
  464. const encoder = new TextEncoder();
  465. const totalBytes = encoder.encode(content).length;
  466. if (totalBytes <= maxBytes) {
  467. return [{ text: content, pos: 0 }];
  468. }
  469. const chunks: { text: string; pos: number }[] = [];
  470. let charPos = 0;
  471. while (charPos < content.length) {
  472. let endPos = charPos;
  473. let byteCount = 0;
  474. while (endPos < content.length && byteCount < maxBytes) {
  475. const charBytes = encoder.encode(content[endPos]).length;
  476. if (byteCount + charBytes > maxBytes) break;
  477. byteCount += charBytes;
  478. endPos++;
  479. }
  480. if (endPos < content.length && endPos > charPos) {
  481. const slice = content.slice(charPos, endPos);
  482. const paragraphBreak = slice.lastIndexOf('\n\n');
  483. const sentenceEnd = Math.max(
  484. slice.lastIndexOf('. '),
  485. slice.lastIndexOf('.\n'),
  486. slice.lastIndexOf('? '),
  487. slice.lastIndexOf('?\n'),
  488. slice.lastIndexOf('! '),
  489. slice.lastIndexOf('!\n')
  490. );
  491. const lineBreak = slice.lastIndexOf('\n');
  492. const spaceBreak = slice.lastIndexOf(' ');
  493. let breakPoint = -1;
  494. if (paragraphBreak > slice.length * 0.5) {
  495. breakPoint = paragraphBreak + 2;
  496. } else if (sentenceEnd > slice.length * 0.5) {
  497. breakPoint = sentenceEnd + 2;
  498. } else if (lineBreak > slice.length * 0.3) {
  499. breakPoint = lineBreak + 1;
  500. } else if (spaceBreak > slice.length * 0.3) {
  501. breakPoint = spaceBreak + 1;
  502. }
  503. if (breakPoint > 0) {
  504. endPos = charPos + breakPoint;
  505. }
  506. }
  507. if (endPos <= charPos) {
  508. endPos = charPos + 1;
  509. }
  510. chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
  511. charPos = endPos;
  512. }
  513. return chunks;
  514. }
  515. // =============================================================================
  516. // Fuzzy matching
  517. // =============================================================================
  518. function levenshtein(a: string, b: string): number {
  519. const m = a.length, n = b.length;
  520. if (m === 0) return n;
  521. if (n === 0) return m;
  522. const dp: number[][] = Array.from({ length: m + 1 }, (_, i) => [i]);
  523. for (let j = 1; j <= n; j++) dp[0][j] = j;
  524. for (let i = 1; i <= m; i++) {
  525. for (let j = 1; j <= n; j++) {
  526. const cost = a[i - 1] === b[j - 1] ? 0 : 1;
  527. dp[i][j] = Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost);
  528. }
  529. }
  530. return dp[m][n];
  531. }
  532. export function findSimilarFiles(db: Database, query: string, maxDistance: number = 3, limit: number = 5): string[] {
  533. const allFiles = db.prepare(`SELECT display_path FROM documents WHERE active = 1`).all() as { display_path: string }[];
  534. const queryLower = query.toLowerCase();
  535. const scored = allFiles
  536. .map(f => ({ path: f.display_path, dist: levenshtein(f.display_path.toLowerCase(), queryLower) }))
  537. .filter(f => f.dist <= maxDistance)
  538. .sort((a, b) => a.dist - b.dist)
  539. .slice(0, limit);
  540. return scored.map(f => f.path);
  541. }
  542. export function matchFilesByGlob(db: Database, pattern: string): { filepath: string; displayPath: string; bodyLength: number }[] {
  543. const allFiles = db.prepare(`SELECT filepath, display_path, LENGTH(body) as body_length FROM documents WHERE active = 1`).all() as { filepath: string; display_path: string; body_length: number }[];
  544. const glob = new Glob(pattern);
  545. return allFiles
  546. .filter(f => glob.match(f.display_path))
  547. .map(f => ({ filepath: f.filepath, displayPath: f.display_path, bodyLength: f.body_length }));
  548. }
  549. // =============================================================================
  550. // Context
  551. // =============================================================================
  552. export function getContextForFile(db: Database, filepath: string): string | null {
  553. const result = db.prepare(`
  554. SELECT context FROM path_contexts
  555. WHERE ? LIKE path_prefix || '%'
  556. ORDER BY LENGTH(path_prefix) DESC
  557. LIMIT 1
  558. `).get(filepath) as { context: string } | null;
  559. return result?.context || null;
  560. }
  561. export function getCollectionIdByName(db: Database, name: string): number | null {
  562. // Search both pwd and glob_pattern columns for the name
  563. const result = db.prepare(`
  564. SELECT id FROM collections
  565. WHERE pwd LIKE ? OR glob_pattern LIKE ?
  566. ORDER BY LENGTH(pwd) DESC
  567. LIMIT 1
  568. `).get(`%${name}%`, `%${name}%`) as { id: number } | null;
  569. return result?.id || null;
  570. }
  571. // =============================================================================
  572. // FTS Search
  573. // =============================================================================
  574. function sanitizeFTS5Term(term: string): string {
  575. return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
  576. }
  577. function buildFTS5Query(query: string): string | null {
  578. const terms = query.split(/\s+/)
  579. .map(t => sanitizeFTS5Term(t))
  580. .filter(t => t.length > 0);
  581. if (terms.length === 0) return null;
  582. if (terms.length === 1) return `"${terms[0]}"*`;
  583. return terms.map(t => `"${t}"*`).join(' AND ');
  584. }
  585. export function searchFTS(db: Database, query: string, limit: number = 20, collectionId?: number): SearchResult[] {
  586. const ftsQuery = buildFTS5Query(query);
  587. if (!ftsQuery) return [];
  588. let sql = `
  589. SELECT d.filepath, d.display_path, d.title, d.body, bm25(documents_fts, 10.0, 1.0) as score
  590. FROM documents_fts f
  591. JOIN documents d ON d.id = f.rowid
  592. WHERE documents_fts MATCH ? AND d.active = 1
  593. `;
  594. const params: (string | number)[] = [ftsQuery];
  595. if (collectionId !== undefined) {
  596. sql += ` AND d.collection_id = ?`;
  597. params.push(collectionId);
  598. }
  599. sql += ` ORDER BY score LIMIT ?`;
  600. params.push(limit);
  601. const rows = db.prepare(sql).all(...params) as { filepath: string; display_path: string; title: string; body: string; score: number }[];
  602. const maxScore = rows.length > 0 ? Math.max(...rows.map(r => Math.abs(r.score))) : 1;
  603. return rows.map(row => ({
  604. file: row.filepath,
  605. displayPath: row.display_path,
  606. title: row.title,
  607. body: row.body,
  608. score: Math.abs(row.score) / maxScore,
  609. source: "fts" as const,
  610. }));
  611. }
  612. // =============================================================================
  613. // Vector Search
  614. // =============================================================================
  615. export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionId?: number): Promise<SearchResult[]> {
  616. const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  617. if (!tableExists) return [];
  618. const embedding = await getEmbedding(query, model, true);
  619. if (!embedding) return [];
  620. // sqlite-vec requires "k = ?" for KNN queries
  621. let sql = `
  622. SELECT v.hash_seq, v.distance, d.filepath, d.display_path, d.title, d.body, cv.pos
  623. FROM vectors_vec v
  624. JOIN content_vectors cv ON cv.hash || '_' || cv.seq = v.hash_seq
  625. JOIN documents d ON d.hash = cv.hash AND d.active = 1
  626. WHERE v.embedding MATCH ? AND k = ?
  627. `;
  628. if (collectionId !== undefined) {
  629. sql += ` AND d.collection_id = ${collectionId}`;
  630. }
  631. sql += ` ORDER BY v.distance`;
  632. const rows = db.prepare(sql).all(new Float32Array(embedding), limit * 3) as { hash_seq: string; distance: number; filepath: string; display_path: string; title: string; body: string; pos: number }[];
  633. const seen = new Map<string, { row: typeof rows[0]; bestDist: number }>();
  634. for (const row of rows) {
  635. const existing = seen.get(row.filepath);
  636. if (!existing || row.distance < existing.bestDist) {
  637. seen.set(row.filepath, { row, bestDist: row.distance });
  638. }
  639. }
  640. return Array.from(seen.values())
  641. .sort((a, b) => a.bestDist - b.bestDist)
  642. .slice(0, limit)
  643. .map(({ row }) => ({
  644. file: row.filepath,
  645. displayPath: row.display_path,
  646. title: row.title,
  647. body: row.body,
  648. score: 1 / (1 + row.distance),
  649. source: "vec" as const,
  650. chunkPos: row.pos,
  651. }));
  652. }
  653. // =============================================================================
  654. // Embeddings
  655. // =============================================================================
  656. async function getEmbedding(text: string, model: string, isQuery: boolean): Promise<number[] | null> {
  657. const ollama = getDefaultOllama();
  658. const result = await ollama.embed(text, { model, isQuery });
  659. return result?.embedding || null;
  660. }
  661. // =============================================================================
  662. // Query expansion
  663. // =============================================================================
  664. export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database): Promise<string[]> {
  665. // Check cache first
  666. const cacheKey = getCacheKey("expandQuery", { query, model });
  667. const cached = getCachedResult(db, cacheKey);
  668. if (cached) {
  669. const lines = cached.split('\n').map(l => l.trim()).filter(l => l.length > 0);
  670. return [query, ...lines.slice(0, 2)];
  671. }
  672. const ollama = getDefaultOllama();
  673. const results = await ollama.expandQuery(query, model, 2);
  674. // Cache the expanded queries (excluding original)
  675. if (results.length > 1) {
  676. setCachedResult(db, cacheKey, results.slice(1).join('\n'));
  677. }
  678. return results;
  679. }
  680. // =============================================================================
  681. // Reranking
  682. // =============================================================================
  683. export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database): Promise<{ file: string; score: number }[]> {
  684. const cachedResults: Map<string, number> = new Map();
  685. const uncachedDocs: RerankDocument[] = [];
  686. // Check cache for each document
  687. for (const doc of documents) {
  688. const cacheKey = getCacheKey("rerank", { query, file: doc.file, model });
  689. const cached = getCachedResult(db, cacheKey);
  690. if (cached !== null) {
  691. cachedResults.set(doc.file, parseFloat(cached));
  692. } else {
  693. uncachedDocs.push({ file: doc.file, text: doc.text });
  694. }
  695. }
  696. // Rerank uncached documents using Ollama
  697. if (uncachedDocs.length > 0) {
  698. const ollama = getDefaultOllama();
  699. const rerankResult = await ollama.rerank(query, uncachedDocs, { model });
  700. // Cache results
  701. for (const result of rerankResult.results) {
  702. const cacheKey = getCacheKey("rerank", { query, file: result.file, model });
  703. setCachedResult(db, cacheKey, result.score.toString());
  704. cachedResults.set(result.file, result.score);
  705. }
  706. }
  707. // Return all results sorted by score
  708. return documents
  709. .map(doc => ({ file: doc.file, score: cachedResults.get(doc.file) || 0 }))
  710. .sort((a, b) => b.score - a.score);
  711. }
  712. // =============================================================================
  713. // Reciprocal Rank Fusion
  714. // =============================================================================
  715. export function reciprocalRankFusion(
  716. resultLists: RankedResult[][],
  717. weights: number[] = [],
  718. k: number = 60
  719. ): RankedResult[] {
  720. const scores = new Map<string, { result: RankedResult; rrfScore: number; topRank: number }>();
  721. for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
  722. const list = resultLists[listIdx];
  723. const weight = weights[listIdx] ?? 1.0;
  724. for (let rank = 0; rank < list.length; rank++) {
  725. const result = list[rank];
  726. const rrfContribution = weight / (k + rank + 1);
  727. const existing = scores.get(result.file);
  728. if (existing) {
  729. existing.rrfScore += rrfContribution;
  730. existing.topRank = Math.min(existing.topRank, rank);
  731. } else {
  732. scores.set(result.file, {
  733. result,
  734. rrfScore: rrfContribution,
  735. topRank: rank,
  736. });
  737. }
  738. }
  739. }
  740. // Top-rank bonus
  741. for (const entry of scores.values()) {
  742. if (entry.topRank === 0) {
  743. entry.rrfScore += 0.05;
  744. } else if (entry.topRank <= 2) {
  745. entry.rrfScore += 0.02;
  746. }
  747. }
  748. return Array.from(scores.values())
  749. .sort((a, b) => b.rrfScore - a.rrfScore)
  750. .map(e => ({ ...e.result, score: e.rrfScore }));
  751. }
  752. // =============================================================================
  753. // Document retrieval
  754. // =============================================================================
  755. type DbDocRow = {
  756. filepath: string;
  757. display_path: string;
  758. title: string;
  759. hash: string;
  760. collection_id: number;
  761. modified_at: string;
  762. body_length: number;
  763. body?: string;
  764. };
  765. /**
  766. * Find a document by filename/path (with fuzzy matching)
  767. * Returns document metadata without body by default
  768. */
  769. export function findDocument(db: Database, filename: string, options: { includeBody?: boolean } = {}): DocumentResult | DocumentNotFound {
  770. let filepath = filename;
  771. const colonMatch = filepath.match(/:(\d+)$/);
  772. if (colonMatch) {
  773. filepath = filepath.slice(0, -colonMatch[0].length);
  774. }
  775. if (filepath.startsWith('~/')) {
  776. filepath = homedir() + filepath.slice(1);
  777. }
  778. const selectCols = options.includeBody
  779. ? `filepath, display_path, title, hash, collection_id, modified_at, LENGTH(body) as body_length, body`
  780. : `filepath, display_path, title, hash, collection_id, modified_at, LENGTH(body) as body_length`;
  781. // Try various match strategies
  782. let doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE filepath = ? AND active = 1`).get(filepath) as DbDocRow | null;
  783. if (!doc) {
  784. doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE display_path = ? AND active = 1`).get(filepath) as DbDocRow | null;
  785. }
  786. if (!doc) {
  787. doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE filepath LIKE ? AND active = 1 LIMIT 1`).get(`%${filepath}`) as DbDocRow | null;
  788. }
  789. if (!doc) {
  790. doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE display_path LIKE ? AND active = 1 LIMIT 1`).get(`%${filepath}`) as DbDocRow | null;
  791. }
  792. if (!doc) {
  793. const similar = findSimilarFiles(db, filepath, 5, 5);
  794. return { error: "not_found", query: filename, similarFiles: similar };
  795. }
  796. const context = getContextForFile(db, doc.filepath);
  797. return {
  798. filepath: doc.filepath,
  799. displayPath: doc.display_path,
  800. title: doc.title,
  801. context,
  802. hash: doc.hash,
  803. collectionId: doc.collection_id,
  804. modifiedAt: doc.modified_at,
  805. bodyLength: doc.body_length,
  806. ...(options.includeBody && doc.body !== undefined && { body: doc.body }),
  807. };
  808. }
  809. /**
  810. * Get the body content for a document
  811. * Optionally slice by line range
  812. */
  813. export function getDocumentBody(db: Database, doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number): string | null {
  814. const filepath = 'filepath' in doc ? doc.filepath : doc.filepath;
  815. const row = db.prepare(`SELECT body FROM documents WHERE filepath = ? AND active = 1`).get(filepath) as { body: string } | null;
  816. if (!row) return null;
  817. let body = row.body;
  818. if (fromLine !== undefined || maxLines !== undefined) {
  819. const lines = body.split('\n');
  820. const start = (fromLine || 1) - 1;
  821. const end = maxLines !== undefined ? start + maxLines : lines.length;
  822. body = lines.slice(start, end).join('\n');
  823. }
  824. return body;
  825. }
  826. /**
  827. * Legacy function for backwards compatibility
  828. * Combines findDocument + getDocumentBody with line slicing
  829. */
  830. export function getDocument(db: Database, filename: string, fromLine?: number, maxLines?: number): (DocumentResult & { body: string }) | DocumentNotFound {
  831. // Parse :line suffix
  832. let parsedFromLine = fromLine;
  833. let filepath = filename;
  834. const colonMatch = filepath.match(/:(\d+)$/);
  835. if (colonMatch && !parsedFromLine) {
  836. parsedFromLine = parseInt(colonMatch[1], 10);
  837. filepath = filepath.slice(0, -colonMatch[0].length);
  838. }
  839. const result = findDocument(db, filepath, { includeBody: true });
  840. if ("error" in result) return result;
  841. let body = result.body || "";
  842. if (parsedFromLine !== undefined || maxLines !== undefined) {
  843. const lines = body.split('\n');
  844. const start = (parsedFromLine || 1) - 1;
  845. const end = maxLines !== undefined ? start + maxLines : lines.length;
  846. body = lines.slice(start, end).join('\n');
  847. }
  848. return { ...result, body };
  849. }
  850. /**
  851. * Find multiple documents by glob pattern or comma-separated list
  852. * Returns documents without body by default (use getDocumentBody to load)
  853. */
  854. export function findDocuments(
  855. db: Database,
  856. pattern: string,
  857. options: { includeBody?: boolean; maxBytes?: number } = {}
  858. ): { docs: MultiGetResult[]; errors: string[] } {
  859. const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
  860. const errors: string[] = [];
  861. const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
  862. const selectCols = options.includeBody
  863. ? `filepath, display_path, title, hash, collection_id, modified_at, LENGTH(body) as body_length, body`
  864. : `filepath, display_path, title, hash, collection_id, modified_at, LENGTH(body) as body_length`;
  865. let fileRows: DbDocRow[];
  866. if (isCommaSeparated) {
  867. const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
  868. fileRows = [];
  869. for (const name of names) {
  870. let doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE display_path = ? AND active = 1`).get(name) as DbDocRow | null;
  871. if (!doc) {
  872. doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE display_path LIKE ? AND active = 1 LIMIT 1`).get(`%${name}`) as DbDocRow | null;
  873. }
  874. if (doc) {
  875. fileRows.push(doc);
  876. } else {
  877. const similar = findSimilarFiles(db, name, 5, 3);
  878. let msg = `File not found: ${name}`;
  879. if (similar.length > 0) {
  880. msg += ` (did you mean: ${similar.join(', ')}?)`;
  881. }
  882. errors.push(msg);
  883. }
  884. }
  885. } else {
  886. // Glob pattern match
  887. const matched = matchFilesByGlob(db, pattern);
  888. if (matched.length === 0) {
  889. errors.push(`No files matched pattern: ${pattern}`);
  890. return { docs: [], errors };
  891. }
  892. const filepaths = matched.map(m => m.filepath);
  893. const placeholders = filepaths.map(() => '?').join(',');
  894. fileRows = db.prepare(`SELECT ${selectCols} FROM documents WHERE filepath IN (${placeholders}) AND active = 1`).all(...filepaths) as DbDocRow[];
  895. }
  896. const results: MultiGetResult[] = [];
  897. for (const row of fileRows) {
  898. const context = getContextForFile(db, row.filepath);
  899. if (row.body_length > maxBytes) {
  900. results.push({
  901. doc: { filepath: row.filepath, displayPath: row.display_path },
  902. skipped: true,
  903. skipReason: `File too large (${Math.round(row.body_length / 1024)}KB > ${Math.round(maxBytes / 1024)}KB)`,
  904. });
  905. continue;
  906. }
  907. results.push({
  908. doc: {
  909. filepath: row.filepath,
  910. displayPath: row.display_path,
  911. title: row.title || row.display_path.split('/').pop() || row.display_path,
  912. context,
  913. hash: row.hash,
  914. collectionId: row.collection_id,
  915. modifiedAt: row.modified_at,
  916. bodyLength: row.body_length,
  917. ...(options.includeBody && row.body !== undefined && { body: row.body }),
  918. },
  919. skipped: false,
  920. });
  921. }
  922. return { docs: results, errors };
  923. }
  924. /**
  925. * Legacy function for backwards compatibility
  926. */
  927. export function getMultipleDocuments(db: Database, pattern: string, maxLines?: number, maxBytes: number = DEFAULT_MULTI_GET_MAX_BYTES): { files: MultiGetFile[]; errors: string[] } {
  928. const { docs, errors } = findDocuments(db, pattern, { includeBody: true, maxBytes });
  929. const files: MultiGetFile[] = docs.map(result => {
  930. if (result.skipped) {
  931. return {
  932. filepath: result.doc.filepath,
  933. displayPath: result.doc.displayPath,
  934. title: "",
  935. body: "",
  936. context: null,
  937. skipped: true as const,
  938. skipReason: result.skipReason,
  939. };
  940. }
  941. let body = result.doc.body || "";
  942. if (maxLines !== undefined) {
  943. const lines = body.split('\n');
  944. body = lines.slice(0, maxLines).join('\n');
  945. if (lines.length > maxLines) {
  946. body += `\n\n[... truncated ${lines.length - maxLines} more lines]`;
  947. }
  948. }
  949. return {
  950. filepath: result.doc.filepath,
  951. displayPath: result.doc.displayPath,
  952. title: result.doc.title,
  953. body,
  954. context: result.doc.context,
  955. skipped: false as const,
  956. };
  957. });
  958. return { files, errors };
  959. }
  960. // Keep the old MultiGetFile type for backwards compatibility
  961. export type MultiGetFile = {
  962. filepath: string;
  963. displayPath: string;
  964. title: string;
  965. body: string;
  966. context: string | null;
  967. skipped: false;
  968. } | {
  969. filepath: string;
  970. displayPath: string;
  971. title: string;
  972. body: string;
  973. context: string | null;
  974. skipped: true;
  975. skipReason: string;
  976. };
  977. // =============================================================================
  978. // Status
  979. // =============================================================================
  980. export function getStatus(db: Database): IndexStatus {
  981. const collections = db.prepare(`
  982. SELECT c.id, c.pwd, c.glob_pattern, c.created_at,
  983. COUNT(d.id) as active_count,
  984. MAX(d.modified_at) as last_doc_update
  985. FROM collections c
  986. LEFT JOIN documents d ON d.collection_id = c.id AND d.active = 1
  987. GROUP BY c.id
  988. ORDER BY last_doc_update DESC
  989. `).all() as { id: number; pwd: string; glob_pattern: string; created_at: string; active_count: number; last_doc_update: string | null }[];
  990. const totalDocs = (db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get() as { c: number }).c;
  991. const needsEmbedding = getHashesNeedingEmbedding(db);
  992. const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  993. return {
  994. totalDocuments: totalDocs,
  995. needsEmbedding,
  996. hasVectorIndex: hasVectors,
  997. collections: collections.map(col => ({
  998. id: col.id,
  999. path: col.pwd,
  1000. pattern: col.glob_pattern,
  1001. documents: col.active_count,
  1002. lastUpdated: col.last_doc_update || col.created_at,
  1003. })),
  1004. };
  1005. }
  1006. // =============================================================================
  1007. // Snippet extraction
  1008. // =============================================================================
  1009. export type SnippetResult = {
  1010. line: number; // 1-indexed line number of best match
  1011. snippet: string; // The snippet text with diff-style header
  1012. linesBefore: number; // Lines in document before snippet
  1013. linesAfter: number; // Lines in document after snippet
  1014. snippetLines: number; // Number of lines in snippet
  1015. };
  1016. export function extractSnippet(body: string, query: string, maxLen = 500, chunkPos?: number): SnippetResult {
  1017. const totalLines = body.split('\n').length;
  1018. let searchBody = body;
  1019. let lineOffset = 0;
  1020. if (chunkPos && chunkPos > 0) {
  1021. const contextStart = Math.max(0, chunkPos - 100);
  1022. const contextEnd = Math.min(body.length, chunkPos + maxLen + 100);
  1023. searchBody = body.slice(contextStart, contextEnd);
  1024. if (contextStart > 0) {
  1025. lineOffset = body.slice(0, contextStart).split('\n').length - 1;
  1026. }
  1027. }
  1028. const lines = searchBody.split('\n');
  1029. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
  1030. let bestLine = 0, bestScore = -1;
  1031. for (let i = 0; i < lines.length; i++) {
  1032. const lineLower = lines[i].toLowerCase();
  1033. let score = 0;
  1034. for (const term of queryTerms) {
  1035. if (lineLower.includes(term)) score++;
  1036. }
  1037. if (score > bestScore) {
  1038. bestScore = score;
  1039. bestLine = i;
  1040. }
  1041. }
  1042. const start = Math.max(0, bestLine - 1);
  1043. const end = Math.min(lines.length, bestLine + 3);
  1044. const snippetLines = lines.slice(start, end);
  1045. let snippetText = snippetLines.join('\n');
  1046. if (snippetText.length > maxLen) snippetText = snippetText.substring(0, maxLen - 3) + "...";
  1047. const absoluteStart = lineOffset + start + 1; // 1-indexed
  1048. const snippetLineCount = snippetLines.length;
  1049. const linesBefore = absoluteStart - 1;
  1050. const linesAfter = totalLines - (absoluteStart + snippetLineCount - 1);
  1051. // Format with diff-style header: @@ -start,count @@ (linesBefore before, linesAfter after)
  1052. const header = `@@ -${absoluteStart},${snippetLineCount} @@ (${linesBefore} before, ${linesAfter} after)`;
  1053. const snippet = `${header}\n${snippetText}`;
  1054. return {
  1055. line: lineOffset + bestLine + 1,
  1056. snippet,
  1057. linesBefore,
  1058. linesAfter,
  1059. snippetLines: snippetLineCount,
  1060. };
  1061. }