store.ts 56 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662
  1. /**
  2. * QMD Store - Core data access and retrieval functions
  3. *
  4. * This module provides all database operations, search functions, and document
  5. * retrieval for QMD. It returns raw data structures that can be formatted by
  6. * CLI or MCP consumers.
  7. *
  8. * Usage:
  9. * const store = createStore("/path/to/db.sqlite");
  10. * // or use default path:
  11. * const store = createStore();
  12. */
  13. import { Database } from "bun:sqlite";
  14. import { Glob } from "bun";
  15. import * as sqliteVec from "sqlite-vec";
  16. import {
  17. Ollama,
  18. getDefaultOllama,
  19. formatQueryForEmbedding,
  20. formatDocForEmbedding,
  21. type RerankDocument,
  22. } from "./llm";
  23. // =============================================================================
  24. // Configuration
  25. // =============================================================================
  26. const HOME = Bun.env.HOME || "/tmp";
  27. export const DEFAULT_EMBED_MODEL = "embeddinggemma";
  28. export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
  29. export const DEFAULT_QUERY_MODEL = "qwen3:0.6b";
  30. export const DEFAULT_GLOB = "**/*.md";
  31. export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
  32. // Re-export OLLAMA_URL for backwards compatibility
  33. export const OLLAMA_URL = getDefaultOllama().getBaseUrl();
  34. // Chunking: ~2000 tokens per chunk, ~3 bytes/token = 6KB
  35. const CHUNK_BYTE_SIZE = 6 * 1024;
  36. // =============================================================================
  37. // Path utilities
  38. // =============================================================================
  39. export function homedir(): string {
  40. return HOME;
  41. }
  42. export function resolve(...paths: string[]): string {
  43. let result = paths[0].startsWith('/') ? '' : Bun.env.PWD || process.cwd();
  44. for (const p of paths) {
  45. if (p.startsWith('/')) {
  46. result = p;
  47. } else {
  48. result = result + '/' + p;
  49. }
  50. }
  51. const parts = result.split('/').filter(Boolean);
  52. const normalized: string[] = [];
  53. for (const part of parts) {
  54. if (part === '..') normalized.pop();
  55. else if (part !== '.') normalized.push(part);
  56. }
  57. return '/' + normalized.join('/');
  58. }
  59. export function getDefaultDbPath(indexName: string = "index"): string {
  60. // Allow override via INDEX_PATH for testing
  61. if (Bun.env.INDEX_PATH) {
  62. return Bun.env.INDEX_PATH;
  63. }
  64. const cacheDir = Bun.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
  65. const qmdCacheDir = resolve(cacheDir, "qmd");
  66. try { Bun.spawnSync(["mkdir", "-p", qmdCacheDir]); } catch {}
  67. return resolve(qmdCacheDir, `${indexName}.sqlite`);
  68. }
  69. export function getPwd(): string {
  70. return process.env.PWD || process.cwd();
  71. }
  72. export function getRealPath(path: string): string {
  73. try {
  74. const result = Bun.spawnSync(["realpath", path]);
  75. if (result.success) {
  76. return result.stdout.toString().trim();
  77. }
  78. } catch {}
  79. return resolve(path);
  80. }
  81. // =============================================================================
  82. // Virtual Path Utilities (qmd://)
  83. // =============================================================================
  84. export type VirtualPath = {
  85. collectionName: string;
  86. path: string; // relative path within collection
  87. };
  88. /**
  89. * Parse a virtual path like "qmd://collection-name/path/to/file.md"
  90. * into its components.
  91. */
  92. export function parseVirtualPath(virtualPath: string): VirtualPath | null {
  93. const match = virtualPath.match(/^qmd:\/\/([^\/]+)\/(.+)$/);
  94. if (!match) return null;
  95. return {
  96. collectionName: match[1],
  97. path: match[2],
  98. };
  99. }
  100. /**
  101. * Build a virtual path from collection name and relative path.
  102. */
  103. export function buildVirtualPath(collectionName: string, path: string): string {
  104. return `qmd://${collectionName}/${path}`;
  105. }
  106. /**
  107. * Check if a path is a virtual path (starts with qmd://).
  108. */
  109. export function isVirtualPath(path: string): boolean {
  110. return path.startsWith('qmd://');
  111. }
  112. /**
  113. * Resolve a virtual path to absolute filesystem path.
  114. */
  115. export function resolveVirtualPath(db: Database, virtualPath: string): string | null {
  116. const parsed = parseVirtualPath(virtualPath);
  117. if (!parsed) return null;
  118. const coll = getCollectionByName(db, parsed.collectionName);
  119. if (!coll) return null;
  120. return resolve(coll.pwd, parsed.path);
  121. }
  122. /**
  123. * Convert an absolute filesystem path to a virtual path.
  124. * Returns null if the file is not in any indexed collection.
  125. */
  126. export function toVirtualPath(db: Database, absolutePath: string): string | null {
  127. const doc = db.prepare(`
  128. SELECT c.name, d.path
  129. FROM documents d
  130. JOIN collections c ON c.id = d.collection_id
  131. WHERE c.pwd || '/' || d.path = ? AND d.active = 1
  132. LIMIT 1
  133. `).get(absolutePath) as { name: string; path: string } | null;
  134. if (!doc) return null;
  135. return buildVirtualPath(doc.name, doc.path);
  136. }
  137. // =============================================================================
  138. // Database initialization
  139. // =============================================================================
  140. // On macOS, use Homebrew's SQLite which supports extensions
  141. if (process.platform === "darwin") {
  142. const homebrewSqlitePath = "/opt/homebrew/opt/sqlite/lib/libsqlite3.dylib";
  143. try {
  144. if (Bun.file(homebrewSqlitePath).size > 0) {
  145. Database.setCustomSQLite(homebrewSqlitePath);
  146. }
  147. } catch {}
  148. }
  149. function initializeDatabase(db: Database): void {
  150. sqliteVec.load(db);
  151. db.exec("PRAGMA journal_mode = WAL");
  152. db.exec("PRAGMA foreign_keys = ON");
  153. // Check if we need to migrate from old schema
  154. const tables = db.prepare(`SELECT name FROM sqlite_master WHERE type='table'`).all() as { name: string }[];
  155. const tableNames = tables.map(t => t.name);
  156. const needsMigration = tableNames.includes('documents') && !tableNames.includes('content');
  157. if (needsMigration) {
  158. migrateToContentAddressable(db);
  159. return; // Migration will call initializeDatabase again
  160. }
  161. // Content-addressable storage - the source of truth for document content
  162. db.exec(`
  163. CREATE TABLE IF NOT EXISTS content (
  164. hash TEXT PRIMARY KEY,
  165. doc TEXT NOT NULL,
  166. created_at TEXT NOT NULL
  167. )
  168. `);
  169. // Collections table with name field
  170. db.exec(`
  171. CREATE TABLE IF NOT EXISTS collections (
  172. id INTEGER PRIMARY KEY AUTOINCREMENT,
  173. name TEXT NOT NULL UNIQUE,
  174. pwd TEXT NOT NULL,
  175. glob_pattern TEXT NOT NULL,
  176. created_at TEXT NOT NULL,
  177. updated_at TEXT NOT NULL,
  178. UNIQUE(pwd, glob_pattern)
  179. )
  180. `);
  181. // Documents table - file system layer mapping virtual paths to content hashes
  182. db.exec(`
  183. CREATE TABLE IF NOT EXISTS documents (
  184. id INTEGER PRIMARY KEY AUTOINCREMENT,
  185. collection_id INTEGER NOT NULL,
  186. path TEXT NOT NULL,
  187. title TEXT NOT NULL,
  188. hash TEXT NOT NULL,
  189. created_at TEXT NOT NULL,
  190. modified_at TEXT NOT NULL,
  191. active INTEGER NOT NULL DEFAULT 1,
  192. FOREIGN KEY (collection_id) REFERENCES collections(id) ON DELETE CASCADE,
  193. FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
  194. UNIQUE(collection_id, path)
  195. )
  196. `);
  197. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection_id, active)`);
  198. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`);
  199. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path, active)`);
  200. // Path-based context (collection-scoped, hierarchical)
  201. db.exec(`
  202. CREATE TABLE IF NOT EXISTS path_contexts (
  203. id INTEGER PRIMARY KEY AUTOINCREMENT,
  204. collection_id INTEGER NOT NULL,
  205. path_prefix TEXT NOT NULL,
  206. context TEXT NOT NULL,
  207. created_at TEXT NOT NULL,
  208. FOREIGN KEY (collection_id) REFERENCES collections(id) ON DELETE CASCADE,
  209. UNIQUE(collection_id, path_prefix)
  210. )
  211. `);
  212. db.exec(`CREATE INDEX IF NOT EXISTS idx_path_contexts_collection ON path_contexts(collection_id, path_prefix)`);
  213. // Cache table for Ollama API calls
  214. db.exec(`
  215. CREATE TABLE IF NOT EXISTS ollama_cache (
  216. hash TEXT PRIMARY KEY,
  217. result TEXT NOT NULL,
  218. created_at TEXT NOT NULL
  219. )
  220. `);
  221. // Content vectors
  222. const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
  223. const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
  224. if (cvInfo.length > 0 && !hasSeqColumn) {
  225. db.exec(`DROP TABLE IF EXISTS content_vectors`);
  226. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  227. }
  228. db.exec(`
  229. CREATE TABLE IF NOT EXISTS content_vectors (
  230. hash TEXT NOT NULL,
  231. seq INTEGER NOT NULL DEFAULT 0,
  232. pos INTEGER NOT NULL DEFAULT 0,
  233. model TEXT NOT NULL,
  234. embedded_at TEXT NOT NULL,
  235. PRIMARY KEY (hash, seq)
  236. )
  237. `);
  238. // FTS - index path and content (joined from content table)
  239. db.exec(`
  240. CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
  241. path, body,
  242. tokenize='porter unicode61'
  243. )
  244. `);
  245. // Triggers to keep FTS in sync
  246. db.exec(`
  247. CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents BEGIN
  248. INSERT INTO documents_fts(rowid, path, body)
  249. SELECT new.id, new.path, c.doc
  250. FROM content c
  251. WHERE c.hash = new.hash;
  252. END
  253. `);
  254. db.exec(`
  255. CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
  256. DELETE FROM documents_fts WHERE rowid = old.id;
  257. END
  258. `);
  259. db.exec(`
  260. CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents BEGIN
  261. UPDATE documents_fts
  262. SET path = new.path,
  263. body = (SELECT doc FROM content WHERE hash = new.hash)
  264. WHERE rowid = new.id;
  265. END
  266. `);
  267. }
  268. function migrateToContentAddressable(db: Database): void {
  269. console.log("Migrating database to content-addressable schema...");
  270. // Start transaction
  271. db.exec("BEGIN TRANSACTION");
  272. try {
  273. // Rename old tables
  274. db.exec("ALTER TABLE documents RENAME TO documents_old");
  275. db.exec("ALTER TABLE collections RENAME TO collections_old");
  276. db.exec("ALTER TABLE path_contexts RENAME TO path_contexts_old");
  277. db.exec("DROP TABLE IF EXISTS documents_fts");
  278. db.exec("DROP TRIGGER IF EXISTS documents_ai");
  279. db.exec("DROP TRIGGER IF EXISTS documents_ad");
  280. db.exec("DROP TRIGGER IF EXISTS documents_au");
  281. // Create new schema
  282. db.exec(`
  283. CREATE TABLE content (
  284. hash TEXT PRIMARY KEY,
  285. doc TEXT NOT NULL,
  286. created_at TEXT NOT NULL
  287. )
  288. `);
  289. db.exec(`
  290. CREATE TABLE collections (
  291. id INTEGER PRIMARY KEY AUTOINCREMENT,
  292. name TEXT NOT NULL UNIQUE,
  293. pwd TEXT NOT NULL,
  294. glob_pattern TEXT NOT NULL,
  295. created_at TEXT NOT NULL,
  296. updated_at TEXT NOT NULL,
  297. UNIQUE(pwd, glob_pattern)
  298. )
  299. `);
  300. db.exec(`
  301. CREATE TABLE documents (
  302. id INTEGER PRIMARY KEY AUTOINCREMENT,
  303. collection_id INTEGER NOT NULL,
  304. path TEXT NOT NULL,
  305. title TEXT NOT NULL,
  306. hash TEXT NOT NULL,
  307. created_at TEXT NOT NULL,
  308. modified_at TEXT NOT NULL,
  309. active INTEGER NOT NULL DEFAULT 1,
  310. FOREIGN KEY (collection_id) REFERENCES collections(id) ON DELETE CASCADE,
  311. FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
  312. UNIQUE(collection_id, path)
  313. )
  314. `);
  315. db.exec(`
  316. CREATE TABLE path_contexts (
  317. id INTEGER PRIMARY KEY AUTOINCREMENT,
  318. collection_id INTEGER NOT NULL,
  319. path_prefix TEXT NOT NULL,
  320. context TEXT NOT NULL,
  321. created_at TEXT NOT NULL,
  322. FOREIGN KEY (collection_id) REFERENCES collections(id) ON DELETE CASCADE,
  323. UNIQUE(collection_id, path_prefix)
  324. )
  325. `);
  326. // Migrate data: Extract unique content hashes
  327. console.log("Migrating content...");
  328. db.exec(`
  329. INSERT INTO content (hash, doc, created_at)
  330. SELECT hash, body, MIN(created_at) as created_at
  331. FROM documents_old
  332. WHERE active = 1
  333. GROUP BY hash
  334. `);
  335. // Migrate collections: generate names from pwd basename
  336. console.log("Migrating collections...");
  337. db.exec(`
  338. INSERT INTO collections (id, name, pwd, glob_pattern, created_at, updated_at)
  339. SELECT
  340. id,
  341. CASE
  342. WHEN INSTR(RTRIM(pwd, '/'), '/') > 0
  343. THEN SUBSTR(RTRIM(pwd, '/'), INSTR(RTRIM(pwd, '/'), '/') + 1)
  344. ELSE RTRIM(pwd, '/')
  345. END as name,
  346. pwd,
  347. glob_pattern,
  348. created_at,
  349. created_at as updated_at
  350. FROM collections_old
  351. `);
  352. // Handle duplicate collection names by appending collection_id
  353. const duplicates = db.prepare(`
  354. SELECT name, COUNT(*) as cnt
  355. FROM collections
  356. GROUP BY name
  357. HAVING cnt > 1
  358. `).all() as { name: string; cnt: number }[];
  359. for (const dup of duplicates) {
  360. const rows = db.prepare(`SELECT id FROM collections WHERE name = ? ORDER BY id`).all(dup.name) as { id: number }[];
  361. for (let i = 1; i < rows.length; i++) {
  362. db.prepare(`UPDATE collections SET name = ? WHERE id = ?`).run(`${dup.name}-${rows[i].id}`, rows[i].id);
  363. }
  364. }
  365. // Migrate documents: convert filepath to relative path within collection
  366. console.log("Migrating documents...");
  367. const oldDocs = db.prepare(`
  368. SELECT d.id, d.collection_id, d.filepath, d.title, d.hash, d.created_at, d.modified_at, c.pwd
  369. FROM documents_old d
  370. JOIN collections c ON c.id = d.collection_id
  371. WHERE d.active = 1
  372. `).all() as Array<{
  373. id: number;
  374. collection_id: number;
  375. filepath: string;
  376. title: string;
  377. hash: string;
  378. created_at: string;
  379. modified_at: string;
  380. pwd: string;
  381. }>;
  382. const insertDoc = db.prepare(`
  383. INSERT INTO documents (collection_id, path, title, hash, created_at, modified_at, active)
  384. VALUES (?, ?, ?, ?, ?, ?, 1)
  385. `);
  386. for (const doc of oldDocs) {
  387. // Convert absolute filepath to relative path within collection
  388. let path = doc.filepath;
  389. if (path.startsWith(doc.pwd + '/')) {
  390. path = path.slice(doc.pwd.length + 1);
  391. } else if (path.startsWith(doc.pwd)) {
  392. path = path.slice(doc.pwd.length);
  393. }
  394. // Remove leading slash if present
  395. path = path.replace(/^\/+/, '');
  396. try {
  397. insertDoc.run(doc.collection_id, path, doc.title, doc.hash, doc.created_at, doc.modified_at);
  398. } catch (e) {
  399. console.warn(`Skipping duplicate path: ${path} in collection ${doc.collection_id}`);
  400. }
  401. }
  402. // Migrate path_contexts: associate with collections based on path prefix
  403. console.log("Migrating path contexts...");
  404. const oldContexts = db.prepare(`SELECT * FROM path_contexts_old`).all() as Array<{
  405. path_prefix: string;
  406. context: string;
  407. created_at: string;
  408. }>;
  409. const insertContext = db.prepare(`
  410. INSERT INTO path_contexts (collection_id, path_prefix, context, created_at)
  411. VALUES (?, ?, ?, ?)
  412. `);
  413. const allCollections = db.prepare(`SELECT id, pwd FROM collections`).all() as Array<{ id: number; pwd: string }>;
  414. for (const ctx of oldContexts) {
  415. // Find collection(s) that match this path prefix
  416. for (const coll of allCollections) {
  417. if (ctx.path_prefix.startsWith(coll.pwd)) {
  418. // Convert absolute path_prefix to relative within collection
  419. let relPath = ctx.path_prefix;
  420. if (relPath.startsWith(coll.pwd + '/')) {
  421. relPath = relPath.slice(coll.pwd.length + 1);
  422. } else if (relPath.startsWith(coll.pwd)) {
  423. relPath = relPath.slice(coll.pwd.length);
  424. }
  425. relPath = relPath.replace(/^\/+/, '');
  426. try {
  427. insertContext.run(coll.id, relPath, ctx.context, ctx.created_at);
  428. } catch (e) {
  429. // Ignore duplicates
  430. }
  431. }
  432. }
  433. }
  434. // Drop old tables
  435. db.exec("DROP TABLE documents_old");
  436. db.exec("DROP TABLE collections_old");
  437. db.exec("DROP TABLE path_contexts_old");
  438. // Recreate FTS and triggers
  439. db.exec(`
  440. CREATE VIRTUAL TABLE documents_fts USING fts5(
  441. path, body,
  442. tokenize='porter unicode61'
  443. )
  444. `);
  445. db.exec(`
  446. CREATE TRIGGER documents_ai AFTER INSERT ON documents BEGIN
  447. INSERT INTO documents_fts(rowid, path, body)
  448. SELECT new.id, new.path, c.doc
  449. FROM content c
  450. WHERE c.hash = new.hash;
  451. END
  452. `);
  453. db.exec(`
  454. CREATE TRIGGER documents_ad AFTER DELETE ON documents BEGIN
  455. DELETE FROM documents_fts WHERE rowid = old.id;
  456. END
  457. `);
  458. db.exec(`
  459. CREATE TRIGGER documents_au AFTER UPDATE ON documents BEGIN
  460. UPDATE documents_fts
  461. SET path = new.path,
  462. body = (SELECT doc FROM content WHERE hash = new.hash)
  463. WHERE rowid = new.id;
  464. END
  465. `);
  466. // Populate FTS from migrated data
  467. console.log("Rebuilding full-text search index...");
  468. db.exec(`
  469. INSERT INTO documents_fts(rowid, path, body)
  470. SELECT d.id, d.path, c.doc
  471. FROM documents d
  472. JOIN content c ON c.hash = d.hash
  473. WHERE d.active = 1
  474. `);
  475. // Create indexes
  476. db.exec(`CREATE INDEX idx_documents_collection ON documents(collection_id, active)`);
  477. db.exec(`CREATE INDEX idx_documents_hash ON documents(hash)`);
  478. db.exec(`CREATE INDEX idx_documents_path ON documents(path, active)`);
  479. db.exec(`CREATE INDEX idx_path_contexts_collection ON path_contexts(collection_id, path_prefix)`);
  480. db.exec("COMMIT");
  481. console.log("Migration complete!");
  482. } catch (e) {
  483. db.exec("ROLLBACK");
  484. console.error("Migration failed:", e);
  485. throw e;
  486. }
  487. }
  488. function ensureVecTableInternal(db: Database, dimensions: number): void {
  489. const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get() as { sql: string } | null;
  490. if (tableInfo) {
  491. const match = tableInfo.sql.match(/float\[(\d+)\]/);
  492. const hasHashSeq = tableInfo.sql.includes('hash_seq');
  493. if (match && parseInt(match[1]) === dimensions && hasHashSeq) return;
  494. db.exec("DROP TABLE IF EXISTS vectors_vec");
  495. }
  496. db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}])`);
  497. }
  498. // =============================================================================
  499. // Store Factory
  500. // =============================================================================
  501. export type Store = {
  502. db: Database;
  503. dbPath: string;
  504. close: () => void;
  505. ensureVecTable: (dimensions: number) => void;
  506. // Index health
  507. getHashesNeedingEmbedding: () => number;
  508. getIndexHealth: () => IndexHealthInfo;
  509. getStatus: () => IndexStatus;
  510. // Caching
  511. getCacheKey: typeof getCacheKey;
  512. getCachedResult: (cacheKey: string) => string | null;
  513. setCachedResult: (cacheKey: string, result: string) => void;
  514. clearCache: () => void;
  515. // Context
  516. getContextForFile: (filepath: string) => string | null;
  517. getContextForPath: (collectionId: number, path: string) => string | null;
  518. getCollectionIdByName: (name: string) => number | null;
  519. getCollectionByName: (name: string) => { id: number; name: string; pwd: string; glob_pattern: string } | null;
  520. // Virtual paths
  521. parseVirtualPath: typeof parseVirtualPath;
  522. buildVirtualPath: typeof buildVirtualPath;
  523. isVirtualPath: typeof isVirtualPath;
  524. resolveVirtualPath: (virtualPath: string) => string | null;
  525. toVirtualPath: (absolutePath: string) => string | null;
  526. // Search
  527. searchFTS: (query: string, limit?: number, collectionId?: number) => SearchResult[];
  528. searchVec: (query: string, model: string, limit?: number, collectionId?: number) => Promise<SearchResult[]>;
  529. // Query expansion & reranking
  530. expandQuery: (query: string, model?: string) => Promise<string[]>;
  531. rerank: (query: string, documents: { file: string; text: string }[], model?: string) => Promise<{ file: string; score: number }[]>;
  532. // Document retrieval
  533. findDocument: (filename: string, options?: { includeBody?: boolean }) => DocumentResult | DocumentNotFound;
  534. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => string | null;
  535. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => { docs: MultiGetResult[]; errors: string[] };
  536. // Legacy compatibility
  537. getDocument: (filename: string, fromLine?: number, maxLines?: number) => (DocumentResult & { body: string }) | DocumentNotFound;
  538. getMultipleDocuments: (pattern: string, maxLines?: number, maxBytes?: number) => { files: MultiGetFile[]; errors: string[] };
  539. // Fuzzy matching
  540. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => string[];
  541. matchFilesByGlob: (pattern: string) => { filepath: string; displayPath: string; bodyLength: number }[];
  542. };
  543. /**
  544. * Create a new store instance with the given database path.
  545. * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
  546. *
  547. * @param dbPath - Path to the SQLite database file
  548. * @returns Store instance with all methods bound to the database
  549. */
  550. export function createStore(dbPath?: string): Store {
  551. const resolvedPath = dbPath || getDefaultDbPath();
  552. const db = new Database(resolvedPath);
  553. initializeDatabase(db);
  554. return {
  555. db,
  556. dbPath: resolvedPath,
  557. close: () => db.close(),
  558. ensureVecTable: (dimensions: number) => ensureVecTableInternal(db, dimensions),
  559. // Index health
  560. getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
  561. getIndexHealth: () => getIndexHealth(db),
  562. getStatus: () => getStatus(db),
  563. // Caching
  564. getCacheKey,
  565. getCachedResult: (cacheKey: string) => getCachedResult(db, cacheKey),
  566. setCachedResult: (cacheKey: string, result: string) => setCachedResult(db, cacheKey, result),
  567. clearCache: () => clearCache(db),
  568. // Context
  569. getContextForFile: (filepath: string) => getContextForFile(db, filepath),
  570. getContextForPath: (collectionId: number, path: string) => getContextForPath(db, collectionId, path),
  571. getCollectionIdByName: (name: string) => getCollectionIdByName(db, name),
  572. getCollectionByName: (name: string) => getCollectionByName(db, name),
  573. // Virtual paths
  574. parseVirtualPath,
  575. buildVirtualPath,
  576. isVirtualPath,
  577. resolveVirtualPath: (virtualPath: string) => resolveVirtualPath(db, virtualPath),
  578. toVirtualPath: (absolutePath: string) => toVirtualPath(db, absolutePath),
  579. // Search
  580. searchFTS: (query: string, limit?: number, collectionId?: number) => searchFTS(db, query, limit, collectionId),
  581. searchVec: (query: string, model: string, limit?: number, collectionId?: number) => searchVec(db, query, model, limit, collectionId),
  582. // Query expansion & reranking
  583. expandQuery: (query: string, model?: string) => expandQuery(query, model, db),
  584. rerank: (query: string, documents: { file: string; text: string }[], model?: string) => rerank(query, documents, model, db),
  585. // Document retrieval
  586. findDocument: (filename: string, options?: { includeBody?: boolean }) => findDocument(db, filename, options),
  587. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => getDocumentBody(db, doc, fromLine, maxLines),
  588. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => findDocuments(db, pattern, options),
  589. // Legacy compatibility
  590. getDocument: (filename: string, fromLine?: number, maxLines?: number) => getDocument(db, filename, fromLine, maxLines),
  591. getMultipleDocuments: (pattern: string, maxLines?: number, maxBytes?: number) => getMultipleDocuments(db, pattern, maxLines, maxBytes),
  592. // Fuzzy matching
  593. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => findSimilarFiles(db, query, maxDistance, limit),
  594. matchFilesByGlob: (pattern: string) => matchFilesByGlob(db, pattern),
  595. };
  596. }
  597. // =============================================================================
  598. // Legacy compatibility - will be removed
  599. // =============================================================================
  600. let _legacyDb: Database | null = null;
  601. let _legacyDbPath: string | null = null;
  602. /** @deprecated Use createStore() instead */
  603. export function setCustomIndexName(name: string | null): void {
  604. _legacyDbPath = name ? getDefaultDbPath(name) : null;
  605. _legacyDb = null; // Reset so next getDb() creates new connection
  606. }
  607. /** @deprecated Use createStore() instead */
  608. export function getDbPath(): string {
  609. return _legacyDbPath || getDefaultDbPath();
  610. }
  611. /** @deprecated Use createStore() instead */
  612. export function getDb(): Database {
  613. if (!_legacyDb) {
  614. _legacyDb = new Database(getDbPath());
  615. initializeDatabase(_legacyDb);
  616. }
  617. return _legacyDb;
  618. }
  619. /** @deprecated Use store.db.close() instead. Closes the legacy db and resets singleton. */
  620. export function closeDb(): void {
  621. if (_legacyDb) {
  622. _legacyDb.close();
  623. _legacyDb = null;
  624. }
  625. }
  626. /** @deprecated Use store.ensureVecTable() instead */
  627. export function ensureVecTable(db: Database, dimensions: number): void {
  628. ensureVecTableInternal(db, dimensions);
  629. }
  630. // =============================================================================
  631. // Core Document Type
  632. // =============================================================================
  633. /**
  634. * Unified document result type with all metadata.
  635. * Body is optional - use getDocumentBody() to load it separately if needed.
  636. */
  637. export type DocumentResult = {
  638. filepath: string; // Full filesystem path
  639. displayPath: string; // Short display path (e.g., "docs/readme.md")
  640. title: string; // Document title (from first heading or filename)
  641. context: string | null; // Folder context description if configured
  642. hash: string; // Content hash for caching/change detection
  643. collectionId: number; // Parent collection ID
  644. modifiedAt: string; // Last modification timestamp
  645. bodyLength: number; // Body length in bytes (useful before loading)
  646. body?: string; // Document body (optional, load with getDocumentBody)
  647. };
  648. /**
  649. * Search result extends DocumentResult with score and source info
  650. */
  651. export type SearchResult = DocumentResult & {
  652. score: number; // Relevance score (0-1)
  653. source: "fts" | "vec"; // Search source (full-text or vector)
  654. chunkPos?: number; // Character position of matching chunk (for vector search)
  655. };
  656. /**
  657. * Ranked result for RRF fusion (simplified, used internally)
  658. */
  659. export type RankedResult = {
  660. file: string;
  661. displayPath: string;
  662. title: string;
  663. body: string;
  664. score: number;
  665. };
  666. /**
  667. * Error result when document is not found
  668. */
  669. export type DocumentNotFound = {
  670. error: "not_found";
  671. query: string;
  672. similarFiles: string[];
  673. };
  674. /**
  675. * Result from multi-get operations
  676. */
  677. export type MultiGetResult = {
  678. doc: DocumentResult;
  679. skipped: false;
  680. } | {
  681. doc: Pick<DocumentResult, "filepath" | "displayPath">;
  682. skipped: true;
  683. skipReason: string;
  684. };
  685. export type CollectionInfo = {
  686. id: number;
  687. path: string;
  688. pattern: string;
  689. documents: number;
  690. lastUpdated: string;
  691. };
  692. export type IndexStatus = {
  693. totalDocuments: number;
  694. needsEmbedding: number;
  695. hasVectorIndex: boolean;
  696. collections: CollectionInfo[];
  697. };
  698. // =============================================================================
  699. // Index health
  700. // =============================================================================
  701. export function getHashesNeedingEmbedding(db: Database): number {
  702. const result = db.prepare(`
  703. SELECT COUNT(DISTINCT d.hash) as count
  704. FROM documents d
  705. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  706. WHERE d.active = 1 AND v.hash IS NULL
  707. `).get() as { count: number };
  708. return result.count;
  709. }
  710. export type IndexHealthInfo = {
  711. needsEmbedding: number;
  712. totalDocs: number;
  713. daysStale: number | null;
  714. };
  715. export function getIndexHealth(db: Database): IndexHealthInfo {
  716. const needsEmbedding = getHashesNeedingEmbedding(db);
  717. const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count;
  718. const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
  719. let daysStale: number | null = null;
  720. if (mostRecent?.latest) {
  721. const lastUpdate = new Date(mostRecent.latest);
  722. daysStale = Math.floor((Date.now() - lastUpdate.getTime()) / (24 * 60 * 60 * 1000));
  723. }
  724. return { needsEmbedding, totalDocs, daysStale };
  725. }
  726. // =============================================================================
  727. // Caching
  728. // =============================================================================
  729. export function getCacheKey(url: string, body: object): string {
  730. const hash = new Bun.CryptoHasher("sha256");
  731. hash.update(url);
  732. hash.update(JSON.stringify(body));
  733. return hash.digest("hex");
  734. }
  735. export function getCachedResult(db: Database, cacheKey: string): string | null {
  736. const row = db.prepare(`SELECT result FROM ollama_cache WHERE hash = ?`).get(cacheKey) as { result: string } | null;
  737. return row?.result || null;
  738. }
  739. export function setCachedResult(db: Database, cacheKey: string, result: string): void {
  740. const now = new Date().toISOString();
  741. db.prepare(`INSERT OR REPLACE INTO ollama_cache (hash, result, created_at) VALUES (?, ?, ?)`).run(cacheKey, result, now);
  742. if (Math.random() < 0.01) {
  743. db.exec(`DELETE FROM ollama_cache WHERE hash NOT IN (SELECT hash FROM ollama_cache ORDER BY created_at DESC LIMIT 1000)`);
  744. }
  745. }
  746. export function clearCache(db: Database): void {
  747. db.exec(`DELETE FROM ollama_cache`);
  748. }
  749. // =============================================================================
  750. // Document helpers
  751. // =============================================================================
  752. export async function hashContent(content: string): Promise<string> {
  753. const hash = new Bun.CryptoHasher("sha256");
  754. hash.update(content);
  755. return hash.digest("hex");
  756. }
  757. export function extractTitle(content: string, filename: string): string {
  758. const match = content.match(/^##?\s+(.+)$/m);
  759. if (match) {
  760. const title = match[1].trim();
  761. if (title === "📝 Notes" || title === "Notes") {
  762. const nextMatch = content.match(/^##\s+(.+)$/m);
  763. if (nextMatch) return nextMatch[1].trim();
  764. }
  765. return title;
  766. }
  767. return filename.replace(/\.md$/, "").split("/").pop() || filename;
  768. }
  769. // Re-export from llm.ts for backwards compatibility
  770. export { formatQueryForEmbedding, formatDocForEmbedding };
  771. export function chunkDocument(content: string, maxBytes: number = CHUNK_BYTE_SIZE): { text: string; pos: number }[] {
  772. const encoder = new TextEncoder();
  773. const totalBytes = encoder.encode(content).length;
  774. if (totalBytes <= maxBytes) {
  775. return [{ text: content, pos: 0 }];
  776. }
  777. const chunks: { text: string; pos: number }[] = [];
  778. let charPos = 0;
  779. while (charPos < content.length) {
  780. let endPos = charPos;
  781. let byteCount = 0;
  782. while (endPos < content.length && byteCount < maxBytes) {
  783. const charBytes = encoder.encode(content[endPos]).length;
  784. if (byteCount + charBytes > maxBytes) break;
  785. byteCount += charBytes;
  786. endPos++;
  787. }
  788. if (endPos < content.length && endPos > charPos) {
  789. const slice = content.slice(charPos, endPos);
  790. const paragraphBreak = slice.lastIndexOf('\n\n');
  791. const sentenceEnd = Math.max(
  792. slice.lastIndexOf('. '),
  793. slice.lastIndexOf('.\n'),
  794. slice.lastIndexOf('? '),
  795. slice.lastIndexOf('?\n'),
  796. slice.lastIndexOf('! '),
  797. slice.lastIndexOf('!\n')
  798. );
  799. const lineBreak = slice.lastIndexOf('\n');
  800. const spaceBreak = slice.lastIndexOf(' ');
  801. let breakPoint = -1;
  802. if (paragraphBreak > slice.length * 0.5) {
  803. breakPoint = paragraphBreak + 2;
  804. } else if (sentenceEnd > slice.length * 0.5) {
  805. breakPoint = sentenceEnd + 2;
  806. } else if (lineBreak > slice.length * 0.3) {
  807. breakPoint = lineBreak + 1;
  808. } else if (spaceBreak > slice.length * 0.3) {
  809. breakPoint = spaceBreak + 1;
  810. }
  811. if (breakPoint > 0) {
  812. endPos = charPos + breakPoint;
  813. }
  814. }
  815. if (endPos <= charPos) {
  816. endPos = charPos + 1;
  817. }
  818. chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
  819. charPos = endPos;
  820. }
  821. return chunks;
  822. }
  823. // =============================================================================
  824. // Fuzzy matching
  825. // =============================================================================
  826. function levenshtein(a: string, b: string): number {
  827. const m = a.length, n = b.length;
  828. if (m === 0) return n;
  829. if (n === 0) return m;
  830. const dp: number[][] = Array.from({ length: m + 1 }, (_, i) => [i]);
  831. for (let j = 1; j <= n; j++) dp[0][j] = j;
  832. for (let i = 1; i <= m; i++) {
  833. for (let j = 1; j <= n; j++) {
  834. const cost = a[i - 1] === b[j - 1] ? 0 : 1;
  835. dp[i][j] = Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost);
  836. }
  837. }
  838. return dp[m][n];
  839. }
  840. export function findSimilarFiles(db: Database, query: string, maxDistance: number = 3, limit: number = 5): string[] {
  841. const allFiles = db.prepare(`SELECT display_path FROM documents WHERE active = 1`).all() as { display_path: string }[];
  842. const queryLower = query.toLowerCase();
  843. const scored = allFiles
  844. .map(f => ({ path: f.display_path, dist: levenshtein(f.display_path.toLowerCase(), queryLower) }))
  845. .filter(f => f.dist <= maxDistance)
  846. .sort((a, b) => a.dist - b.dist)
  847. .slice(0, limit);
  848. return scored.map(f => f.path);
  849. }
  850. export function matchFilesByGlob(db: Database, pattern: string): { filepath: string; displayPath: string; bodyLength: number }[] {
  851. const allFiles = db.prepare(`
  852. SELECT
  853. 'qmd://' || c.name || '/' || d.path as virtual_path,
  854. LENGTH(content.doc) as body_length,
  855. d.collection_id,
  856. d.path
  857. FROM documents d
  858. JOIN collections c ON c.id = d.collection_id
  859. JOIN content ON content.hash = d.hash
  860. WHERE d.active = 1
  861. `).all() as { virtual_path: string; body_length: number; collection_id: number; path: string }[];
  862. const glob = new Glob(pattern);
  863. return allFiles
  864. .filter(f => glob.match(f.virtual_path) || glob.match(f.path))
  865. .map(f => ({
  866. filepath: f.virtual_path, // Use virtual path as filepath
  867. displayPath: f.virtual_path,
  868. bodyLength: f.body_length
  869. }));
  870. }
  871. // =============================================================================
  872. // Context
  873. // =============================================================================
  874. /**
  875. * Get context for a file path using hierarchical inheritance.
  876. * Contexts are collection-scoped and inherit from parent directories.
  877. * For example, context at "/talks" applies to "/talks/2024/keynote.md".
  878. *
  879. * @param db Database instance
  880. * @param collectionId Collection ID
  881. * @param path Relative path within the collection
  882. * @returns Context string or null if no context is defined
  883. */
  884. export function getContextForPath(db: Database, collectionId: number, path: string): string | null {
  885. // Find the most specific (longest) matching path prefix for this collection
  886. const result = db.prepare(`
  887. SELECT context FROM path_contexts
  888. WHERE collection_id = ?
  889. AND (? LIKE path_prefix || '/%' OR ? = path_prefix OR path_prefix = '')
  890. ORDER BY LENGTH(path_prefix) DESC
  891. LIMIT 1
  892. `).get(collectionId, path, path) as { context: string } | null;
  893. return result?.context || null;
  894. }
  895. /**
  896. * Legacy function for backward compatibility - resolves filepath to collection+path first
  897. */
  898. export function getContextForFile(db: Database, filepath: string): string | null {
  899. // Try to find the document to get its collection_id and path
  900. const doc = db.prepare(`
  901. SELECT d.collection_id, d.path
  902. FROM documents d
  903. JOIN collections c ON c.id = d.collection_id
  904. WHERE c.pwd || '/' || d.path = ? AND d.active = 1
  905. LIMIT 1
  906. `).get(filepath) as { collection_id: number; path: string } | null;
  907. if (!doc) return null;
  908. return getContextForPath(db, doc.collection_id, doc.path);
  909. }
  910. /**
  911. * Get collection ID by its name (exact match).
  912. */
  913. export function getCollectionIdByName(db: Database, name: string): number | null {
  914. const result = db.prepare(`
  915. SELECT id FROM collections
  916. WHERE name = ?
  917. LIMIT 1
  918. `).get(name) as { id: number } | null;
  919. return result?.id || null;
  920. }
  921. /**
  922. * Get collection by name.
  923. */
  924. export function getCollectionByName(db: Database, name: string): { id: number; name: string; pwd: string; glob_pattern: string } | null {
  925. const result = db.prepare(`
  926. SELECT id, name, pwd, glob_pattern FROM collections
  927. WHERE name = ?
  928. LIMIT 1
  929. `).get(name) as { id: number; name: string; pwd: string; glob_pattern: string } | null;
  930. return result;
  931. }
  932. // =============================================================================
  933. // FTS Search
  934. // =============================================================================
  935. function sanitizeFTS5Term(term: string): string {
  936. return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
  937. }
  938. function buildFTS5Query(query: string): string | null {
  939. const terms = query.split(/\s+/)
  940. .map(t => sanitizeFTS5Term(t))
  941. .filter(t => t.length > 0);
  942. if (terms.length === 0) return null;
  943. if (terms.length === 1) return `"${terms[0]}"*`;
  944. return terms.map(t => `"${t}"*`).join(' AND ');
  945. }
  946. export function searchFTS(db: Database, query: string, limit: number = 20, collectionId?: number): SearchResult[] {
  947. const ftsQuery = buildFTS5Query(query);
  948. if (!ftsQuery) return [];
  949. let sql = `
  950. SELECT
  951. 'qmd://' || c.name || '/' || d.path as filepath,
  952. 'qmd://' || c.name || '/' || d.path as display_path,
  953. d.title,
  954. content.doc as body,
  955. bm25(documents_fts, 10.0, 1.0) as score
  956. FROM documents_fts f
  957. JOIN documents d ON d.id = f.rowid
  958. JOIN collections c ON c.id = d.collection_id
  959. JOIN content ON content.hash = d.hash
  960. WHERE documents_fts MATCH ? AND d.active = 1
  961. `;
  962. const params: (string | number)[] = [ftsQuery];
  963. if (collectionId !== undefined) {
  964. sql += ` AND d.collection_id = ?`;
  965. params.push(collectionId);
  966. }
  967. sql += ` ORDER BY score LIMIT ?`;
  968. params.push(limit);
  969. const rows = db.prepare(sql).all(...params) as { filepath: string; display_path: string; title: string; body: string; score: number }[];
  970. const maxScore = rows.length > 0 ? Math.max(...rows.map(r => Math.abs(r.score))) : 1;
  971. return rows.map(row => ({
  972. file: row.filepath,
  973. displayPath: row.display_path,
  974. title: row.title,
  975. body: row.body,
  976. score: Math.abs(row.score) / maxScore,
  977. source: "fts" as const,
  978. }));
  979. }
  980. // =============================================================================
  981. // Vector Search
  982. // =============================================================================
  983. export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionId?: number): Promise<SearchResult[]> {
  984. const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  985. if (!tableExists) return [];
  986. const embedding = await getEmbedding(query, model, true);
  987. if (!embedding) return [];
  988. // sqlite-vec requires "k = ?" for KNN queries
  989. let sql = `
  990. SELECT
  991. v.hash_seq,
  992. v.distance,
  993. 'qmd://' || c.name || '/' || d.path as filepath,
  994. 'qmd://' || c.name || '/' || d.path as display_path,
  995. d.title,
  996. content.doc as body,
  997. cv.pos
  998. FROM vectors_vec v
  999. JOIN content_vectors cv ON cv.hash || '_' || cv.seq = v.hash_seq
  1000. JOIN documents d ON d.hash = cv.hash AND d.active = 1
  1001. JOIN collections c ON c.id = d.collection_id
  1002. JOIN content ON content.hash = d.hash
  1003. WHERE v.embedding MATCH ? AND k = ?
  1004. `;
  1005. if (collectionId !== undefined) {
  1006. sql += ` AND d.collection_id = ${collectionId}`;
  1007. }
  1008. sql += ` ORDER BY v.distance`;
  1009. const rows = db.prepare(sql).all(new Float32Array(embedding), limit * 3) as { hash_seq: string; distance: number; filepath: string; display_path: string; title: string; body: string; pos: number }[];
  1010. const seen = new Map<string, { row: typeof rows[0]; bestDist: number }>();
  1011. for (const row of rows) {
  1012. const existing = seen.get(row.filepath);
  1013. if (!existing || row.distance < existing.bestDist) {
  1014. seen.set(row.filepath, { row, bestDist: row.distance });
  1015. }
  1016. }
  1017. return Array.from(seen.values())
  1018. .sort((a, b) => a.bestDist - b.bestDist)
  1019. .slice(0, limit)
  1020. .map(({ row }) => ({
  1021. file: row.filepath,
  1022. displayPath: row.display_path,
  1023. title: row.title,
  1024. body: row.body,
  1025. score: 1 / (1 + row.distance),
  1026. source: "vec" as const,
  1027. chunkPos: row.pos,
  1028. }));
  1029. }
  1030. // =============================================================================
  1031. // Embeddings
  1032. // =============================================================================
  1033. async function getEmbedding(text: string, model: string, isQuery: boolean): Promise<number[] | null> {
  1034. const ollama = getDefaultOllama();
  1035. const result = await ollama.embed(text, { model, isQuery });
  1036. return result?.embedding || null;
  1037. }
  1038. // =============================================================================
  1039. // Query expansion
  1040. // =============================================================================
  1041. export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database): Promise<string[]> {
  1042. // Check cache first
  1043. const cacheKey = getCacheKey("expandQuery", { query, model });
  1044. const cached = getCachedResult(db, cacheKey);
  1045. if (cached) {
  1046. const lines = cached.split('\n').map(l => l.trim()).filter(l => l.length > 0);
  1047. return [query, ...lines.slice(0, 2)];
  1048. }
  1049. const ollama = getDefaultOllama();
  1050. const results = await ollama.expandQuery(query, model, 2);
  1051. // Cache the expanded queries (excluding original)
  1052. if (results.length > 1) {
  1053. setCachedResult(db, cacheKey, results.slice(1).join('\n'));
  1054. }
  1055. return results;
  1056. }
  1057. // =============================================================================
  1058. // Reranking
  1059. // =============================================================================
  1060. export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database): Promise<{ file: string; score: number }[]> {
  1061. const cachedResults: Map<string, number> = new Map();
  1062. const uncachedDocs: RerankDocument[] = [];
  1063. // Check cache for each document
  1064. for (const doc of documents) {
  1065. const cacheKey = getCacheKey("rerank", { query, file: doc.file, model });
  1066. const cached = getCachedResult(db, cacheKey);
  1067. if (cached !== null) {
  1068. cachedResults.set(doc.file, parseFloat(cached));
  1069. } else {
  1070. uncachedDocs.push({ file: doc.file, text: doc.text });
  1071. }
  1072. }
  1073. // Rerank uncached documents using Ollama
  1074. if (uncachedDocs.length > 0) {
  1075. const ollama = getDefaultOllama();
  1076. const rerankResult = await ollama.rerank(query, uncachedDocs, { model });
  1077. // Cache results
  1078. for (const result of rerankResult.results) {
  1079. const cacheKey = getCacheKey("rerank", { query, file: result.file, model });
  1080. setCachedResult(db, cacheKey, result.score.toString());
  1081. cachedResults.set(result.file, result.score);
  1082. }
  1083. }
  1084. // Return all results sorted by score
  1085. return documents
  1086. .map(doc => ({ file: doc.file, score: cachedResults.get(doc.file) || 0 }))
  1087. .sort((a, b) => b.score - a.score);
  1088. }
  1089. // =============================================================================
  1090. // Reciprocal Rank Fusion
  1091. // =============================================================================
  1092. export function reciprocalRankFusion(
  1093. resultLists: RankedResult[][],
  1094. weights: number[] = [],
  1095. k: number = 60
  1096. ): RankedResult[] {
  1097. const scores = new Map<string, { result: RankedResult; rrfScore: number; topRank: number }>();
  1098. for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
  1099. const list = resultLists[listIdx];
  1100. const weight = weights[listIdx] ?? 1.0;
  1101. for (let rank = 0; rank < list.length; rank++) {
  1102. const result = list[rank];
  1103. const rrfContribution = weight / (k + rank + 1);
  1104. const existing = scores.get(result.file);
  1105. if (existing) {
  1106. existing.rrfScore += rrfContribution;
  1107. existing.topRank = Math.min(existing.topRank, rank);
  1108. } else {
  1109. scores.set(result.file, {
  1110. result,
  1111. rrfScore: rrfContribution,
  1112. topRank: rank,
  1113. });
  1114. }
  1115. }
  1116. }
  1117. // Top-rank bonus
  1118. for (const entry of scores.values()) {
  1119. if (entry.topRank === 0) {
  1120. entry.rrfScore += 0.05;
  1121. } else if (entry.topRank <= 2) {
  1122. entry.rrfScore += 0.02;
  1123. }
  1124. }
  1125. return Array.from(scores.values())
  1126. .sort((a, b) => b.rrfScore - a.rrfScore)
  1127. .map(e => ({ ...e.result, score: e.rrfScore }));
  1128. }
  1129. // =============================================================================
  1130. // Document retrieval
  1131. // =============================================================================
  1132. type DbDocRow = {
  1133. filepath: string;
  1134. display_path: string;
  1135. title: string;
  1136. hash: string;
  1137. collection_id: number;
  1138. modified_at: string;
  1139. body_length: number;
  1140. body?: string;
  1141. };
  1142. /**
  1143. * Find a document by filename/path (with fuzzy matching)
  1144. * Returns document metadata without body by default
  1145. */
  1146. export function findDocument(db: Database, filename: string, options: { includeBody?: boolean } = {}): DocumentResult | DocumentNotFound {
  1147. let filepath = filename;
  1148. const colonMatch = filepath.match(/:(\d+)$/);
  1149. if (colonMatch) {
  1150. filepath = filepath.slice(0, -colonMatch[0].length);
  1151. }
  1152. if (filepath.startsWith('~/')) {
  1153. filepath = homedir() + filepath.slice(1);
  1154. }
  1155. const selectCols = options.includeBody
  1156. ? `filepath, display_path, title, hash, collection_id, modified_at, LENGTH(body) as body_length, body`
  1157. : `filepath, display_path, title, hash, collection_id, modified_at, LENGTH(body) as body_length`;
  1158. // Try various match strategies
  1159. let doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE filepath = ? AND active = 1`).get(filepath) as DbDocRow | null;
  1160. if (!doc) {
  1161. doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE display_path = ? AND active = 1`).get(filepath) as DbDocRow | null;
  1162. }
  1163. if (!doc) {
  1164. doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE filepath LIKE ? AND active = 1 LIMIT 1`).get(`%${filepath}`) as DbDocRow | null;
  1165. }
  1166. if (!doc) {
  1167. doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE display_path LIKE ? AND active = 1 LIMIT 1`).get(`%${filepath}`) as DbDocRow | null;
  1168. }
  1169. if (!doc) {
  1170. const similar = findSimilarFiles(db, filepath, 5, 5);
  1171. return { error: "not_found", query: filename, similarFiles: similar };
  1172. }
  1173. const context = getContextForFile(db, doc.filepath);
  1174. return {
  1175. filepath: doc.filepath,
  1176. displayPath: doc.display_path,
  1177. title: doc.title,
  1178. context,
  1179. hash: doc.hash,
  1180. collectionId: doc.collection_id,
  1181. modifiedAt: doc.modified_at,
  1182. bodyLength: doc.body_length,
  1183. ...(options.includeBody && doc.body !== undefined && { body: doc.body }),
  1184. };
  1185. }
  1186. /**
  1187. * Get the body content for a document
  1188. * Optionally slice by line range
  1189. */
  1190. export function getDocumentBody(db: Database, doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number): string | null {
  1191. const filepath = 'filepath' in doc ? doc.filepath : doc.filepath;
  1192. const row = db.prepare(`SELECT body FROM documents WHERE filepath = ? AND active = 1`).get(filepath) as { body: string } | null;
  1193. if (!row) return null;
  1194. let body = row.body;
  1195. if (fromLine !== undefined || maxLines !== undefined) {
  1196. const lines = body.split('\n');
  1197. const start = (fromLine || 1) - 1;
  1198. const end = maxLines !== undefined ? start + maxLines : lines.length;
  1199. body = lines.slice(start, end).join('\n');
  1200. }
  1201. return body;
  1202. }
  1203. /**
  1204. * Legacy function for backwards compatibility
  1205. * Combines findDocument + getDocumentBody with line slicing
  1206. */
  1207. export function getDocument(db: Database, filename: string, fromLine?: number, maxLines?: number): (DocumentResult & { body: string }) | DocumentNotFound {
  1208. // Parse :line suffix
  1209. let parsedFromLine = fromLine;
  1210. let filepath = filename;
  1211. const colonMatch = filepath.match(/:(\d+)$/);
  1212. if (colonMatch && !parsedFromLine) {
  1213. parsedFromLine = parseInt(colonMatch[1], 10);
  1214. filepath = filepath.slice(0, -colonMatch[0].length);
  1215. }
  1216. const result = findDocument(db, filepath, { includeBody: true });
  1217. if ("error" in result) return result;
  1218. let body = result.body || "";
  1219. if (parsedFromLine !== undefined || maxLines !== undefined) {
  1220. const lines = body.split('\n');
  1221. const start = (parsedFromLine || 1) - 1;
  1222. const end = maxLines !== undefined ? start + maxLines : lines.length;
  1223. body = lines.slice(start, end).join('\n');
  1224. }
  1225. return { ...result, body };
  1226. }
  1227. /**
  1228. * Find multiple documents by glob pattern or comma-separated list
  1229. * Returns documents without body by default (use getDocumentBody to load)
  1230. */
  1231. export function findDocuments(
  1232. db: Database,
  1233. pattern: string,
  1234. options: { includeBody?: boolean; maxBytes?: number } = {}
  1235. ): { docs: MultiGetResult[]; errors: string[] } {
  1236. const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
  1237. const errors: string[] = [];
  1238. const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
  1239. const selectCols = options.includeBody
  1240. ? `filepath, display_path, title, hash, collection_id, modified_at, LENGTH(body) as body_length, body`
  1241. : `filepath, display_path, title, hash, collection_id, modified_at, LENGTH(body) as body_length`;
  1242. let fileRows: DbDocRow[];
  1243. if (isCommaSeparated) {
  1244. const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
  1245. fileRows = [];
  1246. for (const name of names) {
  1247. let doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE display_path = ? AND active = 1`).get(name) as DbDocRow | null;
  1248. if (!doc) {
  1249. doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE display_path LIKE ? AND active = 1 LIMIT 1`).get(`%${name}`) as DbDocRow | null;
  1250. }
  1251. if (doc) {
  1252. fileRows.push(doc);
  1253. } else {
  1254. const similar = findSimilarFiles(db, name, 5, 3);
  1255. let msg = `File not found: ${name}`;
  1256. if (similar.length > 0) {
  1257. msg += ` (did you mean: ${similar.join(', ')}?)`;
  1258. }
  1259. errors.push(msg);
  1260. }
  1261. }
  1262. } else {
  1263. // Glob pattern match
  1264. const matched = matchFilesByGlob(db, pattern);
  1265. if (matched.length === 0) {
  1266. errors.push(`No files matched pattern: ${pattern}`);
  1267. return { docs: [], errors };
  1268. }
  1269. const filepaths = matched.map(m => m.filepath);
  1270. const placeholders = filepaths.map(() => '?').join(',');
  1271. fileRows = db.prepare(`SELECT ${selectCols} FROM documents WHERE filepath IN (${placeholders}) AND active = 1`).all(...filepaths) as DbDocRow[];
  1272. }
  1273. const results: MultiGetResult[] = [];
  1274. for (const row of fileRows) {
  1275. const context = getContextForFile(db, row.filepath);
  1276. if (row.body_length > maxBytes) {
  1277. results.push({
  1278. doc: { filepath: row.filepath, displayPath: row.display_path },
  1279. skipped: true,
  1280. skipReason: `File too large (${Math.round(row.body_length / 1024)}KB > ${Math.round(maxBytes / 1024)}KB)`,
  1281. });
  1282. continue;
  1283. }
  1284. results.push({
  1285. doc: {
  1286. filepath: row.filepath,
  1287. displayPath: row.display_path,
  1288. title: row.title || row.display_path.split('/').pop() || row.display_path,
  1289. context,
  1290. hash: row.hash,
  1291. collectionId: row.collection_id,
  1292. modifiedAt: row.modified_at,
  1293. bodyLength: row.body_length,
  1294. ...(options.includeBody && row.body !== undefined && { body: row.body }),
  1295. },
  1296. skipped: false,
  1297. });
  1298. }
  1299. return { docs: results, errors };
  1300. }
  1301. /**
  1302. * Legacy function for backwards compatibility
  1303. */
  1304. export function getMultipleDocuments(db: Database, pattern: string, maxLines?: number, maxBytes: number = DEFAULT_MULTI_GET_MAX_BYTES): { files: MultiGetFile[]; errors: string[] } {
  1305. const { docs, errors } = findDocuments(db, pattern, { includeBody: true, maxBytes });
  1306. const files: MultiGetFile[] = docs.map(result => {
  1307. if (result.skipped) {
  1308. return {
  1309. filepath: result.doc.filepath,
  1310. displayPath: result.doc.displayPath,
  1311. title: "",
  1312. body: "",
  1313. context: null,
  1314. skipped: true as const,
  1315. skipReason: result.skipReason,
  1316. };
  1317. }
  1318. let body = result.doc.body || "";
  1319. if (maxLines !== undefined) {
  1320. const lines = body.split('\n');
  1321. body = lines.slice(0, maxLines).join('\n');
  1322. if (lines.length > maxLines) {
  1323. body += `\n\n[... truncated ${lines.length - maxLines} more lines]`;
  1324. }
  1325. }
  1326. return {
  1327. filepath: result.doc.filepath,
  1328. displayPath: result.doc.displayPath,
  1329. title: result.doc.title,
  1330. body,
  1331. context: result.doc.context,
  1332. skipped: false as const,
  1333. };
  1334. });
  1335. return { files, errors };
  1336. }
  1337. // Keep the old MultiGetFile type for backwards compatibility
  1338. export type MultiGetFile = {
  1339. filepath: string;
  1340. displayPath: string;
  1341. title: string;
  1342. body: string;
  1343. context: string | null;
  1344. skipped: false;
  1345. } | {
  1346. filepath: string;
  1347. displayPath: string;
  1348. title: string;
  1349. body: string;
  1350. context: string | null;
  1351. skipped: true;
  1352. skipReason: string;
  1353. };
  1354. // =============================================================================
  1355. // Status
  1356. // =============================================================================
  1357. export function getStatus(db: Database): IndexStatus {
  1358. const collections = db.prepare(`
  1359. SELECT c.id, c.pwd, c.glob_pattern, c.created_at,
  1360. COUNT(d.id) as active_count,
  1361. MAX(d.modified_at) as last_doc_update
  1362. FROM collections c
  1363. LEFT JOIN documents d ON d.collection_id = c.id AND d.active = 1
  1364. GROUP BY c.id
  1365. ORDER BY last_doc_update DESC
  1366. `).all() as { id: number; pwd: string; glob_pattern: string; created_at: string; active_count: number; last_doc_update: string | null }[];
  1367. const totalDocs = (db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get() as { c: number }).c;
  1368. const needsEmbedding = getHashesNeedingEmbedding(db);
  1369. const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1370. return {
  1371. totalDocuments: totalDocs,
  1372. needsEmbedding,
  1373. hasVectorIndex: hasVectors,
  1374. collections: collections.map(col => ({
  1375. id: col.id,
  1376. path: col.pwd,
  1377. pattern: col.glob_pattern,
  1378. documents: col.active_count,
  1379. lastUpdated: col.last_doc_update || col.created_at,
  1380. })),
  1381. };
  1382. }
  1383. // =============================================================================
  1384. // Snippet extraction
  1385. // =============================================================================
  1386. export type SnippetResult = {
  1387. line: number; // 1-indexed line number of best match
  1388. snippet: string; // The snippet text with diff-style header
  1389. linesBefore: number; // Lines in document before snippet
  1390. linesAfter: number; // Lines in document after snippet
  1391. snippetLines: number; // Number of lines in snippet
  1392. };
  1393. export function extractSnippet(body: string, query: string, maxLen = 500, chunkPos?: number): SnippetResult {
  1394. const totalLines = body.split('\n').length;
  1395. let searchBody = body;
  1396. let lineOffset = 0;
  1397. if (chunkPos && chunkPos > 0) {
  1398. const contextStart = Math.max(0, chunkPos - 100);
  1399. const contextEnd = Math.min(body.length, chunkPos + maxLen + 100);
  1400. searchBody = body.slice(contextStart, contextEnd);
  1401. if (contextStart > 0) {
  1402. lineOffset = body.slice(0, contextStart).split('\n').length - 1;
  1403. }
  1404. }
  1405. const lines = searchBody.split('\n');
  1406. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
  1407. let bestLine = 0, bestScore = -1;
  1408. for (let i = 0; i < lines.length; i++) {
  1409. const lineLower = lines[i].toLowerCase();
  1410. let score = 0;
  1411. for (const term of queryTerms) {
  1412. if (lineLower.includes(term)) score++;
  1413. }
  1414. if (score > bestScore) {
  1415. bestScore = score;
  1416. bestLine = i;
  1417. }
  1418. }
  1419. const start = Math.max(0, bestLine - 1);
  1420. const end = Math.min(lines.length, bestLine + 3);
  1421. const snippetLines = lines.slice(start, end);
  1422. let snippetText = snippetLines.join('\n');
  1423. if (snippetText.length > maxLen) snippetText = snippetText.substring(0, maxLen - 3) + "...";
  1424. const absoluteStart = lineOffset + start + 1; // 1-indexed
  1425. const snippetLineCount = snippetLines.length;
  1426. const linesBefore = absoluteStart - 1;
  1427. const linesAfter = totalLines - (absoluteStart + snippetLineCount - 1);
  1428. // Format with diff-style header: @@ -start,count @@ (linesBefore before, linesAfter after)
  1429. const header = `@@ -${absoluteStart},${snippetLineCount} @@ (${linesBefore} before, ${linesAfter} after)`;
  1430. const snippet = `${header}\n${snippetText}`;
  1431. return {
  1432. line: lineOffset + bestLine + 1,
  1433. snippet,
  1434. linesBefore,
  1435. linesAfter,
  1436. snippetLines: snippetLineCount,
  1437. };
  1438. }