store.ts 70 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056
  1. /**
  2. * QMD Store - Core data access and retrieval functions
  3. *
  4. * This module provides all database operations, search functions, and document
  5. * retrieval for QMD. It returns raw data structures that can be formatted by
  6. * CLI or MCP consumers.
  7. *
  8. * Usage:
  9. * const store = createStore("/path/to/db.sqlite");
  10. * // or use default path:
  11. * const store = createStore();
  12. */
  13. import { Database } from "bun:sqlite";
  14. import { Glob } from "bun";
  15. import * as sqliteVec from "sqlite-vec";
  16. import {
  17. Ollama,
  18. getDefaultOllama,
  19. formatQueryForEmbedding,
  20. formatDocForEmbedding,
  21. type RerankDocument,
  22. } from "./llm";
  23. // =============================================================================
  24. // Configuration
  25. // =============================================================================
  26. const HOME = Bun.env.HOME || "/tmp";
  27. export const DEFAULT_EMBED_MODEL = "embeddinggemma";
  28. export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
  29. export const DEFAULT_QUERY_MODEL = "qwen3:0.6b";
  30. export const DEFAULT_GLOB = "**/*.md";
  31. export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
  32. // Re-export OLLAMA_URL for backwards compatibility
  33. export const OLLAMA_URL = getDefaultOllama().getBaseUrl();
  34. // Chunking: ~2000 tokens per chunk, ~3 bytes/token = 6KB
  35. const CHUNK_BYTE_SIZE = 6 * 1024;
  36. // =============================================================================
  37. // Path utilities
  38. // =============================================================================
  39. export function homedir(): string {
  40. return HOME;
  41. }
  42. export function resolve(...paths: string[]): string {
  43. let result = paths[0].startsWith('/') ? '' : Bun.env.PWD || process.cwd();
  44. for (const p of paths) {
  45. if (p.startsWith('/')) {
  46. result = p;
  47. } else {
  48. result = result + '/' + p;
  49. }
  50. }
  51. const parts = result.split('/').filter(Boolean);
  52. const normalized: string[] = [];
  53. for (const part of parts) {
  54. if (part === '..') normalized.pop();
  55. else if (part !== '.') normalized.push(part);
  56. }
  57. return '/' + normalized.join('/');
  58. }
  59. export function getDefaultDbPath(indexName: string = "index"): string {
  60. // Allow override via INDEX_PATH for testing
  61. if (Bun.env.INDEX_PATH) {
  62. return Bun.env.INDEX_PATH;
  63. }
  64. const cacheDir = Bun.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
  65. const qmdCacheDir = resolve(cacheDir, "qmd");
  66. try { Bun.spawnSync(["mkdir", "-p", qmdCacheDir]); } catch {}
  67. return resolve(qmdCacheDir, `${indexName}.sqlite`);
  68. }
  69. export function getPwd(): string {
  70. return process.env.PWD || process.cwd();
  71. }
  72. export function getRealPath(path: string): string {
  73. try {
  74. const result = Bun.spawnSync(["realpath", path]);
  75. if (result.success) {
  76. return result.stdout.toString().trim();
  77. }
  78. } catch {}
  79. return resolve(path);
  80. }
  81. // =============================================================================
  82. // Virtual Path Utilities (qmd://)
  83. // =============================================================================
  84. export type VirtualPath = {
  85. collectionName: string;
  86. path: string; // relative path within collection
  87. };
  88. /**
  89. * Parse a virtual path like "qmd://collection-name/path/to/file.md"
  90. * into its components.
  91. */
  92. export function parseVirtualPath(virtualPath: string): VirtualPath | null {
  93. const match = virtualPath.match(/^qmd:\/\/([^\/]+)\/(.+)$/);
  94. if (!match) return null;
  95. return {
  96. collectionName: match[1],
  97. path: match[2],
  98. };
  99. }
  100. /**
  101. * Build a virtual path from collection name and relative path.
  102. */
  103. export function buildVirtualPath(collectionName: string, path: string): string {
  104. return `qmd://${collectionName}/${path}`;
  105. }
  106. /**
  107. * Check if a path is a virtual path (starts with qmd://).
  108. */
  109. export function isVirtualPath(path: string): boolean {
  110. return path.startsWith('qmd://');
  111. }
  112. /**
  113. * Resolve a virtual path to absolute filesystem path.
  114. */
  115. export function resolveVirtualPath(db: Database, virtualPath: string): string | null {
  116. const parsed = parseVirtualPath(virtualPath);
  117. if (!parsed) return null;
  118. const coll = getCollectionByName(db, parsed.collectionName);
  119. if (!coll) return null;
  120. return resolve(coll.pwd, parsed.path);
  121. }
  122. /**
  123. * Convert an absolute filesystem path to a virtual path.
  124. * Returns null if the file is not in any indexed collection.
  125. */
  126. export function toVirtualPath(db: Database, absolutePath: string): string | null {
  127. const doc = db.prepare(`
  128. SELECT c.name, d.path
  129. FROM documents d
  130. JOIN collections c ON c.id = d.collection_id
  131. WHERE c.pwd || '/' || d.path = ? AND d.active = 1
  132. LIMIT 1
  133. `).get(absolutePath) as { name: string; path: string } | null;
  134. if (!doc) return null;
  135. return buildVirtualPath(doc.name, doc.path);
  136. }
  137. // =============================================================================
  138. // Database initialization
  139. // =============================================================================
  140. // On macOS, use Homebrew's SQLite which supports extensions
  141. if (process.platform === "darwin") {
  142. const homebrewSqlitePath = "/opt/homebrew/opt/sqlite/lib/libsqlite3.dylib";
  143. try {
  144. if (Bun.file(homebrewSqlitePath).size > 0) {
  145. Database.setCustomSQLite(homebrewSqlitePath);
  146. }
  147. } catch {}
  148. }
  149. function initializeDatabase(db: Database): void {
  150. sqliteVec.load(db);
  151. db.exec("PRAGMA journal_mode = WAL");
  152. db.exec("PRAGMA foreign_keys = ON");
  153. // Check if we need to migrate from old schema
  154. const tables = db.prepare(`SELECT name FROM sqlite_master WHERE type='table'`).all() as { name: string }[];
  155. const tableNames = tables.map(t => t.name);
  156. const needsMigration = tableNames.includes('documents') && !tableNames.includes('content');
  157. if (needsMigration) {
  158. migrateToContentAddressable(db);
  159. return; // Migration will call initializeDatabase again
  160. }
  161. // Content-addressable storage - the source of truth for document content
  162. db.exec(`
  163. CREATE TABLE IF NOT EXISTS content (
  164. hash TEXT PRIMARY KEY,
  165. doc TEXT NOT NULL,
  166. created_at TEXT NOT NULL
  167. )
  168. `);
  169. // Collections table with name field
  170. db.exec(`
  171. CREATE TABLE IF NOT EXISTS collections (
  172. id INTEGER PRIMARY KEY AUTOINCREMENT,
  173. name TEXT NOT NULL UNIQUE,
  174. pwd TEXT NOT NULL,
  175. glob_pattern TEXT NOT NULL,
  176. created_at TEXT NOT NULL,
  177. updated_at TEXT NOT NULL,
  178. UNIQUE(pwd, glob_pattern)
  179. )
  180. `);
  181. // Documents table - file system layer mapping virtual paths to content hashes
  182. db.exec(`
  183. CREATE TABLE IF NOT EXISTS documents (
  184. id INTEGER PRIMARY KEY AUTOINCREMENT,
  185. collection_id INTEGER NOT NULL,
  186. path TEXT NOT NULL,
  187. title TEXT NOT NULL,
  188. hash TEXT NOT NULL,
  189. created_at TEXT NOT NULL,
  190. modified_at TEXT NOT NULL,
  191. active INTEGER NOT NULL DEFAULT 1,
  192. FOREIGN KEY (collection_id) REFERENCES collections(id) ON DELETE CASCADE,
  193. FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
  194. UNIQUE(collection_id, path)
  195. )
  196. `);
  197. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection_id, active)`);
  198. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`);
  199. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path, active)`);
  200. // Path-based context (collection-scoped, hierarchical)
  201. db.exec(`
  202. CREATE TABLE IF NOT EXISTS path_contexts (
  203. id INTEGER PRIMARY KEY AUTOINCREMENT,
  204. collection_id INTEGER NOT NULL,
  205. path_prefix TEXT NOT NULL,
  206. context TEXT NOT NULL,
  207. created_at TEXT NOT NULL,
  208. FOREIGN KEY (collection_id) REFERENCES collections(id) ON DELETE CASCADE,
  209. UNIQUE(collection_id, path_prefix)
  210. )
  211. `);
  212. db.exec(`CREATE INDEX IF NOT EXISTS idx_path_contexts_collection ON path_contexts(collection_id, path_prefix)`);
  213. // Cache table for Ollama API calls
  214. db.exec(`
  215. CREATE TABLE IF NOT EXISTS ollama_cache (
  216. hash TEXT PRIMARY KEY,
  217. result TEXT NOT NULL,
  218. created_at TEXT NOT NULL
  219. )
  220. `);
  221. // Content vectors
  222. const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
  223. const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
  224. if (cvInfo.length > 0 && !hasSeqColumn) {
  225. db.exec(`DROP TABLE IF EXISTS content_vectors`);
  226. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  227. }
  228. db.exec(`
  229. CREATE TABLE IF NOT EXISTS content_vectors (
  230. hash TEXT NOT NULL,
  231. seq INTEGER NOT NULL DEFAULT 0,
  232. pos INTEGER NOT NULL DEFAULT 0,
  233. model TEXT NOT NULL,
  234. embedded_at TEXT NOT NULL,
  235. PRIMARY KEY (hash, seq)
  236. )
  237. `);
  238. // FTS - index path and content (joined from content table)
  239. db.exec(`
  240. CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
  241. path, body,
  242. tokenize='porter unicode61'
  243. )
  244. `);
  245. // Triggers to keep FTS in sync
  246. db.exec(`
  247. CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents BEGIN
  248. INSERT INTO documents_fts(rowid, path, body)
  249. SELECT new.id, new.path, c.doc
  250. FROM content c
  251. WHERE c.hash = new.hash;
  252. END
  253. `);
  254. db.exec(`
  255. CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
  256. DELETE FROM documents_fts WHERE rowid = old.id;
  257. END
  258. `);
  259. db.exec(`
  260. CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents BEGIN
  261. UPDATE documents_fts
  262. SET path = new.path,
  263. body = (SELECT doc FROM content WHERE hash = new.hash)
  264. WHERE rowid = new.id;
  265. END
  266. `);
  267. }
  268. function migrateToContentAddressable(db: Database): void {
  269. console.log("Migrating database to content-addressable schema...");
  270. // Start transaction
  271. db.exec("BEGIN TRANSACTION");
  272. try {
  273. // Rename old tables
  274. db.exec("ALTER TABLE documents RENAME TO documents_old");
  275. db.exec("ALTER TABLE collections RENAME TO collections_old");
  276. db.exec("ALTER TABLE path_contexts RENAME TO path_contexts_old");
  277. db.exec("DROP TABLE IF EXISTS documents_fts");
  278. db.exec("DROP TRIGGER IF EXISTS documents_ai");
  279. db.exec("DROP TRIGGER IF EXISTS documents_ad");
  280. db.exec("DROP TRIGGER IF EXISTS documents_au");
  281. // Create new schema
  282. db.exec(`
  283. CREATE TABLE content (
  284. hash TEXT PRIMARY KEY,
  285. doc TEXT NOT NULL,
  286. created_at TEXT NOT NULL
  287. )
  288. `);
  289. db.exec(`
  290. CREATE TABLE collections (
  291. id INTEGER PRIMARY KEY AUTOINCREMENT,
  292. name TEXT NOT NULL UNIQUE,
  293. pwd TEXT NOT NULL,
  294. glob_pattern TEXT NOT NULL,
  295. created_at TEXT NOT NULL,
  296. updated_at TEXT NOT NULL,
  297. UNIQUE(pwd, glob_pattern)
  298. )
  299. `);
  300. db.exec(`
  301. CREATE TABLE documents (
  302. id INTEGER PRIMARY KEY AUTOINCREMENT,
  303. collection_id INTEGER NOT NULL,
  304. path TEXT NOT NULL,
  305. title TEXT NOT NULL,
  306. hash TEXT NOT NULL,
  307. created_at TEXT NOT NULL,
  308. modified_at TEXT NOT NULL,
  309. active INTEGER NOT NULL DEFAULT 1,
  310. FOREIGN KEY (collection_id) REFERENCES collections(id) ON DELETE CASCADE,
  311. FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
  312. UNIQUE(collection_id, path)
  313. )
  314. `);
  315. db.exec(`
  316. CREATE TABLE path_contexts (
  317. id INTEGER PRIMARY KEY AUTOINCREMENT,
  318. collection_id INTEGER NOT NULL,
  319. path_prefix TEXT NOT NULL,
  320. context TEXT NOT NULL,
  321. created_at TEXT NOT NULL,
  322. FOREIGN KEY (collection_id) REFERENCES collections(id) ON DELETE CASCADE,
  323. UNIQUE(collection_id, path_prefix)
  324. )
  325. `);
  326. // Migrate data: Extract unique content hashes
  327. console.log("Migrating content...");
  328. db.exec(`
  329. INSERT INTO content (hash, doc, created_at)
  330. SELECT hash, body, MIN(created_at) as created_at
  331. FROM documents_old
  332. WHERE active = 1
  333. GROUP BY hash
  334. `);
  335. // Migrate collections: generate names from pwd basename
  336. console.log("Migrating collections...");
  337. // First insert with pwd as temporary name
  338. db.exec(`
  339. INSERT INTO collections (id, name, pwd, glob_pattern, created_at, updated_at)
  340. SELECT
  341. id,
  342. pwd as name,
  343. pwd,
  344. glob_pattern,
  345. created_at,
  346. created_at as updated_at
  347. FROM collections_old
  348. `);
  349. // Then update names to basenames using application logic
  350. const collections = db.prepare(`SELECT id, pwd FROM collections`).all() as { id: number; pwd: string }[];
  351. for (const coll of collections) {
  352. const parts = coll.pwd.split('/').filter(Boolean);
  353. const name = parts[parts.length - 1] || 'root';
  354. db.prepare(`UPDATE collections SET name = ? WHERE id = ?`).run(name, coll.id);
  355. }
  356. // Handle duplicate collection names by appending collection_id
  357. const duplicates = db.prepare(`
  358. SELECT name, COUNT(*) as cnt
  359. FROM collections
  360. GROUP BY name
  361. HAVING cnt > 1
  362. `).all() as { name: string; cnt: number }[];
  363. for (const dup of duplicates) {
  364. const rows = db.prepare(`SELECT id FROM collections WHERE name = ? ORDER BY id`).all(dup.name) as { id: number }[];
  365. for (let i = 1; i < rows.length; i++) {
  366. db.prepare(`UPDATE collections SET name = ? WHERE id = ?`).run(`${dup.name}-${rows[i].id}`, rows[i].id);
  367. }
  368. }
  369. // Migrate documents: convert filepath to relative path within collection
  370. console.log("Migrating documents...");
  371. const oldDocs = db.prepare(`
  372. SELECT d.id, d.collection_id, d.filepath, d.title, d.hash, d.created_at, d.modified_at, c.pwd
  373. FROM documents_old d
  374. JOIN collections c ON c.id = d.collection_id
  375. WHERE d.active = 1
  376. `).all() as Array<{
  377. id: number;
  378. collection_id: number;
  379. filepath: string;
  380. title: string;
  381. hash: string;
  382. created_at: string;
  383. modified_at: string;
  384. pwd: string;
  385. }>;
  386. const insertDoc = db.prepare(`
  387. INSERT INTO documents (collection_id, path, title, hash, created_at, modified_at, active)
  388. VALUES (?, ?, ?, ?, ?, ?, 1)
  389. `);
  390. for (const doc of oldDocs) {
  391. // Convert absolute filepath to relative path within collection
  392. let path = doc.filepath;
  393. if (path.startsWith(doc.pwd + '/')) {
  394. path = path.slice(doc.pwd.length + 1);
  395. } else if (path.startsWith(doc.pwd)) {
  396. path = path.slice(doc.pwd.length);
  397. }
  398. // Remove leading slash if present
  399. path = path.replace(/^\/+/, '');
  400. try {
  401. insertDoc.run(doc.collection_id, path, doc.title, doc.hash, doc.created_at, doc.modified_at);
  402. } catch (e) {
  403. console.warn(`Skipping duplicate path: ${path} in collection ${doc.collection_id}`);
  404. }
  405. }
  406. // Migrate path_contexts: associate with collections based on path prefix
  407. console.log("Migrating path contexts...");
  408. const oldContexts = db.prepare(`SELECT * FROM path_contexts_old`).all() as Array<{
  409. path_prefix: string;
  410. context: string;
  411. created_at: string;
  412. }>;
  413. const insertContext = db.prepare(`
  414. INSERT INTO path_contexts (collection_id, path_prefix, context, created_at)
  415. VALUES (?, ?, ?, ?)
  416. `);
  417. const allCollections = db.prepare(`SELECT id, pwd FROM collections`).all() as Array<{ id: number; pwd: string }>;
  418. for (const ctx of oldContexts) {
  419. // Find collection(s) that match this path prefix
  420. for (const coll of allCollections) {
  421. if (ctx.path_prefix.startsWith(coll.pwd)) {
  422. // Convert absolute path_prefix to relative within collection
  423. let relPath = ctx.path_prefix;
  424. if (relPath.startsWith(coll.pwd + '/')) {
  425. relPath = relPath.slice(coll.pwd.length + 1);
  426. } else if (relPath.startsWith(coll.pwd)) {
  427. relPath = relPath.slice(coll.pwd.length);
  428. }
  429. relPath = relPath.replace(/^\/+/, '');
  430. try {
  431. insertContext.run(coll.id, relPath, ctx.context, ctx.created_at);
  432. } catch (e) {
  433. // Ignore duplicates
  434. }
  435. }
  436. }
  437. }
  438. // Drop old tables
  439. db.exec("DROP TABLE documents_old");
  440. db.exec("DROP TABLE collections_old");
  441. db.exec("DROP TABLE path_contexts_old");
  442. // Recreate FTS and triggers
  443. db.exec(`
  444. CREATE VIRTUAL TABLE documents_fts USING fts5(
  445. path, body,
  446. tokenize='porter unicode61'
  447. )
  448. `);
  449. db.exec(`
  450. CREATE TRIGGER documents_ai AFTER INSERT ON documents BEGIN
  451. INSERT INTO documents_fts(rowid, path, body)
  452. SELECT new.id, new.path, c.doc
  453. FROM content c
  454. WHERE c.hash = new.hash;
  455. END
  456. `);
  457. db.exec(`
  458. CREATE TRIGGER documents_ad AFTER DELETE ON documents BEGIN
  459. DELETE FROM documents_fts WHERE rowid = old.id;
  460. END
  461. `);
  462. db.exec(`
  463. CREATE TRIGGER documents_au AFTER UPDATE ON documents BEGIN
  464. UPDATE documents_fts
  465. SET path = new.path,
  466. body = (SELECT doc FROM content WHERE hash = new.hash)
  467. WHERE rowid = new.id;
  468. END
  469. `);
  470. // Populate FTS from migrated data
  471. console.log("Rebuilding full-text search index...");
  472. db.exec(`
  473. INSERT INTO documents_fts(rowid, path, body)
  474. SELECT d.id, d.path, c.doc
  475. FROM documents d
  476. JOIN content c ON c.hash = d.hash
  477. WHERE d.active = 1
  478. `);
  479. // Create indexes
  480. db.exec(`CREATE INDEX idx_documents_collection ON documents(collection_id, active)`);
  481. db.exec(`CREATE INDEX idx_documents_hash ON documents(hash)`);
  482. db.exec(`CREATE INDEX idx_documents_path ON documents(path, active)`);
  483. db.exec(`CREATE INDEX idx_path_contexts_collection ON path_contexts(collection_id, path_prefix)`);
  484. db.exec("COMMIT");
  485. console.log("Migration complete!");
  486. } catch (e) {
  487. db.exec("ROLLBACK");
  488. console.error("Migration failed:", e);
  489. throw e;
  490. }
  491. }
  492. function ensureVecTableInternal(db: Database, dimensions: number): void {
  493. const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get() as { sql: string } | null;
  494. if (tableInfo) {
  495. const match = tableInfo.sql.match(/float\[(\d+)\]/);
  496. const hasHashSeq = tableInfo.sql.includes('hash_seq');
  497. if (match && parseInt(match[1]) === dimensions && hasHashSeq) return;
  498. db.exec("DROP TABLE IF EXISTS vectors_vec");
  499. }
  500. db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}])`);
  501. }
  502. // =============================================================================
  503. // Store Factory
  504. // =============================================================================
  505. export type Store = {
  506. db: Database;
  507. dbPath: string;
  508. close: () => void;
  509. ensureVecTable: (dimensions: number) => void;
  510. // Index health
  511. getHashesNeedingEmbedding: () => number;
  512. getIndexHealth: () => IndexHealthInfo;
  513. getStatus: () => IndexStatus;
  514. // Caching
  515. getCacheKey: typeof getCacheKey;
  516. getCachedResult: (cacheKey: string) => string | null;
  517. setCachedResult: (cacheKey: string, result: string) => void;
  518. clearCache: () => void;
  519. // Cleanup and maintenance
  520. deleteOllamaCache: () => number;
  521. deleteInactiveDocuments: () => number;
  522. cleanupOrphanedContent: () => number;
  523. cleanupOrphanedVectors: () => number;
  524. cleanupDuplicateCollections: () => number;
  525. vacuumDatabase: () => void;
  526. // Context
  527. getContextForFile: (filepath: string) => string | null;
  528. getContextForPath: (collectionId: number, path: string) => string | null;
  529. getCollectionIdByName: (name: string) => number | null;
  530. getCollectionByName: (name: string) => { id: number; name: string; pwd: string; glob_pattern: string } | null;
  531. // Virtual paths
  532. parseVirtualPath: typeof parseVirtualPath;
  533. buildVirtualPath: typeof buildVirtualPath;
  534. isVirtualPath: typeof isVirtualPath;
  535. resolveVirtualPath: (virtualPath: string) => string | null;
  536. toVirtualPath: (absolutePath: string) => string | null;
  537. // Search
  538. searchFTS: (query: string, limit?: number, collectionId?: number) => SearchResult[];
  539. searchVec: (query: string, model: string, limit?: number, collectionId?: number) => Promise<SearchResult[]>;
  540. // Query expansion & reranking
  541. expandQuery: (query: string, model?: string) => Promise<string[]>;
  542. rerank: (query: string, documents: { file: string; text: string }[], model?: string) => Promise<{ file: string; score: number }[]>;
  543. // Document retrieval
  544. findDocument: (filename: string, options?: { includeBody?: boolean }) => DocumentResult | DocumentNotFound;
  545. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => string | null;
  546. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => { docs: MultiGetResult[]; errors: string[] };
  547. // Legacy compatibility
  548. getDocument: (filename: string, fromLine?: number, maxLines?: number) => (DocumentResult & { body: string }) | DocumentNotFound;
  549. getMultipleDocuments: (pattern: string, maxLines?: number, maxBytes?: number) => { files: MultiGetFile[]; errors: string[] };
  550. // Fuzzy matching
  551. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => string[];
  552. matchFilesByGlob: (pattern: string) => { filepath: string; displayPath: string; bodyLength: number }[];
  553. // Document indexing operations
  554. insertContent: (hash: string, content: string, createdAt: string) => void;
  555. insertDocument: (collectionId: number, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => void;
  556. findActiveDocument: (collectionId: number, path: string) => { id: number; hash: string; title: string } | null;
  557. updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => void;
  558. deactivateDocument: (collectionId: number, path: string) => void;
  559. getActiveDocumentPaths: (collectionId: number) => string[];
  560. // Vector/embedding operations
  561. getHashesForEmbedding: () => { hash: string; body: string; path: string }[];
  562. clearAllEmbeddings: () => void;
  563. insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void;
  564. };
  565. /**
  566. * Create a new store instance with the given database path.
  567. * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
  568. *
  569. * @param dbPath - Path to the SQLite database file
  570. * @returns Store instance with all methods bound to the database
  571. */
  572. export function createStore(dbPath?: string): Store {
  573. const resolvedPath = dbPath || getDefaultDbPath();
  574. const db = new Database(resolvedPath);
  575. initializeDatabase(db);
  576. return {
  577. db,
  578. dbPath: resolvedPath,
  579. close: () => db.close(),
  580. ensureVecTable: (dimensions: number) => ensureVecTableInternal(db, dimensions),
  581. // Index health
  582. getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
  583. getIndexHealth: () => getIndexHealth(db),
  584. getStatus: () => getStatus(db),
  585. // Caching
  586. getCacheKey,
  587. getCachedResult: (cacheKey: string) => getCachedResult(db, cacheKey),
  588. setCachedResult: (cacheKey: string, result: string) => setCachedResult(db, cacheKey, result),
  589. clearCache: () => clearCache(db),
  590. // Cleanup and maintenance
  591. deleteOllamaCache: () => deleteOllamaCache(db),
  592. deleteInactiveDocuments: () => deleteInactiveDocuments(db),
  593. cleanupOrphanedContent: () => cleanupOrphanedContent(db),
  594. cleanupOrphanedVectors: () => cleanupOrphanedVectors(db),
  595. cleanupDuplicateCollections: () => cleanupDuplicateCollections(db),
  596. vacuumDatabase: () => vacuumDatabase(db),
  597. // Context
  598. getContextForFile: (filepath: string) => getContextForFile(db, filepath),
  599. getContextForPath: (collectionId: number, path: string) => getContextForPath(db, collectionId, path),
  600. getCollectionIdByName: (name: string) => getCollectionIdByName(db, name),
  601. getCollectionByName: (name: string) => getCollectionByName(db, name),
  602. // Virtual paths
  603. parseVirtualPath,
  604. buildVirtualPath,
  605. isVirtualPath,
  606. resolveVirtualPath: (virtualPath: string) => resolveVirtualPath(db, virtualPath),
  607. toVirtualPath: (absolutePath: string) => toVirtualPath(db, absolutePath),
  608. // Search
  609. searchFTS: (query: string, limit?: number, collectionId?: number) => searchFTS(db, query, limit, collectionId),
  610. searchVec: (query: string, model: string, limit?: number, collectionId?: number) => searchVec(db, query, model, limit, collectionId),
  611. // Query expansion & reranking
  612. expandQuery: (query: string, model?: string) => expandQuery(query, model, db),
  613. rerank: (query: string, documents: { file: string; text: string }[], model?: string) => rerank(query, documents, model, db),
  614. // Document retrieval
  615. findDocument: (filename: string, options?: { includeBody?: boolean }) => findDocument(db, filename, options),
  616. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => getDocumentBody(db, doc, fromLine, maxLines),
  617. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => findDocuments(db, pattern, options),
  618. // Legacy compatibility
  619. getDocument: (filename: string, fromLine?: number, maxLines?: number) => getDocument(db, filename, fromLine, maxLines),
  620. getMultipleDocuments: (pattern: string, maxLines?: number, maxBytes?: number) => getMultipleDocuments(db, pattern, maxLines, maxBytes),
  621. // Fuzzy matching
  622. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => findSimilarFiles(db, query, maxDistance, limit),
  623. matchFilesByGlob: (pattern: string) => matchFilesByGlob(db, pattern),
  624. // Document indexing operations
  625. insertContent: (hash: string, content: string, createdAt: string) => insertContent(db, hash, content, createdAt),
  626. insertDocument: (collectionId: number, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => insertDocument(db, collectionId, path, title, hash, createdAt, modifiedAt),
  627. findActiveDocument: (collectionId: number, path: string) => findActiveDocument(db, collectionId, path),
  628. updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => updateDocumentTitle(db, documentId, title, modifiedAt),
  629. deactivateDocument: (collectionId: number, path: string) => deactivateDocument(db, collectionId, path),
  630. getActiveDocumentPaths: (collectionId: number) => getActiveDocumentPaths(db, collectionId),
  631. // Vector/embedding operations
  632. getHashesForEmbedding: () => getHashesForEmbedding(db),
  633. clearAllEmbeddings: () => clearAllEmbeddings(db),
  634. insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
  635. };
  636. }
  637. // =============================================================================
  638. // Legacy compatibility - will be removed
  639. // =============================================================================
  640. let _legacyDb: Database | null = null;
  641. let _legacyDbPath: string | null = null;
  642. /** @deprecated Use createStore() instead */
  643. export function setCustomIndexName(name: string | null): void {
  644. _legacyDbPath = name ? getDefaultDbPath(name) : null;
  645. _legacyDb = null; // Reset so next getDb() creates new connection
  646. }
  647. /** @deprecated Use createStore() instead */
  648. export function getDbPath(): string {
  649. return _legacyDbPath || getDefaultDbPath();
  650. }
  651. /** @deprecated Use createStore() instead */
  652. export function getDb(): Database {
  653. if (!_legacyDb) {
  654. _legacyDb = new Database(getDbPath());
  655. initializeDatabase(_legacyDb);
  656. }
  657. return _legacyDb;
  658. }
  659. /** @deprecated Use store.db.close() instead. Closes the legacy db and resets singleton. */
  660. export function closeDb(): void {
  661. if (_legacyDb) {
  662. _legacyDb.close();
  663. _legacyDb = null;
  664. }
  665. }
  666. /** @deprecated Use store.ensureVecTable() instead */
  667. export function ensureVecTable(db: Database, dimensions: number): void {
  668. ensureVecTableInternal(db, dimensions);
  669. }
  670. // =============================================================================
  671. // Core Document Type
  672. // =============================================================================
  673. /**
  674. * Unified document result type with all metadata.
  675. * Body is optional - use getDocumentBody() to load it separately if needed.
  676. */
  677. export type DocumentResult = {
  678. filepath: string; // Full filesystem path
  679. displayPath: string; // Short display path (e.g., "docs/readme.md")
  680. title: string; // Document title (from first heading or filename)
  681. context: string | null; // Folder context description if configured
  682. hash: string; // Content hash for caching/change detection
  683. collectionId: number; // Parent collection ID
  684. modifiedAt: string; // Last modification timestamp
  685. bodyLength: number; // Body length in bytes (useful before loading)
  686. body?: string; // Document body (optional, load with getDocumentBody)
  687. };
  688. /**
  689. * Search result extends DocumentResult with score and source info
  690. */
  691. export type SearchResult = DocumentResult & {
  692. score: number; // Relevance score (0-1)
  693. source: "fts" | "vec"; // Search source (full-text or vector)
  694. chunkPos?: number; // Character position of matching chunk (for vector search)
  695. };
  696. /**
  697. * Ranked result for RRF fusion (simplified, used internally)
  698. */
  699. export type RankedResult = {
  700. file: string;
  701. displayPath: string;
  702. title: string;
  703. body: string;
  704. score: number;
  705. };
  706. /**
  707. * Error result when document is not found
  708. */
  709. export type DocumentNotFound = {
  710. error: "not_found";
  711. query: string;
  712. similarFiles: string[];
  713. };
  714. /**
  715. * Result from multi-get operations
  716. */
  717. export type MultiGetResult = {
  718. doc: DocumentResult;
  719. skipped: false;
  720. } | {
  721. doc: Pick<DocumentResult, "filepath" | "displayPath">;
  722. skipped: true;
  723. skipReason: string;
  724. };
  725. export type CollectionInfo = {
  726. id: number;
  727. path: string;
  728. pattern: string;
  729. documents: number;
  730. lastUpdated: string;
  731. };
  732. export type IndexStatus = {
  733. totalDocuments: number;
  734. needsEmbedding: number;
  735. hasVectorIndex: boolean;
  736. collections: CollectionInfo[];
  737. };
  738. // =============================================================================
  739. // Index health
  740. // =============================================================================
  741. export function getHashesNeedingEmbedding(db: Database): number {
  742. const result = db.prepare(`
  743. SELECT COUNT(DISTINCT d.hash) as count
  744. FROM documents d
  745. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  746. WHERE d.active = 1 AND v.hash IS NULL
  747. `).get() as { count: number };
  748. return result.count;
  749. }
  750. export type IndexHealthInfo = {
  751. needsEmbedding: number;
  752. totalDocs: number;
  753. daysStale: number | null;
  754. };
  755. export function getIndexHealth(db: Database): IndexHealthInfo {
  756. const needsEmbedding = getHashesNeedingEmbedding(db);
  757. const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count;
  758. const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
  759. let daysStale: number | null = null;
  760. if (mostRecent?.latest) {
  761. const lastUpdate = new Date(mostRecent.latest);
  762. daysStale = Math.floor((Date.now() - lastUpdate.getTime()) / (24 * 60 * 60 * 1000));
  763. }
  764. return { needsEmbedding, totalDocs, daysStale };
  765. }
  766. // =============================================================================
  767. // Caching
  768. // =============================================================================
  769. export function getCacheKey(url: string, body: object): string {
  770. const hash = new Bun.CryptoHasher("sha256");
  771. hash.update(url);
  772. hash.update(JSON.stringify(body));
  773. return hash.digest("hex");
  774. }
  775. export function getCachedResult(db: Database, cacheKey: string): string | null {
  776. const row = db.prepare(`SELECT result FROM ollama_cache WHERE hash = ?`).get(cacheKey) as { result: string } | null;
  777. return row?.result || null;
  778. }
  779. export function setCachedResult(db: Database, cacheKey: string, result: string): void {
  780. const now = new Date().toISOString();
  781. db.prepare(`INSERT OR REPLACE INTO ollama_cache (hash, result, created_at) VALUES (?, ?, ?)`).run(cacheKey, result, now);
  782. if (Math.random() < 0.01) {
  783. db.exec(`DELETE FROM ollama_cache WHERE hash NOT IN (SELECT hash FROM ollama_cache ORDER BY created_at DESC LIMIT 1000)`);
  784. }
  785. }
  786. export function clearCache(db: Database): void {
  787. db.exec(`DELETE FROM ollama_cache`);
  788. }
  789. // =============================================================================
  790. // Cleanup and maintenance operations
  791. // =============================================================================
  792. /**
  793. * Delete cached Ollama API responses.
  794. * Returns the number of cached responses deleted.
  795. */
  796. export function deleteOllamaCache(db: Database): number {
  797. const result = db.prepare(`DELETE FROM ollama_cache`).run();
  798. return result.changes;
  799. }
  800. /**
  801. * Remove inactive document records (active = 0).
  802. * Returns the number of inactive documents deleted.
  803. */
  804. export function deleteInactiveDocuments(db: Database): number {
  805. const result = db.prepare(`DELETE FROM documents WHERE active = 0`).run();
  806. return result.changes;
  807. }
  808. /**
  809. * Remove orphaned content hashes that are not referenced by any active document.
  810. * Returns the number of orphaned content hashes deleted.
  811. */
  812. export function cleanupOrphanedContent(db: Database): number {
  813. const result = db.prepare(`
  814. DELETE FROM content
  815. WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  816. `).run();
  817. return result.changes;
  818. }
  819. /**
  820. * Remove orphaned vector embeddings that are not referenced by any active document.
  821. * Returns the number of orphaned embedding chunks deleted.
  822. */
  823. export function cleanupOrphanedVectors(db: Database): number {
  824. // Check if vectors_vec table exists
  825. const tableExists = db.prepare(`
  826. SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'
  827. `).get();
  828. if (!tableExists) {
  829. return 0;
  830. }
  831. // Count orphaned vectors first
  832. const countResult = db.prepare(`
  833. SELECT COUNT(*) as c FROM content_vectors cv
  834. WHERE NOT EXISTS (
  835. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  836. )
  837. `).get() as { c: number };
  838. if (countResult.c === 0) {
  839. return 0;
  840. }
  841. // Delete from vectors_vec first
  842. db.exec(`
  843. DELETE FROM vectors_vec WHERE hash_seq IN (
  844. SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
  845. WHERE NOT EXISTS (
  846. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  847. )
  848. )
  849. `);
  850. // Delete from content_vectors
  851. db.exec(`
  852. DELETE FROM content_vectors WHERE hash NOT IN (
  853. SELECT hash FROM documents WHERE active = 1
  854. )
  855. `);
  856. return countResult.c;
  857. }
  858. /**
  859. * Remove duplicate collections, keeping the oldest one per (pwd, glob_pattern).
  860. * Also removes bogus "." glob pattern entries.
  861. * Returns the number of duplicate collections removed.
  862. */
  863. export function cleanupDuplicateCollections(db: Database): number {
  864. // Count duplicates before removal
  865. const beforeCount = (db.prepare(`SELECT COUNT(*) as c FROM collections`).get() as { c: number }).c;
  866. // Remove duplicates keeping the oldest one
  867. db.exec(`
  868. DELETE FROM collections WHERE id NOT IN (
  869. SELECT MIN(id) FROM collections GROUP BY pwd, glob_pattern
  870. )
  871. `);
  872. // Remove bogus "." glob pattern entries (from earlier bug)
  873. db.exec(`DELETE FROM collections WHERE glob_pattern = '.'`);
  874. const afterCount = (db.prepare(`SELECT COUNT(*) as c FROM collections`).get() as { c: number }).c;
  875. return beforeCount - afterCount;
  876. }
  877. /**
  878. * Run VACUUM to reclaim unused space in the database.
  879. * This operation rebuilds the database file to eliminate fragmentation.
  880. */
  881. export function vacuumDatabase(db: Database): void {
  882. db.exec(`VACUUM`);
  883. }
  884. // =============================================================================
  885. // Document helpers
  886. // =============================================================================
  887. export async function hashContent(content: string): Promise<string> {
  888. const hash = new Bun.CryptoHasher("sha256");
  889. hash.update(content);
  890. return hash.digest("hex");
  891. }
  892. export function extractTitle(content: string, filename: string): string {
  893. const match = content.match(/^##?\s+(.+)$/m);
  894. if (match) {
  895. const title = match[1].trim();
  896. if (title === "📝 Notes" || title === "Notes") {
  897. const nextMatch = content.match(/^##\s+(.+)$/m);
  898. if (nextMatch) return nextMatch[1].trim();
  899. }
  900. return title;
  901. }
  902. return filename.replace(/\.md$/, "").split("/").pop() || filename;
  903. }
  904. // =============================================================================
  905. // Document indexing operations
  906. // =============================================================================
  907. /**
  908. * Insert content into the content table (content-addressable storage).
  909. * Uses INSERT OR IGNORE so duplicate hashes are skipped.
  910. */
  911. export function insertContent(db: Database, hash: string, content: string, createdAt: string): void {
  912. db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
  913. .run(hash, content, createdAt);
  914. }
  915. /**
  916. * Insert a new document into the documents table.
  917. */
  918. export function insertDocument(
  919. db: Database,
  920. collectionId: number,
  921. path: string,
  922. title: string,
  923. hash: string,
  924. createdAt: string,
  925. modifiedAt: string
  926. ): void {
  927. db.prepare(`
  928. INSERT INTO documents (collection_id, path, title, hash, created_at, modified_at, active)
  929. VALUES (?, ?, ?, ?, ?, ?, 1)
  930. `).run(collectionId, path, title, hash, createdAt, modifiedAt);
  931. }
  932. /**
  933. * Find an active document by collection ID and path.
  934. */
  935. export function findActiveDocument(
  936. db: Database,
  937. collectionId: number,
  938. path: string
  939. ): { id: number; hash: string; title: string } | null {
  940. return db.prepare(`
  941. SELECT id, hash, title FROM documents
  942. WHERE collection_id = ? AND path = ? AND active = 1
  943. `).get(collectionId, path) as { id: number; hash: string; title: string } | null;
  944. }
  945. /**
  946. * Update the title and modified_at timestamp for a document.
  947. */
  948. export function updateDocumentTitle(
  949. db: Database,
  950. documentId: number,
  951. title: string,
  952. modifiedAt: string
  953. ): void {
  954. db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
  955. .run(title, modifiedAt, documentId);
  956. }
  957. /**
  958. * Update an existing document's hash, title, and modified_at timestamp.
  959. * Used when content changes but the file path stays the same.
  960. */
  961. export function updateDocument(
  962. db: Database,
  963. documentId: number,
  964. title: string,
  965. hash: string,
  966. modifiedAt: string
  967. ): void {
  968. db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`)
  969. .run(title, hash, modifiedAt, documentId);
  970. }
  971. /**
  972. * Deactivate a document (mark as inactive but don't delete).
  973. */
  974. export function deactivateDocument(db: Database, collectionId: number, path: string): void {
  975. db.prepare(`UPDATE documents SET active = 0 WHERE collection_id = ? AND path = ? AND active = 1`)
  976. .run(collectionId, path);
  977. }
  978. /**
  979. * Get all active document paths for a collection.
  980. */
  981. export function getActiveDocumentPaths(db: Database, collectionId: number): string[] {
  982. const rows = db.prepare(`
  983. SELECT path FROM documents WHERE collection_id = ? AND active = 1
  984. `).all(collectionId) as { path: string }[];
  985. return rows.map(r => r.path);
  986. }
  987. // Re-export from llm.ts for backwards compatibility
  988. export { formatQueryForEmbedding, formatDocForEmbedding };
  989. export function chunkDocument(content: string, maxBytes: number = CHUNK_BYTE_SIZE): { text: string; pos: number }[] {
  990. const encoder = new TextEncoder();
  991. const totalBytes = encoder.encode(content).length;
  992. if (totalBytes <= maxBytes) {
  993. return [{ text: content, pos: 0 }];
  994. }
  995. const chunks: { text: string; pos: number }[] = [];
  996. let charPos = 0;
  997. while (charPos < content.length) {
  998. let endPos = charPos;
  999. let byteCount = 0;
  1000. while (endPos < content.length && byteCount < maxBytes) {
  1001. const charBytes = encoder.encode(content[endPos]).length;
  1002. if (byteCount + charBytes > maxBytes) break;
  1003. byteCount += charBytes;
  1004. endPos++;
  1005. }
  1006. if (endPos < content.length && endPos > charPos) {
  1007. const slice = content.slice(charPos, endPos);
  1008. const paragraphBreak = slice.lastIndexOf('\n\n');
  1009. const sentenceEnd = Math.max(
  1010. slice.lastIndexOf('. '),
  1011. slice.lastIndexOf('.\n'),
  1012. slice.lastIndexOf('? '),
  1013. slice.lastIndexOf('?\n'),
  1014. slice.lastIndexOf('! '),
  1015. slice.lastIndexOf('!\n')
  1016. );
  1017. const lineBreak = slice.lastIndexOf('\n');
  1018. const spaceBreak = slice.lastIndexOf(' ');
  1019. let breakPoint = -1;
  1020. if (paragraphBreak > slice.length * 0.5) {
  1021. breakPoint = paragraphBreak + 2;
  1022. } else if (sentenceEnd > slice.length * 0.5) {
  1023. breakPoint = sentenceEnd + 2;
  1024. } else if (lineBreak > slice.length * 0.3) {
  1025. breakPoint = lineBreak + 1;
  1026. } else if (spaceBreak > slice.length * 0.3) {
  1027. breakPoint = spaceBreak + 1;
  1028. }
  1029. if (breakPoint > 0) {
  1030. endPos = charPos + breakPoint;
  1031. }
  1032. }
  1033. if (endPos <= charPos) {
  1034. endPos = charPos + 1;
  1035. }
  1036. chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
  1037. charPos = endPos;
  1038. }
  1039. return chunks;
  1040. }
  1041. // =============================================================================
  1042. // Fuzzy matching
  1043. // =============================================================================
  1044. function levenshtein(a: string, b: string): number {
  1045. const m = a.length, n = b.length;
  1046. if (m === 0) return n;
  1047. if (n === 0) return m;
  1048. const dp: number[][] = Array.from({ length: m + 1 }, (_, i) => [i]);
  1049. for (let j = 1; j <= n; j++) dp[0][j] = j;
  1050. for (let i = 1; i <= m; i++) {
  1051. for (let j = 1; j <= n; j++) {
  1052. const cost = a[i - 1] === b[j - 1] ? 0 : 1;
  1053. dp[i][j] = Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost);
  1054. }
  1055. }
  1056. return dp[m][n];
  1057. }
  1058. export function findSimilarFiles(db: Database, query: string, maxDistance: number = 3, limit: number = 5): string[] {
  1059. const allFiles = db.prepare(`SELECT display_path FROM documents WHERE active = 1`).all() as { display_path: string }[];
  1060. const queryLower = query.toLowerCase();
  1061. const scored = allFiles
  1062. .map(f => ({ path: f.display_path, dist: levenshtein(f.display_path.toLowerCase(), queryLower) }))
  1063. .filter(f => f.dist <= maxDistance)
  1064. .sort((a, b) => a.dist - b.dist)
  1065. .slice(0, limit);
  1066. return scored.map(f => f.path);
  1067. }
  1068. export function matchFilesByGlob(db: Database, pattern: string): { filepath: string; displayPath: string; bodyLength: number }[] {
  1069. const allFiles = db.prepare(`
  1070. SELECT
  1071. 'qmd://' || c.name || '/' || d.path as virtual_path,
  1072. LENGTH(content.doc) as body_length,
  1073. d.collection_id,
  1074. d.path
  1075. FROM documents d
  1076. JOIN collections c ON c.id = d.collection_id
  1077. JOIN content ON content.hash = d.hash
  1078. WHERE d.active = 1
  1079. `).all() as { virtual_path: string; body_length: number; collection_id: number; path: string }[];
  1080. const glob = new Glob(pattern);
  1081. return allFiles
  1082. .filter(f => glob.match(f.virtual_path) || glob.match(f.path))
  1083. .map(f => ({
  1084. filepath: f.virtual_path, // Use virtual path as filepath
  1085. displayPath: f.virtual_path,
  1086. bodyLength: f.body_length
  1087. }));
  1088. }
  1089. // =============================================================================
  1090. // Context
  1091. // =============================================================================
  1092. /**
  1093. * Get context for a file path using hierarchical inheritance.
  1094. * Contexts are collection-scoped and inherit from parent directories.
  1095. * For example, context at "/talks" applies to "/talks/2024/keynote.md".
  1096. *
  1097. * @param db Database instance
  1098. * @param collectionId Collection ID
  1099. * @param path Relative path within the collection
  1100. * @returns Context string or null if no context is defined
  1101. */
  1102. export function getContextForPath(db: Database, collectionId: number, path: string): string | null {
  1103. // Find the most specific (longest) matching path prefix for this collection
  1104. const result = db.prepare(`
  1105. SELECT context FROM path_contexts
  1106. WHERE collection_id = ?
  1107. AND (? LIKE path_prefix || '/%' OR ? = path_prefix OR path_prefix = '')
  1108. ORDER BY LENGTH(path_prefix) DESC
  1109. LIMIT 1
  1110. `).get(collectionId, path, path) as { context: string } | null;
  1111. return result?.context || null;
  1112. }
  1113. /**
  1114. * Legacy function for backward compatibility - resolves filepath to collection+path first
  1115. */
  1116. export function getContextForFile(db: Database, filepath: string): string | null {
  1117. // Try to find the document to get its collection_id and path
  1118. const doc = db.prepare(`
  1119. SELECT d.collection_id, d.path
  1120. FROM documents d
  1121. JOIN collections c ON c.id = d.collection_id
  1122. WHERE c.pwd || '/' || d.path = ? AND d.active = 1
  1123. LIMIT 1
  1124. `).get(filepath) as { collection_id: number; path: string } | null;
  1125. if (!doc) return null;
  1126. return getContextForPath(db, doc.collection_id, doc.path);
  1127. }
  1128. /**
  1129. * Get collection ID by its name (exact match).
  1130. */
  1131. export function getCollectionIdByName(db: Database, name: string): number | null {
  1132. const result = db.prepare(`
  1133. SELECT id FROM collections
  1134. WHERE name = ?
  1135. LIMIT 1
  1136. `).get(name) as { id: number } | null;
  1137. return result?.id || null;
  1138. }
  1139. /**
  1140. * Get collection by name.
  1141. */
  1142. export function getCollectionByName(db: Database, name: string): { id: number; name: string; pwd: string; glob_pattern: string } | null {
  1143. const result = db.prepare(`
  1144. SELECT id, name, pwd, glob_pattern FROM collections
  1145. WHERE name = ?
  1146. LIMIT 1
  1147. `).get(name) as { id: number; name: string; pwd: string; glob_pattern: string } | null;
  1148. return result;
  1149. }
  1150. export function listCollections(db: Database): { id: number; name: string; pwd: string; glob_pattern: string; created_at: string; updated_at: string; doc_count: number; active_count: number; last_modified: string | null }[] {
  1151. const collections = db.prepare(`
  1152. SELECT c.id, c.name, c.pwd, c.glob_pattern, c.created_at, c.updated_at,
  1153. COUNT(d.id) as doc_count,
  1154. SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count,
  1155. MAX(d.modified_at) as last_modified
  1156. FROM collections c
  1157. LEFT JOIN documents d ON d.collection_id = c.id
  1158. GROUP BY c.id
  1159. ORDER BY c.name
  1160. `).all() as { id: number; name: string; pwd: string; glob_pattern: string; created_at: string; updated_at: string; doc_count: number; active_count: number; last_modified: string | null }[];
  1161. return collections;
  1162. }
  1163. export function removeCollection(db: Database, collectionId: number): { deletedDocs: number; cleanedHashes: number } {
  1164. // Delete documents
  1165. const docResult = db.prepare(`DELETE FROM documents WHERE collection_id = ?`).run(collectionId);
  1166. // Delete contexts
  1167. db.prepare(`DELETE FROM path_contexts WHERE collection_id = ?`).run(collectionId);
  1168. // Delete collection
  1169. db.prepare(`DELETE FROM collections WHERE id = ?`).run(collectionId);
  1170. // Clean up orphaned content hashes
  1171. const cleanupResult = db.prepare(`
  1172. DELETE FROM content
  1173. WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  1174. `).run();
  1175. return {
  1176. deletedDocs: docResult.changes,
  1177. cleanedHashes: cleanupResult.changes
  1178. };
  1179. }
  1180. export function renameCollection(db: Database, collectionId: number, newName: string): void {
  1181. const now = new Date().toISOString();
  1182. db.prepare(`UPDATE collections SET name = ?, updated_at = ? WHERE id = ?`)
  1183. .run(newName, now, collectionId);
  1184. }
  1185. // =============================================================================
  1186. // Context Management Operations
  1187. // =============================================================================
  1188. /**
  1189. * Insert or update a context for a specific collection and path prefix.
  1190. */
  1191. export function insertContext(db: Database, collectionId: number, pathPrefix: string, context: string): void {
  1192. const now = new Date().toISOString();
  1193. db.prepare(`
  1194. INSERT INTO path_contexts (collection_id, path_prefix, context, created_at)
  1195. VALUES (?, ?, ?, ?)
  1196. ON CONFLICT(collection_id, path_prefix) DO UPDATE SET context = excluded.context
  1197. `).run(collectionId, pathPrefix, context, now);
  1198. }
  1199. /**
  1200. * Delete a context for a specific collection and path prefix.
  1201. * Returns the number of contexts deleted.
  1202. */
  1203. export function deleteContext(db: Database, collectionId: number, pathPrefix: string): number {
  1204. const result = db.prepare(`
  1205. DELETE FROM path_contexts
  1206. WHERE collection_id = ? AND path_prefix = ?
  1207. `).run(collectionId, pathPrefix);
  1208. return result.changes;
  1209. }
  1210. /**
  1211. * Delete all global contexts (contexts with empty path_prefix).
  1212. * Returns the number of contexts deleted.
  1213. */
  1214. export function deleteGlobalContexts(db: Database): number {
  1215. const result = db.prepare(`DELETE FROM path_contexts WHERE path_prefix = ''`).run();
  1216. return result.changes;
  1217. }
  1218. /**
  1219. * List all contexts, grouped by collection.
  1220. * Returns contexts ordered by collection name, then by path prefix length (longest first).
  1221. */
  1222. export function listPathContexts(db: Database): { collection_name: string; path_prefix: string; context: string }[] {
  1223. const contexts = db.prepare(`
  1224. SELECT c.name as collection_name, pc.path_prefix, pc.context
  1225. FROM path_contexts pc
  1226. JOIN collections c ON c.id = pc.collection_id
  1227. ORDER BY c.name, LENGTH(pc.path_prefix) DESC, pc.path_prefix
  1228. `).all() as { collection_name: string; path_prefix: string; context: string }[];
  1229. return contexts;
  1230. }
  1231. /**
  1232. * Get all collections (id and name).
  1233. */
  1234. export function getAllCollections(db: Database): { id: number; name: string }[] {
  1235. return db.prepare(`SELECT id, name FROM collections`).all() as { id: number; name: string }[];
  1236. }
  1237. // =============================================================================
  1238. // FTS Search
  1239. // =============================================================================
  1240. function sanitizeFTS5Term(term: string): string {
  1241. return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
  1242. }
  1243. function buildFTS5Query(query: string): string | null {
  1244. const terms = query.split(/\s+/)
  1245. .map(t => sanitizeFTS5Term(t))
  1246. .filter(t => t.length > 0);
  1247. if (terms.length === 0) return null;
  1248. if (terms.length === 1) return `"${terms[0]}"*`;
  1249. return terms.map(t => `"${t}"*`).join(' AND ');
  1250. }
  1251. export function searchFTS(db: Database, query: string, limit: number = 20, collectionId?: number): SearchResult[] {
  1252. const ftsQuery = buildFTS5Query(query);
  1253. if (!ftsQuery) return [];
  1254. let sql = `
  1255. SELECT
  1256. 'qmd://' || c.name || '/' || d.path as filepath,
  1257. 'qmd://' || c.name || '/' || d.path as display_path,
  1258. d.title,
  1259. content.doc as body,
  1260. bm25(documents_fts, 10.0, 1.0) as score
  1261. FROM documents_fts f
  1262. JOIN documents d ON d.id = f.rowid
  1263. JOIN collections c ON c.id = d.collection_id
  1264. JOIN content ON content.hash = d.hash
  1265. WHERE documents_fts MATCH ? AND d.active = 1
  1266. `;
  1267. const params: (string | number)[] = [ftsQuery];
  1268. if (collectionId !== undefined) {
  1269. sql += ` AND d.collection_id = ?`;
  1270. params.push(collectionId);
  1271. }
  1272. sql += ` ORDER BY score LIMIT ?`;
  1273. params.push(limit);
  1274. const rows = db.prepare(sql).all(...params) as { filepath: string; display_path: string; title: string; body: string; score: number }[];
  1275. const maxScore = rows.length > 0 ? Math.max(...rows.map(r => Math.abs(r.score))) : 1;
  1276. return rows.map(row => ({
  1277. file: row.filepath,
  1278. displayPath: row.display_path,
  1279. title: row.title,
  1280. body: row.body,
  1281. score: Math.abs(row.score) / maxScore,
  1282. source: "fts" as const,
  1283. }));
  1284. }
  1285. // =============================================================================
  1286. // Vector Search
  1287. // =============================================================================
  1288. export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionId?: number): Promise<SearchResult[]> {
  1289. const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1290. if (!tableExists) return [];
  1291. const embedding = await getEmbedding(query, model, true);
  1292. if (!embedding) return [];
  1293. // sqlite-vec requires "k = ?" for KNN queries
  1294. let sql = `
  1295. SELECT
  1296. v.hash_seq,
  1297. v.distance,
  1298. 'qmd://' || c.name || '/' || d.path as filepath,
  1299. 'qmd://' || c.name || '/' || d.path as display_path,
  1300. d.title,
  1301. content.doc as body,
  1302. cv.pos
  1303. FROM vectors_vec v
  1304. JOIN content_vectors cv ON cv.hash || '_' || cv.seq = v.hash_seq
  1305. JOIN documents d ON d.hash = cv.hash AND d.active = 1
  1306. JOIN collections c ON c.id = d.collection_id
  1307. JOIN content ON content.hash = d.hash
  1308. WHERE v.embedding MATCH ? AND k = ?
  1309. `;
  1310. if (collectionId !== undefined) {
  1311. sql += ` AND d.collection_id = ${collectionId}`;
  1312. }
  1313. sql += ` ORDER BY v.distance`;
  1314. const rows = db.prepare(sql).all(new Float32Array(embedding), limit * 3) as { hash_seq: string; distance: number; filepath: string; display_path: string; title: string; body: string; pos: number }[];
  1315. const seen = new Map<string, { row: typeof rows[0]; bestDist: number }>();
  1316. for (const row of rows) {
  1317. const existing = seen.get(row.filepath);
  1318. if (!existing || row.distance < existing.bestDist) {
  1319. seen.set(row.filepath, { row, bestDist: row.distance });
  1320. }
  1321. }
  1322. return Array.from(seen.values())
  1323. .sort((a, b) => a.bestDist - b.bestDist)
  1324. .slice(0, limit)
  1325. .map(({ row }) => ({
  1326. file: row.filepath,
  1327. displayPath: row.display_path,
  1328. title: row.title,
  1329. body: row.body,
  1330. score: 1 / (1 + row.distance),
  1331. source: "vec" as const,
  1332. chunkPos: row.pos,
  1333. }));
  1334. }
  1335. // =============================================================================
  1336. // Embeddings
  1337. // =============================================================================
  1338. async function getEmbedding(text: string, model: string, isQuery: boolean): Promise<number[] | null> {
  1339. const ollama = getDefaultOllama();
  1340. const result = await ollama.embed(text, { model, isQuery });
  1341. return result?.embedding || null;
  1342. }
  1343. /**
  1344. * Get all unique content hashes that need embeddings (from active documents).
  1345. * Returns hash, document body, and a sample path for display purposes.
  1346. */
  1347. export function getHashesForEmbedding(db: Database): { hash: string; body: string; path: string }[] {
  1348. return db.prepare(`
  1349. SELECT d.hash, c.doc as body, MIN(d.path) as path
  1350. FROM documents d
  1351. JOIN content c ON d.hash = c.hash
  1352. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  1353. WHERE d.active = 1 AND v.hash IS NULL
  1354. GROUP BY d.hash
  1355. `).all() as { hash: string; body: string; path: string }[];
  1356. }
  1357. /**
  1358. * Clear all embeddings from the database (force re-index).
  1359. * Deletes all rows from content_vectors and drops the vectors_vec table.
  1360. */
  1361. export function clearAllEmbeddings(db: Database): void {
  1362. db.exec(`DELETE FROM content_vectors`);
  1363. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  1364. }
  1365. /**
  1366. * Insert a single embedding into both content_vectors and vectors_vec tables.
  1367. * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
  1368. */
  1369. export function insertEmbedding(
  1370. db: Database,
  1371. hash: string,
  1372. seq: number,
  1373. pos: number,
  1374. embedding: Float32Array,
  1375. model: string,
  1376. embeddedAt: string
  1377. ): void {
  1378. const hashSeq = `${hash}_${seq}`;
  1379. const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
  1380. const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
  1381. insertVecStmt.run(hashSeq, embedding);
  1382. insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
  1383. }
  1384. // =============================================================================
  1385. // Query expansion
  1386. // =============================================================================
  1387. export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database): Promise<string[]> {
  1388. // Check cache first
  1389. const cacheKey = getCacheKey("expandQuery", { query, model });
  1390. const cached = getCachedResult(db, cacheKey);
  1391. if (cached) {
  1392. const lines = cached.split('\n').map(l => l.trim()).filter(l => l.length > 0);
  1393. return [query, ...lines.slice(0, 2)];
  1394. }
  1395. const ollama = getDefaultOllama();
  1396. const results = await ollama.expandQuery(query, model, 2);
  1397. // Cache the expanded queries (excluding original)
  1398. if (results.length > 1) {
  1399. setCachedResult(db, cacheKey, results.slice(1).join('\n'));
  1400. }
  1401. return results;
  1402. }
  1403. // =============================================================================
  1404. // Reranking
  1405. // =============================================================================
  1406. export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database): Promise<{ file: string; score: number }[]> {
  1407. const cachedResults: Map<string, number> = new Map();
  1408. const uncachedDocs: RerankDocument[] = [];
  1409. // Check cache for each document
  1410. for (const doc of documents) {
  1411. const cacheKey = getCacheKey("rerank", { query, file: doc.file, model });
  1412. const cached = getCachedResult(db, cacheKey);
  1413. if (cached !== null) {
  1414. cachedResults.set(doc.file, parseFloat(cached));
  1415. } else {
  1416. uncachedDocs.push({ file: doc.file, text: doc.text });
  1417. }
  1418. }
  1419. // Rerank uncached documents using Ollama
  1420. if (uncachedDocs.length > 0) {
  1421. const ollama = getDefaultOllama();
  1422. const rerankResult = await ollama.rerank(query, uncachedDocs, { model });
  1423. // Cache results
  1424. for (const result of rerankResult.results) {
  1425. const cacheKey = getCacheKey("rerank", { query, file: result.file, model });
  1426. setCachedResult(db, cacheKey, result.score.toString());
  1427. cachedResults.set(result.file, result.score);
  1428. }
  1429. }
  1430. // Return all results sorted by score
  1431. return documents
  1432. .map(doc => ({ file: doc.file, score: cachedResults.get(doc.file) || 0 }))
  1433. .sort((a, b) => b.score - a.score);
  1434. }
  1435. // =============================================================================
  1436. // Reciprocal Rank Fusion
  1437. // =============================================================================
  1438. export function reciprocalRankFusion(
  1439. resultLists: RankedResult[][],
  1440. weights: number[] = [],
  1441. k: number = 60
  1442. ): RankedResult[] {
  1443. const scores = new Map<string, { result: RankedResult; rrfScore: number; topRank: number }>();
  1444. for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
  1445. const list = resultLists[listIdx];
  1446. const weight = weights[listIdx] ?? 1.0;
  1447. for (let rank = 0; rank < list.length; rank++) {
  1448. const result = list[rank];
  1449. const rrfContribution = weight / (k + rank + 1);
  1450. const existing = scores.get(result.file);
  1451. if (existing) {
  1452. existing.rrfScore += rrfContribution;
  1453. existing.topRank = Math.min(existing.topRank, rank);
  1454. } else {
  1455. scores.set(result.file, {
  1456. result,
  1457. rrfScore: rrfContribution,
  1458. topRank: rank,
  1459. });
  1460. }
  1461. }
  1462. }
  1463. // Top-rank bonus
  1464. for (const entry of scores.values()) {
  1465. if (entry.topRank === 0) {
  1466. entry.rrfScore += 0.05;
  1467. } else if (entry.topRank <= 2) {
  1468. entry.rrfScore += 0.02;
  1469. }
  1470. }
  1471. return Array.from(scores.values())
  1472. .sort((a, b) => b.rrfScore - a.rrfScore)
  1473. .map(e => ({ ...e.result, score: e.rrfScore }));
  1474. }
  1475. // =============================================================================
  1476. // Document retrieval
  1477. // =============================================================================
  1478. type DbDocRow = {
  1479. filepath: string;
  1480. display_path: string;
  1481. title: string;
  1482. hash: string;
  1483. collection_id: number;
  1484. modified_at: string;
  1485. body_length: number;
  1486. body?: string;
  1487. };
  1488. /**
  1489. * Find a document by filename/path (with fuzzy matching)
  1490. * Returns document metadata without body by default
  1491. */
  1492. export function findDocument(db: Database, filename: string, options: { includeBody?: boolean } = {}): DocumentResult | DocumentNotFound {
  1493. let filepath = filename;
  1494. const colonMatch = filepath.match(/:(\d+)$/);
  1495. if (colonMatch) {
  1496. filepath = filepath.slice(0, -colonMatch[0].length);
  1497. }
  1498. if (filepath.startsWith('~/')) {
  1499. filepath = homedir() + filepath.slice(1);
  1500. }
  1501. const selectCols = options.includeBody
  1502. ? `filepath, display_path, title, hash, collection_id, modified_at, LENGTH(body) as body_length, body`
  1503. : `filepath, display_path, title, hash, collection_id, modified_at, LENGTH(body) as body_length`;
  1504. // Try various match strategies
  1505. let doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE filepath = ? AND active = 1`).get(filepath) as DbDocRow | null;
  1506. if (!doc) {
  1507. doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE display_path = ? AND active = 1`).get(filepath) as DbDocRow | null;
  1508. }
  1509. if (!doc) {
  1510. doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE filepath LIKE ? AND active = 1 LIMIT 1`).get(`%${filepath}`) as DbDocRow | null;
  1511. }
  1512. if (!doc) {
  1513. doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE display_path LIKE ? AND active = 1 LIMIT 1`).get(`%${filepath}`) as DbDocRow | null;
  1514. }
  1515. if (!doc) {
  1516. const similar = findSimilarFiles(db, filepath, 5, 5);
  1517. return { error: "not_found", query: filename, similarFiles: similar };
  1518. }
  1519. const context = getContextForFile(db, doc.filepath);
  1520. return {
  1521. filepath: doc.filepath,
  1522. displayPath: doc.display_path,
  1523. title: doc.title,
  1524. context,
  1525. hash: doc.hash,
  1526. collectionId: doc.collection_id,
  1527. modifiedAt: doc.modified_at,
  1528. bodyLength: doc.body_length,
  1529. ...(options.includeBody && doc.body !== undefined && { body: doc.body }),
  1530. };
  1531. }
  1532. /**
  1533. * Get the body content for a document
  1534. * Optionally slice by line range
  1535. */
  1536. export function getDocumentBody(db: Database, doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number): string | null {
  1537. const filepath = 'filepath' in doc ? doc.filepath : doc.filepath;
  1538. const row = db.prepare(`SELECT body FROM documents WHERE filepath = ? AND active = 1`).get(filepath) as { body: string } | null;
  1539. if (!row) return null;
  1540. let body = row.body;
  1541. if (fromLine !== undefined || maxLines !== undefined) {
  1542. const lines = body.split('\n');
  1543. const start = (fromLine || 1) - 1;
  1544. const end = maxLines !== undefined ? start + maxLines : lines.length;
  1545. body = lines.slice(start, end).join('\n');
  1546. }
  1547. return body;
  1548. }
  1549. /**
  1550. * Legacy function for backwards compatibility
  1551. * Combines findDocument + getDocumentBody with line slicing
  1552. */
  1553. export function getDocument(db: Database, filename: string, fromLine?: number, maxLines?: number): (DocumentResult & { body: string }) | DocumentNotFound {
  1554. // Parse :line suffix
  1555. let parsedFromLine = fromLine;
  1556. let filepath = filename;
  1557. const colonMatch = filepath.match(/:(\d+)$/);
  1558. if (colonMatch && !parsedFromLine) {
  1559. parsedFromLine = parseInt(colonMatch[1], 10);
  1560. filepath = filepath.slice(0, -colonMatch[0].length);
  1561. }
  1562. const result = findDocument(db, filepath, { includeBody: true });
  1563. if ("error" in result) return result;
  1564. let body = result.body || "";
  1565. if (parsedFromLine !== undefined || maxLines !== undefined) {
  1566. const lines = body.split('\n');
  1567. const start = (parsedFromLine || 1) - 1;
  1568. const end = maxLines !== undefined ? start + maxLines : lines.length;
  1569. body = lines.slice(start, end).join('\n');
  1570. }
  1571. return { ...result, body };
  1572. }
  1573. /**
  1574. * Find multiple documents by glob pattern or comma-separated list
  1575. * Returns documents without body by default (use getDocumentBody to load)
  1576. */
  1577. export function findDocuments(
  1578. db: Database,
  1579. pattern: string,
  1580. options: { includeBody?: boolean; maxBytes?: number } = {}
  1581. ): { docs: MultiGetResult[]; errors: string[] } {
  1582. const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
  1583. const errors: string[] = [];
  1584. const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
  1585. const selectCols = options.includeBody
  1586. ? `filepath, display_path, title, hash, collection_id, modified_at, LENGTH(body) as body_length, body`
  1587. : `filepath, display_path, title, hash, collection_id, modified_at, LENGTH(body) as body_length`;
  1588. let fileRows: DbDocRow[];
  1589. if (isCommaSeparated) {
  1590. const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
  1591. fileRows = [];
  1592. for (const name of names) {
  1593. let doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE display_path = ? AND active = 1`).get(name) as DbDocRow | null;
  1594. if (!doc) {
  1595. doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE display_path LIKE ? AND active = 1 LIMIT 1`).get(`%${name}`) as DbDocRow | null;
  1596. }
  1597. if (doc) {
  1598. fileRows.push(doc);
  1599. } else {
  1600. const similar = findSimilarFiles(db, name, 5, 3);
  1601. let msg = `File not found: ${name}`;
  1602. if (similar.length > 0) {
  1603. msg += ` (did you mean: ${similar.join(', ')}?)`;
  1604. }
  1605. errors.push(msg);
  1606. }
  1607. }
  1608. } else {
  1609. // Glob pattern match
  1610. const matched = matchFilesByGlob(db, pattern);
  1611. if (matched.length === 0) {
  1612. errors.push(`No files matched pattern: ${pattern}`);
  1613. return { docs: [], errors };
  1614. }
  1615. const filepaths = matched.map(m => m.filepath);
  1616. const placeholders = filepaths.map(() => '?').join(',');
  1617. fileRows = db.prepare(`SELECT ${selectCols} FROM documents WHERE filepath IN (${placeholders}) AND active = 1`).all(...filepaths) as DbDocRow[];
  1618. }
  1619. const results: MultiGetResult[] = [];
  1620. for (const row of fileRows) {
  1621. const context = getContextForFile(db, row.filepath);
  1622. if (row.body_length > maxBytes) {
  1623. results.push({
  1624. doc: { filepath: row.filepath, displayPath: row.display_path },
  1625. skipped: true,
  1626. skipReason: `File too large (${Math.round(row.body_length / 1024)}KB > ${Math.round(maxBytes / 1024)}KB)`,
  1627. });
  1628. continue;
  1629. }
  1630. results.push({
  1631. doc: {
  1632. filepath: row.filepath,
  1633. displayPath: row.display_path,
  1634. title: row.title || row.display_path.split('/').pop() || row.display_path,
  1635. context,
  1636. hash: row.hash,
  1637. collectionId: row.collection_id,
  1638. modifiedAt: row.modified_at,
  1639. bodyLength: row.body_length,
  1640. ...(options.includeBody && row.body !== undefined && { body: row.body }),
  1641. },
  1642. skipped: false,
  1643. });
  1644. }
  1645. return { docs: results, errors };
  1646. }
  1647. /**
  1648. * Legacy function for backwards compatibility
  1649. */
  1650. export function getMultipleDocuments(db: Database, pattern: string, maxLines?: number, maxBytes: number = DEFAULT_MULTI_GET_MAX_BYTES): { files: MultiGetFile[]; errors: string[] } {
  1651. const { docs, errors } = findDocuments(db, pattern, { includeBody: true, maxBytes });
  1652. const files: MultiGetFile[] = docs.map(result => {
  1653. if (result.skipped) {
  1654. return {
  1655. filepath: result.doc.filepath,
  1656. displayPath: result.doc.displayPath,
  1657. title: "",
  1658. body: "",
  1659. context: null,
  1660. skipped: true as const,
  1661. skipReason: result.skipReason,
  1662. };
  1663. }
  1664. let body = result.doc.body || "";
  1665. if (maxLines !== undefined) {
  1666. const lines = body.split('\n');
  1667. body = lines.slice(0, maxLines).join('\n');
  1668. if (lines.length > maxLines) {
  1669. body += `\n\n[... truncated ${lines.length - maxLines} more lines]`;
  1670. }
  1671. }
  1672. return {
  1673. filepath: result.doc.filepath,
  1674. displayPath: result.doc.displayPath,
  1675. title: result.doc.title,
  1676. body,
  1677. context: result.doc.context,
  1678. skipped: false as const,
  1679. };
  1680. });
  1681. return { files, errors };
  1682. }
  1683. // Keep the old MultiGetFile type for backwards compatibility
  1684. export type MultiGetFile = {
  1685. filepath: string;
  1686. displayPath: string;
  1687. title: string;
  1688. body: string;
  1689. context: string | null;
  1690. skipped: false;
  1691. } | {
  1692. filepath: string;
  1693. displayPath: string;
  1694. title: string;
  1695. body: string;
  1696. context: string | null;
  1697. skipped: true;
  1698. skipReason: string;
  1699. };
  1700. // =============================================================================
  1701. // Status
  1702. // =============================================================================
  1703. export function getStatus(db: Database): IndexStatus {
  1704. const collections = db.prepare(`
  1705. SELECT c.id, c.pwd, c.glob_pattern, c.created_at,
  1706. COUNT(d.id) as active_count,
  1707. MAX(d.modified_at) as last_doc_update
  1708. FROM collections c
  1709. LEFT JOIN documents d ON d.collection_id = c.id AND d.active = 1
  1710. GROUP BY c.id
  1711. ORDER BY last_doc_update DESC
  1712. `).all() as { id: number; pwd: string; glob_pattern: string; created_at: string; active_count: number; last_doc_update: string | null }[];
  1713. const totalDocs = (db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get() as { c: number }).c;
  1714. const needsEmbedding = getHashesNeedingEmbedding(db);
  1715. const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1716. return {
  1717. totalDocuments: totalDocs,
  1718. needsEmbedding,
  1719. hasVectorIndex: hasVectors,
  1720. collections: collections.map(col => ({
  1721. id: col.id,
  1722. path: col.pwd,
  1723. pattern: col.glob_pattern,
  1724. documents: col.active_count,
  1725. lastUpdated: col.last_doc_update || col.created_at,
  1726. })),
  1727. };
  1728. }
  1729. // =============================================================================
  1730. // Snippet extraction
  1731. // =============================================================================
  1732. export type SnippetResult = {
  1733. line: number; // 1-indexed line number of best match
  1734. snippet: string; // The snippet text with diff-style header
  1735. linesBefore: number; // Lines in document before snippet
  1736. linesAfter: number; // Lines in document after snippet
  1737. snippetLines: number; // Number of lines in snippet
  1738. };
  1739. export function extractSnippet(body: string, query: string, maxLen = 500, chunkPos?: number): SnippetResult {
  1740. const totalLines = body.split('\n').length;
  1741. let searchBody = body;
  1742. let lineOffset = 0;
  1743. if (chunkPos && chunkPos > 0) {
  1744. const contextStart = Math.max(0, chunkPos - 100);
  1745. const contextEnd = Math.min(body.length, chunkPos + maxLen + 100);
  1746. searchBody = body.slice(contextStart, contextEnd);
  1747. if (contextStart > 0) {
  1748. lineOffset = body.slice(0, contextStart).split('\n').length - 1;
  1749. }
  1750. }
  1751. const lines = searchBody.split('\n');
  1752. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
  1753. let bestLine = 0, bestScore = -1;
  1754. for (let i = 0; i < lines.length; i++) {
  1755. const lineLower = lines[i].toLowerCase();
  1756. let score = 0;
  1757. for (const term of queryTerms) {
  1758. if (lineLower.includes(term)) score++;
  1759. }
  1760. if (score > bestScore) {
  1761. bestScore = score;
  1762. bestLine = i;
  1763. }
  1764. }
  1765. const start = Math.max(0, bestLine - 1);
  1766. const end = Math.min(lines.length, bestLine + 3);
  1767. const snippetLines = lines.slice(start, end);
  1768. let snippetText = snippetLines.join('\n');
  1769. if (snippetText.length > maxLen) snippetText = snippetText.substring(0, maxLen - 3) + "...";
  1770. const absoluteStart = lineOffset + start + 1; // 1-indexed
  1771. const snippetLineCount = snippetLines.length;
  1772. const linesBefore = absoluteStart - 1;
  1773. const linesAfter = totalLines - (absoluteStart + snippetLineCount - 1);
  1774. // Format with diff-style header: @@ -start,count @@ (linesBefore before, linesAfter after)
  1775. const header = `@@ -${absoluteStart},${snippetLineCount} @@ (${linesBefore} before, ${linesAfter} after)`;
  1776. const snippet = `${header}\n${snippetText}`;
  1777. return {
  1778. line: lineOffset + bestLine + 1,
  1779. snippet,
  1780. linesBefore,
  1781. linesAfter,
  1782. snippetLines: snippetLineCount,
  1783. };
  1784. }