store.ts 77 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313
  1. /**
  2. * QMD Store - Core data access and retrieval functions
  3. *
  4. * This module provides all database operations, search functions, and document
  5. * retrieval for QMD. It returns raw data structures that can be formatted by
  6. * CLI or MCP consumers.
  7. *
  8. * Usage:
  9. * const store = createStore("/path/to/db.sqlite");
  10. * // or use default path:
  11. * const store = createStore();
  12. */
  13. import { Database } from "bun:sqlite";
  14. import { Glob } from "bun";
  15. import * as sqliteVec from "sqlite-vec";
  16. import {
  17. Ollama,
  18. getDefaultOllama,
  19. formatQueryForEmbedding,
  20. formatDocForEmbedding,
  21. type RerankDocument,
  22. } from "./llm";
  23. import {
  24. findContextForPath as collectionsFindContextForPath,
  25. addContext as collectionsAddContext,
  26. removeContext as collectionsRemoveContext,
  27. listAllContexts as collectionsListAllContexts,
  28. getCollection,
  29. listCollections as collectionsListCollections,
  30. addCollection as collectionsAddCollection,
  31. removeCollection as collectionsRemoveCollection,
  32. renameCollection as collectionsRenameCollection,
  33. setGlobalContext,
  34. type NamedCollection,
  35. } from "./collections";
  36. // =============================================================================
  37. // Configuration
  38. // =============================================================================
  39. const HOME = Bun.env.HOME || "/tmp";
  40. export const DEFAULT_EMBED_MODEL = "embeddinggemma";
  41. export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
  42. export const DEFAULT_QUERY_MODEL = "qwen3:0.6b";
  43. export const DEFAULT_GLOB = "**/*.md";
  44. export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
  45. // Re-export OLLAMA_URL for backwards compatibility
  46. export const OLLAMA_URL = getDefaultOllama().getBaseUrl();
  47. // Chunking: ~2000 tokens per chunk, ~3 bytes/token = 6KB
  48. const CHUNK_BYTE_SIZE = 6 * 1024;
  49. // =============================================================================
  50. // Path utilities
  51. // =============================================================================
  52. export function homedir(): string {
  53. return HOME;
  54. }
  55. export function resolve(...paths: string[]): string {
  56. let result = paths[0].startsWith('/') ? '' : Bun.env.PWD || process.cwd();
  57. for (const p of paths) {
  58. if (p.startsWith('/')) {
  59. result = p;
  60. } else {
  61. result = result + '/' + p;
  62. }
  63. }
  64. const parts = result.split('/').filter(Boolean);
  65. const normalized: string[] = [];
  66. for (const part of parts) {
  67. if (part === '..') normalized.pop();
  68. else if (part !== '.') normalized.push(part);
  69. }
  70. return '/' + normalized.join('/');
  71. }
  72. export function getDefaultDbPath(indexName: string = "index"): string {
  73. // Allow override via INDEX_PATH for testing
  74. if (Bun.env.INDEX_PATH) {
  75. return Bun.env.INDEX_PATH;
  76. }
  77. const cacheDir = Bun.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
  78. const qmdCacheDir = resolve(cacheDir, "qmd");
  79. try { Bun.spawnSync(["mkdir", "-p", qmdCacheDir]); } catch {}
  80. return resolve(qmdCacheDir, `${indexName}.sqlite`);
  81. }
  82. export function getPwd(): string {
  83. return process.env.PWD || process.cwd();
  84. }
  85. export function getRealPath(path: string): string {
  86. try {
  87. const result = Bun.spawnSync(["realpath", path]);
  88. if (result.success) {
  89. return result.stdout.toString().trim();
  90. }
  91. } catch {}
  92. return resolve(path);
  93. }
  94. // =============================================================================
  95. // Virtual Path Utilities (qmd://)
  96. // =============================================================================
  97. export type VirtualPath = {
  98. collectionName: string;
  99. path: string; // relative path within collection
  100. };
  101. /**
  102. * Parse a virtual path like "qmd://collection-name/path/to/file.md"
  103. * into its components.
  104. * Also supports collection root: "qmd://collection-name/" or "qmd://collection-name"
  105. */
  106. export function parseVirtualPath(virtualPath: string): VirtualPath | null {
  107. // Match: qmd://collection-name[/optional-path]
  108. // Allows: qmd://name, qmd://name/, qmd://name/path
  109. const match = virtualPath.match(/^qmd:\/\/([^\/]+)\/?(.*)$/);
  110. if (!match) return null;
  111. return {
  112. collectionName: match[1],
  113. path: match[2] || '', // Empty string for collection root
  114. };
  115. }
  116. /**
  117. * Build a virtual path from collection name and relative path.
  118. */
  119. export function buildVirtualPath(collectionName: string, path: string): string {
  120. return `qmd://${collectionName}/${path}`;
  121. }
  122. /**
  123. * Check if a path is a virtual path (starts with qmd://).
  124. */
  125. export function isVirtualPath(path: string): boolean {
  126. return path.startsWith('qmd://');
  127. }
  128. /**
  129. * Resolve a virtual path to absolute filesystem path.
  130. */
  131. export function resolveVirtualPath(db: Database, virtualPath: string): string | null {
  132. const parsed = parseVirtualPath(virtualPath);
  133. if (!parsed) return null;
  134. const coll = getCollectionByName(db, parsed.collectionName);
  135. if (!coll) return null;
  136. return resolve(coll.pwd, parsed.path);
  137. }
  138. /**
  139. * Convert an absolute filesystem path to a virtual path.
  140. * Returns null if the file is not in any indexed collection.
  141. */
  142. export function toVirtualPath(db: Database, absolutePath: string): string | null {
  143. // Get all collections from YAML config
  144. const collections = collectionsListCollections();
  145. // Find which collection this absolute path belongs to
  146. for (const coll of collections) {
  147. if (absolutePath.startsWith(coll.path + '/') || absolutePath === coll.path) {
  148. // Extract relative path
  149. const relativePath = absolutePath.startsWith(coll.path + '/')
  150. ? absolutePath.slice(coll.path.length + 1)
  151. : '';
  152. // Verify this document exists in the database
  153. const doc = db.prepare(`
  154. SELECT d.path
  155. FROM documents d
  156. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  157. LIMIT 1
  158. `).get(coll.name, relativePath) as { path: string } | null;
  159. if (doc) {
  160. return buildVirtualPath(coll.name, relativePath);
  161. }
  162. }
  163. }
  164. return null;
  165. }
  166. // =============================================================================
  167. // Database initialization
  168. // =============================================================================
  169. // On macOS, use Homebrew's SQLite which supports extensions
  170. if (process.platform === "darwin") {
  171. const homebrewSqlitePath = "/opt/homebrew/opt/sqlite/lib/libsqlite3.dylib";
  172. try {
  173. if (Bun.file(homebrewSqlitePath).size > 0) {
  174. Database.setCustomSQLite(homebrewSqlitePath);
  175. }
  176. } catch {}
  177. }
  178. function initializeDatabase(db: Database): void {
  179. sqliteVec.load(db);
  180. db.exec("PRAGMA journal_mode = WAL");
  181. db.exec("PRAGMA foreign_keys = ON");
  182. // Check if we need to migrate from old schema
  183. const tables = db.prepare(`SELECT name FROM sqlite_master WHERE type='table'`).all() as { name: string }[];
  184. const tableNames = tables.map(t => t.name);
  185. const needsMigration = tableNames.includes('documents') && !tableNames.includes('content');
  186. if (needsMigration) {
  187. migrateToContentAddressable(db);
  188. return; // Migration will call initializeDatabase again
  189. }
  190. // Content-addressable storage - the source of truth for document content
  191. db.exec(`
  192. CREATE TABLE IF NOT EXISTS content (
  193. hash TEXT PRIMARY KEY,
  194. doc TEXT NOT NULL,
  195. created_at TEXT NOT NULL
  196. )
  197. `);
  198. // Collections table with name field
  199. db.exec(`
  200. CREATE TABLE IF NOT EXISTS collections (
  201. id INTEGER PRIMARY KEY AUTOINCREMENT,
  202. name TEXT NOT NULL UNIQUE,
  203. pwd TEXT NOT NULL,
  204. glob_pattern TEXT NOT NULL,
  205. created_at TEXT NOT NULL,
  206. updated_at TEXT NOT NULL,
  207. UNIQUE(pwd, glob_pattern)
  208. )
  209. `);
  210. // Documents table - file system layer mapping virtual paths to content hashes
  211. db.exec(`
  212. CREATE TABLE IF NOT EXISTS documents (
  213. id INTEGER PRIMARY KEY AUTOINCREMENT,
  214. collection TEXT NOT NULL,
  215. path TEXT NOT NULL,
  216. title TEXT NOT NULL,
  217. hash TEXT NOT NULL,
  218. created_at TEXT NOT NULL,
  219. modified_at TEXT NOT NULL,
  220. active INTEGER NOT NULL DEFAULT 1,
  221. FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
  222. UNIQUE(collection, path)
  223. )
  224. `);
  225. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection, active)`);
  226. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`);
  227. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path, active)`);
  228. // Path-based context (collection-scoped, hierarchical)
  229. db.exec(`
  230. CREATE TABLE IF NOT EXISTS path_contexts (
  231. id INTEGER PRIMARY KEY AUTOINCREMENT,
  232. collection_id INTEGER NOT NULL,
  233. path_prefix TEXT NOT NULL,
  234. context TEXT NOT NULL,
  235. created_at TEXT NOT NULL,
  236. FOREIGN KEY (collection_id) REFERENCES collections(id) ON DELETE CASCADE,
  237. UNIQUE(collection_id, path_prefix)
  238. )
  239. `);
  240. db.exec(`CREATE INDEX IF NOT EXISTS idx_path_contexts_collection ON path_contexts(collection_id, path_prefix)`);
  241. // Cache table for Ollama API calls
  242. db.exec(`
  243. CREATE TABLE IF NOT EXISTS ollama_cache (
  244. hash TEXT PRIMARY KEY,
  245. result TEXT NOT NULL,
  246. created_at TEXT NOT NULL
  247. )
  248. `);
  249. // Content vectors
  250. const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
  251. const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
  252. if (cvInfo.length > 0 && !hasSeqColumn) {
  253. db.exec(`DROP TABLE IF EXISTS content_vectors`);
  254. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  255. }
  256. db.exec(`
  257. CREATE TABLE IF NOT EXISTS content_vectors (
  258. hash TEXT NOT NULL,
  259. seq INTEGER NOT NULL DEFAULT 0,
  260. pos INTEGER NOT NULL DEFAULT 0,
  261. model TEXT NOT NULL,
  262. embedded_at TEXT NOT NULL,
  263. PRIMARY KEY (hash, seq)
  264. )
  265. `);
  266. // FTS - index path and content (joined from content table)
  267. db.exec(`
  268. CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
  269. path, body,
  270. tokenize='porter unicode61'
  271. )
  272. `);
  273. // Triggers to keep FTS in sync
  274. db.exec(`
  275. CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents BEGIN
  276. INSERT INTO documents_fts(rowid, path, body)
  277. SELECT new.id, new.path, c.doc
  278. FROM content c
  279. WHERE c.hash = new.hash;
  280. END
  281. `);
  282. db.exec(`
  283. CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
  284. DELETE FROM documents_fts WHERE rowid = old.id;
  285. END
  286. `);
  287. db.exec(`
  288. CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents BEGIN
  289. UPDATE documents_fts
  290. SET path = new.path,
  291. body = (SELECT doc FROM content WHERE hash = new.hash)
  292. WHERE rowid = new.id;
  293. END
  294. `);
  295. }
  296. function migrateToContentAddressable(db: Database): void {
  297. console.log("Migrating database to content-addressable schema...");
  298. // Start transaction
  299. db.exec("BEGIN TRANSACTION");
  300. try {
  301. // Rename old tables
  302. db.exec("ALTER TABLE documents RENAME TO documents_old");
  303. db.exec("ALTER TABLE collections RENAME TO collections_old");
  304. db.exec("ALTER TABLE path_contexts RENAME TO path_contexts_old");
  305. db.exec("DROP TABLE IF EXISTS documents_fts");
  306. db.exec("DROP TRIGGER IF EXISTS documents_ai");
  307. db.exec("DROP TRIGGER IF EXISTS documents_ad");
  308. db.exec("DROP TRIGGER IF EXISTS documents_au");
  309. // Create new schema
  310. db.exec(`
  311. CREATE TABLE content (
  312. hash TEXT PRIMARY KEY,
  313. doc TEXT NOT NULL,
  314. created_at TEXT NOT NULL
  315. )
  316. `);
  317. db.exec(`
  318. CREATE TABLE collections (
  319. id INTEGER PRIMARY KEY AUTOINCREMENT,
  320. name TEXT NOT NULL UNIQUE,
  321. pwd TEXT NOT NULL,
  322. glob_pattern TEXT NOT NULL,
  323. created_at TEXT NOT NULL,
  324. updated_at TEXT NOT NULL,
  325. UNIQUE(pwd, glob_pattern)
  326. )
  327. `);
  328. db.exec(`
  329. CREATE TABLE documents (
  330. id INTEGER PRIMARY KEY AUTOINCREMENT,
  331. collection TEXT NOT NULL,
  332. path TEXT NOT NULL,
  333. title TEXT NOT NULL,
  334. hash TEXT NOT NULL,
  335. created_at TEXT NOT NULL,
  336. modified_at TEXT NOT NULL,
  337. active INTEGER NOT NULL DEFAULT 1,
  338. FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
  339. UNIQUE(collection, path)
  340. )
  341. `);
  342. db.exec(`
  343. CREATE TABLE path_contexts (
  344. id INTEGER PRIMARY KEY AUTOINCREMENT,
  345. collection_id INTEGER NOT NULL,
  346. path_prefix TEXT NOT NULL,
  347. context TEXT NOT NULL,
  348. created_at TEXT NOT NULL,
  349. FOREIGN KEY (collection_id) REFERENCES collections(id) ON DELETE CASCADE,
  350. UNIQUE(collection_id, path_prefix)
  351. )
  352. `);
  353. // Migrate data: Extract unique content hashes
  354. console.log("Migrating content...");
  355. db.exec(`
  356. INSERT INTO content (hash, doc, created_at)
  357. SELECT hash, body, MIN(created_at) as created_at
  358. FROM documents_old
  359. WHERE active = 1
  360. GROUP BY hash
  361. `);
  362. // Migrate collections: generate names from pwd basename
  363. console.log("Migrating collections...");
  364. // First insert with pwd as temporary name
  365. db.exec(`
  366. INSERT INTO collections (id, name, pwd, glob_pattern, created_at, updated_at)
  367. SELECT
  368. id,
  369. pwd as name,
  370. pwd,
  371. glob_pattern,
  372. created_at,
  373. created_at as updated_at
  374. FROM collections_old
  375. `);
  376. // Then update names to basenames using application logic
  377. const collections = db.prepare(`SELECT id, pwd FROM collections`).all() as { id: number; pwd: string }[];
  378. for (const coll of collections) {
  379. const parts = coll.pwd.split('/').filter(Boolean);
  380. const name = parts[parts.length - 1] || 'root';
  381. db.prepare(`UPDATE collections SET name = ? WHERE id = ?`).run(name, coll.id);
  382. }
  383. // Handle duplicate collection names by appending collection_id
  384. const duplicates = db.prepare(`
  385. SELECT name, COUNT(*) as cnt
  386. FROM collections
  387. GROUP BY name
  388. HAVING cnt > 1
  389. `).all() as { name: string; cnt: number }[];
  390. for (const dup of duplicates) {
  391. const rows = db.prepare(`SELECT id FROM collections WHERE name = ? ORDER BY id`).all(dup.name) as { id: number }[];
  392. for (let i = 1; i < rows.length; i++) {
  393. db.prepare(`UPDATE collections SET name = ? WHERE id = ?`).run(`${dup.name}-${rows[i].id}`, rows[i].id);
  394. }
  395. }
  396. // Migrate documents: convert filepath to relative path within collection
  397. console.log("Migrating documents...");
  398. const oldDocs = db.prepare(`
  399. SELECT d.id, d.collection_id, d.filepath, d.title, d.hash, d.created_at, d.modified_at, c.pwd, c.name
  400. FROM documents_old d
  401. JOIN collections c ON c.id = d.collection_id
  402. WHERE d.active = 1
  403. `).all() as Array<{
  404. id: number;
  405. collection_id: number;
  406. filepath: string;
  407. title: string;
  408. hash: string;
  409. created_at: string;
  410. modified_at: string;
  411. pwd: string;
  412. name: string;
  413. }>;
  414. const insertDoc = db.prepare(`
  415. INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active)
  416. VALUES (?, ?, ?, ?, ?, ?, 1)
  417. `);
  418. for (const doc of oldDocs) {
  419. // Convert absolute filepath to relative path within collection
  420. let path = doc.filepath;
  421. if (path.startsWith(doc.pwd + '/')) {
  422. path = path.slice(doc.pwd.length + 1);
  423. } else if (path.startsWith(doc.pwd)) {
  424. path = path.slice(doc.pwd.length);
  425. }
  426. // Remove leading slash if present
  427. path = path.replace(/^\/+/, '');
  428. try {
  429. insertDoc.run(doc.name, path, doc.title, doc.hash, doc.created_at, doc.modified_at);
  430. } catch (e) {
  431. console.warn(`Skipping duplicate path: ${path} in collection ${doc.name}`);
  432. }
  433. }
  434. // Migrate path_contexts: associate with collections based on path prefix
  435. console.log("Migrating path contexts...");
  436. const oldContexts = db.prepare(`SELECT * FROM path_contexts_old`).all() as Array<{
  437. path_prefix: string;
  438. context: string;
  439. created_at: string;
  440. }>;
  441. const insertContext = db.prepare(`
  442. INSERT INTO path_contexts (collection_id, path_prefix, context, created_at)
  443. VALUES (?, ?, ?, ?)
  444. `);
  445. const allCollections = db.prepare(`SELECT id, pwd FROM collections`).all() as Array<{ id: number; pwd: string }>;
  446. for (const ctx of oldContexts) {
  447. // Find collection(s) that match this path prefix
  448. for (const coll of allCollections) {
  449. if (ctx.path_prefix.startsWith(coll.pwd)) {
  450. // Convert absolute path_prefix to relative within collection
  451. let relPath = ctx.path_prefix;
  452. if (relPath.startsWith(coll.pwd + '/')) {
  453. relPath = relPath.slice(coll.pwd.length + 1);
  454. } else if (relPath.startsWith(coll.pwd)) {
  455. relPath = relPath.slice(coll.pwd.length);
  456. }
  457. relPath = relPath.replace(/^\/+/, '');
  458. try {
  459. insertContext.run(coll.id, relPath, ctx.context, ctx.created_at);
  460. } catch (e) {
  461. // Ignore duplicates
  462. }
  463. }
  464. }
  465. }
  466. // Drop old tables
  467. db.exec("DROP TABLE documents_old");
  468. db.exec("DROP TABLE collections_old");
  469. db.exec("DROP TABLE path_contexts_old");
  470. // Recreate FTS and triggers
  471. db.exec(`
  472. CREATE VIRTUAL TABLE documents_fts USING fts5(
  473. path, body,
  474. tokenize='porter unicode61'
  475. )
  476. `);
  477. db.exec(`
  478. CREATE TRIGGER documents_ai AFTER INSERT ON documents BEGIN
  479. INSERT INTO documents_fts(rowid, path, body)
  480. SELECT new.id, new.path, c.doc
  481. FROM content c
  482. WHERE c.hash = new.hash;
  483. END
  484. `);
  485. db.exec(`
  486. CREATE TRIGGER documents_ad AFTER DELETE ON documents BEGIN
  487. DELETE FROM documents_fts WHERE rowid = old.id;
  488. END
  489. `);
  490. db.exec(`
  491. CREATE TRIGGER documents_au AFTER UPDATE ON documents BEGIN
  492. UPDATE documents_fts
  493. SET path = new.path,
  494. body = (SELECT doc FROM content WHERE hash = new.hash)
  495. WHERE rowid = new.id;
  496. END
  497. `);
  498. // Populate FTS from migrated data
  499. console.log("Rebuilding full-text search index...");
  500. db.exec(`
  501. INSERT INTO documents_fts(rowid, path, body)
  502. SELECT d.id, d.path, c.doc
  503. FROM documents d
  504. JOIN content c ON c.hash = d.hash
  505. WHERE d.active = 1
  506. `);
  507. // Create indexes
  508. db.exec(`CREATE INDEX idx_documents_collection ON documents(collection, active)`);
  509. db.exec(`CREATE INDEX idx_documents_hash ON documents(hash)`);
  510. db.exec(`CREATE INDEX idx_documents_path ON documents(path, active)`);
  511. db.exec(`CREATE INDEX idx_path_contexts_collection ON path_contexts(collection_id, path_prefix)`);
  512. db.exec("COMMIT");
  513. console.log("Migration complete!");
  514. } catch (e) {
  515. db.exec("ROLLBACK");
  516. console.error("Migration failed:", e);
  517. throw e;
  518. }
  519. }
  520. function ensureVecTableInternal(db: Database, dimensions: number): void {
  521. const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get() as { sql: string } | null;
  522. if (tableInfo) {
  523. const match = tableInfo.sql.match(/float\[(\d+)\]/);
  524. const hasHashSeq = tableInfo.sql.includes('hash_seq');
  525. if (match && parseInt(match[1]) === dimensions && hasHashSeq) return;
  526. db.exec("DROP TABLE IF EXISTS vectors_vec");
  527. }
  528. db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}])`);
  529. }
  530. // =============================================================================
  531. // Store Factory
  532. // =============================================================================
  533. export type Store = {
  534. db: Database;
  535. dbPath: string;
  536. close: () => void;
  537. ensureVecTable: (dimensions: number) => void;
  538. // Index health
  539. getHashesNeedingEmbedding: () => number;
  540. getIndexHealth: () => IndexHealthInfo;
  541. getStatus: () => IndexStatus;
  542. // Caching
  543. getCacheKey: typeof getCacheKey;
  544. getCachedResult: (cacheKey: string) => string | null;
  545. setCachedResult: (cacheKey: string, result: string) => void;
  546. clearCache: () => void;
  547. // Cleanup and maintenance
  548. deleteOllamaCache: () => number;
  549. deleteInactiveDocuments: () => number;
  550. cleanupOrphanedContent: () => number;
  551. cleanupOrphanedVectors: () => number;
  552. cleanupDuplicateCollections: () => number;
  553. vacuumDatabase: () => void;
  554. // Context
  555. getContextForFile: (filepath: string) => string | null;
  556. getContextForPath: (collectionName: string, path: string) => string | null;
  557. getCollectionByName: (name: string) => { name: string; pwd: string; glob_pattern: string } | null;
  558. getCollectionsWithoutContext: () => { name: string; pwd: string; doc_count: number }[];
  559. getTopLevelPathsWithoutContext: (collectionName: string) => string[];
  560. // Virtual paths
  561. parseVirtualPath: typeof parseVirtualPath;
  562. buildVirtualPath: typeof buildVirtualPath;
  563. isVirtualPath: typeof isVirtualPath;
  564. resolveVirtualPath: (virtualPath: string) => string | null;
  565. toVirtualPath: (absolutePath: string) => string | null;
  566. // Search
  567. searchFTS: (query: string, limit?: number, collectionId?: number) => SearchResult[];
  568. searchVec: (query: string, model: string, limit?: number, collectionId?: number) => Promise<SearchResult[]>;
  569. // Query expansion & reranking
  570. expandQuery: (query: string, model?: string) => Promise<string[]>;
  571. rerank: (query: string, documents: { file: string; text: string }[], model?: string) => Promise<{ file: string; score: number }[]>;
  572. // Document retrieval
  573. findDocument: (filename: string, options?: { includeBody?: boolean }) => DocumentResult | DocumentNotFound;
  574. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => string | null;
  575. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => { docs: MultiGetResult[]; errors: string[] };
  576. // Legacy compatibility
  577. getDocument: (filename: string, fromLine?: number, maxLines?: number) => (DocumentResult & { body: string }) | DocumentNotFound;
  578. getMultipleDocuments: (pattern: string, maxLines?: number, maxBytes?: number) => { files: MultiGetFile[]; errors: string[] };
  579. // Fuzzy matching
  580. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => string[];
  581. matchFilesByGlob: (pattern: string) => { filepath: string; displayPath: string; bodyLength: number }[];
  582. // Document indexing operations
  583. insertContent: (hash: string, content: string, createdAt: string) => void;
  584. insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => void;
  585. findActiveDocument: (collectionName: string, path: string) => { id: number; hash: string; title: string } | null;
  586. updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => void;
  587. updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => void;
  588. deactivateDocument: (collectionName: string, path: string) => void;
  589. getActiveDocumentPaths: (collectionName: string) => string[];
  590. // Vector/embedding operations
  591. getHashesForEmbedding: () => { hash: string; body: string; path: string }[];
  592. clearAllEmbeddings: () => void;
  593. insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void;
  594. };
  595. /**
  596. * Create a new store instance with the given database path.
  597. * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
  598. *
  599. * @param dbPath - Path to the SQLite database file
  600. * @returns Store instance with all methods bound to the database
  601. */
  602. export function createStore(dbPath?: string): Store {
  603. const resolvedPath = dbPath || getDefaultDbPath();
  604. const db = new Database(resolvedPath);
  605. initializeDatabase(db);
  606. return {
  607. db,
  608. dbPath: resolvedPath,
  609. close: () => db.close(),
  610. ensureVecTable: (dimensions: number) => ensureVecTableInternal(db, dimensions),
  611. // Index health
  612. getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
  613. getIndexHealth: () => getIndexHealth(db),
  614. getStatus: () => getStatus(db),
  615. // Caching
  616. getCacheKey,
  617. getCachedResult: (cacheKey: string) => getCachedResult(db, cacheKey),
  618. setCachedResult: (cacheKey: string, result: string) => setCachedResult(db, cacheKey, result),
  619. clearCache: () => clearCache(db),
  620. // Cleanup and maintenance
  621. deleteOllamaCache: () => deleteOllamaCache(db),
  622. deleteInactiveDocuments: () => deleteInactiveDocuments(db),
  623. cleanupOrphanedContent: () => cleanupOrphanedContent(db),
  624. cleanupOrphanedVectors: () => cleanupOrphanedVectors(db),
  625. cleanupDuplicateCollections: () => cleanupDuplicateCollections(db),
  626. vacuumDatabase: () => vacuumDatabase(db),
  627. // Context
  628. getContextForFile: (filepath: string) => getContextForFile(db, filepath),
  629. getContextForPath: (collectionName: string, path: string) => getContextForPath(db, collectionName, path),
  630. getCollectionByName: (name: string) => getCollectionByName(db, name),
  631. getCollectionsWithoutContext: () => getCollectionsWithoutContext(db),
  632. getTopLevelPathsWithoutContext: (collectionName: string) => getTopLevelPathsWithoutContext(db, collectionName),
  633. // Virtual paths
  634. parseVirtualPath,
  635. buildVirtualPath,
  636. isVirtualPath,
  637. resolveVirtualPath: (virtualPath: string) => resolveVirtualPath(db, virtualPath),
  638. toVirtualPath: (absolutePath: string) => toVirtualPath(db, absolutePath),
  639. // Search
  640. searchFTS: (query: string, limit?: number, collectionId?: number) => searchFTS(db, query, limit, collectionId),
  641. searchVec: (query: string, model: string, limit?: number, collectionId?: number) => searchVec(db, query, model, limit, collectionId),
  642. // Query expansion & reranking
  643. expandQuery: (query: string, model?: string) => expandQuery(query, model, db),
  644. rerank: (query: string, documents: { file: string; text: string }[], model?: string) => rerank(query, documents, model, db),
  645. // Document retrieval
  646. findDocument: (filename: string, options?: { includeBody?: boolean }) => findDocument(db, filename, options),
  647. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => getDocumentBody(db, doc, fromLine, maxLines),
  648. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => findDocuments(db, pattern, options),
  649. // Legacy compatibility
  650. getDocument: (filename: string, fromLine?: number, maxLines?: number) => getDocument(db, filename, fromLine, maxLines),
  651. getMultipleDocuments: (pattern: string, maxLines?: number, maxBytes?: number) => getMultipleDocuments(db, pattern, maxLines, maxBytes),
  652. // Fuzzy matching
  653. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => findSimilarFiles(db, query, maxDistance, limit),
  654. matchFilesByGlob: (pattern: string) => matchFilesByGlob(db, pattern),
  655. // Document indexing operations
  656. insertContent: (hash: string, content: string, createdAt: string) => insertContent(db, hash, content, createdAt),
  657. insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt),
  658. findActiveDocument: (collectionName: string, path: string) => findActiveDocument(db, collectionName, path),
  659. updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => updateDocumentTitle(db, documentId, title, modifiedAt),
  660. updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => updateDocument(db, documentId, title, hash, modifiedAt),
  661. deactivateDocument: (collectionName: string, path: string) => deactivateDocument(db, collectionName, path),
  662. getActiveDocumentPaths: (collectionName: string) => getActiveDocumentPaths(db, collectionName),
  663. // Vector/embedding operations
  664. getHashesForEmbedding: () => getHashesForEmbedding(db),
  665. clearAllEmbeddings: () => clearAllEmbeddings(db),
  666. insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
  667. };
  668. }
  669. // =============================================================================
  670. // Legacy compatibility - will be removed
  671. // =============================================================================
  672. let _legacyDb: Database | null = null;
  673. let _legacyDbPath: string | null = null;
  674. /** @deprecated Use createStore() instead */
  675. export function setCustomIndexName(name: string | null): void {
  676. _legacyDbPath = name ? getDefaultDbPath(name) : null;
  677. _legacyDb = null; // Reset so next getDb() creates new connection
  678. }
  679. /** @deprecated Use createStore() instead */
  680. export function getDbPath(): string {
  681. return _legacyDbPath || getDefaultDbPath();
  682. }
  683. /** @deprecated Use createStore() instead */
  684. export function getDb(): Database {
  685. if (!_legacyDb) {
  686. _legacyDb = new Database(getDbPath());
  687. initializeDatabase(_legacyDb);
  688. }
  689. return _legacyDb;
  690. }
  691. /** @deprecated Use store.db.close() instead. Closes the legacy db and resets singleton. */
  692. export function closeDb(): void {
  693. if (_legacyDb) {
  694. _legacyDb.close();
  695. _legacyDb = null;
  696. }
  697. }
  698. /** @deprecated Use store.ensureVecTable() instead */
  699. export function ensureVecTable(db: Database, dimensions: number): void {
  700. ensureVecTableInternal(db, dimensions);
  701. }
  702. // =============================================================================
  703. // Core Document Type
  704. // =============================================================================
  705. /**
  706. * Unified document result type with all metadata.
  707. * Body is optional - use getDocumentBody() to load it separately if needed.
  708. */
  709. export type DocumentResult = {
  710. filepath: string; // Full filesystem path
  711. displayPath: string; // Short display path (e.g., "docs/readme.md")
  712. title: string; // Document title (from first heading or filename)
  713. context: string | null; // Folder context description if configured
  714. hash: string; // Content hash for caching/change detection
  715. collectionName: string; // Parent collection name
  716. modifiedAt: string; // Last modification timestamp
  717. bodyLength: number; // Body length in bytes (useful before loading)
  718. body?: string; // Document body (optional, load with getDocumentBody)
  719. };
  720. /**
  721. * Search result extends DocumentResult with score and source info
  722. */
  723. export type SearchResult = DocumentResult & {
  724. score: number; // Relevance score (0-1)
  725. source: "fts" | "vec"; // Search source (full-text or vector)
  726. chunkPos?: number; // Character position of matching chunk (for vector search)
  727. };
  728. /**
  729. * Ranked result for RRF fusion (simplified, used internally)
  730. */
  731. export type RankedResult = {
  732. file: string;
  733. displayPath: string;
  734. title: string;
  735. body: string;
  736. score: number;
  737. };
  738. /**
  739. * Error result when document is not found
  740. */
  741. export type DocumentNotFound = {
  742. error: "not_found";
  743. query: string;
  744. similarFiles: string[];
  745. };
  746. /**
  747. * Result from multi-get operations
  748. */
  749. export type MultiGetResult = {
  750. doc: DocumentResult;
  751. skipped: false;
  752. } | {
  753. doc: Pick<DocumentResult, "filepath" | "displayPath">;
  754. skipped: true;
  755. skipReason: string;
  756. };
  757. export type CollectionInfo = {
  758. id: number;
  759. path: string;
  760. pattern: string;
  761. documents: number;
  762. lastUpdated: string;
  763. };
  764. export type IndexStatus = {
  765. totalDocuments: number;
  766. needsEmbedding: number;
  767. hasVectorIndex: boolean;
  768. collections: CollectionInfo[];
  769. };
  770. // =============================================================================
  771. // Index health
  772. // =============================================================================
  773. export function getHashesNeedingEmbedding(db: Database): number {
  774. const result = db.prepare(`
  775. SELECT COUNT(DISTINCT d.hash) as count
  776. FROM documents d
  777. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  778. WHERE d.active = 1 AND v.hash IS NULL
  779. `).get() as { count: number };
  780. return result.count;
  781. }
  782. export type IndexHealthInfo = {
  783. needsEmbedding: number;
  784. totalDocs: number;
  785. daysStale: number | null;
  786. };
  787. export function getIndexHealth(db: Database): IndexHealthInfo {
  788. const needsEmbedding = getHashesNeedingEmbedding(db);
  789. const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count;
  790. const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
  791. let daysStale: number | null = null;
  792. if (mostRecent?.latest) {
  793. const lastUpdate = new Date(mostRecent.latest);
  794. daysStale = Math.floor((Date.now() - lastUpdate.getTime()) / (24 * 60 * 60 * 1000));
  795. }
  796. return { needsEmbedding, totalDocs, daysStale };
  797. }
  798. // =============================================================================
  799. // Caching
  800. // =============================================================================
  801. export function getCacheKey(url: string, body: object): string {
  802. const hash = new Bun.CryptoHasher("sha256");
  803. hash.update(url);
  804. hash.update(JSON.stringify(body));
  805. return hash.digest("hex");
  806. }
  807. export function getCachedResult(db: Database, cacheKey: string): string | null {
  808. const row = db.prepare(`SELECT result FROM ollama_cache WHERE hash = ?`).get(cacheKey) as { result: string } | null;
  809. return row?.result || null;
  810. }
  811. export function setCachedResult(db: Database, cacheKey: string, result: string): void {
  812. const now = new Date().toISOString();
  813. db.prepare(`INSERT OR REPLACE INTO ollama_cache (hash, result, created_at) VALUES (?, ?, ?)`).run(cacheKey, result, now);
  814. if (Math.random() < 0.01) {
  815. db.exec(`DELETE FROM ollama_cache WHERE hash NOT IN (SELECT hash FROM ollama_cache ORDER BY created_at DESC LIMIT 1000)`);
  816. }
  817. }
  818. export function clearCache(db: Database): void {
  819. db.exec(`DELETE FROM ollama_cache`);
  820. }
  821. // =============================================================================
  822. // Cleanup and maintenance operations
  823. // =============================================================================
  824. /**
  825. * Delete cached Ollama API responses.
  826. * Returns the number of cached responses deleted.
  827. */
  828. export function deleteOllamaCache(db: Database): number {
  829. const result = db.prepare(`DELETE FROM ollama_cache`).run();
  830. return result.changes;
  831. }
  832. /**
  833. * Remove inactive document records (active = 0).
  834. * Returns the number of inactive documents deleted.
  835. */
  836. export function deleteInactiveDocuments(db: Database): number {
  837. const result = db.prepare(`DELETE FROM documents WHERE active = 0`).run();
  838. return result.changes;
  839. }
  840. /**
  841. * Remove orphaned content hashes that are not referenced by any active document.
  842. * Returns the number of orphaned content hashes deleted.
  843. */
  844. export function cleanupOrphanedContent(db: Database): number {
  845. const result = db.prepare(`
  846. DELETE FROM content
  847. WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  848. `).run();
  849. return result.changes;
  850. }
  851. /**
  852. * Remove orphaned vector embeddings that are not referenced by any active document.
  853. * Returns the number of orphaned embedding chunks deleted.
  854. */
  855. export function cleanupOrphanedVectors(db: Database): number {
  856. // Check if vectors_vec table exists
  857. const tableExists = db.prepare(`
  858. SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'
  859. `).get();
  860. if (!tableExists) {
  861. return 0;
  862. }
  863. // Count orphaned vectors first
  864. const countResult = db.prepare(`
  865. SELECT COUNT(*) as c FROM content_vectors cv
  866. WHERE NOT EXISTS (
  867. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  868. )
  869. `).get() as { c: number };
  870. if (countResult.c === 0) {
  871. return 0;
  872. }
  873. // Delete from vectors_vec first
  874. db.exec(`
  875. DELETE FROM vectors_vec WHERE hash_seq IN (
  876. SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
  877. WHERE NOT EXISTS (
  878. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  879. )
  880. )
  881. `);
  882. // Delete from content_vectors
  883. db.exec(`
  884. DELETE FROM content_vectors WHERE hash NOT IN (
  885. SELECT hash FROM documents WHERE active = 1
  886. )
  887. `);
  888. return countResult.c;
  889. }
  890. /**
  891. * Remove duplicate collections, keeping the oldest one per (pwd, glob_pattern).
  892. * Also removes bogus "." glob pattern entries.
  893. * Returns the number of duplicate collections removed.
  894. */
  895. export function cleanupDuplicateCollections(db: Database): number {
  896. // Count duplicates before removal
  897. const beforeCount = (db.prepare(`SELECT COUNT(*) as c FROM collections`).get() as { c: number }).c;
  898. // Remove duplicates keeping the oldest one
  899. db.exec(`
  900. DELETE FROM collections WHERE id NOT IN (
  901. SELECT MIN(id) FROM collections GROUP BY pwd, glob_pattern
  902. )
  903. `);
  904. // Remove bogus "." glob pattern entries (from earlier bug)
  905. db.exec(`DELETE FROM collections WHERE glob_pattern = '.'`);
  906. const afterCount = (db.prepare(`SELECT COUNT(*) as c FROM collections`).get() as { c: number }).c;
  907. return beforeCount - afterCount;
  908. }
  909. /**
  910. * Run VACUUM to reclaim unused space in the database.
  911. * This operation rebuilds the database file to eliminate fragmentation.
  912. */
  913. export function vacuumDatabase(db: Database): void {
  914. db.exec(`VACUUM`);
  915. }
  916. // =============================================================================
  917. // Document helpers
  918. // =============================================================================
  919. export async function hashContent(content: string): Promise<string> {
  920. const hash = new Bun.CryptoHasher("sha256");
  921. hash.update(content);
  922. return hash.digest("hex");
  923. }
  924. export function extractTitle(content: string, filename: string): string {
  925. const match = content.match(/^##?\s+(.+)$/m);
  926. if (match) {
  927. const title = match[1].trim();
  928. if (title === "📝 Notes" || title === "Notes") {
  929. const nextMatch = content.match(/^##\s+(.+)$/m);
  930. if (nextMatch) return nextMatch[1].trim();
  931. }
  932. return title;
  933. }
  934. return filename.replace(/\.md$/, "").split("/").pop() || filename;
  935. }
  936. // =============================================================================
  937. // Document indexing operations
  938. // =============================================================================
  939. /**
  940. * Insert content into the content table (content-addressable storage).
  941. * Uses INSERT OR IGNORE so duplicate hashes are skipped.
  942. */
  943. export function insertContent(db: Database, hash: string, content: string, createdAt: string): void {
  944. db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
  945. .run(hash, content, createdAt);
  946. }
  947. /**
  948. * Insert a new document into the documents table.
  949. */
  950. export function insertDocument(
  951. db: Database,
  952. collectionName: string,
  953. path: string,
  954. title: string,
  955. hash: string,
  956. createdAt: string,
  957. modifiedAt: string
  958. ): void {
  959. db.prepare(`
  960. INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active)
  961. VALUES (?, ?, ?, ?, ?, ?, 1)
  962. `).run(collectionName, path, title, hash, createdAt, modifiedAt);
  963. }
  964. /**
  965. * Find an active document by collection name and path.
  966. */
  967. export function findActiveDocument(
  968. db: Database,
  969. collectionName: string,
  970. path: string
  971. ): { id: number; hash: string; title: string } | null {
  972. return db.prepare(`
  973. SELECT id, hash, title FROM documents
  974. WHERE collection = ? AND path = ? AND active = 1
  975. `).get(collectionName, path) as { id: number; hash: string; title: string } | null;
  976. }
  977. /**
  978. * Update the title and modified_at timestamp for a document.
  979. */
  980. export function updateDocumentTitle(
  981. db: Database,
  982. documentId: number,
  983. title: string,
  984. modifiedAt: string
  985. ): void {
  986. db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
  987. .run(title, modifiedAt, documentId);
  988. }
  989. /**
  990. * Update an existing document's hash, title, and modified_at timestamp.
  991. * Used when content changes but the file path stays the same.
  992. */
  993. export function updateDocument(
  994. db: Database,
  995. documentId: number,
  996. title: string,
  997. hash: string,
  998. modifiedAt: string
  999. ): void {
  1000. db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`)
  1001. .run(title, hash, modifiedAt, documentId);
  1002. }
  1003. /**
  1004. * Deactivate a document (mark as inactive but don't delete).
  1005. */
  1006. export function deactivateDocument(db: Database, collectionName: string, path: string): void {
  1007. db.prepare(`UPDATE documents SET active = 0 WHERE collection = ? AND path = ? AND active = 1`)
  1008. .run(collectionName, path);
  1009. }
  1010. /**
  1011. * Get all active document paths for a collection.
  1012. */
  1013. export function getActiveDocumentPaths(db: Database, collectionName: string): string[] {
  1014. const rows = db.prepare(`
  1015. SELECT path FROM documents WHERE collection = ? AND active = 1
  1016. `).all(collectionName) as { path: string }[];
  1017. return rows.map(r => r.path);
  1018. }
  1019. // Re-export from llm.ts for backwards compatibility
  1020. export { formatQueryForEmbedding, formatDocForEmbedding };
  1021. export function chunkDocument(content: string, maxBytes: number = CHUNK_BYTE_SIZE): { text: string; pos: number }[] {
  1022. const encoder = new TextEncoder();
  1023. const totalBytes = encoder.encode(content).length;
  1024. if (totalBytes <= maxBytes) {
  1025. return [{ text: content, pos: 0 }];
  1026. }
  1027. const chunks: { text: string; pos: number }[] = [];
  1028. let charPos = 0;
  1029. while (charPos < content.length) {
  1030. let endPos = charPos;
  1031. let byteCount = 0;
  1032. while (endPos < content.length && byteCount < maxBytes) {
  1033. const charBytes = encoder.encode(content[endPos]).length;
  1034. if (byteCount + charBytes > maxBytes) break;
  1035. byteCount += charBytes;
  1036. endPos++;
  1037. }
  1038. if (endPos < content.length && endPos > charPos) {
  1039. const slice = content.slice(charPos, endPos);
  1040. const paragraphBreak = slice.lastIndexOf('\n\n');
  1041. const sentenceEnd = Math.max(
  1042. slice.lastIndexOf('. '),
  1043. slice.lastIndexOf('.\n'),
  1044. slice.lastIndexOf('? '),
  1045. slice.lastIndexOf('?\n'),
  1046. slice.lastIndexOf('! '),
  1047. slice.lastIndexOf('!\n')
  1048. );
  1049. const lineBreak = slice.lastIndexOf('\n');
  1050. const spaceBreak = slice.lastIndexOf(' ');
  1051. let breakPoint = -1;
  1052. if (paragraphBreak > slice.length * 0.5) {
  1053. breakPoint = paragraphBreak + 2;
  1054. } else if (sentenceEnd > slice.length * 0.5) {
  1055. breakPoint = sentenceEnd + 2;
  1056. } else if (lineBreak > slice.length * 0.3) {
  1057. breakPoint = lineBreak + 1;
  1058. } else if (spaceBreak > slice.length * 0.3) {
  1059. breakPoint = spaceBreak + 1;
  1060. }
  1061. if (breakPoint > 0) {
  1062. endPos = charPos + breakPoint;
  1063. }
  1064. }
  1065. if (endPos <= charPos) {
  1066. endPos = charPos + 1;
  1067. }
  1068. chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
  1069. charPos = endPos;
  1070. }
  1071. return chunks;
  1072. }
  1073. // =============================================================================
  1074. // Fuzzy matching
  1075. // =============================================================================
  1076. function levenshtein(a: string, b: string): number {
  1077. const m = a.length, n = b.length;
  1078. if (m === 0) return n;
  1079. if (n === 0) return m;
  1080. const dp: number[][] = Array.from({ length: m + 1 }, (_, i) => [i]);
  1081. for (let j = 1; j <= n; j++) dp[0][j] = j;
  1082. for (let i = 1; i <= m; i++) {
  1083. for (let j = 1; j <= n; j++) {
  1084. const cost = a[i - 1] === b[j - 1] ? 0 : 1;
  1085. dp[i][j] = Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost);
  1086. }
  1087. }
  1088. return dp[m][n];
  1089. }
  1090. export function findSimilarFiles(db: Database, query: string, maxDistance: number = 3, limit: number = 5): string[] {
  1091. const allFiles = db.prepare(`
  1092. SELECT 'qmd://' || d.collection || '/' || d.path as display_path
  1093. FROM documents d
  1094. WHERE d.active = 1
  1095. `).all() as { display_path: string }[];
  1096. const queryLower = query.toLowerCase();
  1097. const scored = allFiles
  1098. .map(f => ({ path: f.display_path, dist: levenshtein(f.display_path.toLowerCase(), queryLower) }))
  1099. .filter(f => f.dist <= maxDistance)
  1100. .sort((a, b) => a.dist - b.dist)
  1101. .slice(0, limit);
  1102. return scored.map(f => f.path);
  1103. }
  1104. export function matchFilesByGlob(db: Database, pattern: string): { filepath: string; displayPath: string; bodyLength: number }[] {
  1105. const allFiles = db.prepare(`
  1106. SELECT
  1107. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  1108. LENGTH(content.doc) as body_length,
  1109. d.path
  1110. FROM documents d
  1111. JOIN content ON content.hash = d.hash
  1112. WHERE d.active = 1
  1113. `).all() as { virtual_path: string; body_length: number; path: string }[];
  1114. const glob = new Glob(pattern);
  1115. return allFiles
  1116. .filter(f => glob.match(f.virtual_path) || glob.match(f.path))
  1117. .map(f => ({
  1118. filepath: f.virtual_path, // Use virtual path as filepath
  1119. displayPath: f.virtual_path,
  1120. bodyLength: f.body_length
  1121. }));
  1122. }
  1123. // =============================================================================
  1124. // Context
  1125. // =============================================================================
  1126. /**
  1127. * Get context for a file path using hierarchical inheritance.
  1128. * Contexts are collection-scoped and inherit from parent directories.
  1129. * For example, context at "/talks" applies to "/talks/2024/keynote.md".
  1130. *
  1131. * @param db Database instance (unused - kept for compatibility)
  1132. * @param collectionName Collection name
  1133. * @param path Relative path within the collection
  1134. * @returns Context string or null if no context is defined
  1135. */
  1136. export function getContextForPath(db: Database, collectionName: string, path: string): string | null {
  1137. const context = collectionsFindContextForPath(collectionName, path);
  1138. return context || null;
  1139. }
  1140. /**
  1141. * Legacy function for backward compatibility - resolves filepath to collection+path first
  1142. */
  1143. export function getContextForFile(db: Database, filepath: string): string | null {
  1144. // Get all collections from YAML config
  1145. const collections = collectionsListCollections();
  1146. // Find which collection this absolute path belongs to
  1147. for (const coll of collections) {
  1148. if (filepath.startsWith(coll.path + '/') || filepath === coll.path) {
  1149. // Extract relative path
  1150. const relativePath = filepath.startsWith(coll.path + '/')
  1151. ? filepath.slice(coll.path.length + 1)
  1152. : '';
  1153. // Verify this document exists in the database
  1154. const doc = db.prepare(`
  1155. SELECT d.path
  1156. FROM documents d
  1157. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  1158. LIMIT 1
  1159. `).get(coll.name, relativePath) as { path: string } | null;
  1160. if (doc) {
  1161. // Use collections.ts to find context
  1162. const context = collectionsFindContextForPath(coll.name, relativePath);
  1163. return context || null;
  1164. }
  1165. }
  1166. }
  1167. return null;
  1168. }
  1169. /**
  1170. * Get collection by name from YAML config.
  1171. * Returns collection metadata from ~/.config/qmd/index.yml
  1172. */
  1173. export function getCollectionByName(db: Database, name: string): { name: string; pwd: string; glob_pattern: string } | null {
  1174. const collection = getCollection(name);
  1175. if (!collection) return null;
  1176. return {
  1177. name: collection.name,
  1178. pwd: collection.path,
  1179. glob_pattern: collection.pattern,
  1180. };
  1181. }
  1182. /**
  1183. * List all collections with document counts from database.
  1184. * Merges YAML config with database statistics.
  1185. */
  1186. export function listCollections(db: Database): { name: string; pwd: string; glob_pattern: string; doc_count: number; active_count: number; last_modified: string | null }[] {
  1187. const collections = collectionsListCollections();
  1188. // Get document counts from database for each collection
  1189. const result = collections.map(coll => {
  1190. const stats = db.prepare(`
  1191. SELECT
  1192. COUNT(d.id) as doc_count,
  1193. SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count,
  1194. MAX(d.modified_at) as last_modified
  1195. FROM documents d
  1196. WHERE d.collection = ?
  1197. `).get(coll.name) as { doc_count: number; active_count: number; last_modified: string | null } | null;
  1198. return {
  1199. name: coll.name,
  1200. pwd: coll.path,
  1201. glob_pattern: coll.pattern,
  1202. doc_count: stats?.doc_count || 0,
  1203. active_count: stats?.active_count || 0,
  1204. last_modified: stats?.last_modified || null,
  1205. };
  1206. });
  1207. return result;
  1208. }
  1209. /**
  1210. * Remove a collection and clean up its documents.
  1211. * Uses collections.ts to remove from YAML config and cleans up database.
  1212. */
  1213. export function removeCollection(db: Database, collectionName: string): { deletedDocs: number; cleanedHashes: number } {
  1214. // Delete documents from database
  1215. const docResult = db.prepare(`DELETE FROM documents WHERE collection = ?`).run(collectionName);
  1216. // Clean up orphaned content hashes
  1217. const cleanupResult = db.prepare(`
  1218. DELETE FROM content
  1219. WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  1220. `).run();
  1221. // Remove from YAML config (returns true if found and removed)
  1222. collectionsRemoveCollection(collectionName);
  1223. return {
  1224. deletedDocs: docResult.changes,
  1225. cleanedHashes: cleanupResult.changes
  1226. };
  1227. }
  1228. /**
  1229. * Rename a collection.
  1230. * Updates both YAML config and database documents table.
  1231. */
  1232. export function renameCollection(db: Database, oldName: string, newName: string): void {
  1233. // Update all documents with the new collection name in database
  1234. db.prepare(`UPDATE documents SET collection = ? WHERE collection = ?`)
  1235. .run(newName, oldName);
  1236. // Rename in YAML config
  1237. collectionsRenameCollection(oldName, newName);
  1238. }
  1239. // =============================================================================
  1240. // Context Management Operations
  1241. // =============================================================================
  1242. /**
  1243. * Insert or update a context for a specific collection and path prefix.
  1244. */
  1245. export function insertContext(db: Database, collectionId: number, pathPrefix: string, context: string): void {
  1246. // Get collection name from ID
  1247. const coll = db.prepare(`SELECT name FROM collections WHERE id = ?`).get(collectionId) as { name: string } | null;
  1248. if (!coll) {
  1249. throw new Error(`Collection with id ${collectionId} not found`);
  1250. }
  1251. // Use collections.ts to add context
  1252. collectionsAddContext(coll.name, pathPrefix, context);
  1253. }
  1254. /**
  1255. * Delete a context for a specific collection and path prefix.
  1256. * Returns the number of contexts deleted.
  1257. */
  1258. export function deleteContext(db: Database, collectionId: number, pathPrefix: string): number {
  1259. // Get collection name from ID
  1260. const coll = db.prepare(`SELECT name FROM collections WHERE id = ?`).get(collectionId) as { name: string } | null;
  1261. if (!coll) {
  1262. return 0;
  1263. }
  1264. // Use collections.ts to remove context
  1265. const success = collectionsRemoveContext(coll.name, pathPrefix);
  1266. return success ? 1 : 0;
  1267. }
  1268. /**
  1269. * Delete all global contexts (contexts with empty path_prefix).
  1270. * Returns the number of contexts deleted.
  1271. */
  1272. export function deleteGlobalContexts(db: Database): number {
  1273. let deletedCount = 0;
  1274. // Remove global context
  1275. setGlobalContext(undefined);
  1276. deletedCount++;
  1277. // Remove root context (empty string) from all collections
  1278. const collections = collectionsListCollections();
  1279. for (const coll of collections) {
  1280. const success = collectionsRemoveContext(coll.name, '');
  1281. if (success) {
  1282. deletedCount++;
  1283. }
  1284. }
  1285. return deletedCount;
  1286. }
  1287. /**
  1288. * List all contexts, grouped by collection.
  1289. * Returns contexts ordered by collection name, then by path prefix length (longest first).
  1290. */
  1291. export function listPathContexts(db: Database): { collection_name: string; path_prefix: string; context: string }[] {
  1292. const allContexts = collectionsListAllContexts();
  1293. // Convert to expected format and sort
  1294. return allContexts.map(ctx => ({
  1295. collection_name: ctx.collection,
  1296. path_prefix: ctx.path,
  1297. context: ctx.context,
  1298. })).sort((a, b) => {
  1299. // Sort by collection name first
  1300. if (a.collection_name !== b.collection_name) {
  1301. return a.collection_name.localeCompare(b.collection_name);
  1302. }
  1303. // Then by path prefix length (longest first)
  1304. if (a.path_prefix.length !== b.path_prefix.length) {
  1305. return b.path_prefix.length - a.path_prefix.length;
  1306. }
  1307. // Then alphabetically
  1308. return a.path_prefix.localeCompare(b.path_prefix);
  1309. });
  1310. }
  1311. /**
  1312. * Get all collections (name only - from YAML config).
  1313. */
  1314. export function getAllCollections(db: Database): { name: string }[] {
  1315. const collections = collectionsListCollections();
  1316. return collections.map(c => ({ name: c.name }));
  1317. }
  1318. /**
  1319. * Check which collections don't have any context defined.
  1320. * Returns collections that have no context entries at all (not even root context).
  1321. */
  1322. export function getCollectionsWithoutContext(db: Database): { name: string; pwd: string; doc_count: number }[] {
  1323. // Get all collections from YAML config
  1324. const yamlCollections = collectionsListCollections();
  1325. // Filter to those without context
  1326. const collectionsWithoutContext: { name: string; pwd: string; doc_count: number }[] = [];
  1327. for (const coll of yamlCollections) {
  1328. // Check if collection has any context
  1329. if (!coll.context || Object.keys(coll.context).length === 0) {
  1330. // Get doc count from database
  1331. const stats = db.prepare(`
  1332. SELECT COUNT(d.id) as doc_count
  1333. FROM documents d
  1334. WHERE d.collection = ? AND d.active = 1
  1335. `).get(coll.name) as { doc_count: number } | null;
  1336. collectionsWithoutContext.push({
  1337. name: coll.name,
  1338. pwd: coll.path,
  1339. doc_count: stats?.doc_count || 0,
  1340. });
  1341. }
  1342. }
  1343. return collectionsWithoutContext.sort((a, b) => a.name.localeCompare(b.name));
  1344. }
  1345. /**
  1346. * Get top-level directories in a collection that don't have context.
  1347. * Useful for suggesting where context might be needed.
  1348. */
  1349. export function getTopLevelPathsWithoutContext(db: Database, collectionName: string): string[] {
  1350. // Get all paths in the collection from database
  1351. const paths = db.prepare(`
  1352. SELECT DISTINCT path FROM documents
  1353. WHERE collection = ? AND active = 1
  1354. `).all(collectionName) as { path: string }[];
  1355. // Get existing contexts for this collection from YAML
  1356. const yamlColl = getCollection(collectionName);
  1357. if (!yamlColl) return [];
  1358. const contextPrefixes = new Set<string>();
  1359. if (yamlColl.context) {
  1360. for (const prefix of Object.keys(yamlColl.context)) {
  1361. contextPrefixes.add(prefix);
  1362. }
  1363. }
  1364. // Extract top-level directories (first path component)
  1365. const topLevelDirs = new Set<string>();
  1366. for (const { path } of paths) {
  1367. const parts = path.split('/').filter(Boolean);
  1368. if (parts.length > 1) {
  1369. topLevelDirs.add(parts[0]);
  1370. }
  1371. }
  1372. // Filter out directories that already have context (exact or parent)
  1373. const missing: string[] = [];
  1374. for (const dir of topLevelDirs) {
  1375. let hasContext = false;
  1376. // Check if this dir or any parent has context
  1377. for (const prefix of contextPrefixes) {
  1378. if (prefix === '' || prefix === dir || dir.startsWith(prefix + '/')) {
  1379. hasContext = true;
  1380. break;
  1381. }
  1382. }
  1383. if (!hasContext) {
  1384. missing.push(dir);
  1385. }
  1386. }
  1387. return missing.sort();
  1388. }
  1389. // =============================================================================
  1390. // FTS Search
  1391. // =============================================================================
  1392. function sanitizeFTS5Term(term: string): string {
  1393. return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
  1394. }
  1395. function buildFTS5Query(query: string): string | null {
  1396. const terms = query.split(/\s+/)
  1397. .map(t => sanitizeFTS5Term(t))
  1398. .filter(t => t.length > 0);
  1399. if (terms.length === 0) return null;
  1400. if (terms.length === 1) return `"${terms[0]}"*`;
  1401. return terms.map(t => `"${t}"*`).join(' AND ');
  1402. }
  1403. export function searchFTS(db: Database, query: string, limit: number = 20, collectionId?: number): SearchResult[] {
  1404. const ftsQuery = buildFTS5Query(query);
  1405. if (!ftsQuery) return [];
  1406. let sql = `
  1407. SELECT
  1408. 'qmd://' || d.collection || '/' || d.path as filepath,
  1409. 'qmd://' || d.collection || '/' || d.path as display_path,
  1410. d.title,
  1411. content.doc as body,
  1412. bm25(documents_fts, 10.0, 1.0) as score
  1413. FROM documents_fts f
  1414. JOIN documents d ON d.id = f.rowid
  1415. JOIN content ON content.hash = d.hash
  1416. WHERE documents_fts MATCH ? AND d.active = 1
  1417. `;
  1418. const params: (string | number)[] = [ftsQuery];
  1419. if (collectionId !== undefined) {
  1420. // Convert collectionId to collection name for filtering
  1421. const coll = db.prepare(`SELECT name FROM collections WHERE id = ?`).get(collectionId) as { name: string } | null;
  1422. if (coll) {
  1423. sql += ` AND d.collection = ?`;
  1424. params.push(coll.name);
  1425. }
  1426. }
  1427. sql += ` ORDER BY score LIMIT ?`;
  1428. params.push(limit);
  1429. const rows = db.prepare(sql).all(...params) as { filepath: string; display_path: string; title: string; body: string; score: number }[];
  1430. const maxScore = rows.length > 0 ? Math.max(...rows.map(r => Math.abs(r.score))) : 1;
  1431. return rows.map(row => ({
  1432. filepath: row.filepath,
  1433. displayPath: row.display_path,
  1434. title: row.title,
  1435. hash: "", // Not available in FTS query
  1436. collectionName: row.filepath.split('//')[1]?.split('/')[0] || "", // Extract from virtual path
  1437. modifiedAt: "", // Not available in FTS query
  1438. bodyLength: row.body.length,
  1439. body: row.body,
  1440. context: null, // Not loaded in FTS
  1441. score: Math.abs(row.score) / maxScore,
  1442. source: "fts" as const,
  1443. }));
  1444. }
  1445. // =============================================================================
  1446. // Vector Search
  1447. // =============================================================================
  1448. export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionId?: number): Promise<SearchResult[]> {
  1449. const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1450. if (!tableExists) return [];
  1451. const embedding = await getEmbedding(query, model, true);
  1452. if (!embedding) return [];
  1453. // sqlite-vec requires "k = ?" for KNN queries
  1454. let sql = `
  1455. SELECT
  1456. v.hash_seq,
  1457. v.distance,
  1458. 'qmd://' || d.collection || '/' || d.path as filepath,
  1459. 'qmd://' || d.collection || '/' || d.path as display_path,
  1460. d.title,
  1461. content.doc as body,
  1462. cv.pos
  1463. FROM vectors_vec v
  1464. JOIN content_vectors cv ON cv.hash || '_' || cv.seq = v.hash_seq
  1465. JOIN documents d ON d.hash = cv.hash AND d.active = 1
  1466. JOIN content ON content.hash = d.hash
  1467. WHERE v.embedding MATCH ? AND k = ?
  1468. `;
  1469. if (collectionId !== undefined) {
  1470. // Convert collectionId to collection name for filtering
  1471. const coll = db.prepare(`SELECT name FROM collections WHERE id = ?`).get(collectionId) as { name: string } | null;
  1472. if (coll) {
  1473. sql += ` AND d.collection = '${coll.name}'`;
  1474. }
  1475. }
  1476. sql += ` ORDER BY v.distance`;
  1477. const rows = db.prepare(sql).all(new Float32Array(embedding), limit * 3) as { hash_seq: string; distance: number; filepath: string; display_path: string; title: string; body: string; pos: number }[];
  1478. const seen = new Map<string, { row: typeof rows[0]; bestDist: number }>();
  1479. for (const row of rows) {
  1480. const existing = seen.get(row.filepath);
  1481. if (!existing || row.distance < existing.bestDist) {
  1482. seen.set(row.filepath, { row, bestDist: row.distance });
  1483. }
  1484. }
  1485. return Array.from(seen.values())
  1486. .sort((a, b) => a.bestDist - b.bestDist)
  1487. .slice(0, limit)
  1488. .map(({ row }) => ({
  1489. filepath: row.filepath,
  1490. displayPath: row.display_path,
  1491. title: row.title,
  1492. hash: "", // Not available in vec query
  1493. collectionName: row.filepath.split('//')[1]?.split('/')[0] || "", // Extract from virtual path
  1494. modifiedAt: "", // Not available in vec query
  1495. bodyLength: row.body.length,
  1496. body: row.body,
  1497. context: null, // Not loaded in vec
  1498. score: 1 / (1 + row.distance),
  1499. source: "vec" as const,
  1500. chunkPos: row.pos,
  1501. }));
  1502. }
  1503. // =============================================================================
  1504. // Embeddings
  1505. // =============================================================================
  1506. async function getEmbedding(text: string, model: string, isQuery: boolean): Promise<number[] | null> {
  1507. const ollama = getDefaultOllama();
  1508. const result = await ollama.embed(text, { model, isQuery });
  1509. return result?.embedding || null;
  1510. }
  1511. /**
  1512. * Get all unique content hashes that need embeddings (from active documents).
  1513. * Returns hash, document body, and a sample path for display purposes.
  1514. */
  1515. export function getHashesForEmbedding(db: Database): { hash: string; body: string; path: string }[] {
  1516. return db.prepare(`
  1517. SELECT d.hash, c.doc as body, MIN(d.path) as path
  1518. FROM documents d
  1519. JOIN content c ON d.hash = c.hash
  1520. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  1521. WHERE d.active = 1 AND v.hash IS NULL
  1522. GROUP BY d.hash
  1523. `).all() as { hash: string; body: string; path: string }[];
  1524. }
  1525. /**
  1526. * Clear all embeddings from the database (force re-index).
  1527. * Deletes all rows from content_vectors and drops the vectors_vec table.
  1528. */
  1529. export function clearAllEmbeddings(db: Database): void {
  1530. db.exec(`DELETE FROM content_vectors`);
  1531. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  1532. }
  1533. /**
  1534. * Insert a single embedding into both content_vectors and vectors_vec tables.
  1535. * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
  1536. */
  1537. export function insertEmbedding(
  1538. db: Database,
  1539. hash: string,
  1540. seq: number,
  1541. pos: number,
  1542. embedding: Float32Array,
  1543. model: string,
  1544. embeddedAt: string
  1545. ): void {
  1546. const hashSeq = `${hash}_${seq}`;
  1547. const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
  1548. const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
  1549. insertVecStmt.run(hashSeq, embedding);
  1550. insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
  1551. }
  1552. // =============================================================================
  1553. // Query expansion
  1554. // =============================================================================
  1555. export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database): Promise<string[]> {
  1556. // Check cache first
  1557. const cacheKey = getCacheKey("expandQuery", { query, model });
  1558. const cached = getCachedResult(db, cacheKey);
  1559. if (cached) {
  1560. const lines = cached.split('\n').map(l => l.trim()).filter(l => l.length > 0);
  1561. return [query, ...lines.slice(0, 2)];
  1562. }
  1563. const ollama = getDefaultOllama();
  1564. const results = await ollama.expandQuery(query, model, 2);
  1565. // Cache the expanded queries (excluding original)
  1566. if (results.length > 1) {
  1567. setCachedResult(db, cacheKey, results.slice(1).join('\n'));
  1568. }
  1569. return results;
  1570. }
  1571. // =============================================================================
  1572. // Reranking
  1573. // =============================================================================
  1574. export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database): Promise<{ file: string; score: number }[]> {
  1575. const cachedResults: Map<string, number> = new Map();
  1576. const uncachedDocs: RerankDocument[] = [];
  1577. // Check cache for each document
  1578. for (const doc of documents) {
  1579. const cacheKey = getCacheKey("rerank", { query, file: doc.file, model });
  1580. const cached = getCachedResult(db, cacheKey);
  1581. if (cached !== null) {
  1582. cachedResults.set(doc.file, parseFloat(cached));
  1583. } else {
  1584. uncachedDocs.push({ file: doc.file, text: doc.text });
  1585. }
  1586. }
  1587. // Rerank uncached documents using Ollama
  1588. if (uncachedDocs.length > 0) {
  1589. const ollama = getDefaultOllama();
  1590. const rerankResult = await ollama.rerank(query, uncachedDocs, { model });
  1591. // Cache results
  1592. for (const result of rerankResult.results) {
  1593. const cacheKey = getCacheKey("rerank", { query, file: result.file, model });
  1594. setCachedResult(db, cacheKey, result.score.toString());
  1595. cachedResults.set(result.file, result.score);
  1596. }
  1597. }
  1598. // Return all results sorted by score
  1599. return documents
  1600. .map(doc => ({ file: doc.file, score: cachedResults.get(doc.file) || 0 }))
  1601. .sort((a, b) => b.score - a.score);
  1602. }
  1603. // =============================================================================
  1604. // Reciprocal Rank Fusion
  1605. // =============================================================================
  1606. export function reciprocalRankFusion(
  1607. resultLists: RankedResult[][],
  1608. weights: number[] = [],
  1609. k: number = 60
  1610. ): RankedResult[] {
  1611. const scores = new Map<string, { result: RankedResult; rrfScore: number; topRank: number }>();
  1612. for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
  1613. const list = resultLists[listIdx];
  1614. const weight = weights[listIdx] ?? 1.0;
  1615. for (let rank = 0; rank < list.length; rank++) {
  1616. const result = list[rank];
  1617. const rrfContribution = weight / (k + rank + 1);
  1618. const existing = scores.get(result.file);
  1619. if (existing) {
  1620. existing.rrfScore += rrfContribution;
  1621. existing.topRank = Math.min(existing.topRank, rank);
  1622. } else {
  1623. scores.set(result.file, {
  1624. result,
  1625. rrfScore: rrfContribution,
  1626. topRank: rank,
  1627. });
  1628. }
  1629. }
  1630. }
  1631. // Top-rank bonus
  1632. for (const entry of scores.values()) {
  1633. if (entry.topRank === 0) {
  1634. entry.rrfScore += 0.05;
  1635. } else if (entry.topRank <= 2) {
  1636. entry.rrfScore += 0.02;
  1637. }
  1638. }
  1639. return Array.from(scores.values())
  1640. .sort((a, b) => b.rrfScore - a.rrfScore)
  1641. .map(e => ({ ...e.result, score: e.rrfScore }));
  1642. }
  1643. // =============================================================================
  1644. // Document retrieval
  1645. // =============================================================================
  1646. type DbDocRow = {
  1647. filepath: string;
  1648. display_path: string;
  1649. title: string;
  1650. hash: string;
  1651. collection: string;
  1652. modified_at: string;
  1653. body_length: number;
  1654. body?: string;
  1655. };
  1656. /**
  1657. * Find a document by filename/path (with fuzzy matching)
  1658. * Returns document metadata without body by default
  1659. */
  1660. export function findDocument(db: Database, filename: string, options: { includeBody?: boolean } = {}): DocumentResult | DocumentNotFound {
  1661. let filepath = filename;
  1662. const colonMatch = filepath.match(/:(\d+)$/);
  1663. if (colonMatch) {
  1664. filepath = filepath.slice(0, -colonMatch[0].length);
  1665. }
  1666. if (filepath.startsWith('~/')) {
  1667. filepath = homedir() + filepath.slice(1);
  1668. }
  1669. const bodyCol = options.includeBody ? `, content.doc as body` : ``;
  1670. // Build computed columns for filepath and display_path
  1671. const selectCols = `
  1672. c.pwd || '/' || d.path as filepath,
  1673. 'qmd://' || d.collection || '/' || d.path as display_path,
  1674. d.title,
  1675. d.hash,
  1676. d.collection,
  1677. d.modified_at,
  1678. LENGTH(content.doc) as body_length
  1679. ${bodyCol}
  1680. `;
  1681. // Try various match strategies - always join content for body_length
  1682. let doc = db.prepare(`
  1683. SELECT ${selectCols}
  1684. FROM documents d
  1685. JOIN collections c ON c.name = d.collection
  1686. JOIN content ON content.hash = d.hash
  1687. WHERE c.pwd || '/' || d.path = ? AND d.active = 1
  1688. `).get(filepath) as DbDocRow | null;
  1689. if (!doc) {
  1690. doc = db.prepare(`
  1691. SELECT ${selectCols}
  1692. FROM documents d
  1693. JOIN collections c ON c.name = d.collection
  1694. JOIN content ON content.hash = d.hash
  1695. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  1696. `).get(filepath) as DbDocRow | null;
  1697. }
  1698. if (!doc) {
  1699. doc = db.prepare(`
  1700. SELECT ${selectCols}
  1701. FROM documents d
  1702. JOIN collections c ON c.name = d.collection
  1703. JOIN content ON content.hash = d.hash
  1704. WHERE (c.pwd || '/' || d.path LIKE ? OR 'qmd://' || d.collection || '/' || d.path LIKE ?) AND d.active = 1
  1705. LIMIT 1
  1706. `).get(`%${filepath}`, `%${filepath}`) as DbDocRow | null;
  1707. }
  1708. if (!doc) {
  1709. const similar = findSimilarFiles(db, filepath, 5, 5);
  1710. return { error: "not_found", query: filename, similarFiles: similar };
  1711. }
  1712. const context = getContextForFile(db, doc.filepath);
  1713. return {
  1714. filepath: doc.filepath,
  1715. displayPath: doc.display_path,
  1716. title: doc.title,
  1717. context,
  1718. hash: doc.hash,
  1719. collectionName: doc.collection,
  1720. modifiedAt: doc.modified_at,
  1721. bodyLength: doc.body_length,
  1722. ...(options.includeBody && doc.body !== undefined && { body: doc.body }),
  1723. };
  1724. }
  1725. /**
  1726. * Get the body content for a document
  1727. * Optionally slice by line range
  1728. */
  1729. export function getDocumentBody(db: Database, doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number): string | null {
  1730. const filepath = 'filepath' in doc ? doc.filepath : doc.filepath;
  1731. const row = db.prepare(`
  1732. SELECT content.doc as body
  1733. FROM documents d
  1734. JOIN collections c ON c.name = d.collection
  1735. JOIN content ON content.hash = d.hash
  1736. WHERE c.pwd || '/' || d.path = ? AND d.active = 1
  1737. `).get(filepath) as { body: string } | null;
  1738. if (!row) return null;
  1739. let body = row.body;
  1740. if (fromLine !== undefined || maxLines !== undefined) {
  1741. const lines = body.split('\n');
  1742. const start = (fromLine || 1) - 1;
  1743. const end = maxLines !== undefined ? start + maxLines : lines.length;
  1744. body = lines.slice(start, end).join('\n');
  1745. }
  1746. return body;
  1747. }
  1748. /**
  1749. * Legacy function for backwards compatibility
  1750. * Combines findDocument + getDocumentBody with line slicing
  1751. */
  1752. export function getDocument(db: Database, filename: string, fromLine?: number, maxLines?: number): (DocumentResult & { body: string }) | DocumentNotFound {
  1753. // Parse :line suffix
  1754. let parsedFromLine = fromLine;
  1755. let filepath = filename;
  1756. const colonMatch = filepath.match(/:(\d+)$/);
  1757. if (colonMatch && !parsedFromLine) {
  1758. parsedFromLine = parseInt(colonMatch[1], 10);
  1759. filepath = filepath.slice(0, -colonMatch[0].length);
  1760. }
  1761. const result = findDocument(db, filepath, { includeBody: true });
  1762. if ("error" in result) return result;
  1763. let body = result.body || "";
  1764. if (parsedFromLine !== undefined || maxLines !== undefined) {
  1765. const lines = body.split('\n');
  1766. const start = (parsedFromLine || 1) - 1;
  1767. const end = maxLines !== undefined ? start + maxLines : lines.length;
  1768. body = lines.slice(start, end).join('\n');
  1769. }
  1770. return { ...result, body };
  1771. }
  1772. /**
  1773. * Find multiple documents by glob pattern or comma-separated list
  1774. * Returns documents without body by default (use getDocumentBody to load)
  1775. */
  1776. export function findDocuments(
  1777. db: Database,
  1778. pattern: string,
  1779. options: { includeBody?: boolean; maxBytes?: number } = {}
  1780. ): { docs: MultiGetResult[]; errors: string[] } {
  1781. const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
  1782. const errors: string[] = [];
  1783. const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
  1784. const bodyCol = options.includeBody ? `, content.doc as body` : ``;
  1785. const selectCols = `
  1786. c.pwd || '/' || d.path as filepath,
  1787. 'qmd://' || d.collection || '/' || d.path as display_path,
  1788. d.title,
  1789. d.hash,
  1790. d.collection,
  1791. d.modified_at,
  1792. LENGTH(content.doc) as body_length
  1793. ${bodyCol}
  1794. `;
  1795. let fileRows: DbDocRow[];
  1796. if (isCommaSeparated) {
  1797. const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
  1798. fileRows = [];
  1799. for (const name of names) {
  1800. let doc = db.prepare(`
  1801. SELECT ${selectCols}
  1802. FROM documents d
  1803. JOIN collections c ON c.name = d.collection
  1804. JOIN content ON content.hash = d.hash
  1805. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  1806. `).get(name) as DbDocRow | null;
  1807. if (!doc) {
  1808. doc = db.prepare(`
  1809. SELECT ${selectCols}
  1810. FROM documents d
  1811. JOIN collections c ON c.name = d.collection
  1812. JOIN content ON content.hash = d.hash
  1813. WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
  1814. LIMIT 1
  1815. `).get(`%${name}`) as DbDocRow | null;
  1816. }
  1817. if (doc) {
  1818. fileRows.push(doc);
  1819. } else {
  1820. const similar = findSimilarFiles(db, name, 5, 3);
  1821. let msg = `File not found: ${name}`;
  1822. if (similar.length > 0) {
  1823. msg += ` (did you mean: ${similar.join(', ')}?)`;
  1824. }
  1825. errors.push(msg);
  1826. }
  1827. }
  1828. } else {
  1829. // Glob pattern match
  1830. const matched = matchFilesByGlob(db, pattern);
  1831. if (matched.length === 0) {
  1832. errors.push(`No files matched pattern: ${pattern}`);
  1833. return { docs: [], errors };
  1834. }
  1835. const virtualPaths = matched.map(m => m.filepath);
  1836. const placeholders = virtualPaths.map(() => '?').join(',');
  1837. fileRows = db.prepare(`
  1838. SELECT ${selectCols}
  1839. FROM documents d
  1840. JOIN collections c ON c.name = d.collection
  1841. JOIN content ON content.hash = d.hash
  1842. WHERE 'qmd://' || d.collection || '/' || d.path IN (${placeholders}) AND d.active = 1
  1843. `).all(...virtualPaths) as DbDocRow[];
  1844. }
  1845. const results: MultiGetResult[] = [];
  1846. for (const row of fileRows) {
  1847. const context = getContextForFile(db, row.filepath);
  1848. if (row.body_length > maxBytes) {
  1849. results.push({
  1850. doc: { filepath: row.filepath, displayPath: row.display_path },
  1851. skipped: true,
  1852. skipReason: `File too large (${Math.round(row.body_length / 1024)}KB > ${Math.round(maxBytes / 1024)}KB)`,
  1853. });
  1854. continue;
  1855. }
  1856. results.push({
  1857. doc: {
  1858. filepath: row.filepath,
  1859. displayPath: row.display_path,
  1860. title: row.title || row.display_path.split('/').pop() || row.display_path,
  1861. context,
  1862. hash: row.hash,
  1863. collectionName: row.collection,
  1864. modifiedAt: row.modified_at,
  1865. bodyLength: row.body_length,
  1866. ...(options.includeBody && row.body !== undefined && { body: row.body }),
  1867. },
  1868. skipped: false,
  1869. });
  1870. }
  1871. return { docs: results, errors };
  1872. }
  1873. /**
  1874. * Legacy function for backwards compatibility
  1875. */
  1876. export function getMultipleDocuments(db: Database, pattern: string, maxLines?: number, maxBytes: number = DEFAULT_MULTI_GET_MAX_BYTES): { files: MultiGetFile[]; errors: string[] } {
  1877. const { docs, errors } = findDocuments(db, pattern, { includeBody: true, maxBytes });
  1878. const files: MultiGetFile[] = docs.map(result => {
  1879. if (result.skipped) {
  1880. return {
  1881. filepath: result.doc.filepath,
  1882. displayPath: result.doc.displayPath,
  1883. title: "",
  1884. body: "",
  1885. context: null,
  1886. skipped: true as const,
  1887. skipReason: result.skipReason,
  1888. };
  1889. }
  1890. let body = result.doc.body || "";
  1891. if (maxLines !== undefined) {
  1892. const lines = body.split('\n');
  1893. body = lines.slice(0, maxLines).join('\n');
  1894. if (lines.length > maxLines) {
  1895. body += `\n\n[... truncated ${lines.length - maxLines} more lines]`;
  1896. }
  1897. }
  1898. return {
  1899. filepath: result.doc.filepath,
  1900. displayPath: result.doc.displayPath,
  1901. title: result.doc.title,
  1902. body,
  1903. context: result.doc.context,
  1904. skipped: false as const,
  1905. };
  1906. });
  1907. return { files, errors };
  1908. }
  1909. // Keep the old MultiGetFile type for backwards compatibility
  1910. export type MultiGetFile = {
  1911. filepath: string;
  1912. displayPath: string;
  1913. title: string;
  1914. body: string;
  1915. context: string | null;
  1916. skipped: false;
  1917. } | {
  1918. filepath: string;
  1919. displayPath: string;
  1920. title: string;
  1921. body: string;
  1922. context: string | null;
  1923. skipped: true;
  1924. skipReason: string;
  1925. };
  1926. // =============================================================================
  1927. // Status
  1928. // =============================================================================
  1929. export function getStatus(db: Database): IndexStatus {
  1930. const collections = db.prepare(`
  1931. SELECT c.id, c.pwd, c.glob_pattern, c.created_at,
  1932. COUNT(d.id) as active_count,
  1933. MAX(d.modified_at) as last_doc_update
  1934. FROM collections c
  1935. LEFT JOIN documents d ON d.collection = c.name AND d.active = 1
  1936. GROUP BY c.id
  1937. ORDER BY last_doc_update DESC
  1938. `).all() as { id: number; pwd: string; glob_pattern: string; created_at: string; active_count: number; last_doc_update: string | null }[];
  1939. const totalDocs = (db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get() as { c: number }).c;
  1940. const needsEmbedding = getHashesNeedingEmbedding(db);
  1941. const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1942. return {
  1943. totalDocuments: totalDocs,
  1944. needsEmbedding,
  1945. hasVectorIndex: hasVectors,
  1946. collections: collections.map(col => ({
  1947. id: col.id,
  1948. path: col.pwd,
  1949. pattern: col.glob_pattern,
  1950. documents: col.active_count,
  1951. lastUpdated: col.last_doc_update || col.created_at,
  1952. })),
  1953. };
  1954. }
  1955. // =============================================================================
  1956. // Snippet extraction
  1957. // =============================================================================
  1958. export type SnippetResult = {
  1959. line: number; // 1-indexed line number of best match
  1960. snippet: string; // The snippet text with diff-style header
  1961. linesBefore: number; // Lines in document before snippet
  1962. linesAfter: number; // Lines in document after snippet
  1963. snippetLines: number; // Number of lines in snippet
  1964. };
  1965. export function extractSnippet(body: string, query: string, maxLen = 500, chunkPos?: number): SnippetResult {
  1966. const totalLines = body.split('\n').length;
  1967. let searchBody = body;
  1968. let lineOffset = 0;
  1969. if (chunkPos && chunkPos > 0) {
  1970. const contextStart = Math.max(0, chunkPos - 100);
  1971. const contextEnd = Math.min(body.length, chunkPos + maxLen + 100);
  1972. searchBody = body.slice(contextStart, contextEnd);
  1973. if (contextStart > 0) {
  1974. lineOffset = body.slice(0, contextStart).split('\n').length - 1;
  1975. }
  1976. }
  1977. const lines = searchBody.split('\n');
  1978. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
  1979. let bestLine = 0, bestScore = -1;
  1980. for (let i = 0; i < lines.length; i++) {
  1981. const lineLower = lines[i].toLowerCase();
  1982. let score = 0;
  1983. for (const term of queryTerms) {
  1984. if (lineLower.includes(term)) score++;
  1985. }
  1986. if (score > bestScore) {
  1987. bestScore = score;
  1988. bestLine = i;
  1989. }
  1990. }
  1991. const start = Math.max(0, bestLine - 1);
  1992. const end = Math.min(lines.length, bestLine + 3);
  1993. const snippetLines = lines.slice(start, end);
  1994. let snippetText = snippetLines.join('\n');
  1995. if (snippetText.length > maxLen) snippetText = snippetText.substring(0, maxLen - 3) + "...";
  1996. const absoluteStart = lineOffset + start + 1; // 1-indexed
  1997. const snippetLineCount = snippetLines.length;
  1998. const linesBefore = absoluteStart - 1;
  1999. const linesAfter = totalLines - (absoluteStart + snippetLineCount - 1);
  2000. // Format with diff-style header: @@ -start,count @@ (linesBefore before, linesAfter after)
  2001. const header = `@@ -${absoluteStart},${snippetLineCount} @@ (${linesBefore} before, ${linesAfter} after)`;
  2002. const snippet = `${header}\n${snippetText}`;
  2003. return {
  2004. line: lineOffset + bestLine + 1,
  2005. snippet,
  2006. linesBefore,
  2007. linesAfter,
  2008. snippetLines: snippetLineCount,
  2009. };
  2010. }