store.ts 78 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320
  1. /**
  2. * QMD Store - Core data access and retrieval functions
  3. *
  4. * This module provides all database operations, search functions, and document
  5. * retrieval for QMD. It returns raw data structures that can be formatted by
  6. * CLI or MCP consumers.
  7. *
  8. * Usage:
  9. * const store = createStore("/path/to/db.sqlite");
  10. * // or use default path:
  11. * const store = createStore();
  12. */
  13. import { Database } from "bun:sqlite";
  14. import { Glob } from "bun";
  15. import * as sqliteVec from "sqlite-vec";
  16. import {
  17. LlamaCpp,
  18. getDefaultLlamaCpp,
  19. formatQueryForEmbedding,
  20. formatDocForEmbedding,
  21. type RerankDocument,
  22. } from "./llm";
  23. import {
  24. findContextForPath as collectionsFindContextForPath,
  25. addContext as collectionsAddContext,
  26. removeContext as collectionsRemoveContext,
  27. listAllContexts as collectionsListAllContexts,
  28. getCollection,
  29. listCollections as collectionsListCollections,
  30. addCollection as collectionsAddCollection,
  31. removeCollection as collectionsRemoveCollection,
  32. renameCollection as collectionsRenameCollection,
  33. setGlobalContext,
  34. loadConfig as collectionsLoadConfig,
  35. type NamedCollection,
  36. } from "./collections";
  37. // =============================================================================
  38. // Configuration
  39. // =============================================================================
  40. const HOME = Bun.env.HOME || "/tmp";
  41. export const DEFAULT_EMBED_MODEL = "embeddinggemma";
  42. export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
  43. export const DEFAULT_QUERY_MODEL = "qwen3:0.6b";
  44. export const DEFAULT_GLOB = "**/*.md";
  45. export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
  46. // Chunking: 800 tokens per chunk with 15% overlap
  47. export const CHUNK_SIZE_TOKENS = 800;
  48. export const CHUNK_OVERLAP_TOKENS = Math.floor(CHUNK_SIZE_TOKENS * 0.15); // 120 tokens (15% overlap)
  49. // Fallback char-based approximation for sync chunking (~4 chars per token)
  50. export const CHUNK_SIZE_CHARS = CHUNK_SIZE_TOKENS * 4; // 3200 chars
  51. export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 480 chars
  52. // =============================================================================
  53. // Path utilities
  54. // =============================================================================
  55. export function homedir(): string {
  56. return HOME;
  57. }
  58. export function resolve(...paths: string[]): string {
  59. if (paths.length === 0) {
  60. throw new Error("resolve: at least one path segment is required");
  61. }
  62. let result = paths[0]!.startsWith('/') ? '' : Bun.env.PWD || process.cwd();
  63. for (const p of paths) {
  64. if (p.startsWith('/')) {
  65. result = p;
  66. } else {
  67. result = result + '/' + p;
  68. }
  69. }
  70. const parts = result.split('/').filter(Boolean);
  71. const normalized: string[] = [];
  72. for (const part of parts) {
  73. if (part === '..') normalized.pop();
  74. else if (part !== '.') normalized.push(part);
  75. }
  76. return '/' + normalized.join('/');
  77. }
  78. // Flag to indicate production mode (set by qmd.ts at startup)
  79. let _productionMode = false;
  80. export function enableProductionMode(): void {
  81. _productionMode = true;
  82. }
  83. export function getDefaultDbPath(indexName: string = "index"): string {
  84. // Always allow override via INDEX_PATH (for testing)
  85. if (Bun.env.INDEX_PATH) {
  86. return Bun.env.INDEX_PATH;
  87. }
  88. // In non-production mode (tests), require explicit path
  89. if (!_productionMode) {
  90. throw new Error(
  91. "Database path not set. Tests must set INDEX_PATH env var or use createStore() with explicit path. " +
  92. "This prevents tests from accidentally writing to the global index."
  93. );
  94. }
  95. const cacheDir = Bun.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
  96. const qmdCacheDir = resolve(cacheDir, "qmd");
  97. try { Bun.spawnSync(["mkdir", "-p", qmdCacheDir]); } catch {}
  98. return resolve(qmdCacheDir, `${indexName}.sqlite`);
  99. }
  100. export function getPwd(): string {
  101. return process.env.PWD || process.cwd();
  102. }
  103. export function getRealPath(path: string): string {
  104. try {
  105. const result = Bun.spawnSync(["realpath", path]);
  106. if (result.success) {
  107. return result.stdout.toString().trim();
  108. }
  109. } catch {}
  110. return resolve(path);
  111. }
  112. // =============================================================================
  113. // Virtual Path Utilities (qmd://)
  114. // =============================================================================
  115. export type VirtualPath = {
  116. collectionName: string;
  117. path: string; // relative path within collection
  118. };
  119. /**
  120. * Normalize explicit virtual path formats to standard qmd:// format.
  121. * Only handles paths that are already explicitly virtual:
  122. * - qmd://collection/path.md (already normalized)
  123. * - qmd:////collection/path.md (extra slashes - normalize)
  124. * - //collection/path.md (missing qmd: prefix - add it)
  125. *
  126. * Does NOT handle:
  127. * - collection/path.md (bare paths - could be filesystem relative)
  128. * - :linenum suffix (should be parsed separately before calling this)
  129. */
  130. export function normalizeVirtualPath(input: string): string {
  131. let path = input.trim();
  132. // Handle qmd:// with extra slashes: qmd:////collection/path -> qmd://collection/path
  133. if (path.startsWith('qmd:')) {
  134. // Remove qmd: prefix and normalize slashes
  135. path = path.slice(4);
  136. // Remove leading slashes and re-add exactly two
  137. path = path.replace(/^\/+/, '');
  138. return `qmd://${path}`;
  139. }
  140. // Handle //collection/path (missing qmd: prefix)
  141. if (path.startsWith('//')) {
  142. path = path.replace(/^\/+/, '');
  143. return `qmd://${path}`;
  144. }
  145. // Return as-is for other cases (filesystem paths, docids, bare collection/path, etc.)
  146. return path;
  147. }
  148. /**
  149. * Parse a virtual path like "qmd://collection-name/path/to/file.md"
  150. * into its components.
  151. * Also supports collection root: "qmd://collection-name/" or "qmd://collection-name"
  152. */
  153. export function parseVirtualPath(virtualPath: string): VirtualPath | null {
  154. // Normalize the path first
  155. const normalized = normalizeVirtualPath(virtualPath);
  156. // Match: qmd://collection-name[/optional-path]
  157. // Allows: qmd://name, qmd://name/, qmd://name/path
  158. const match = normalized.match(/^qmd:\/\/([^\/]+)\/?(.*)$/);
  159. if (!match?.[1]) return null;
  160. return {
  161. collectionName: match[1],
  162. path: match[2] ?? '', // Empty string for collection root
  163. };
  164. }
  165. /**
  166. * Build a virtual path from collection name and relative path.
  167. */
  168. export function buildVirtualPath(collectionName: string, path: string): string {
  169. return `qmd://${collectionName}/${path}`;
  170. }
  171. /**
  172. * Check if a path is explicitly a virtual path.
  173. * Only recognizes explicit virtual path formats:
  174. * - qmd://collection/path.md
  175. * - //collection/path.md
  176. *
  177. * Does NOT consider bare collection/path.md as virtual - that should be
  178. * handled separately by checking if the first component is a collection name.
  179. */
  180. export function isVirtualPath(path: string): boolean {
  181. const trimmed = path.trim();
  182. // Explicit qmd:// prefix (with any number of slashes)
  183. if (trimmed.startsWith('qmd:')) return true;
  184. // //collection/path format (missing qmd: prefix)
  185. if (trimmed.startsWith('//')) return true;
  186. return false;
  187. }
  188. /**
  189. * Resolve a virtual path to absolute filesystem path.
  190. */
  191. export function resolveVirtualPath(db: Database, virtualPath: string): string | null {
  192. const parsed = parseVirtualPath(virtualPath);
  193. if (!parsed) return null;
  194. const coll = getCollectionByName(db, parsed.collectionName);
  195. if (!coll) return null;
  196. return resolve(coll.pwd, parsed.path);
  197. }
  198. /**
  199. * Convert an absolute filesystem path to a virtual path.
  200. * Returns null if the file is not in any indexed collection.
  201. */
  202. export function toVirtualPath(db: Database, absolutePath: string): string | null {
  203. // Get all collections from YAML config
  204. const collections = collectionsListCollections();
  205. // Find which collection this absolute path belongs to
  206. for (const coll of collections) {
  207. if (absolutePath.startsWith(coll.path + '/') || absolutePath === coll.path) {
  208. // Extract relative path
  209. const relativePath = absolutePath.startsWith(coll.path + '/')
  210. ? absolutePath.slice(coll.path.length + 1)
  211. : '';
  212. // Verify this document exists in the database
  213. const doc = db.prepare(`
  214. SELECT d.path
  215. FROM documents d
  216. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  217. LIMIT 1
  218. `).get(coll.name, relativePath) as { path: string } | null;
  219. if (doc) {
  220. return buildVirtualPath(coll.name, relativePath);
  221. }
  222. }
  223. }
  224. return null;
  225. }
  226. // =============================================================================
  227. // Database initialization
  228. // =============================================================================
  229. // On macOS, use Homebrew's SQLite which supports extensions
  230. if (process.platform === "darwin") {
  231. const homebrewSqlitePath = "/opt/homebrew/opt/sqlite/lib/libsqlite3.dylib";
  232. try {
  233. if (Bun.file(homebrewSqlitePath).size > 0) {
  234. Database.setCustomSQLite(homebrewSqlitePath);
  235. }
  236. } catch {}
  237. }
  238. function initializeDatabase(db: Database): void {
  239. sqliteVec.load(db);
  240. db.exec("PRAGMA journal_mode = WAL");
  241. db.exec("PRAGMA foreign_keys = ON");
  242. // Drop legacy tables that are now managed in YAML
  243. db.exec(`DROP TABLE IF EXISTS path_contexts`);
  244. db.exec(`DROP TABLE IF EXISTS collections`);
  245. // Content-addressable storage - the source of truth for document content
  246. db.exec(`
  247. CREATE TABLE IF NOT EXISTS content (
  248. hash TEXT PRIMARY KEY,
  249. doc TEXT NOT NULL,
  250. created_at TEXT NOT NULL
  251. )
  252. `);
  253. // Documents table - file system layer mapping virtual paths to content hashes
  254. // Collections are now managed in ~/.config/qmd/index.yml
  255. db.exec(`
  256. CREATE TABLE IF NOT EXISTS documents (
  257. id INTEGER PRIMARY KEY AUTOINCREMENT,
  258. collection TEXT NOT NULL,
  259. path TEXT NOT NULL,
  260. title TEXT NOT NULL,
  261. hash TEXT NOT NULL,
  262. created_at TEXT NOT NULL,
  263. modified_at TEXT NOT NULL,
  264. active INTEGER NOT NULL DEFAULT 1,
  265. FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
  266. UNIQUE(collection, path)
  267. )
  268. `);
  269. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection, active)`);
  270. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`);
  271. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path, active)`);
  272. // Cache table for LLM API calls
  273. db.exec(`
  274. CREATE TABLE IF NOT EXISTS llm_cache (
  275. hash TEXT PRIMARY KEY,
  276. result TEXT NOT NULL,
  277. created_at TEXT NOT NULL
  278. )
  279. `);
  280. // Content vectors
  281. const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
  282. const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
  283. if (cvInfo.length > 0 && !hasSeqColumn) {
  284. db.exec(`DROP TABLE IF EXISTS content_vectors`);
  285. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  286. }
  287. db.exec(`
  288. CREATE TABLE IF NOT EXISTS content_vectors (
  289. hash TEXT NOT NULL,
  290. seq INTEGER NOT NULL DEFAULT 0,
  291. pos INTEGER NOT NULL DEFAULT 0,
  292. model TEXT NOT NULL,
  293. embedded_at TEXT NOT NULL,
  294. PRIMARY KEY (hash, seq)
  295. )
  296. `);
  297. // FTS - index filepath (collection/path), title, and content
  298. db.exec(`
  299. CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
  300. filepath, title, body,
  301. tokenize='porter unicode61'
  302. )
  303. `);
  304. // Triggers to keep FTS in sync
  305. db.exec(`
  306. CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents
  307. WHEN new.active = 1
  308. BEGIN
  309. INSERT INTO documents_fts(rowid, filepath, title, body)
  310. SELECT
  311. new.id,
  312. new.collection || '/' || new.path,
  313. new.title,
  314. (SELECT doc FROM content WHERE hash = new.hash)
  315. WHERE new.active = 1;
  316. END
  317. `);
  318. db.exec(`
  319. CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
  320. DELETE FROM documents_fts WHERE rowid = old.id;
  321. END
  322. `);
  323. db.exec(`
  324. CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents
  325. BEGIN
  326. -- Delete from FTS if no longer active
  327. DELETE FROM documents_fts WHERE rowid = old.id AND new.active = 0;
  328. -- Update FTS if still/newly active
  329. INSERT OR REPLACE INTO documents_fts(rowid, filepath, title, body)
  330. SELECT
  331. new.id,
  332. new.collection || '/' || new.path,
  333. new.title,
  334. (SELECT doc FROM content WHERE hash = new.hash)
  335. WHERE new.active = 1;
  336. END
  337. `);
  338. }
  339. function ensureVecTableInternal(db: Database, dimensions: number): void {
  340. const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get() as { sql: string } | null;
  341. if (tableInfo) {
  342. const match = tableInfo.sql.match(/float\[(\d+)\]/);
  343. const hasHashSeq = tableInfo.sql.includes('hash_seq');
  344. const hasCosine = tableInfo.sql.includes('distance_metric=cosine');
  345. const existingDims = match?.[1] ? parseInt(match[1], 10) : null;
  346. if (existingDims === dimensions && hasHashSeq && hasCosine) return;
  347. // Table exists but wrong schema - need to rebuild
  348. db.exec("DROP TABLE IF EXISTS vectors_vec");
  349. }
  350. db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}] distance_metric=cosine)`);
  351. }
  352. // =============================================================================
  353. // Store Factory
  354. // =============================================================================
  355. export type Store = {
  356. db: Database;
  357. dbPath: string;
  358. close: () => void;
  359. ensureVecTable: (dimensions: number) => void;
  360. // Index health
  361. getHashesNeedingEmbedding: () => number;
  362. getIndexHealth: () => IndexHealthInfo;
  363. getStatus: () => IndexStatus;
  364. // Caching
  365. getCacheKey: typeof getCacheKey;
  366. getCachedResult: (cacheKey: string) => string | null;
  367. setCachedResult: (cacheKey: string, result: string) => void;
  368. clearCache: () => void;
  369. // Cleanup and maintenance
  370. deleteLLMCache: () => number;
  371. deleteInactiveDocuments: () => number;
  372. cleanupOrphanedContent: () => number;
  373. cleanupOrphanedVectors: () => number;
  374. vacuumDatabase: () => void;
  375. // Context
  376. getContextForFile: (filepath: string) => string | null;
  377. getContextForPath: (collectionName: string, path: string) => string | null;
  378. getCollectionByName: (name: string) => { name: string; pwd: string; glob_pattern: string } | null;
  379. getCollectionsWithoutContext: () => { name: string; pwd: string; doc_count: number }[];
  380. getTopLevelPathsWithoutContext: (collectionName: string) => string[];
  381. // Virtual paths
  382. parseVirtualPath: typeof parseVirtualPath;
  383. buildVirtualPath: typeof buildVirtualPath;
  384. isVirtualPath: typeof isVirtualPath;
  385. resolveVirtualPath: (virtualPath: string) => string | null;
  386. toVirtualPath: (absolutePath: string) => string | null;
  387. // Search
  388. searchFTS: (query: string, limit?: number, collectionId?: number) => SearchResult[];
  389. searchVec: (query: string, model: string, limit?: number, collectionId?: number) => Promise<SearchResult[]>;
  390. // Query expansion & reranking
  391. expandQuery: (query: string, model?: string) => Promise<string[]>;
  392. rerank: (query: string, documents: { file: string; text: string }[], model?: string) => Promise<{ file: string; score: number }[]>;
  393. // Document retrieval
  394. findDocument: (filename: string, options?: { includeBody?: boolean }) => DocumentResult | DocumentNotFound;
  395. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => string | null;
  396. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => { docs: MultiGetResult[]; errors: string[] };
  397. // Fuzzy matching and docid lookup
  398. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => string[];
  399. matchFilesByGlob: (pattern: string) => { filepath: string; displayPath: string; bodyLength: number }[];
  400. findDocumentByDocid: (docid: string) => { filepath: string; hash: string } | null;
  401. // Document indexing operations
  402. insertContent: (hash: string, content: string, createdAt: string) => void;
  403. insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => void;
  404. findActiveDocument: (collectionName: string, path: string) => { id: number; hash: string; title: string } | null;
  405. updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => void;
  406. updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => void;
  407. deactivateDocument: (collectionName: string, path: string) => void;
  408. getActiveDocumentPaths: (collectionName: string) => string[];
  409. // Vector/embedding operations
  410. getHashesForEmbedding: () => { hash: string; body: string; path: string }[];
  411. clearAllEmbeddings: () => void;
  412. insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void;
  413. };
  414. /**
  415. * Create a new store instance with the given database path.
  416. * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
  417. *
  418. * @param dbPath - Path to the SQLite database file
  419. * @returns Store instance with all methods bound to the database
  420. */
  421. export function createStore(dbPath?: string): Store {
  422. const resolvedPath = dbPath || getDefaultDbPath();
  423. const db = new Database(resolvedPath);
  424. initializeDatabase(db);
  425. return {
  426. db,
  427. dbPath: resolvedPath,
  428. close: () => db.close(),
  429. ensureVecTable: (dimensions: number) => ensureVecTableInternal(db, dimensions),
  430. // Index health
  431. getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
  432. getIndexHealth: () => getIndexHealth(db),
  433. getStatus: () => getStatus(db),
  434. // Caching
  435. getCacheKey,
  436. getCachedResult: (cacheKey: string) => getCachedResult(db, cacheKey),
  437. setCachedResult: (cacheKey: string, result: string) => setCachedResult(db, cacheKey, result),
  438. clearCache: () => clearCache(db),
  439. // Cleanup and maintenance
  440. deleteLLMCache: () => deleteLLMCache(db),
  441. deleteInactiveDocuments: () => deleteInactiveDocuments(db),
  442. cleanupOrphanedContent: () => cleanupOrphanedContent(db),
  443. cleanupOrphanedVectors: () => cleanupOrphanedVectors(db),
  444. vacuumDatabase: () => vacuumDatabase(db),
  445. // Context
  446. getContextForFile: (filepath: string) => getContextForFile(db, filepath),
  447. getContextForPath: (collectionName: string, path: string) => getContextForPath(db, collectionName, path),
  448. getCollectionByName: (name: string) => getCollectionByName(db, name),
  449. getCollectionsWithoutContext: () => getCollectionsWithoutContext(db),
  450. getTopLevelPathsWithoutContext: (collectionName: string) => getTopLevelPathsWithoutContext(db, collectionName),
  451. // Virtual paths
  452. parseVirtualPath,
  453. buildVirtualPath,
  454. isVirtualPath,
  455. resolveVirtualPath: (virtualPath: string) => resolveVirtualPath(db, virtualPath),
  456. toVirtualPath: (absolutePath: string) => toVirtualPath(db, absolutePath),
  457. // Search
  458. searchFTS: (query: string, limit?: number, collectionId?: number) => searchFTS(db, query, limit, collectionId),
  459. searchVec: (query: string, model: string, limit?: number, collectionId?: number) => searchVec(db, query, model, limit, collectionId),
  460. // Query expansion & reranking
  461. expandQuery: (query: string, model?: string) => expandQuery(query, model, db),
  462. rerank: (query: string, documents: { file: string; text: string }[], model?: string) => rerank(query, documents, model, db),
  463. // Document retrieval
  464. findDocument: (filename: string, options?: { includeBody?: boolean }) => findDocument(db, filename, options),
  465. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => getDocumentBody(db, doc, fromLine, maxLines),
  466. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => findDocuments(db, pattern, options),
  467. // Fuzzy matching and docid lookup
  468. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => findSimilarFiles(db, query, maxDistance, limit),
  469. matchFilesByGlob: (pattern: string) => matchFilesByGlob(db, pattern),
  470. findDocumentByDocid: (docid: string) => findDocumentByDocid(db, docid),
  471. // Document indexing operations
  472. insertContent: (hash: string, content: string, createdAt: string) => insertContent(db, hash, content, createdAt),
  473. insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt),
  474. findActiveDocument: (collectionName: string, path: string) => findActiveDocument(db, collectionName, path),
  475. updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => updateDocumentTitle(db, documentId, title, modifiedAt),
  476. updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => updateDocument(db, documentId, title, hash, modifiedAt),
  477. deactivateDocument: (collectionName: string, path: string) => deactivateDocument(db, collectionName, path),
  478. getActiveDocumentPaths: (collectionName: string) => getActiveDocumentPaths(db, collectionName),
  479. // Vector/embedding operations
  480. getHashesForEmbedding: () => getHashesForEmbedding(db),
  481. clearAllEmbeddings: () => clearAllEmbeddings(db),
  482. insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
  483. };
  484. }
  485. // =============================================================================
  486. // Core Document Type
  487. // =============================================================================
  488. /**
  489. * Unified document result type with all metadata.
  490. * Body is optional - use getDocumentBody() to load it separately if needed.
  491. */
  492. export type DocumentResult = {
  493. filepath: string; // Full filesystem path
  494. displayPath: string; // Short display path (e.g., "docs/readme.md")
  495. title: string; // Document title (from first heading or filename)
  496. context: string | null; // Folder context description if configured
  497. hash: string; // Content hash for caching/change detection
  498. docid: string; // Short docid (first 6 chars of hash) for quick reference
  499. collectionName: string; // Parent collection name
  500. modifiedAt: string; // Last modification timestamp
  501. bodyLength: number; // Body length in bytes (useful before loading)
  502. body?: string; // Document body (optional, load with getDocumentBody)
  503. };
  504. /**
  505. * Extract short docid from a full hash (first 6 characters).
  506. */
  507. export function getDocid(hash: string): string {
  508. return hash.slice(0, 6);
  509. }
  510. /**
  511. * Handelize a filename to be more token-friendly.
  512. * - Convert triple underscore `___` to `/` (folder separator)
  513. * - Convert to lowercase
  514. * - Replace sequences of non-word chars (except /) with single dash
  515. * - Remove leading/trailing dashes from path segments
  516. * - Preserve folder structure (a/b/c/d.md stays structured)
  517. * - Preserve file extension
  518. */
  519. export function handelize(path: string): string {
  520. if (!path || path.trim() === '') {
  521. throw new Error('handelize: path cannot be empty');
  522. }
  523. // Check for paths that are just extensions or only dots/special chars
  524. // A valid path must have at least one alphanumeric character before processing
  525. const segments = path.split('/').filter(Boolean);
  526. const lastSegment = segments[segments.length - 1] || '';
  527. const filenameWithoutExt = lastSegment.replace(/\.[^.]+$/, '');
  528. const hasValidContent = /[a-zA-Z0-9]/.test(filenameWithoutExt);
  529. if (!hasValidContent) {
  530. throw new Error(`handelize: path "${path}" has no valid filename content`);
  531. }
  532. const result = path
  533. .replace(/___/g, '/') // Triple underscore becomes folder separator
  534. .toLowerCase()
  535. .split('/')
  536. .map((segment, idx, arr) => {
  537. const isLastSegment = idx === arr.length - 1;
  538. if (isLastSegment) {
  539. // For the filename (last segment), preserve the extension
  540. const extMatch = segment.match(/(\.[a-z0-9]+)$/i);
  541. const ext = extMatch ? extMatch[1] : '';
  542. const nameWithoutExt = ext ? segment.slice(0, -ext.length) : segment;
  543. const cleanedName = nameWithoutExt
  544. .replace(/[\W_]+/g, '-') // Replace non-word chars with dash
  545. .replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
  546. return cleanedName + ext;
  547. } else {
  548. // For directories, just clean normally
  549. return segment
  550. .replace(/[\W_]+/g, '-')
  551. .replace(/^-+|-+$/g, '');
  552. }
  553. })
  554. .filter(Boolean)
  555. .join('/');
  556. if (!result) {
  557. throw new Error(`handelize: path "${path}" resulted in empty string after processing`);
  558. }
  559. return result;
  560. }
  561. /**
  562. * Search result extends DocumentResult with score and source info
  563. */
  564. export type SearchResult = DocumentResult & {
  565. score: number; // Relevance score (0-1)
  566. source: "fts" | "vec"; // Search source (full-text or vector)
  567. chunkPos?: number; // Character position of matching chunk (for vector search)
  568. };
  569. /**
  570. * Ranked result for RRF fusion (simplified, used internally)
  571. */
  572. export type RankedResult = {
  573. file: string;
  574. displayPath: string;
  575. title: string;
  576. body: string;
  577. score: number;
  578. };
  579. /**
  580. * Error result when document is not found
  581. */
  582. export type DocumentNotFound = {
  583. error: "not_found";
  584. query: string;
  585. similarFiles: string[];
  586. };
  587. /**
  588. * Result from multi-get operations
  589. */
  590. export type MultiGetResult = {
  591. doc: DocumentResult;
  592. skipped: false;
  593. } | {
  594. doc: Pick<DocumentResult, "filepath" | "displayPath">;
  595. skipped: true;
  596. skipReason: string;
  597. };
  598. export type CollectionInfo = {
  599. name: string;
  600. path: string;
  601. pattern: string;
  602. documents: number;
  603. lastUpdated: string;
  604. };
  605. export type IndexStatus = {
  606. totalDocuments: number;
  607. needsEmbedding: number;
  608. hasVectorIndex: boolean;
  609. collections: CollectionInfo[];
  610. };
  611. // =============================================================================
  612. // Index health
  613. // =============================================================================
  614. export function getHashesNeedingEmbedding(db: Database): number {
  615. const result = db.prepare(`
  616. SELECT COUNT(DISTINCT d.hash) as count
  617. FROM documents d
  618. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  619. WHERE d.active = 1 AND v.hash IS NULL
  620. `).get() as { count: number };
  621. return result.count;
  622. }
  623. export type IndexHealthInfo = {
  624. needsEmbedding: number;
  625. totalDocs: number;
  626. daysStale: number | null;
  627. };
  628. export function getIndexHealth(db: Database): IndexHealthInfo {
  629. const needsEmbedding = getHashesNeedingEmbedding(db);
  630. const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count;
  631. const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
  632. let daysStale: number | null = null;
  633. if (mostRecent?.latest) {
  634. const lastUpdate = new Date(mostRecent.latest);
  635. daysStale = Math.floor((Date.now() - lastUpdate.getTime()) / (24 * 60 * 60 * 1000));
  636. }
  637. return { needsEmbedding, totalDocs, daysStale };
  638. }
  639. // =============================================================================
  640. // Caching
  641. // =============================================================================
  642. export function getCacheKey(url: string, body: object): string {
  643. const hash = new Bun.CryptoHasher("sha256");
  644. hash.update(url);
  645. hash.update(JSON.stringify(body));
  646. return hash.digest("hex");
  647. }
  648. export function getCachedResult(db: Database, cacheKey: string): string | null {
  649. const row = db.prepare(`SELECT result FROM llm_cache WHERE hash = ?`).get(cacheKey) as { result: string } | null;
  650. return row?.result || null;
  651. }
  652. export function setCachedResult(db: Database, cacheKey: string, result: string): void {
  653. const now = new Date().toISOString();
  654. db.prepare(`INSERT OR REPLACE INTO llm_cache (hash, result, created_at) VALUES (?, ?, ?)`).run(cacheKey, result, now);
  655. if (Math.random() < 0.01) {
  656. db.exec(`DELETE FROM llm_cache WHERE hash NOT IN (SELECT hash FROM llm_cache ORDER BY created_at DESC LIMIT 1000)`);
  657. }
  658. }
  659. export function clearCache(db: Database): void {
  660. db.exec(`DELETE FROM llm_cache`);
  661. }
  662. // =============================================================================
  663. // Cleanup and maintenance operations
  664. // =============================================================================
  665. /**
  666. * Delete cached LLM API responses.
  667. * Returns the number of cached responses deleted.
  668. */
  669. export function deleteLLMCache(db: Database): number {
  670. const result = db.prepare(`DELETE FROM llm_cache`).run();
  671. return result.changes;
  672. }
  673. /**
  674. * Remove inactive document records (active = 0).
  675. * Returns the number of inactive documents deleted.
  676. */
  677. export function deleteInactiveDocuments(db: Database): number {
  678. const result = db.prepare(`DELETE FROM documents WHERE active = 0`).run();
  679. return result.changes;
  680. }
  681. /**
  682. * Remove orphaned content hashes that are not referenced by any active document.
  683. * Returns the number of orphaned content hashes deleted.
  684. */
  685. export function cleanupOrphanedContent(db: Database): number {
  686. const result = db.prepare(`
  687. DELETE FROM content
  688. WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  689. `).run();
  690. return result.changes;
  691. }
  692. /**
  693. * Remove orphaned vector embeddings that are not referenced by any active document.
  694. * Returns the number of orphaned embedding chunks deleted.
  695. */
  696. export function cleanupOrphanedVectors(db: Database): number {
  697. // Check if vectors_vec table exists
  698. const tableExists = db.prepare(`
  699. SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'
  700. `).get();
  701. if (!tableExists) {
  702. return 0;
  703. }
  704. // Count orphaned vectors first
  705. const countResult = db.prepare(`
  706. SELECT COUNT(*) as c FROM content_vectors cv
  707. WHERE NOT EXISTS (
  708. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  709. )
  710. `).get() as { c: number };
  711. if (countResult.c === 0) {
  712. return 0;
  713. }
  714. // Delete from vectors_vec first
  715. db.exec(`
  716. DELETE FROM vectors_vec WHERE hash_seq IN (
  717. SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
  718. WHERE NOT EXISTS (
  719. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  720. )
  721. )
  722. `);
  723. // Delete from content_vectors
  724. db.exec(`
  725. DELETE FROM content_vectors WHERE hash NOT IN (
  726. SELECT hash FROM documents WHERE active = 1
  727. )
  728. `);
  729. return countResult.c;
  730. }
  731. /**
  732. * Run VACUUM to reclaim unused space in the database.
  733. * This operation rebuilds the database file to eliminate fragmentation.
  734. */
  735. export function vacuumDatabase(db: Database): void {
  736. db.exec(`VACUUM`);
  737. }
  738. // =============================================================================
  739. // Document helpers
  740. // =============================================================================
  741. export async function hashContent(content: string): Promise<string> {
  742. const hash = new Bun.CryptoHasher("sha256");
  743. hash.update(content);
  744. return hash.digest("hex");
  745. }
  746. export function extractTitle(content: string, filename: string): string {
  747. const match = content.match(/^##?\s+(.+)$/m);
  748. if (match) {
  749. const title = (match[1] ?? "").trim();
  750. if (title === "📝 Notes" || title === "Notes") {
  751. const nextMatch = content.match(/^##\s+(.+)$/m);
  752. if (nextMatch?.[1]) return nextMatch[1].trim();
  753. }
  754. return title;
  755. }
  756. return filename.replace(/\.md$/, "").split("/").pop() || filename;
  757. }
  758. // =============================================================================
  759. // Document indexing operations
  760. // =============================================================================
  761. /**
  762. * Insert content into the content table (content-addressable storage).
  763. * Uses INSERT OR IGNORE so duplicate hashes are skipped.
  764. */
  765. export function insertContent(db: Database, hash: string, content: string, createdAt: string): void {
  766. db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
  767. .run(hash, content, createdAt);
  768. }
  769. /**
  770. * Insert a new document into the documents table.
  771. */
  772. export function insertDocument(
  773. db: Database,
  774. collectionName: string,
  775. path: string,
  776. title: string,
  777. hash: string,
  778. createdAt: string,
  779. modifiedAt: string
  780. ): void {
  781. db.prepare(`
  782. INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active)
  783. VALUES (?, ?, ?, ?, ?, ?, 1)
  784. `).run(collectionName, path, title, hash, createdAt, modifiedAt);
  785. }
  786. /**
  787. * Find an active document by collection name and path.
  788. */
  789. export function findActiveDocument(
  790. db: Database,
  791. collectionName: string,
  792. path: string
  793. ): { id: number; hash: string; title: string } | null {
  794. return db.prepare(`
  795. SELECT id, hash, title FROM documents
  796. WHERE collection = ? AND path = ? AND active = 1
  797. `).get(collectionName, path) as { id: number; hash: string; title: string } | null;
  798. }
  799. /**
  800. * Update the title and modified_at timestamp for a document.
  801. */
  802. export function updateDocumentTitle(
  803. db: Database,
  804. documentId: number,
  805. title: string,
  806. modifiedAt: string
  807. ): void {
  808. db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
  809. .run(title, modifiedAt, documentId);
  810. }
  811. /**
  812. * Update an existing document's hash, title, and modified_at timestamp.
  813. * Used when content changes but the file path stays the same.
  814. */
  815. export function updateDocument(
  816. db: Database,
  817. documentId: number,
  818. title: string,
  819. hash: string,
  820. modifiedAt: string
  821. ): void {
  822. db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`)
  823. .run(title, hash, modifiedAt, documentId);
  824. }
  825. /**
  826. * Deactivate a document (mark as inactive but don't delete).
  827. */
  828. export function deactivateDocument(db: Database, collectionName: string, path: string): void {
  829. db.prepare(`UPDATE documents SET active = 0 WHERE collection = ? AND path = ? AND active = 1`)
  830. .run(collectionName, path);
  831. }
  832. /**
  833. * Get all active document paths for a collection.
  834. */
  835. export function getActiveDocumentPaths(db: Database, collectionName: string): string[] {
  836. const rows = db.prepare(`
  837. SELECT path FROM documents WHERE collection = ? AND active = 1
  838. `).all(collectionName) as { path: string }[];
  839. return rows.map(r => r.path);
  840. }
  841. export { formatQueryForEmbedding, formatDocForEmbedding };
  842. export function chunkDocument(content: string, maxChars: number = CHUNK_SIZE_CHARS, overlapChars: number = CHUNK_OVERLAP_CHARS): { text: string; pos: number }[] {
  843. if (content.length <= maxChars) {
  844. return [{ text: content, pos: 0 }];
  845. }
  846. const chunks: { text: string; pos: number }[] = [];
  847. let charPos = 0;
  848. while (charPos < content.length) {
  849. // Calculate end position for this chunk
  850. let endPos = Math.min(charPos + maxChars, content.length);
  851. // If not at the end, try to find a good break point
  852. if (endPos < content.length) {
  853. const slice = content.slice(charPos, endPos);
  854. // Look for break points in the last 30% of the chunk
  855. const searchStart = Math.floor(slice.length * 0.7);
  856. const searchSlice = slice.slice(searchStart);
  857. // Priority: paragraph > sentence > line > word
  858. let breakOffset = -1;
  859. const paragraphBreak = searchSlice.lastIndexOf('\n\n');
  860. if (paragraphBreak >= 0) {
  861. breakOffset = searchStart + paragraphBreak + 2;
  862. } else {
  863. const sentenceEnd = Math.max(
  864. searchSlice.lastIndexOf('. '),
  865. searchSlice.lastIndexOf('.\n'),
  866. searchSlice.lastIndexOf('? '),
  867. searchSlice.lastIndexOf('?\n'),
  868. searchSlice.lastIndexOf('! '),
  869. searchSlice.lastIndexOf('!\n')
  870. );
  871. if (sentenceEnd >= 0) {
  872. breakOffset = searchStart + sentenceEnd + 2;
  873. } else {
  874. const lineBreak = searchSlice.lastIndexOf('\n');
  875. if (lineBreak >= 0) {
  876. breakOffset = searchStart + lineBreak + 1;
  877. } else {
  878. const spaceBreak = searchSlice.lastIndexOf(' ');
  879. if (spaceBreak >= 0) {
  880. breakOffset = searchStart + spaceBreak + 1;
  881. }
  882. }
  883. }
  884. }
  885. if (breakOffset > 0) {
  886. endPos = charPos + breakOffset;
  887. }
  888. }
  889. // Ensure we make progress
  890. if (endPos <= charPos) {
  891. endPos = Math.min(charPos + maxChars, content.length);
  892. }
  893. chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
  894. // Move forward, but overlap with previous chunk
  895. // For last chunk, don't overlap (just go to the end)
  896. if (endPos >= content.length) {
  897. break;
  898. }
  899. charPos = endPos - overlapChars;
  900. const lastChunkPos = chunks.at(-1)!.pos;
  901. if (charPos <= lastChunkPos) {
  902. // Prevent infinite loop - move forward at least a bit
  903. charPos = endPos;
  904. }
  905. }
  906. return chunks;
  907. }
  908. /**
  909. * Chunk a document by actual token count using the LLM tokenizer.
  910. * More accurate than character-based chunking but requires async.
  911. */
  912. export async function chunkDocumentByTokens(
  913. content: string,
  914. maxTokens: number = CHUNK_SIZE_TOKENS,
  915. overlapTokens: number = CHUNK_OVERLAP_TOKENS
  916. ): Promise<{ text: string; pos: number; tokens: number }[]> {
  917. const llm = getDefaultLlamaCpp();
  918. // Tokenize once upfront
  919. const allTokens = await llm.tokenize(content);
  920. const totalTokens = allTokens.length;
  921. if (totalTokens <= maxTokens) {
  922. return [{ text: content, pos: 0, tokens: totalTokens }];
  923. }
  924. const chunks: { text: string; pos: number; tokens: number }[] = [];
  925. const step = maxTokens - overlapTokens;
  926. const avgCharsPerToken = content.length / totalTokens;
  927. let tokenPos = 0;
  928. while (tokenPos < totalTokens) {
  929. const chunkEnd = Math.min(tokenPos + maxTokens, totalTokens);
  930. const chunkTokens = allTokens.slice(tokenPos, chunkEnd);
  931. let chunkText = await llm.detokenize(chunkTokens);
  932. // Find a good break point if not at end of document
  933. if (chunkEnd < totalTokens) {
  934. const searchStart = Math.floor(chunkText.length * 0.7);
  935. const searchSlice = chunkText.slice(searchStart);
  936. let breakOffset = -1;
  937. const paragraphBreak = searchSlice.lastIndexOf('\n\n');
  938. if (paragraphBreak >= 0) {
  939. breakOffset = paragraphBreak + 2;
  940. } else {
  941. const sentenceEnd = Math.max(
  942. searchSlice.lastIndexOf('. '),
  943. searchSlice.lastIndexOf('.\n'),
  944. searchSlice.lastIndexOf('? '),
  945. searchSlice.lastIndexOf('?\n'),
  946. searchSlice.lastIndexOf('! '),
  947. searchSlice.lastIndexOf('!\n')
  948. );
  949. if (sentenceEnd >= 0) {
  950. breakOffset = sentenceEnd + 2;
  951. } else {
  952. const lineBreak = searchSlice.lastIndexOf('\n');
  953. if (lineBreak >= 0) {
  954. breakOffset = lineBreak + 1;
  955. }
  956. }
  957. }
  958. if (breakOffset >= 0) {
  959. chunkText = chunkText.slice(0, searchStart + breakOffset);
  960. }
  961. }
  962. // Approximate character position based on token position
  963. const charPos = Math.floor(tokenPos * avgCharsPerToken);
  964. chunks.push({ text: chunkText, pos: charPos, tokens: chunkTokens.length });
  965. // Move forward
  966. if (chunkEnd >= totalTokens) break;
  967. // Advance by step tokens (maxTokens - overlap)
  968. tokenPos += step;
  969. }
  970. return chunks;
  971. }
  972. // =============================================================================
  973. // Fuzzy matching
  974. // =============================================================================
  975. function levenshtein(a: string, b: string): number {
  976. const m = a.length, n = b.length;
  977. if (m === 0) return n;
  978. if (n === 0) return m;
  979. const dp: number[][] = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0));
  980. for (let i = 0; i <= m; i++) dp[i]![0] = i;
  981. for (let j = 0; j <= n; j++) dp[0]![j] = j;
  982. for (let i = 1; i <= m; i++) {
  983. for (let j = 1; j <= n; j++) {
  984. const cost = a[i - 1] === b[j - 1] ? 0 : 1;
  985. dp[i]![j] = Math.min(
  986. dp[i - 1]![j]! + 1,
  987. dp[i]![j - 1]! + 1,
  988. dp[i - 1]![j - 1]! + cost
  989. );
  990. }
  991. }
  992. return dp[m]![n]!;
  993. }
  994. /**
  995. * Find a document by its short docid (first 6 characters of hash).
  996. * Returns the document's virtual path if found, null otherwise.
  997. * If multiple documents match the same short hash (collision), returns the first one.
  998. */
  999. export function findDocumentByDocid(db: Database, docid: string): { filepath: string; hash: string } | null {
  1000. // Normalize: remove leading # if present
  1001. const shortHash = docid.startsWith('#') ? docid.slice(1) : docid;
  1002. if (shortHash.length < 1) return null;
  1003. // Look up documents where hash starts with the short hash
  1004. const doc = db.prepare(`
  1005. SELECT 'qmd://' || d.collection || '/' || d.path as filepath, d.hash
  1006. FROM documents d
  1007. WHERE d.hash LIKE ? AND d.active = 1
  1008. LIMIT 1
  1009. `).get(`${shortHash}%`) as { filepath: string; hash: string } | null;
  1010. return doc;
  1011. }
  1012. export function findSimilarFiles(db: Database, query: string, maxDistance: number = 3, limit: number = 5): string[] {
  1013. const allFiles = db.prepare(`
  1014. SELECT d.path
  1015. FROM documents d
  1016. WHERE d.active = 1
  1017. `).all() as { path: string }[];
  1018. const queryLower = query.toLowerCase();
  1019. const scored = allFiles
  1020. .map(f => ({ path: f.path, dist: levenshtein(f.path.toLowerCase(), queryLower) }))
  1021. .filter(f => f.dist <= maxDistance)
  1022. .sort((a, b) => a.dist - b.dist)
  1023. .slice(0, limit);
  1024. return scored.map(f => f.path);
  1025. }
  1026. export function matchFilesByGlob(db: Database, pattern: string): { filepath: string; displayPath: string; bodyLength: number }[] {
  1027. const allFiles = db.prepare(`
  1028. SELECT
  1029. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  1030. LENGTH(content.doc) as body_length,
  1031. d.path,
  1032. d.collection
  1033. FROM documents d
  1034. JOIN content ON content.hash = d.hash
  1035. WHERE d.active = 1
  1036. `).all() as { virtual_path: string; body_length: number; path: string; collection: string }[];
  1037. const glob = new Glob(pattern);
  1038. return allFiles
  1039. .filter(f => glob.match(f.virtual_path) || glob.match(f.path))
  1040. .map(f => ({
  1041. filepath: f.virtual_path, // Virtual path for precise lookup
  1042. displayPath: f.path, // Relative path for display
  1043. bodyLength: f.body_length
  1044. }));
  1045. }
  1046. // =============================================================================
  1047. // Context
  1048. // =============================================================================
  1049. /**
  1050. * Get context for a file path using hierarchical inheritance.
  1051. * Contexts are collection-scoped and inherit from parent directories.
  1052. * For example, context at "/talks" applies to "/talks/2024/keynote.md".
  1053. *
  1054. * @param db Database instance (unused - kept for compatibility)
  1055. * @param collectionName Collection name
  1056. * @param path Relative path within the collection
  1057. * @returns Context string or null if no context is defined
  1058. */
  1059. export function getContextForPath(db: Database, collectionName: string, path: string): string | null {
  1060. const config = collectionsLoadConfig();
  1061. const coll = getCollection(collectionName);
  1062. if (!coll) return null;
  1063. // Collect ALL matching contexts (global + all path prefixes)
  1064. const contexts: string[] = [];
  1065. // Add global context if present
  1066. if (config.global_context) {
  1067. contexts.push(config.global_context);
  1068. }
  1069. // Add all matching path contexts (from most general to most specific)
  1070. if (coll.context) {
  1071. const normalizedPath = path.startsWith("/") ? path : `/${path}`;
  1072. // Collect all matching prefixes
  1073. const matchingContexts: { prefix: string; context: string }[] = [];
  1074. for (const [prefix, context] of Object.entries(coll.context)) {
  1075. const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
  1076. if (normalizedPath.startsWith(normalizedPrefix)) {
  1077. matchingContexts.push({ prefix: normalizedPrefix, context });
  1078. }
  1079. }
  1080. // Sort by prefix length (shortest/most general first)
  1081. matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
  1082. // Add all matching contexts
  1083. for (const match of matchingContexts) {
  1084. contexts.push(match.context);
  1085. }
  1086. }
  1087. // Join all contexts with double newline
  1088. return contexts.length > 0 ? contexts.join('\n\n') : null;
  1089. }
  1090. /**
  1091. * Get context for a file path (virtual or filesystem).
  1092. * Resolves the collection and relative path using the YAML collections config.
  1093. */
  1094. export function getContextForFile(db: Database, filepath: string): string | null {
  1095. // Handle undefined or null filepath
  1096. if (!filepath) return null;
  1097. // Get all collections from YAML config
  1098. const collections = collectionsListCollections();
  1099. const config = collectionsLoadConfig();
  1100. // Parse virtual path format: qmd://collection/path
  1101. let collectionName: string | null = null;
  1102. let relativePath: string | null = null;
  1103. const parsedVirtual = filepath.startsWith('qmd://') ? parseVirtualPath(filepath) : null;
  1104. if (parsedVirtual) {
  1105. collectionName = parsedVirtual.collectionName;
  1106. relativePath = parsedVirtual.path;
  1107. } else {
  1108. // Filesystem path: find which collection this absolute path belongs to
  1109. for (const coll of collections) {
  1110. // Skip collections with missing paths
  1111. if (!coll || !coll.path) continue;
  1112. if (filepath.startsWith(coll.path + '/') || filepath === coll.path) {
  1113. collectionName = coll.name;
  1114. // Extract relative path
  1115. relativePath = filepath.startsWith(coll.path + '/')
  1116. ? filepath.slice(coll.path.length + 1)
  1117. : '';
  1118. break;
  1119. }
  1120. }
  1121. if (!collectionName || relativePath === null) return null;
  1122. }
  1123. // Get the collection from config
  1124. const coll = getCollection(collectionName);
  1125. if (!coll) return null;
  1126. // Verify this document exists in the database
  1127. const doc = db.prepare(`
  1128. SELECT d.path
  1129. FROM documents d
  1130. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  1131. LIMIT 1
  1132. `).get(collectionName, relativePath) as { path: string } | null;
  1133. if (!doc) return null;
  1134. // Collect ALL matching contexts (global + all path prefixes)
  1135. const contexts: string[] = [];
  1136. // Add global context if present
  1137. if (config.global_context) {
  1138. contexts.push(config.global_context);
  1139. }
  1140. // Add all matching path contexts (from most general to most specific)
  1141. if (coll.context) {
  1142. const normalizedPath = relativePath.startsWith("/") ? relativePath : `/${relativePath}`;
  1143. // Collect all matching prefixes
  1144. const matchingContexts: { prefix: string; context: string }[] = [];
  1145. for (const [prefix, context] of Object.entries(coll.context)) {
  1146. const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
  1147. if (normalizedPath.startsWith(normalizedPrefix)) {
  1148. matchingContexts.push({ prefix: normalizedPrefix, context });
  1149. }
  1150. }
  1151. // Sort by prefix length (shortest/most general first)
  1152. matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
  1153. // Add all matching contexts
  1154. for (const match of matchingContexts) {
  1155. contexts.push(match.context);
  1156. }
  1157. }
  1158. // Join all contexts with double newline
  1159. return contexts.length > 0 ? contexts.join('\n\n') : null;
  1160. }
  1161. /**
  1162. * Get collection by name from YAML config.
  1163. * Returns collection metadata from ~/.config/qmd/index.yml
  1164. */
  1165. export function getCollectionByName(db: Database, name: string): { name: string; pwd: string; glob_pattern: string } | null {
  1166. const collection = getCollection(name);
  1167. if (!collection) return null;
  1168. return {
  1169. name: collection.name,
  1170. pwd: collection.path,
  1171. glob_pattern: collection.pattern,
  1172. };
  1173. }
  1174. /**
  1175. * List all collections with document counts from database.
  1176. * Merges YAML config with database statistics.
  1177. */
  1178. export function listCollections(db: Database): { name: string; pwd: string; glob_pattern: string; doc_count: number; active_count: number; last_modified: string | null }[] {
  1179. const collections = collectionsListCollections();
  1180. // Get document counts from database for each collection
  1181. const result = collections.map(coll => {
  1182. const stats = db.prepare(`
  1183. SELECT
  1184. COUNT(d.id) as doc_count,
  1185. SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count,
  1186. MAX(d.modified_at) as last_modified
  1187. FROM documents d
  1188. WHERE d.collection = ?
  1189. `).get(coll.name) as { doc_count: number; active_count: number; last_modified: string | null } | null;
  1190. return {
  1191. name: coll.name,
  1192. pwd: coll.path,
  1193. glob_pattern: coll.pattern,
  1194. doc_count: stats?.doc_count || 0,
  1195. active_count: stats?.active_count || 0,
  1196. last_modified: stats?.last_modified || null,
  1197. };
  1198. });
  1199. return result;
  1200. }
  1201. /**
  1202. * Remove a collection and clean up its documents.
  1203. * Uses collections.ts to remove from YAML config and cleans up database.
  1204. */
  1205. export function removeCollection(db: Database, collectionName: string): { deletedDocs: number; cleanedHashes: number } {
  1206. // Delete documents from database
  1207. const docResult = db.prepare(`DELETE FROM documents WHERE collection = ?`).run(collectionName);
  1208. // Clean up orphaned content hashes
  1209. const cleanupResult = db.prepare(`
  1210. DELETE FROM content
  1211. WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  1212. `).run();
  1213. // Remove from YAML config (returns true if found and removed)
  1214. collectionsRemoveCollection(collectionName);
  1215. return {
  1216. deletedDocs: docResult.changes,
  1217. cleanedHashes: cleanupResult.changes
  1218. };
  1219. }
  1220. /**
  1221. * Rename a collection.
  1222. * Updates both YAML config and database documents table.
  1223. */
  1224. export function renameCollection(db: Database, oldName: string, newName: string): void {
  1225. // Update all documents with the new collection name in database
  1226. db.prepare(`UPDATE documents SET collection = ? WHERE collection = ?`)
  1227. .run(newName, oldName);
  1228. // Rename in YAML config
  1229. collectionsRenameCollection(oldName, newName);
  1230. }
  1231. // =============================================================================
  1232. // Context Management Operations
  1233. // =============================================================================
  1234. /**
  1235. * Insert or update a context for a specific collection and path prefix.
  1236. */
  1237. export function insertContext(db: Database, collectionId: number, pathPrefix: string, context: string): void {
  1238. // Get collection name from ID
  1239. const coll = db.prepare(`SELECT name FROM collections WHERE id = ?`).get(collectionId) as { name: string } | null;
  1240. if (!coll) {
  1241. throw new Error(`Collection with id ${collectionId} not found`);
  1242. }
  1243. // Use collections.ts to add context
  1244. collectionsAddContext(coll.name, pathPrefix, context);
  1245. }
  1246. /**
  1247. * Delete a context for a specific collection and path prefix.
  1248. * Returns the number of contexts deleted.
  1249. */
  1250. export function deleteContext(db: Database, collectionName: string, pathPrefix: string): number {
  1251. // Use collections.ts to remove context
  1252. const success = collectionsRemoveContext(collectionName, pathPrefix);
  1253. return success ? 1 : 0;
  1254. }
  1255. /**
  1256. * Delete all global contexts (contexts with empty path_prefix).
  1257. * Returns the number of contexts deleted.
  1258. */
  1259. export function deleteGlobalContexts(db: Database): number {
  1260. let deletedCount = 0;
  1261. // Remove global context
  1262. setGlobalContext(undefined);
  1263. deletedCount++;
  1264. // Remove root context (empty string) from all collections
  1265. const collections = collectionsListCollections();
  1266. for (const coll of collections) {
  1267. const success = collectionsRemoveContext(coll.name, '');
  1268. if (success) {
  1269. deletedCount++;
  1270. }
  1271. }
  1272. return deletedCount;
  1273. }
  1274. /**
  1275. * List all contexts, grouped by collection.
  1276. * Returns contexts ordered by collection name, then by path prefix length (longest first).
  1277. */
  1278. export function listPathContexts(db: Database): { collection_name: string; path_prefix: string; context: string }[] {
  1279. const allContexts = collectionsListAllContexts();
  1280. // Convert to expected format and sort
  1281. return allContexts.map(ctx => ({
  1282. collection_name: ctx.collection,
  1283. path_prefix: ctx.path,
  1284. context: ctx.context,
  1285. })).sort((a, b) => {
  1286. // Sort by collection name first
  1287. if (a.collection_name !== b.collection_name) {
  1288. return a.collection_name.localeCompare(b.collection_name);
  1289. }
  1290. // Then by path prefix length (longest first)
  1291. if (a.path_prefix.length !== b.path_prefix.length) {
  1292. return b.path_prefix.length - a.path_prefix.length;
  1293. }
  1294. // Then alphabetically
  1295. return a.path_prefix.localeCompare(b.path_prefix);
  1296. });
  1297. }
  1298. /**
  1299. * Get all collections (name only - from YAML config).
  1300. */
  1301. export function getAllCollections(db: Database): { name: string }[] {
  1302. const collections = collectionsListCollections();
  1303. return collections.map(c => ({ name: c.name }));
  1304. }
  1305. /**
  1306. * Check which collections don't have any context defined.
  1307. * Returns collections that have no context entries at all (not even root context).
  1308. */
  1309. export function getCollectionsWithoutContext(db: Database): { name: string; pwd: string; doc_count: number }[] {
  1310. // Get all collections from YAML config
  1311. const yamlCollections = collectionsListCollections();
  1312. // Filter to those without context
  1313. const collectionsWithoutContext: { name: string; pwd: string; doc_count: number }[] = [];
  1314. for (const coll of yamlCollections) {
  1315. // Check if collection has any context
  1316. if (!coll.context || Object.keys(coll.context).length === 0) {
  1317. // Get doc count from database
  1318. const stats = db.prepare(`
  1319. SELECT COUNT(d.id) as doc_count
  1320. FROM documents d
  1321. WHERE d.collection = ? AND d.active = 1
  1322. `).get(coll.name) as { doc_count: number } | null;
  1323. collectionsWithoutContext.push({
  1324. name: coll.name,
  1325. pwd: coll.path,
  1326. doc_count: stats?.doc_count || 0,
  1327. });
  1328. }
  1329. }
  1330. return collectionsWithoutContext.sort((a, b) => a.name.localeCompare(b.name));
  1331. }
  1332. /**
  1333. * Get top-level directories in a collection that don't have context.
  1334. * Useful for suggesting where context might be needed.
  1335. */
  1336. export function getTopLevelPathsWithoutContext(db: Database, collectionName: string): string[] {
  1337. // Get all paths in the collection from database
  1338. const paths = db.prepare(`
  1339. SELECT DISTINCT path FROM documents
  1340. WHERE collection = ? AND active = 1
  1341. `).all(collectionName) as { path: string }[];
  1342. // Get existing contexts for this collection from YAML
  1343. const yamlColl = getCollection(collectionName);
  1344. if (!yamlColl) return [];
  1345. const contextPrefixes = new Set<string>();
  1346. if (yamlColl.context) {
  1347. for (const prefix of Object.keys(yamlColl.context)) {
  1348. contextPrefixes.add(prefix);
  1349. }
  1350. }
  1351. // Extract top-level directories (first path component)
  1352. const topLevelDirs = new Set<string>();
  1353. for (const { path } of paths) {
  1354. const parts = path.split('/').filter(Boolean);
  1355. if (parts.length > 1) {
  1356. const dir = parts[0];
  1357. if (dir) topLevelDirs.add(dir);
  1358. }
  1359. }
  1360. // Filter out directories that already have context (exact or parent)
  1361. const missing: string[] = [];
  1362. for (const dir of topLevelDirs) {
  1363. let hasContext = false;
  1364. // Check if this dir or any parent has context
  1365. for (const prefix of contextPrefixes) {
  1366. if (prefix === '' || prefix === dir || dir.startsWith(prefix + '/')) {
  1367. hasContext = true;
  1368. break;
  1369. }
  1370. }
  1371. if (!hasContext) {
  1372. missing.push(dir);
  1373. }
  1374. }
  1375. return missing.sort();
  1376. }
  1377. // =============================================================================
  1378. // FTS Search
  1379. // =============================================================================
  1380. function sanitizeFTS5Term(term: string): string {
  1381. return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
  1382. }
  1383. function buildFTS5Query(query: string): string | null {
  1384. const terms = query.split(/\s+/)
  1385. .map(t => sanitizeFTS5Term(t))
  1386. .filter(t => t.length > 0);
  1387. if (terms.length === 0) return null;
  1388. if (terms.length === 1) return `"${terms[0]}"*`;
  1389. return terms.map(t => `"${t}"*`).join(' AND ');
  1390. }
  1391. export function searchFTS(db: Database, query: string, limit: number = 20, collectionId?: number): SearchResult[] {
  1392. const ftsQuery = buildFTS5Query(query);
  1393. if (!ftsQuery) return [];
  1394. let sql = `
  1395. SELECT
  1396. 'qmd://' || d.collection || '/' || d.path as filepath,
  1397. d.collection || '/' || d.path as display_path,
  1398. d.title,
  1399. content.doc as body,
  1400. d.hash,
  1401. bm25(documents_fts, 10.0, 1.0) as bm25_score
  1402. FROM documents_fts f
  1403. JOIN documents d ON d.id = f.rowid
  1404. JOIN content ON content.hash = d.hash
  1405. WHERE documents_fts MATCH ? AND d.active = 1
  1406. `;
  1407. const params: (string | number)[] = [ftsQuery];
  1408. if (collectionId !== undefined) {
  1409. // Note: collectionId is a legacy parameter that should be phased out
  1410. // Collections are now managed in YAML. For now, we interpret it as a collection name filter.
  1411. // This code path is likely unused as collection filtering should be done at CLI level.
  1412. sql += ` AND d.collection = ?`;
  1413. params.push(String(collectionId));
  1414. }
  1415. // bm25 lower is better; sort ascending.
  1416. sql += ` ORDER BY bm25_score ASC LIMIT ?`;
  1417. params.push(limit);
  1418. const rows = db.prepare(sql).all(...params) as { filepath: string; display_path: string; title: string; body: string; hash: string; bm25_score: number }[];
  1419. return rows.map(row => {
  1420. const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
  1421. // Convert bm25 (lower is better) into a stable (0..1] score where higher is better.
  1422. // Avoid per-query normalization so "strong signal" heuristics can work.
  1423. const score = 1 / (1 + Math.max(0, row.bm25_score));
  1424. return {
  1425. filepath: row.filepath,
  1426. displayPath: row.display_path,
  1427. title: row.title,
  1428. hash: row.hash,
  1429. docid: getDocid(row.hash),
  1430. collectionName,
  1431. modifiedAt: "", // Not available in FTS query
  1432. bodyLength: row.body.length,
  1433. body: row.body,
  1434. context: getContextForFile(db, row.filepath),
  1435. score,
  1436. source: "fts" as const,
  1437. };
  1438. });
  1439. }
  1440. // =============================================================================
  1441. // Vector Search
  1442. // =============================================================================
  1443. export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionId?: number): Promise<SearchResult[]> {
  1444. const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1445. if (!tableExists) return [];
  1446. const embedding = await getEmbedding(query, model, true);
  1447. if (!embedding) return [];
  1448. // sqlite-vec requires "k = ?" for KNN queries
  1449. let sql = `
  1450. SELECT
  1451. v.hash_seq,
  1452. v.distance,
  1453. 'qmd://' || d.collection || '/' || d.path as filepath,
  1454. d.collection || '/' || d.path as display_path,
  1455. d.title,
  1456. content.doc as body,
  1457. cv.hash,
  1458. cv.pos
  1459. FROM vectors_vec v
  1460. JOIN content_vectors cv ON cv.hash || '_' || cv.seq = v.hash_seq
  1461. JOIN documents d ON d.hash = cv.hash AND d.active = 1
  1462. JOIN content ON content.hash = d.hash
  1463. WHERE v.embedding MATCH ? AND k = ?
  1464. `;
  1465. if (collectionId !== undefined) {
  1466. // Note: collectionId is a legacy parameter that should be phased out
  1467. // Collections are now managed in YAML. For now, we interpret it as a collection name filter.
  1468. sql += ` AND d.collection = ?`;
  1469. sql = sql.replace('?', String(collectionId)); // Hacky but maintains compatibility
  1470. }
  1471. sql += ` ORDER BY v.distance`;
  1472. const rows = db.prepare(sql).all(new Float32Array(embedding), limit * 3) as { hash_seq: string; distance: number; filepath: string; display_path: string; title: string; body: string; hash: string; pos: number }[];
  1473. const seen = new Map<string, { row: typeof rows[0]; bestDist: number }>();
  1474. for (const row of rows) {
  1475. const existing = seen.get(row.filepath);
  1476. if (!existing || row.distance < existing.bestDist) {
  1477. seen.set(row.filepath, { row, bestDist: row.distance });
  1478. }
  1479. }
  1480. return Array.from(seen.values())
  1481. .sort((a, b) => a.bestDist - b.bestDist)
  1482. .slice(0, limit)
  1483. .map(({ row }) => {
  1484. const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
  1485. return {
  1486. filepath: row.filepath,
  1487. displayPath: row.display_path,
  1488. title: row.title,
  1489. hash: row.hash,
  1490. docid: getDocid(row.hash),
  1491. collectionName,
  1492. modifiedAt: "", // Not available in vec query
  1493. bodyLength: row.body.length,
  1494. body: row.body,
  1495. context: getContextForFile(db, row.filepath),
  1496. score: 1 - row.distance, // Cosine similarity = 1 - cosine distance
  1497. source: "vec" as const,
  1498. chunkPos: row.pos,
  1499. };
  1500. });
  1501. }
  1502. // =============================================================================
  1503. // Embeddings
  1504. // =============================================================================
  1505. async function getEmbedding(text: string, model: string, isQuery: boolean): Promise<number[] | null> {
  1506. const llm = getDefaultLlamaCpp();
  1507. // Format text using the appropriate prompt template
  1508. const formattedText = isQuery ? formatQueryForEmbedding(text) : formatDocForEmbedding(text);
  1509. const result = await llm.embed(formattedText, { model, isQuery });
  1510. return result?.embedding || null;
  1511. }
  1512. /**
  1513. * Get all unique content hashes that need embeddings (from active documents).
  1514. * Returns hash, document body, and a sample path for display purposes.
  1515. */
  1516. export function getHashesForEmbedding(db: Database): { hash: string; body: string; path: string }[] {
  1517. return db.prepare(`
  1518. SELECT d.hash, c.doc as body, MIN(d.path) as path
  1519. FROM documents d
  1520. JOIN content c ON d.hash = c.hash
  1521. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  1522. WHERE d.active = 1 AND v.hash IS NULL
  1523. GROUP BY d.hash
  1524. `).all() as { hash: string; body: string; path: string }[];
  1525. }
  1526. /**
  1527. * Clear all embeddings from the database (force re-index).
  1528. * Deletes all rows from content_vectors and drops the vectors_vec table.
  1529. */
  1530. export function clearAllEmbeddings(db: Database): void {
  1531. db.exec(`DELETE FROM content_vectors`);
  1532. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  1533. }
  1534. /**
  1535. * Insert a single embedding into both content_vectors and vectors_vec tables.
  1536. * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
  1537. */
  1538. export function insertEmbedding(
  1539. db: Database,
  1540. hash: string,
  1541. seq: number,
  1542. pos: number,
  1543. embedding: Float32Array,
  1544. model: string,
  1545. embeddedAt: string
  1546. ): void {
  1547. const hashSeq = `${hash}_${seq}`;
  1548. const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
  1549. const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
  1550. insertVecStmt.run(hashSeq, embedding);
  1551. insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
  1552. }
  1553. // =============================================================================
  1554. // Query expansion
  1555. // =============================================================================
  1556. export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database): Promise<string[]> {
  1557. // Check cache first
  1558. const cacheKey = getCacheKey("expandQuery", { query, model });
  1559. const cached = getCachedResult(db, cacheKey);
  1560. if (cached) {
  1561. const lines = cached.split('\n').map(l => l.trim()).filter(l => l.length > 0);
  1562. return [query, ...lines.slice(0, 2)];
  1563. }
  1564. const llm = getDefaultLlamaCpp();
  1565. // Note: LlamaCpp uses hardcoded model, model parameter is ignored
  1566. const results = await llm.expandQuery(query, 2);
  1567. // Cache the expanded queries (excluding original)
  1568. if (results.length > 1) {
  1569. setCachedResult(db, cacheKey, results.slice(1).join('\n'));
  1570. }
  1571. return results;
  1572. }
  1573. // =============================================================================
  1574. // Reranking
  1575. // =============================================================================
  1576. export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database): Promise<{ file: string; score: number }[]> {
  1577. const cachedResults: Map<string, number> = new Map();
  1578. const uncachedDocs: RerankDocument[] = [];
  1579. // Check cache for each document
  1580. for (const doc of documents) {
  1581. const cacheKey = getCacheKey("rerank", { query, file: doc.file, model });
  1582. const cached = getCachedResult(db, cacheKey);
  1583. if (cached !== null) {
  1584. cachedResults.set(doc.file, parseFloat(cached));
  1585. } else {
  1586. uncachedDocs.push({ file: doc.file, text: doc.text });
  1587. }
  1588. }
  1589. // Rerank uncached documents using LlamaCpp
  1590. if (uncachedDocs.length > 0) {
  1591. const llm = getDefaultLlamaCpp();
  1592. const rerankResult = await llm.rerank(query, uncachedDocs, { model });
  1593. // Cache results
  1594. for (const result of rerankResult.results) {
  1595. const cacheKey = getCacheKey("rerank", { query, file: result.file, model });
  1596. setCachedResult(db, cacheKey, result.score.toString());
  1597. cachedResults.set(result.file, result.score);
  1598. }
  1599. }
  1600. // Return all results sorted by score
  1601. return documents
  1602. .map(doc => ({ file: doc.file, score: cachedResults.get(doc.file) || 0 }))
  1603. .sort((a, b) => b.score - a.score);
  1604. }
  1605. // =============================================================================
  1606. // Reciprocal Rank Fusion
  1607. // =============================================================================
  1608. export function reciprocalRankFusion(
  1609. resultLists: RankedResult[][],
  1610. weights: number[] = [],
  1611. k: number = 60
  1612. ): RankedResult[] {
  1613. const scores = new Map<string, { result: RankedResult; rrfScore: number; topRank: number }>();
  1614. for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
  1615. const list = resultLists[listIdx];
  1616. if (!list) continue;
  1617. const weight = weights[listIdx] ?? 1.0;
  1618. for (let rank = 0; rank < list.length; rank++) {
  1619. const result = list[rank];
  1620. if (!result) continue;
  1621. const rrfContribution = weight / (k + rank + 1);
  1622. const existing = scores.get(result.file);
  1623. if (existing) {
  1624. existing.rrfScore += rrfContribution;
  1625. existing.topRank = Math.min(existing.topRank, rank);
  1626. } else {
  1627. scores.set(result.file, {
  1628. result,
  1629. rrfScore: rrfContribution,
  1630. topRank: rank,
  1631. });
  1632. }
  1633. }
  1634. }
  1635. // Top-rank bonus
  1636. for (const entry of scores.values()) {
  1637. if (entry.topRank === 0) {
  1638. entry.rrfScore += 0.05;
  1639. } else if (entry.topRank <= 2) {
  1640. entry.rrfScore += 0.02;
  1641. }
  1642. }
  1643. return Array.from(scores.values())
  1644. .sort((a, b) => b.rrfScore - a.rrfScore)
  1645. .map(e => ({ ...e.result, score: e.rrfScore }));
  1646. }
  1647. // =============================================================================
  1648. // Document retrieval
  1649. // =============================================================================
  1650. type DbDocRow = {
  1651. virtual_path: string;
  1652. display_path: string;
  1653. title: string;
  1654. hash: string;
  1655. collection: string;
  1656. path: string;
  1657. modified_at: string;
  1658. body_length: number;
  1659. body?: string;
  1660. };
  1661. /**
  1662. * Find a document by filename/path, docid (#hash), or with fuzzy matching.
  1663. * Returns document metadata without body by default.
  1664. *
  1665. * Supports:
  1666. * - Virtual paths: qmd://collection/path/to/file.md
  1667. * - Absolute paths: /path/to/file.md
  1668. * - Relative paths: path/to/file.md
  1669. * - Short docid: #abc123 (first 6 chars of hash)
  1670. */
  1671. export function findDocument(db: Database, filename: string, options: { includeBody?: boolean } = {}): DocumentResult | DocumentNotFound {
  1672. let filepath = filename;
  1673. const colonMatch = filepath.match(/:(\d+)$/);
  1674. if (colonMatch) {
  1675. filepath = filepath.slice(0, -colonMatch[0].length);
  1676. }
  1677. // Check if this is a docid lookup (#hash or just 6-char hex)
  1678. if (filepath.startsWith('#') || /^[a-f0-9]{6}$/i.test(filepath)) {
  1679. const docidMatch = findDocumentByDocid(db, filepath);
  1680. if (docidMatch) {
  1681. filepath = docidMatch.filepath;
  1682. } else {
  1683. return { error: "not_found", query: filename, similarFiles: [] };
  1684. }
  1685. }
  1686. if (filepath.startsWith('~/')) {
  1687. filepath = homedir() + filepath.slice(1);
  1688. }
  1689. const bodyCol = options.includeBody ? `, content.doc as body` : ``;
  1690. // Build computed columns
  1691. // Note: absoluteFilepath is computed from YAML collections after query
  1692. const selectCols = `
  1693. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  1694. d.collection || '/' || d.path as display_path,
  1695. d.title,
  1696. d.hash,
  1697. d.collection,
  1698. d.modified_at,
  1699. LENGTH(content.doc) as body_length
  1700. ${bodyCol}
  1701. `;
  1702. // Try to match by virtual path first
  1703. let doc = db.prepare(`
  1704. SELECT ${selectCols}
  1705. FROM documents d
  1706. JOIN content ON content.hash = d.hash
  1707. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  1708. `).get(filepath) as DbDocRow | null;
  1709. // Try fuzzy match by virtual path
  1710. if (!doc) {
  1711. doc = db.prepare(`
  1712. SELECT ${selectCols}
  1713. FROM documents d
  1714. JOIN content ON content.hash = d.hash
  1715. WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
  1716. LIMIT 1
  1717. `).get(`%${filepath}`) as DbDocRow | null;
  1718. }
  1719. // Try to match by absolute path (requires looking up collection paths from YAML)
  1720. if (!doc && !filepath.startsWith('qmd://')) {
  1721. const collections = collectionsListCollections();
  1722. for (const coll of collections) {
  1723. let relativePath: string | null = null;
  1724. // If filepath is absolute and starts with collection path, extract relative part
  1725. if (filepath.startsWith(coll.path + '/')) {
  1726. relativePath = filepath.slice(coll.path.length + 1);
  1727. }
  1728. // Otherwise treat filepath as relative to collection
  1729. else if (!filepath.startsWith('/')) {
  1730. relativePath = filepath;
  1731. }
  1732. if (relativePath) {
  1733. doc = db.prepare(`
  1734. SELECT ${selectCols}
  1735. FROM documents d
  1736. JOIN content ON content.hash = d.hash
  1737. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  1738. `).get(coll.name, relativePath) as DbDocRow | null;
  1739. if (doc) break;
  1740. }
  1741. }
  1742. }
  1743. if (!doc) {
  1744. const similar = findSimilarFiles(db, filepath, 5, 5);
  1745. return { error: "not_found", query: filename, similarFiles: similar };
  1746. }
  1747. // Get context using virtual path
  1748. const virtualPath = doc.virtual_path || `qmd://${doc.collection}/${doc.display_path}`;
  1749. const context = getContextForFile(db, virtualPath);
  1750. return {
  1751. filepath: virtualPath,
  1752. displayPath: doc.display_path,
  1753. title: doc.title,
  1754. context,
  1755. hash: doc.hash,
  1756. docid: getDocid(doc.hash),
  1757. collectionName: doc.collection,
  1758. modifiedAt: doc.modified_at,
  1759. bodyLength: doc.body_length,
  1760. ...(options.includeBody && doc.body !== undefined && { body: doc.body }),
  1761. };
  1762. }
  1763. /**
  1764. * Get the body content for a document
  1765. * Optionally slice by line range
  1766. */
  1767. export function getDocumentBody(db: Database, doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number): string | null {
  1768. const filepath = doc.filepath;
  1769. // Try to resolve document by filepath (absolute or virtual)
  1770. let row: { body: string } | null = null;
  1771. // Try virtual path first
  1772. if (filepath.startsWith('qmd://')) {
  1773. row = db.prepare(`
  1774. SELECT content.doc as body
  1775. FROM documents d
  1776. JOIN content ON content.hash = d.hash
  1777. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  1778. `).get(filepath) as { body: string } | null;
  1779. }
  1780. // Try absolute path by looking up in YAML collections
  1781. if (!row) {
  1782. const collections = collectionsListCollections();
  1783. for (const coll of collections) {
  1784. if (filepath.startsWith(coll.path + '/')) {
  1785. const relativePath = filepath.slice(coll.path.length + 1);
  1786. row = db.prepare(`
  1787. SELECT content.doc as body
  1788. FROM documents d
  1789. JOIN content ON content.hash = d.hash
  1790. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  1791. `).get(coll.name, relativePath) as { body: string } | null;
  1792. if (row) break;
  1793. }
  1794. }
  1795. }
  1796. if (!row) return null;
  1797. let body = row.body;
  1798. if (fromLine !== undefined || maxLines !== undefined) {
  1799. const lines = body.split('\n');
  1800. const start = (fromLine || 1) - 1;
  1801. const end = maxLines !== undefined ? start + maxLines : lines.length;
  1802. body = lines.slice(start, end).join('\n');
  1803. }
  1804. return body;
  1805. }
  1806. /**
  1807. * Find multiple documents by glob pattern or comma-separated list
  1808. * Returns documents without body by default (use getDocumentBody to load)
  1809. */
  1810. export function findDocuments(
  1811. db: Database,
  1812. pattern: string,
  1813. options: { includeBody?: boolean; maxBytes?: number } = {}
  1814. ): { docs: MultiGetResult[]; errors: string[] } {
  1815. const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
  1816. const errors: string[] = [];
  1817. const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
  1818. const bodyCol = options.includeBody ? `, content.doc as body` : ``;
  1819. const selectCols = `
  1820. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  1821. d.collection || '/' || d.path as display_path,
  1822. d.title,
  1823. d.hash,
  1824. d.collection,
  1825. d.modified_at,
  1826. LENGTH(content.doc) as body_length
  1827. ${bodyCol}
  1828. `;
  1829. let fileRows: DbDocRow[];
  1830. if (isCommaSeparated) {
  1831. const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
  1832. fileRows = [];
  1833. for (const name of names) {
  1834. let doc = db.prepare(`
  1835. SELECT ${selectCols}
  1836. FROM documents d
  1837. JOIN content ON content.hash = d.hash
  1838. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  1839. `).get(name) as DbDocRow | null;
  1840. if (!doc) {
  1841. doc = db.prepare(`
  1842. SELECT ${selectCols}
  1843. FROM documents d
  1844. JOIN content ON content.hash = d.hash
  1845. WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
  1846. LIMIT 1
  1847. `).get(`%${name}`) as DbDocRow | null;
  1848. }
  1849. if (doc) {
  1850. fileRows.push(doc);
  1851. } else {
  1852. const similar = findSimilarFiles(db, name, 5, 3);
  1853. let msg = `File not found: ${name}`;
  1854. if (similar.length > 0) {
  1855. msg += ` (did you mean: ${similar.join(', ')}?)`;
  1856. }
  1857. errors.push(msg);
  1858. }
  1859. }
  1860. } else {
  1861. // Glob pattern match
  1862. const matched = matchFilesByGlob(db, pattern);
  1863. if (matched.length === 0) {
  1864. errors.push(`No files matched pattern: ${pattern}`);
  1865. return { docs: [], errors };
  1866. }
  1867. const virtualPaths = matched.map(m => m.filepath);
  1868. const placeholders = virtualPaths.map(() => '?').join(',');
  1869. fileRows = db.prepare(`
  1870. SELECT ${selectCols}
  1871. FROM documents d
  1872. JOIN content ON content.hash = d.hash
  1873. WHERE 'qmd://' || d.collection || '/' || d.path IN (${placeholders}) AND d.active = 1
  1874. `).all(...virtualPaths) as DbDocRow[];
  1875. }
  1876. const results: MultiGetResult[] = [];
  1877. for (const row of fileRows) {
  1878. // Get context using virtual path
  1879. const virtualPath = row.virtual_path || `qmd://${row.collection}/${row.display_path}`;
  1880. const context = getContextForFile(db, virtualPath);
  1881. if (row.body_length > maxBytes) {
  1882. results.push({
  1883. doc: { filepath: virtualPath, displayPath: row.display_path },
  1884. skipped: true,
  1885. skipReason: `File too large (${Math.round(row.body_length / 1024)}KB > ${Math.round(maxBytes / 1024)}KB)`,
  1886. });
  1887. continue;
  1888. }
  1889. results.push({
  1890. doc: {
  1891. filepath: virtualPath,
  1892. displayPath: row.display_path,
  1893. title: row.title || row.display_path.split('/').pop() || row.display_path,
  1894. context,
  1895. hash: row.hash,
  1896. docid: getDocid(row.hash),
  1897. collectionName: row.collection,
  1898. modifiedAt: row.modified_at,
  1899. bodyLength: row.body_length,
  1900. ...(options.includeBody && row.body !== undefined && { body: row.body }),
  1901. },
  1902. skipped: false,
  1903. });
  1904. }
  1905. return { docs: results, errors };
  1906. }
  1907. // =============================================================================
  1908. // Status
  1909. // =============================================================================
  1910. export function getStatus(db: Database): IndexStatus {
  1911. // Load collections from YAML
  1912. const yamlCollections = collectionsListCollections();
  1913. // Get document counts and last update times for each collection
  1914. const collections = yamlCollections.map(col => {
  1915. const stats = db.prepare(`
  1916. SELECT
  1917. COUNT(*) as active_count,
  1918. MAX(modified_at) as last_doc_update
  1919. FROM documents
  1920. WHERE collection = ? AND active = 1
  1921. `).get(col.name) as { active_count: number; last_doc_update: string | null };
  1922. return {
  1923. name: col.name,
  1924. path: col.path,
  1925. pattern: col.pattern,
  1926. documents: stats.active_count,
  1927. lastUpdated: stats.last_doc_update || new Date().toISOString(),
  1928. };
  1929. });
  1930. // Sort by last update time (most recent first)
  1931. collections.sort((a, b) => {
  1932. if (!a.lastUpdated) return 1;
  1933. if (!b.lastUpdated) return -1;
  1934. return new Date(b.lastUpdated).getTime() - new Date(a.lastUpdated).getTime();
  1935. });
  1936. const totalDocs = (db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get() as { c: number }).c;
  1937. const needsEmbedding = getHashesNeedingEmbedding(db);
  1938. const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1939. return {
  1940. totalDocuments: totalDocs,
  1941. needsEmbedding,
  1942. hasVectorIndex: hasVectors,
  1943. collections,
  1944. };
  1945. }
  1946. // =============================================================================
  1947. // Snippet extraction
  1948. // =============================================================================
  1949. export type SnippetResult = {
  1950. line: number; // 1-indexed line number of best match
  1951. snippet: string; // The snippet text with diff-style header
  1952. linesBefore: number; // Lines in document before snippet
  1953. linesAfter: number; // Lines in document after snippet
  1954. snippetLines: number; // Number of lines in snippet
  1955. };
  1956. export function extractSnippet(body: string, query: string, maxLen = 500, chunkPos?: number): SnippetResult {
  1957. const totalLines = body.split('\n').length;
  1958. let searchBody = body;
  1959. let lineOffset = 0;
  1960. if (chunkPos && chunkPos > 0) {
  1961. const contextStart = Math.max(0, chunkPos - 100);
  1962. const contextEnd = Math.min(body.length, chunkPos + maxLen + 100);
  1963. searchBody = body.slice(contextStart, contextEnd);
  1964. if (contextStart > 0) {
  1965. lineOffset = body.slice(0, contextStart).split('\n').length - 1;
  1966. }
  1967. }
  1968. const lines = searchBody.split('\n');
  1969. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
  1970. let bestLine = 0, bestScore = -1;
  1971. for (let i = 0; i < lines.length; i++) {
  1972. const lineLower = (lines[i] ?? "").toLowerCase();
  1973. let score = 0;
  1974. for (const term of queryTerms) {
  1975. if (lineLower.includes(term)) score++;
  1976. }
  1977. if (score > bestScore) {
  1978. bestScore = score;
  1979. bestLine = i;
  1980. }
  1981. }
  1982. const start = Math.max(0, bestLine - 1);
  1983. const end = Math.min(lines.length, bestLine + 3);
  1984. const snippetLines = lines.slice(start, end);
  1985. let snippetText = snippetLines.join('\n');
  1986. // If we focused on a chunk window and it produced an empty/whitespace-only snippet,
  1987. // fall back to a full-document snippet so we always show something useful.
  1988. if (chunkPos && chunkPos > 0 && snippetText.trim().length === 0) {
  1989. return extractSnippet(body, query, maxLen, undefined);
  1990. }
  1991. if (snippetText.length > maxLen) snippetText = snippetText.substring(0, maxLen - 3) + "...";
  1992. const absoluteStart = lineOffset + start + 1; // 1-indexed
  1993. const snippetLineCount = snippetLines.length;
  1994. const linesBefore = absoluteStart - 1;
  1995. const linesAfter = totalLines - (absoluteStart + snippetLineCount - 1);
  1996. // Format with diff-style header: @@ -start,count @@ (linesBefore before, linesAfter after)
  1997. const header = `@@ -${absoluteStart},${snippetLineCount} @@ (${linesBefore} before, ${linesAfter} after)`;
  1998. const snippet = `${header}\n${snippetText}`;
  1999. return {
  2000. line: lineOffset + bestLine + 1,
  2001. snippet,
  2002. linesBefore,
  2003. linesAfter,
  2004. snippetLines: snippetLineCount,
  2005. };
  2006. }