store.ts 78 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322
  1. /**
  2. * QMD Store - Core data access and retrieval functions
  3. *
  4. * This module provides all database operations, search functions, and document
  5. * retrieval for QMD. It returns raw data structures that can be formatted by
  6. * CLI or MCP consumers.
  7. *
  8. * Usage:
  9. * const store = createStore("/path/to/db.sqlite");
  10. * // or use default path:
  11. * const store = createStore();
  12. */
  13. import { Database } from "bun:sqlite";
  14. import { Glob } from "bun";
  15. import { realpathSync } from "node:fs";
  16. import * as sqliteVec from "sqlite-vec";
  17. import {
  18. LlamaCpp,
  19. getDefaultLlamaCpp,
  20. formatQueryForEmbedding,
  21. formatDocForEmbedding,
  22. type RerankDocument,
  23. } from "./llm";
  24. import {
  25. findContextForPath as collectionsFindContextForPath,
  26. addContext as collectionsAddContext,
  27. removeContext as collectionsRemoveContext,
  28. listAllContexts as collectionsListAllContexts,
  29. getCollection,
  30. listCollections as collectionsListCollections,
  31. addCollection as collectionsAddCollection,
  32. removeCollection as collectionsRemoveCollection,
  33. renameCollection as collectionsRenameCollection,
  34. setGlobalContext,
  35. loadConfig as collectionsLoadConfig,
  36. type NamedCollection,
  37. } from "./collections";
  38. // =============================================================================
  39. // Configuration
  40. // =============================================================================
  41. const HOME = Bun.env.HOME || "/tmp";
  42. export const DEFAULT_EMBED_MODEL = "embeddinggemma";
  43. export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
  44. export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
  45. export const DEFAULT_GLOB = "**/*.md";
  46. export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
  47. // Chunking: 800 tokens per chunk with 15% overlap
  48. export const CHUNK_SIZE_TOKENS = 800;
  49. export const CHUNK_OVERLAP_TOKENS = Math.floor(CHUNK_SIZE_TOKENS * 0.15); // 120 tokens (15% overlap)
  50. // Fallback char-based approximation for sync chunking (~4 chars per token)
  51. export const CHUNK_SIZE_CHARS = CHUNK_SIZE_TOKENS * 4; // 3200 chars
  52. export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 480 chars
  53. // =============================================================================
  54. // Path utilities
  55. // =============================================================================
  56. export function homedir(): string {
  57. return HOME;
  58. }
  59. export function resolve(...paths: string[]): string {
  60. if (paths.length === 0) {
  61. throw new Error("resolve: at least one path segment is required");
  62. }
  63. let result = paths[0]!.startsWith('/') ? '' : Bun.env.PWD || process.cwd();
  64. for (const p of paths) {
  65. if (p.startsWith('/')) {
  66. result = p;
  67. } else {
  68. result = result + '/' + p;
  69. }
  70. }
  71. const parts = result.split('/').filter(Boolean);
  72. const normalized: string[] = [];
  73. for (const part of parts) {
  74. if (part === '..') normalized.pop();
  75. else if (part !== '.') normalized.push(part);
  76. }
  77. return '/' + normalized.join('/');
  78. }
  79. // Flag to indicate production mode (set by qmd.ts at startup)
  80. let _productionMode = false;
  81. export function enableProductionMode(): void {
  82. _productionMode = true;
  83. }
  84. export function getDefaultDbPath(indexName: string = "index"): string {
  85. // Always allow override via INDEX_PATH (for testing)
  86. if (Bun.env.INDEX_PATH) {
  87. return Bun.env.INDEX_PATH;
  88. }
  89. // In non-production mode (tests), require explicit path
  90. if (!_productionMode) {
  91. throw new Error(
  92. "Database path not set. Tests must set INDEX_PATH env var or use createStore() with explicit path. " +
  93. "This prevents tests from accidentally writing to the global index."
  94. );
  95. }
  96. const cacheDir = Bun.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
  97. const qmdCacheDir = resolve(cacheDir, "qmd");
  98. try { Bun.spawnSync(["mkdir", "-p", qmdCacheDir]); } catch { }
  99. return resolve(qmdCacheDir, `${indexName}.sqlite`);
  100. }
  101. export function getPwd(): string {
  102. return process.env.PWD || process.cwd();
  103. }
  104. export function getRealPath(path: string): string {
  105. try {
  106. return realpathSync(path);
  107. } catch {
  108. return resolve(path);
  109. }
  110. }
  111. // =============================================================================
  112. // Virtual Path Utilities (qmd://)
  113. // =============================================================================
  114. export type VirtualPath = {
  115. collectionName: string;
  116. path: string; // relative path within collection
  117. };
  118. /**
  119. * Normalize explicit virtual path formats to standard qmd:// format.
  120. * Only handles paths that are already explicitly virtual:
  121. * - qmd://collection/path.md (already normalized)
  122. * - qmd:////collection/path.md (extra slashes - normalize)
  123. * - //collection/path.md (missing qmd: prefix - add it)
  124. *
  125. * Does NOT handle:
  126. * - collection/path.md (bare paths - could be filesystem relative)
  127. * - :linenum suffix (should be parsed separately before calling this)
  128. */
  129. export function normalizeVirtualPath(input: string): string {
  130. let path = input.trim();
  131. // Handle qmd:// with extra slashes: qmd:////collection/path -> qmd://collection/path
  132. if (path.startsWith('qmd:')) {
  133. // Remove qmd: prefix and normalize slashes
  134. path = path.slice(4);
  135. // Remove leading slashes and re-add exactly two
  136. path = path.replace(/^\/+/, '');
  137. return `qmd://${path}`;
  138. }
  139. // Handle //collection/path (missing qmd: prefix)
  140. if (path.startsWith('//')) {
  141. path = path.replace(/^\/+/, '');
  142. return `qmd://${path}`;
  143. }
  144. // Return as-is for other cases (filesystem paths, docids, bare collection/path, etc.)
  145. return path;
  146. }
  147. /**
  148. * Parse a virtual path like "qmd://collection-name/path/to/file.md"
  149. * into its components.
  150. * Also supports collection root: "qmd://collection-name/" or "qmd://collection-name"
  151. */
  152. export function parseVirtualPath(virtualPath: string): VirtualPath | null {
  153. // Normalize the path first
  154. const normalized = normalizeVirtualPath(virtualPath);
  155. // Match: qmd://collection-name[/optional-path]
  156. // Allows: qmd://name, qmd://name/, qmd://name/path
  157. const match = normalized.match(/^qmd:\/\/([^\/]+)\/?(.*)$/);
  158. if (!match?.[1]) return null;
  159. return {
  160. collectionName: match[1],
  161. path: match[2] ?? '', // Empty string for collection root
  162. };
  163. }
  164. /**
  165. * Build a virtual path from collection name and relative path.
  166. */
  167. export function buildVirtualPath(collectionName: string, path: string): string {
  168. return `qmd://${collectionName}/${path}`;
  169. }
  170. /**
  171. * Check if a path is explicitly a virtual path.
  172. * Only recognizes explicit virtual path formats:
  173. * - qmd://collection/path.md
  174. * - //collection/path.md
  175. *
  176. * Does NOT consider bare collection/path.md as virtual - that should be
  177. * handled separately by checking if the first component is a collection name.
  178. */
  179. export function isVirtualPath(path: string): boolean {
  180. const trimmed = path.trim();
  181. // Explicit qmd:// prefix (with any number of slashes)
  182. if (trimmed.startsWith('qmd:')) return true;
  183. // //collection/path format (missing qmd: prefix)
  184. if (trimmed.startsWith('//')) return true;
  185. return false;
  186. }
  187. /**
  188. * Resolve a virtual path to absolute filesystem path.
  189. */
  190. export function resolveVirtualPath(db: Database, virtualPath: string): string | null {
  191. const parsed = parseVirtualPath(virtualPath);
  192. if (!parsed) return null;
  193. const coll = getCollectionByName(db, parsed.collectionName);
  194. if (!coll) return null;
  195. return resolve(coll.pwd, parsed.path);
  196. }
  197. /**
  198. * Convert an absolute filesystem path to a virtual path.
  199. * Returns null if the file is not in any indexed collection.
  200. */
  201. export function toVirtualPath(db: Database, absolutePath: string): string | null {
  202. // Get all collections from YAML config
  203. const collections = collectionsListCollections();
  204. // Find which collection this absolute path belongs to
  205. for (const coll of collections) {
  206. if (absolutePath.startsWith(coll.path + '/') || absolutePath === coll.path) {
  207. // Extract relative path
  208. const relativePath = absolutePath.startsWith(coll.path + '/')
  209. ? absolutePath.slice(coll.path.length + 1)
  210. : '';
  211. // Verify this document exists in the database
  212. const doc = db.prepare(`
  213. SELECT d.path
  214. FROM documents d
  215. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  216. LIMIT 1
  217. `).get(coll.name, relativePath) as { path: string } | null;
  218. if (doc) {
  219. return buildVirtualPath(coll.name, relativePath);
  220. }
  221. }
  222. }
  223. return null;
  224. }
  225. // =============================================================================
  226. // Database initialization
  227. // =============================================================================
  228. // On macOS, use Homebrew's SQLite which supports extensions
  229. if (process.platform === "darwin") {
  230. const homebrewSqlitePath = "/opt/homebrew/opt/sqlite/lib/libsqlite3.dylib";
  231. try {
  232. if (Bun.file(homebrewSqlitePath).size > 0) {
  233. Database.setCustomSQLite(homebrewSqlitePath);
  234. }
  235. } catch { }
  236. }
  237. function initializeDatabase(db: Database): void {
  238. sqliteVec.load(db);
  239. db.exec("PRAGMA journal_mode = WAL");
  240. db.exec("PRAGMA foreign_keys = ON");
  241. // Drop legacy tables that are now managed in YAML
  242. db.exec(`DROP TABLE IF EXISTS path_contexts`);
  243. db.exec(`DROP TABLE IF EXISTS collections`);
  244. // Content-addressable storage - the source of truth for document content
  245. db.exec(`
  246. CREATE TABLE IF NOT EXISTS content (
  247. hash TEXT PRIMARY KEY,
  248. doc TEXT NOT NULL,
  249. created_at TEXT NOT NULL
  250. )
  251. `);
  252. // Documents table - file system layer mapping virtual paths to content hashes
  253. // Collections are now managed in ~/.config/qmd/index.yml
  254. db.exec(`
  255. CREATE TABLE IF NOT EXISTS documents (
  256. id INTEGER PRIMARY KEY AUTOINCREMENT,
  257. collection TEXT NOT NULL,
  258. path TEXT NOT NULL,
  259. title TEXT NOT NULL,
  260. hash TEXT NOT NULL,
  261. created_at TEXT NOT NULL,
  262. modified_at TEXT NOT NULL,
  263. active INTEGER NOT NULL DEFAULT 1,
  264. FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
  265. UNIQUE(collection, path)
  266. )
  267. `);
  268. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection, active)`);
  269. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`);
  270. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path, active)`);
  271. // Cache table for LLM API calls
  272. db.exec(`
  273. CREATE TABLE IF NOT EXISTS llm_cache (
  274. hash TEXT PRIMARY KEY,
  275. result TEXT NOT NULL,
  276. created_at TEXT NOT NULL
  277. )
  278. `);
  279. // Content vectors
  280. const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
  281. const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
  282. if (cvInfo.length > 0 && !hasSeqColumn) {
  283. db.exec(`DROP TABLE IF EXISTS content_vectors`);
  284. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  285. }
  286. db.exec(`
  287. CREATE TABLE IF NOT EXISTS content_vectors (
  288. hash TEXT NOT NULL,
  289. seq INTEGER NOT NULL DEFAULT 0,
  290. pos INTEGER NOT NULL DEFAULT 0,
  291. model TEXT NOT NULL,
  292. embedded_at TEXT NOT NULL,
  293. PRIMARY KEY (hash, seq)
  294. )
  295. `);
  296. // FTS - index filepath (collection/path), title, and content
  297. db.exec(`
  298. CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
  299. filepath, title, body,
  300. tokenize='porter unicode61'
  301. )
  302. `);
  303. // Triggers to keep FTS in sync
  304. db.exec(`
  305. CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents
  306. WHEN new.active = 1
  307. BEGIN
  308. INSERT INTO documents_fts(rowid, filepath, title, body)
  309. SELECT
  310. new.id,
  311. new.collection || '/' || new.path,
  312. new.title,
  313. (SELECT doc FROM content WHERE hash = new.hash)
  314. WHERE new.active = 1;
  315. END
  316. `);
  317. db.exec(`
  318. CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
  319. DELETE FROM documents_fts WHERE rowid = old.id;
  320. END
  321. `);
  322. db.exec(`
  323. CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents
  324. BEGIN
  325. -- Delete from FTS if no longer active
  326. DELETE FROM documents_fts WHERE rowid = old.id AND new.active = 0;
  327. -- Update FTS if still/newly active
  328. INSERT OR REPLACE INTO documents_fts(rowid, filepath, title, body)
  329. SELECT
  330. new.id,
  331. new.collection || '/' || new.path,
  332. new.title,
  333. (SELECT doc FROM content WHERE hash = new.hash)
  334. WHERE new.active = 1;
  335. END
  336. `);
  337. }
  338. function ensureVecTableInternal(db: Database, dimensions: number): void {
  339. const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get() as { sql: string } | null;
  340. if (tableInfo) {
  341. const match = tableInfo.sql.match(/float\[(\d+)\]/);
  342. const hasHashSeq = tableInfo.sql.includes('hash_seq');
  343. const hasCosine = tableInfo.sql.includes('distance_metric=cosine');
  344. const existingDims = match?.[1] ? parseInt(match[1], 10) : null;
  345. if (existingDims === dimensions && hasHashSeq && hasCosine) return;
  346. // Table exists but wrong schema - need to rebuild
  347. db.exec("DROP TABLE IF EXISTS vectors_vec");
  348. }
  349. db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}] distance_metric=cosine)`);
  350. }
  351. // =============================================================================
  352. // Store Factory
  353. // =============================================================================
  354. export type Store = {
  355. db: Database;
  356. dbPath: string;
  357. close: () => void;
  358. ensureVecTable: (dimensions: number) => void;
  359. // Index health
  360. getHashesNeedingEmbedding: () => number;
  361. getIndexHealth: () => IndexHealthInfo;
  362. getStatus: () => IndexStatus;
  363. // Caching
  364. getCacheKey: typeof getCacheKey;
  365. getCachedResult: (cacheKey: string) => string | null;
  366. setCachedResult: (cacheKey: string, result: string) => void;
  367. clearCache: () => void;
  368. // Cleanup and maintenance
  369. deleteLLMCache: () => number;
  370. deleteInactiveDocuments: () => number;
  371. cleanupOrphanedContent: () => number;
  372. cleanupOrphanedVectors: () => number;
  373. vacuumDatabase: () => void;
  374. // Context
  375. getContextForFile: (filepath: string) => string | null;
  376. getContextForPath: (collectionName: string, path: string) => string | null;
  377. getCollectionByName: (name: string) => { name: string; pwd: string; glob_pattern: string } | null;
  378. getCollectionsWithoutContext: () => { name: string; pwd: string; doc_count: number }[];
  379. getTopLevelPathsWithoutContext: (collectionName: string) => string[];
  380. // Virtual paths
  381. parseVirtualPath: typeof parseVirtualPath;
  382. buildVirtualPath: typeof buildVirtualPath;
  383. isVirtualPath: typeof isVirtualPath;
  384. resolveVirtualPath: (virtualPath: string) => string | null;
  385. toVirtualPath: (absolutePath: string) => string | null;
  386. // Search
  387. searchFTS: (query: string, limit?: number, collectionId?: number) => SearchResult[];
  388. searchVec: (query: string, model: string, limit?: number, collectionId?: number) => Promise<SearchResult[]>;
  389. // Query expansion & reranking
  390. expandQuery: (query: string, model?: string) => Promise<string[]>;
  391. rerank: (query: string, documents: { file: string; text: string }[], model?: string) => Promise<{ file: string; score: number }[]>;
  392. // Document retrieval
  393. findDocument: (filename: string, options?: { includeBody?: boolean }) => DocumentResult | DocumentNotFound;
  394. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => string | null;
  395. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => { docs: MultiGetResult[]; errors: string[] };
  396. // Fuzzy matching and docid lookup
  397. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => string[];
  398. matchFilesByGlob: (pattern: string) => { filepath: string; displayPath: string; bodyLength: number }[];
  399. findDocumentByDocid: (docid: string) => { filepath: string; hash: string } | null;
  400. // Document indexing operations
  401. insertContent: (hash: string, content: string, createdAt: string) => void;
  402. insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => void;
  403. findActiveDocument: (collectionName: string, path: string) => { id: number; hash: string; title: string } | null;
  404. updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => void;
  405. updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => void;
  406. deactivateDocument: (collectionName: string, path: string) => void;
  407. getActiveDocumentPaths: (collectionName: string) => string[];
  408. // Vector/embedding operations
  409. getHashesForEmbedding: () => { hash: string; body: string; path: string }[];
  410. clearAllEmbeddings: () => void;
  411. insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void;
  412. };
  413. /**
  414. * Create a new store instance with the given database path.
  415. * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
  416. *
  417. * @param dbPath - Path to the SQLite database file
  418. * @returns Store instance with all methods bound to the database
  419. */
  420. export function createStore(dbPath?: string): Store {
  421. const resolvedPath = dbPath || getDefaultDbPath();
  422. const db = new Database(resolvedPath);
  423. initializeDatabase(db);
  424. return {
  425. db,
  426. dbPath: resolvedPath,
  427. close: () => db.close(),
  428. ensureVecTable: (dimensions: number) => ensureVecTableInternal(db, dimensions),
  429. // Index health
  430. getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
  431. getIndexHealth: () => getIndexHealth(db),
  432. getStatus: () => getStatus(db),
  433. // Caching
  434. getCacheKey,
  435. getCachedResult: (cacheKey: string) => getCachedResult(db, cacheKey),
  436. setCachedResult: (cacheKey: string, result: string) => setCachedResult(db, cacheKey, result),
  437. clearCache: () => clearCache(db),
  438. // Cleanup and maintenance
  439. deleteLLMCache: () => deleteLLMCache(db),
  440. deleteInactiveDocuments: () => deleteInactiveDocuments(db),
  441. cleanupOrphanedContent: () => cleanupOrphanedContent(db),
  442. cleanupOrphanedVectors: () => cleanupOrphanedVectors(db),
  443. vacuumDatabase: () => vacuumDatabase(db),
  444. // Context
  445. getContextForFile: (filepath: string) => getContextForFile(db, filepath),
  446. getContextForPath: (collectionName: string, path: string) => getContextForPath(db, collectionName, path),
  447. getCollectionByName: (name: string) => getCollectionByName(db, name),
  448. getCollectionsWithoutContext: () => getCollectionsWithoutContext(db),
  449. getTopLevelPathsWithoutContext: (collectionName: string) => getTopLevelPathsWithoutContext(db, collectionName),
  450. // Virtual paths
  451. parseVirtualPath,
  452. buildVirtualPath,
  453. isVirtualPath,
  454. resolveVirtualPath: (virtualPath: string) => resolveVirtualPath(db, virtualPath),
  455. toVirtualPath: (absolutePath: string) => toVirtualPath(db, absolutePath),
  456. // Search
  457. searchFTS: (query: string, limit?: number, collectionId?: number) => searchFTS(db, query, limit, collectionId),
  458. searchVec: (query: string, model: string, limit?: number, collectionId?: number) => searchVec(db, query, model, limit, collectionId),
  459. // Query expansion & reranking
  460. expandQuery: (query: string, model?: string) => expandQuery(query, model, db),
  461. rerank: (query: string, documents: { file: string; text: string }[], model?: string) => rerank(query, documents, model, db),
  462. // Document retrieval
  463. findDocument: (filename: string, options?: { includeBody?: boolean }) => findDocument(db, filename, options),
  464. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => getDocumentBody(db, doc, fromLine, maxLines),
  465. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => findDocuments(db, pattern, options),
  466. // Fuzzy matching and docid lookup
  467. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => findSimilarFiles(db, query, maxDistance, limit),
  468. matchFilesByGlob: (pattern: string) => matchFilesByGlob(db, pattern),
  469. findDocumentByDocid: (docid: string) => findDocumentByDocid(db, docid),
  470. // Document indexing operations
  471. insertContent: (hash: string, content: string, createdAt: string) => insertContent(db, hash, content, createdAt),
  472. insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt),
  473. findActiveDocument: (collectionName: string, path: string) => findActiveDocument(db, collectionName, path),
  474. updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => updateDocumentTitle(db, documentId, title, modifiedAt),
  475. updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => updateDocument(db, documentId, title, hash, modifiedAt),
  476. deactivateDocument: (collectionName: string, path: string) => deactivateDocument(db, collectionName, path),
  477. getActiveDocumentPaths: (collectionName: string) => getActiveDocumentPaths(db, collectionName),
  478. // Vector/embedding operations
  479. getHashesForEmbedding: () => getHashesForEmbedding(db),
  480. clearAllEmbeddings: () => clearAllEmbeddings(db),
  481. insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
  482. };
  483. }
  484. // =============================================================================
  485. // Core Document Type
  486. // =============================================================================
  487. /**
  488. * Unified document result type with all metadata.
  489. * Body is optional - use getDocumentBody() to load it separately if needed.
  490. */
  491. export type DocumentResult = {
  492. filepath: string; // Full filesystem path
  493. displayPath: string; // Short display path (e.g., "docs/readme.md")
  494. title: string; // Document title (from first heading or filename)
  495. context: string | null; // Folder context description if configured
  496. hash: string; // Content hash for caching/change detection
  497. docid: string; // Short docid (first 6 chars of hash) for quick reference
  498. collectionName: string; // Parent collection name
  499. modifiedAt: string; // Last modification timestamp
  500. bodyLength: number; // Body length in bytes (useful before loading)
  501. body?: string; // Document body (optional, load with getDocumentBody)
  502. };
  503. /**
  504. * Extract short docid from a full hash (first 6 characters).
  505. */
  506. export function getDocid(hash: string): string {
  507. return hash.slice(0, 6);
  508. }
  509. /**
  510. * Handelize a filename to be more token-friendly.
  511. * - Convert triple underscore `___` to `/` (folder separator)
  512. * - Convert to lowercase
  513. * - Replace sequences of non-word chars (except /) with single dash
  514. * - Remove leading/trailing dashes from path segments
  515. * - Preserve folder structure (a/b/c/d.md stays structured)
  516. * - Preserve file extension
  517. */
  518. export function handelize(path: string): string {
  519. if (!path || path.trim() === '') {
  520. throw new Error('handelize: path cannot be empty');
  521. }
  522. // Check for paths that are just extensions or only dots/special chars
  523. // A valid path must have at least one letter or digit (including Unicode)
  524. const segments = path.split('/').filter(Boolean);
  525. const lastSegment = segments[segments.length - 1] || '';
  526. const filenameWithoutExt = lastSegment.replace(/\.[^.]+$/, '');
  527. const hasValidContent = /[\p{L}\p{N}]/u.test(filenameWithoutExt);
  528. if (!hasValidContent) {
  529. throw new Error(`handelize: path "${path}" has no valid filename content`);
  530. }
  531. const result = path
  532. .replace(/___/g, '/') // Triple underscore becomes folder separator
  533. .toLowerCase()
  534. .split('/')
  535. .map((segment, idx, arr) => {
  536. const isLastSegment = idx === arr.length - 1;
  537. if (isLastSegment) {
  538. // For the filename (last segment), preserve the extension
  539. const extMatch = segment.match(/(\.[a-z0-9]+)$/i);
  540. const ext = extMatch ? extMatch[1] : '';
  541. const nameWithoutExt = ext ? segment.slice(0, -ext.length) : segment;
  542. const cleanedName = nameWithoutExt
  543. .replace(/[^\p{L}\p{N}]+/gu, '-') // Replace non-letter/digit chars with dash
  544. .replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
  545. return cleanedName + ext;
  546. } else {
  547. // For directories, just clean normally
  548. return segment
  549. .replace(/[^\p{L}\p{N}]+/gu, '-')
  550. .replace(/^-+|-+$/g, '');
  551. }
  552. })
  553. .filter(Boolean)
  554. .join('/');
  555. if (!result) {
  556. throw new Error(`handelize: path "${path}" resulted in empty string after processing`);
  557. }
  558. return result;
  559. }
  560. /**
  561. * Search result extends DocumentResult with score and source info
  562. */
  563. export type SearchResult = DocumentResult & {
  564. score: number; // Relevance score (0-1)
  565. source: "fts" | "vec"; // Search source (full-text or vector)
  566. chunkPos?: number; // Character position of matching chunk (for vector search)
  567. };
  568. /**
  569. * Ranked result for RRF fusion (simplified, used internally)
  570. */
  571. export type RankedResult = {
  572. file: string;
  573. displayPath: string;
  574. title: string;
  575. body: string;
  576. score: number;
  577. };
  578. /**
  579. * Error result when document is not found
  580. */
  581. export type DocumentNotFound = {
  582. error: "not_found";
  583. query: string;
  584. similarFiles: string[];
  585. };
  586. /**
  587. * Result from multi-get operations
  588. */
  589. export type MultiGetResult = {
  590. doc: DocumentResult;
  591. skipped: false;
  592. } | {
  593. doc: Pick<DocumentResult, "filepath" | "displayPath">;
  594. skipped: true;
  595. skipReason: string;
  596. };
  597. export type CollectionInfo = {
  598. name: string;
  599. path: string;
  600. pattern: string;
  601. documents: number;
  602. lastUpdated: string;
  603. };
  604. export type IndexStatus = {
  605. totalDocuments: number;
  606. needsEmbedding: number;
  607. hasVectorIndex: boolean;
  608. collections: CollectionInfo[];
  609. };
  610. // =============================================================================
  611. // Index health
  612. // =============================================================================
  613. export function getHashesNeedingEmbedding(db: Database): number {
  614. const result = db.prepare(`
  615. SELECT COUNT(DISTINCT d.hash) as count
  616. FROM documents d
  617. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  618. WHERE d.active = 1 AND v.hash IS NULL
  619. `).get() as { count: number };
  620. return result.count;
  621. }
  622. export type IndexHealthInfo = {
  623. needsEmbedding: number;
  624. totalDocs: number;
  625. daysStale: number | null;
  626. };
  627. export function getIndexHealth(db: Database): IndexHealthInfo {
  628. const needsEmbedding = getHashesNeedingEmbedding(db);
  629. const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count;
  630. const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
  631. let daysStale: number | null = null;
  632. if (mostRecent?.latest) {
  633. const lastUpdate = new Date(mostRecent.latest);
  634. daysStale = Math.floor((Date.now() - lastUpdate.getTime()) / (24 * 60 * 60 * 1000));
  635. }
  636. return { needsEmbedding, totalDocs, daysStale };
  637. }
  638. // =============================================================================
  639. // Caching
  640. // =============================================================================
  641. export function getCacheKey(url: string, body: object): string {
  642. const hash = new Bun.CryptoHasher("sha256");
  643. hash.update(url);
  644. hash.update(JSON.stringify(body));
  645. return hash.digest("hex");
  646. }
  647. export function getCachedResult(db: Database, cacheKey: string): string | null {
  648. const row = db.prepare(`SELECT result FROM llm_cache WHERE hash = ?`).get(cacheKey) as { result: string } | null;
  649. return row?.result || null;
  650. }
  651. export function setCachedResult(db: Database, cacheKey: string, result: string): void {
  652. const now = new Date().toISOString();
  653. db.prepare(`INSERT OR REPLACE INTO llm_cache (hash, result, created_at) VALUES (?, ?, ?)`).run(cacheKey, result, now);
  654. if (Math.random() < 0.01) {
  655. db.exec(`DELETE FROM llm_cache WHERE hash NOT IN (SELECT hash FROM llm_cache ORDER BY created_at DESC LIMIT 1000)`);
  656. }
  657. }
  658. export function clearCache(db: Database): void {
  659. db.exec(`DELETE FROM llm_cache`);
  660. }
  661. // =============================================================================
  662. // Cleanup and maintenance operations
  663. // =============================================================================
  664. /**
  665. * Delete cached LLM API responses.
  666. * Returns the number of cached responses deleted.
  667. */
  668. export function deleteLLMCache(db: Database): number {
  669. const result = db.prepare(`DELETE FROM llm_cache`).run();
  670. return result.changes;
  671. }
  672. /**
  673. * Remove inactive document records (active = 0).
  674. * Returns the number of inactive documents deleted.
  675. */
  676. export function deleteInactiveDocuments(db: Database): number {
  677. const result = db.prepare(`DELETE FROM documents WHERE active = 0`).run();
  678. return result.changes;
  679. }
  680. /**
  681. * Remove orphaned content hashes that are not referenced by any active document.
  682. * Returns the number of orphaned content hashes deleted.
  683. */
  684. export function cleanupOrphanedContent(db: Database): number {
  685. const result = db.prepare(`
  686. DELETE FROM content
  687. WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  688. `).run();
  689. return result.changes;
  690. }
  691. /**
  692. * Remove orphaned vector embeddings that are not referenced by any active document.
  693. * Returns the number of orphaned embedding chunks deleted.
  694. */
  695. export function cleanupOrphanedVectors(db: Database): number {
  696. // Check if vectors_vec table exists
  697. const tableExists = db.prepare(`
  698. SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'
  699. `).get();
  700. if (!tableExists) {
  701. return 0;
  702. }
  703. // Count orphaned vectors first
  704. const countResult = db.prepare(`
  705. SELECT COUNT(*) as c FROM content_vectors cv
  706. WHERE NOT EXISTS (
  707. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  708. )
  709. `).get() as { c: number };
  710. if (countResult.c === 0) {
  711. return 0;
  712. }
  713. // Delete from vectors_vec first
  714. db.exec(`
  715. DELETE FROM vectors_vec WHERE hash_seq IN (
  716. SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
  717. WHERE NOT EXISTS (
  718. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  719. )
  720. )
  721. `);
  722. // Delete from content_vectors
  723. db.exec(`
  724. DELETE FROM content_vectors WHERE hash NOT IN (
  725. SELECT hash FROM documents WHERE active = 1
  726. )
  727. `);
  728. return countResult.c;
  729. }
  730. /**
  731. * Run VACUUM to reclaim unused space in the database.
  732. * This operation rebuilds the database file to eliminate fragmentation.
  733. */
  734. export function vacuumDatabase(db: Database): void {
  735. db.exec(`VACUUM`);
  736. }
  737. // =============================================================================
  738. // Document helpers
  739. // =============================================================================
  740. export async function hashContent(content: string): Promise<string> {
  741. const hash = new Bun.CryptoHasher("sha256");
  742. hash.update(content);
  743. return hash.digest("hex");
  744. }
  745. export function extractTitle(content: string, filename: string): string {
  746. const match = content.match(/^##?\s+(.+)$/m);
  747. if (match) {
  748. const title = (match[1] ?? "").trim();
  749. if (title === "📝 Notes" || title === "Notes") {
  750. const nextMatch = content.match(/^##\s+(.+)$/m);
  751. if (nextMatch?.[1]) return nextMatch[1].trim();
  752. }
  753. return title;
  754. }
  755. return filename.replace(/\.md$/, "").split("/").pop() || filename;
  756. }
  757. // =============================================================================
  758. // Document indexing operations
  759. // =============================================================================
  760. /**
  761. * Insert content into the content table (content-addressable storage).
  762. * Uses INSERT OR IGNORE so duplicate hashes are skipped.
  763. */
  764. export function insertContent(db: Database, hash: string, content: string, createdAt: string): void {
  765. db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
  766. .run(hash, content, createdAt);
  767. }
  768. /**
  769. * Insert a new document into the documents table.
  770. */
  771. export function insertDocument(
  772. db: Database,
  773. collectionName: string,
  774. path: string,
  775. title: string,
  776. hash: string,
  777. createdAt: string,
  778. modifiedAt: string
  779. ): void {
  780. db.prepare(`
  781. INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active)
  782. VALUES (?, ?, ?, ?, ?, ?, 1)
  783. `).run(collectionName, path, title, hash, createdAt, modifiedAt);
  784. }
  785. /**
  786. * Find an active document by collection name and path.
  787. */
  788. export function findActiveDocument(
  789. db: Database,
  790. collectionName: string,
  791. path: string
  792. ): { id: number; hash: string; title: string } | null {
  793. return db.prepare(`
  794. SELECT id, hash, title FROM documents
  795. WHERE collection = ? AND path = ? AND active = 1
  796. `).get(collectionName, path) as { id: number; hash: string; title: string } | null;
  797. }
  798. /**
  799. * Update the title and modified_at timestamp for a document.
  800. */
  801. export function updateDocumentTitle(
  802. db: Database,
  803. documentId: number,
  804. title: string,
  805. modifiedAt: string
  806. ): void {
  807. db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
  808. .run(title, modifiedAt, documentId);
  809. }
  810. /**
  811. * Update an existing document's hash, title, and modified_at timestamp.
  812. * Used when content changes but the file path stays the same.
  813. */
  814. export function updateDocument(
  815. db: Database,
  816. documentId: number,
  817. title: string,
  818. hash: string,
  819. modifiedAt: string
  820. ): void {
  821. db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`)
  822. .run(title, hash, modifiedAt, documentId);
  823. }
  824. /**
  825. * Deactivate a document (mark as inactive but don't delete).
  826. */
  827. export function deactivateDocument(db: Database, collectionName: string, path: string): void {
  828. db.prepare(`UPDATE documents SET active = 0 WHERE collection = ? AND path = ? AND active = 1`)
  829. .run(collectionName, path);
  830. }
  831. /**
  832. * Get all active document paths for a collection.
  833. */
  834. export function getActiveDocumentPaths(db: Database, collectionName: string): string[] {
  835. const rows = db.prepare(`
  836. SELECT path FROM documents WHERE collection = ? AND active = 1
  837. `).all(collectionName) as { path: string }[];
  838. return rows.map(r => r.path);
  839. }
  840. export { formatQueryForEmbedding, formatDocForEmbedding };
  841. export function chunkDocument(content: string, maxChars: number = CHUNK_SIZE_CHARS, overlapChars: number = CHUNK_OVERLAP_CHARS): { text: string; pos: number }[] {
  842. if (content.length <= maxChars) {
  843. return [{ text: content, pos: 0 }];
  844. }
  845. const chunks: { text: string; pos: number }[] = [];
  846. let charPos = 0;
  847. while (charPos < content.length) {
  848. // Calculate end position for this chunk
  849. let endPos = Math.min(charPos + maxChars, content.length);
  850. // If not at the end, try to find a good break point
  851. if (endPos < content.length) {
  852. const slice = content.slice(charPos, endPos);
  853. // Look for break points in the last 30% of the chunk
  854. const searchStart = Math.floor(slice.length * 0.7);
  855. const searchSlice = slice.slice(searchStart);
  856. // Priority: paragraph > sentence > line > word
  857. let breakOffset = -1;
  858. const paragraphBreak = searchSlice.lastIndexOf('\n\n');
  859. if (paragraphBreak >= 0) {
  860. breakOffset = searchStart + paragraphBreak + 2;
  861. } else {
  862. const sentenceEnd = Math.max(
  863. searchSlice.lastIndexOf('. '),
  864. searchSlice.lastIndexOf('.\n'),
  865. searchSlice.lastIndexOf('? '),
  866. searchSlice.lastIndexOf('?\n'),
  867. searchSlice.lastIndexOf('! '),
  868. searchSlice.lastIndexOf('!\n')
  869. );
  870. if (sentenceEnd >= 0) {
  871. breakOffset = searchStart + sentenceEnd + 2;
  872. } else {
  873. const lineBreak = searchSlice.lastIndexOf('\n');
  874. if (lineBreak >= 0) {
  875. breakOffset = searchStart + lineBreak + 1;
  876. } else {
  877. const spaceBreak = searchSlice.lastIndexOf(' ');
  878. if (spaceBreak >= 0) {
  879. breakOffset = searchStart + spaceBreak + 1;
  880. }
  881. }
  882. }
  883. }
  884. if (breakOffset > 0) {
  885. endPos = charPos + breakOffset;
  886. }
  887. }
  888. // Ensure we make progress
  889. if (endPos <= charPos) {
  890. endPos = Math.min(charPos + maxChars, content.length);
  891. }
  892. chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
  893. // Move forward, but overlap with previous chunk
  894. // For last chunk, don't overlap (just go to the end)
  895. if (endPos >= content.length) {
  896. break;
  897. }
  898. charPos = endPos - overlapChars;
  899. const lastChunkPos = chunks.at(-1)!.pos;
  900. if (charPos <= lastChunkPos) {
  901. // Prevent infinite loop - move forward at least a bit
  902. charPos = endPos;
  903. }
  904. }
  905. return chunks;
  906. }
  907. /**
  908. * Chunk a document by actual token count using the LLM tokenizer.
  909. * More accurate than character-based chunking but requires async.
  910. */
  911. export async function chunkDocumentByTokens(
  912. content: string,
  913. maxTokens: number = CHUNK_SIZE_TOKENS,
  914. overlapTokens: number = CHUNK_OVERLAP_TOKENS
  915. ): Promise<{ text: string; pos: number; tokens: number }[]> {
  916. const llm = getDefaultLlamaCpp();
  917. // Tokenize once upfront
  918. const allTokens = await llm.tokenize(content);
  919. const totalTokens = allTokens.length;
  920. if (totalTokens <= maxTokens) {
  921. return [{ text: content, pos: 0, tokens: totalTokens }];
  922. }
  923. const chunks: { text: string; pos: number; tokens: number }[] = [];
  924. const step = maxTokens - overlapTokens;
  925. const avgCharsPerToken = content.length / totalTokens;
  926. let tokenPos = 0;
  927. while (tokenPos < totalTokens) {
  928. const chunkEnd = Math.min(tokenPos + maxTokens, totalTokens);
  929. const chunkTokens = allTokens.slice(tokenPos, chunkEnd);
  930. let chunkText = await llm.detokenize(chunkTokens);
  931. // Find a good break point if not at end of document
  932. if (chunkEnd < totalTokens) {
  933. const searchStart = Math.floor(chunkText.length * 0.7);
  934. const searchSlice = chunkText.slice(searchStart);
  935. let breakOffset = -1;
  936. const paragraphBreak = searchSlice.lastIndexOf('\n\n');
  937. if (paragraphBreak >= 0) {
  938. breakOffset = paragraphBreak + 2;
  939. } else {
  940. const sentenceEnd = Math.max(
  941. searchSlice.lastIndexOf('. '),
  942. searchSlice.lastIndexOf('.\n'),
  943. searchSlice.lastIndexOf('? '),
  944. searchSlice.lastIndexOf('?\n'),
  945. searchSlice.lastIndexOf('! '),
  946. searchSlice.lastIndexOf('!\n')
  947. );
  948. if (sentenceEnd >= 0) {
  949. breakOffset = sentenceEnd + 2;
  950. } else {
  951. const lineBreak = searchSlice.lastIndexOf('\n');
  952. if (lineBreak >= 0) {
  953. breakOffset = lineBreak + 1;
  954. }
  955. }
  956. }
  957. if (breakOffset >= 0) {
  958. chunkText = chunkText.slice(0, searchStart + breakOffset);
  959. }
  960. }
  961. // Approximate character position based on token position
  962. const charPos = Math.floor(tokenPos * avgCharsPerToken);
  963. chunks.push({ text: chunkText, pos: charPos, tokens: chunkTokens.length });
  964. // Move forward
  965. if (chunkEnd >= totalTokens) break;
  966. // Advance by step tokens (maxTokens - overlap)
  967. tokenPos += step;
  968. }
  969. return chunks;
  970. }
  971. // =============================================================================
  972. // Fuzzy matching
  973. // =============================================================================
  974. function levenshtein(a: string, b: string): number {
  975. const m = a.length, n = b.length;
  976. if (m === 0) return n;
  977. if (n === 0) return m;
  978. const dp: number[][] = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0));
  979. for (let i = 0; i <= m; i++) dp[i]![0] = i;
  980. for (let j = 0; j <= n; j++) dp[0]![j] = j;
  981. for (let i = 1; i <= m; i++) {
  982. for (let j = 1; j <= n; j++) {
  983. const cost = a[i - 1] === b[j - 1] ? 0 : 1;
  984. dp[i]![j] = Math.min(
  985. dp[i - 1]![j]! + 1,
  986. dp[i]![j - 1]! + 1,
  987. dp[i - 1]![j - 1]! + cost
  988. );
  989. }
  990. }
  991. return dp[m]![n]!;
  992. }
  993. /**
  994. * Find a document by its short docid (first 6 characters of hash).
  995. * Returns the document's virtual path if found, null otherwise.
  996. * If multiple documents match the same short hash (collision), returns the first one.
  997. */
  998. export function findDocumentByDocid(db: Database, docid: string): { filepath: string; hash: string } | null {
  999. // Normalize: remove leading # if present
  1000. const shortHash = docid.startsWith('#') ? docid.slice(1) : docid;
  1001. if (shortHash.length < 1) return null;
  1002. // Look up documents where hash starts with the short hash
  1003. const doc = db.prepare(`
  1004. SELECT 'qmd://' || d.collection || '/' || d.path as filepath, d.hash
  1005. FROM documents d
  1006. WHERE d.hash LIKE ? AND d.active = 1
  1007. LIMIT 1
  1008. `).get(`${shortHash}%`) as { filepath: string; hash: string } | null;
  1009. return doc;
  1010. }
  1011. export function findSimilarFiles(db: Database, query: string, maxDistance: number = 3, limit: number = 5): string[] {
  1012. const allFiles = db.prepare(`
  1013. SELECT d.path
  1014. FROM documents d
  1015. WHERE d.active = 1
  1016. `).all() as { path: string }[];
  1017. const queryLower = query.toLowerCase();
  1018. const scored = allFiles
  1019. .map(f => ({ path: f.path, dist: levenshtein(f.path.toLowerCase(), queryLower) }))
  1020. .filter(f => f.dist <= maxDistance)
  1021. .sort((a, b) => a.dist - b.dist)
  1022. .slice(0, limit);
  1023. return scored.map(f => f.path);
  1024. }
  1025. export function matchFilesByGlob(db: Database, pattern: string): { filepath: string; displayPath: string; bodyLength: number }[] {
  1026. const allFiles = db.prepare(`
  1027. SELECT
  1028. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  1029. LENGTH(content.doc) as body_length,
  1030. d.path,
  1031. d.collection
  1032. FROM documents d
  1033. JOIN content ON content.hash = d.hash
  1034. WHERE d.active = 1
  1035. `).all() as { virtual_path: string; body_length: number; path: string; collection: string }[];
  1036. const glob = new Glob(pattern);
  1037. return allFiles
  1038. .filter(f => glob.match(f.virtual_path) || glob.match(f.path))
  1039. .map(f => ({
  1040. filepath: f.virtual_path, // Virtual path for precise lookup
  1041. displayPath: f.path, // Relative path for display
  1042. bodyLength: f.body_length
  1043. }));
  1044. }
  1045. // =============================================================================
  1046. // Context
  1047. // =============================================================================
  1048. /**
  1049. * Get context for a file path using hierarchical inheritance.
  1050. * Contexts are collection-scoped and inherit from parent directories.
  1051. * For example, context at "/talks" applies to "/talks/2024/keynote.md".
  1052. *
  1053. * @param db Database instance (unused - kept for compatibility)
  1054. * @param collectionName Collection name
  1055. * @param path Relative path within the collection
  1056. * @returns Context string or null if no context is defined
  1057. */
  1058. export function getContextForPath(db: Database, collectionName: string, path: string): string | null {
  1059. const config = collectionsLoadConfig();
  1060. const coll = getCollection(collectionName);
  1061. if (!coll) return null;
  1062. // Collect ALL matching contexts (global + all path prefixes)
  1063. const contexts: string[] = [];
  1064. // Add global context if present
  1065. if (config.global_context) {
  1066. contexts.push(config.global_context);
  1067. }
  1068. // Add all matching path contexts (from most general to most specific)
  1069. if (coll.context) {
  1070. const normalizedPath = path.startsWith("/") ? path : `/${path}`;
  1071. // Collect all matching prefixes
  1072. const matchingContexts: { prefix: string; context: string }[] = [];
  1073. for (const [prefix, context] of Object.entries(coll.context)) {
  1074. const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
  1075. if (normalizedPath.startsWith(normalizedPrefix)) {
  1076. matchingContexts.push({ prefix: normalizedPrefix, context });
  1077. }
  1078. }
  1079. // Sort by prefix length (shortest/most general first)
  1080. matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
  1081. // Add all matching contexts
  1082. for (const match of matchingContexts) {
  1083. contexts.push(match.context);
  1084. }
  1085. }
  1086. // Join all contexts with double newline
  1087. return contexts.length > 0 ? contexts.join('\n\n') : null;
  1088. }
  1089. /**
  1090. * Get context for a file path (virtual or filesystem).
  1091. * Resolves the collection and relative path using the YAML collections config.
  1092. */
  1093. export function getContextForFile(db: Database, filepath: string): string | null {
  1094. // Handle undefined or null filepath
  1095. if (!filepath) return null;
  1096. // Get all collections from YAML config
  1097. const collections = collectionsListCollections();
  1098. const config = collectionsLoadConfig();
  1099. // Parse virtual path format: qmd://collection/path
  1100. let collectionName: string | null = null;
  1101. let relativePath: string | null = null;
  1102. const parsedVirtual = filepath.startsWith('qmd://') ? parseVirtualPath(filepath) : null;
  1103. if (parsedVirtual) {
  1104. collectionName = parsedVirtual.collectionName;
  1105. relativePath = parsedVirtual.path;
  1106. } else {
  1107. // Filesystem path: find which collection this absolute path belongs to
  1108. for (const coll of collections) {
  1109. // Skip collections with missing paths
  1110. if (!coll || !coll.path) continue;
  1111. if (filepath.startsWith(coll.path + '/') || filepath === coll.path) {
  1112. collectionName = coll.name;
  1113. // Extract relative path
  1114. relativePath = filepath.startsWith(coll.path + '/')
  1115. ? filepath.slice(coll.path.length + 1)
  1116. : '';
  1117. break;
  1118. }
  1119. }
  1120. if (!collectionName || relativePath === null) return null;
  1121. }
  1122. // Get the collection from config
  1123. const coll = getCollection(collectionName);
  1124. if (!coll) return null;
  1125. // Verify this document exists in the database
  1126. const doc = db.prepare(`
  1127. SELECT d.path
  1128. FROM documents d
  1129. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  1130. LIMIT 1
  1131. `).get(collectionName, relativePath) as { path: string } | null;
  1132. if (!doc) return null;
  1133. // Collect ALL matching contexts (global + all path prefixes)
  1134. const contexts: string[] = [];
  1135. // Add global context if present
  1136. if (config.global_context) {
  1137. contexts.push(config.global_context);
  1138. }
  1139. // Add all matching path contexts (from most general to most specific)
  1140. if (coll.context) {
  1141. const normalizedPath = relativePath.startsWith("/") ? relativePath : `/${relativePath}`;
  1142. // Collect all matching prefixes
  1143. const matchingContexts: { prefix: string; context: string }[] = [];
  1144. for (const [prefix, context] of Object.entries(coll.context)) {
  1145. const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
  1146. if (normalizedPath.startsWith(normalizedPrefix)) {
  1147. matchingContexts.push({ prefix: normalizedPrefix, context });
  1148. }
  1149. }
  1150. // Sort by prefix length (shortest/most general first)
  1151. matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
  1152. // Add all matching contexts
  1153. for (const match of matchingContexts) {
  1154. contexts.push(match.context);
  1155. }
  1156. }
  1157. // Join all contexts with double newline
  1158. return contexts.length > 0 ? contexts.join('\n\n') : null;
  1159. }
  1160. /**
  1161. * Get collection by name from YAML config.
  1162. * Returns collection metadata from ~/.config/qmd/index.yml
  1163. */
  1164. export function getCollectionByName(db: Database, name: string): { name: string; pwd: string; glob_pattern: string } | null {
  1165. const collection = getCollection(name);
  1166. if (!collection) return null;
  1167. return {
  1168. name: collection.name,
  1169. pwd: collection.path,
  1170. glob_pattern: collection.pattern,
  1171. };
  1172. }
  1173. /**
  1174. * List all collections with document counts from database.
  1175. * Merges YAML config with database statistics.
  1176. */
  1177. export function listCollections(db: Database): { name: string; pwd: string; glob_pattern: string; doc_count: number; active_count: number; last_modified: string | null }[] {
  1178. const collections = collectionsListCollections();
  1179. // Get document counts from database for each collection
  1180. const result = collections.map(coll => {
  1181. const stats = db.prepare(`
  1182. SELECT
  1183. COUNT(d.id) as doc_count,
  1184. SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count,
  1185. MAX(d.modified_at) as last_modified
  1186. FROM documents d
  1187. WHERE d.collection = ?
  1188. `).get(coll.name) as { doc_count: number; active_count: number; last_modified: string | null } | null;
  1189. return {
  1190. name: coll.name,
  1191. pwd: coll.path,
  1192. glob_pattern: coll.pattern,
  1193. doc_count: stats?.doc_count || 0,
  1194. active_count: stats?.active_count || 0,
  1195. last_modified: stats?.last_modified || null,
  1196. };
  1197. });
  1198. return result;
  1199. }
  1200. /**
  1201. * Remove a collection and clean up its documents.
  1202. * Uses collections.ts to remove from YAML config and cleans up database.
  1203. */
  1204. export function removeCollection(db: Database, collectionName: string): { deletedDocs: number; cleanedHashes: number } {
  1205. // Delete documents from database
  1206. const docResult = db.prepare(`DELETE FROM documents WHERE collection = ?`).run(collectionName);
  1207. // Clean up orphaned content hashes
  1208. const cleanupResult = db.prepare(`
  1209. DELETE FROM content
  1210. WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  1211. `).run();
  1212. // Remove from YAML config (returns true if found and removed)
  1213. collectionsRemoveCollection(collectionName);
  1214. return {
  1215. deletedDocs: docResult.changes,
  1216. cleanedHashes: cleanupResult.changes
  1217. };
  1218. }
  1219. /**
  1220. * Rename a collection.
  1221. * Updates both YAML config and database documents table.
  1222. */
  1223. export function renameCollection(db: Database, oldName: string, newName: string): void {
  1224. // Update all documents with the new collection name in database
  1225. db.prepare(`UPDATE documents SET collection = ? WHERE collection = ?`)
  1226. .run(newName, oldName);
  1227. // Rename in YAML config
  1228. collectionsRenameCollection(oldName, newName);
  1229. }
  1230. // =============================================================================
  1231. // Context Management Operations
  1232. // =============================================================================
  1233. /**
  1234. * Insert or update a context for a specific collection and path prefix.
  1235. */
  1236. export function insertContext(db: Database, collectionId: number, pathPrefix: string, context: string): void {
  1237. // Get collection name from ID
  1238. const coll = db.prepare(`SELECT name FROM collections WHERE id = ?`).get(collectionId) as { name: string } | null;
  1239. if (!coll) {
  1240. throw new Error(`Collection with id ${collectionId} not found`);
  1241. }
  1242. // Use collections.ts to add context
  1243. collectionsAddContext(coll.name, pathPrefix, context);
  1244. }
  1245. /**
  1246. * Delete a context for a specific collection and path prefix.
  1247. * Returns the number of contexts deleted.
  1248. */
  1249. export function deleteContext(db: Database, collectionName: string, pathPrefix: string): number {
  1250. // Use collections.ts to remove context
  1251. const success = collectionsRemoveContext(collectionName, pathPrefix);
  1252. return success ? 1 : 0;
  1253. }
  1254. /**
  1255. * Delete all global contexts (contexts with empty path_prefix).
  1256. * Returns the number of contexts deleted.
  1257. */
  1258. export function deleteGlobalContexts(db: Database): number {
  1259. let deletedCount = 0;
  1260. // Remove global context
  1261. setGlobalContext(undefined);
  1262. deletedCount++;
  1263. // Remove root context (empty string) from all collections
  1264. const collections = collectionsListCollections();
  1265. for (const coll of collections) {
  1266. const success = collectionsRemoveContext(coll.name, '');
  1267. if (success) {
  1268. deletedCount++;
  1269. }
  1270. }
  1271. return deletedCount;
  1272. }
  1273. /**
  1274. * List all contexts, grouped by collection.
  1275. * Returns contexts ordered by collection name, then by path prefix length (longest first).
  1276. */
  1277. export function listPathContexts(db: Database): { collection_name: string; path_prefix: string; context: string }[] {
  1278. const allContexts = collectionsListAllContexts();
  1279. // Convert to expected format and sort
  1280. return allContexts.map(ctx => ({
  1281. collection_name: ctx.collection,
  1282. path_prefix: ctx.path,
  1283. context: ctx.context,
  1284. })).sort((a, b) => {
  1285. // Sort by collection name first
  1286. if (a.collection_name !== b.collection_name) {
  1287. return a.collection_name.localeCompare(b.collection_name);
  1288. }
  1289. // Then by path prefix length (longest first)
  1290. if (a.path_prefix.length !== b.path_prefix.length) {
  1291. return b.path_prefix.length - a.path_prefix.length;
  1292. }
  1293. // Then alphabetically
  1294. return a.path_prefix.localeCompare(b.path_prefix);
  1295. });
  1296. }
  1297. /**
  1298. * Get all collections (name only - from YAML config).
  1299. */
  1300. export function getAllCollections(db: Database): { name: string }[] {
  1301. const collections = collectionsListCollections();
  1302. return collections.map(c => ({ name: c.name }));
  1303. }
  1304. /**
  1305. * Check which collections don't have any context defined.
  1306. * Returns collections that have no context entries at all (not even root context).
  1307. */
  1308. export function getCollectionsWithoutContext(db: Database): { name: string; pwd: string; doc_count: number }[] {
  1309. // Get all collections from YAML config
  1310. const yamlCollections = collectionsListCollections();
  1311. // Filter to those without context
  1312. const collectionsWithoutContext: { name: string; pwd: string; doc_count: number }[] = [];
  1313. for (const coll of yamlCollections) {
  1314. // Check if collection has any context
  1315. if (!coll.context || Object.keys(coll.context).length === 0) {
  1316. // Get doc count from database
  1317. const stats = db.prepare(`
  1318. SELECT COUNT(d.id) as doc_count
  1319. FROM documents d
  1320. WHERE d.collection = ? AND d.active = 1
  1321. `).get(coll.name) as { doc_count: number } | null;
  1322. collectionsWithoutContext.push({
  1323. name: coll.name,
  1324. pwd: coll.path,
  1325. doc_count: stats?.doc_count || 0,
  1326. });
  1327. }
  1328. }
  1329. return collectionsWithoutContext.sort((a, b) => a.name.localeCompare(b.name));
  1330. }
  1331. /**
  1332. * Get top-level directories in a collection that don't have context.
  1333. * Useful for suggesting where context might be needed.
  1334. */
  1335. export function getTopLevelPathsWithoutContext(db: Database, collectionName: string): string[] {
  1336. // Get all paths in the collection from database
  1337. const paths = db.prepare(`
  1338. SELECT DISTINCT path FROM documents
  1339. WHERE collection = ? AND active = 1
  1340. `).all(collectionName) as { path: string }[];
  1341. // Get existing contexts for this collection from YAML
  1342. const yamlColl = getCollection(collectionName);
  1343. if (!yamlColl) return [];
  1344. const contextPrefixes = new Set<string>();
  1345. if (yamlColl.context) {
  1346. for (const prefix of Object.keys(yamlColl.context)) {
  1347. contextPrefixes.add(prefix);
  1348. }
  1349. }
  1350. // Extract top-level directories (first path component)
  1351. const topLevelDirs = new Set<string>();
  1352. for (const { path } of paths) {
  1353. const parts = path.split('/').filter(Boolean);
  1354. if (parts.length > 1) {
  1355. const dir = parts[0];
  1356. if (dir) topLevelDirs.add(dir);
  1357. }
  1358. }
  1359. // Filter out directories that already have context (exact or parent)
  1360. const missing: string[] = [];
  1361. for (const dir of topLevelDirs) {
  1362. let hasContext = false;
  1363. // Check if this dir or any parent has context
  1364. for (const prefix of contextPrefixes) {
  1365. if (prefix === '' || prefix === dir || dir.startsWith(prefix + '/')) {
  1366. hasContext = true;
  1367. break;
  1368. }
  1369. }
  1370. if (!hasContext) {
  1371. missing.push(dir);
  1372. }
  1373. }
  1374. return missing.sort();
  1375. }
  1376. // =============================================================================
  1377. // FTS Search
  1378. // =============================================================================
  1379. function sanitizeFTS5Term(term: string): string {
  1380. return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
  1381. }
  1382. function buildFTS5Query(query: string): string | null {
  1383. const terms = query.split(/\s+/)
  1384. .map(t => sanitizeFTS5Term(t))
  1385. .filter(t => t.length > 0);
  1386. if (terms.length === 0) return null;
  1387. if (terms.length === 1) return `"${terms[0]}"*`;
  1388. return terms.map(t => `"${t}"*`).join(' AND ');
  1389. }
  1390. export function searchFTS(db: Database, query: string, limit: number = 20, collectionId?: number): SearchResult[] {
  1391. const ftsQuery = buildFTS5Query(query);
  1392. if (!ftsQuery) return [];
  1393. let sql = `
  1394. SELECT
  1395. 'qmd://' || d.collection || '/' || d.path as filepath,
  1396. d.collection || '/' || d.path as display_path,
  1397. d.title,
  1398. content.doc as body,
  1399. d.hash,
  1400. bm25(documents_fts, 10.0, 1.0) as bm25_score
  1401. FROM documents_fts f
  1402. JOIN documents d ON d.id = f.rowid
  1403. JOIN content ON content.hash = d.hash
  1404. WHERE documents_fts MATCH ? AND d.active = 1
  1405. `;
  1406. const params: (string | number)[] = [ftsQuery];
  1407. if (collectionId) {
  1408. // Note: collectionId is a legacy parameter that should be phased out
  1409. // Collections are now managed in YAML. For now, we interpret it as a collection name filter.
  1410. // This code path is likely unused as collection filtering should be done at CLI level.
  1411. sql += ` AND d.collection = ?`;
  1412. params.push(String(collectionId));
  1413. }
  1414. // bm25 lower is better; sort ascending.
  1415. sql += ` ORDER BY bm25_score ASC LIMIT ?`;
  1416. params.push(limit);
  1417. const rows = db.prepare(sql).all(...params) as { filepath: string; display_path: string; title: string; body: string; hash: string; bm25_score: number }[];
  1418. return rows.map(row => {
  1419. const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
  1420. // Convert bm25 (lower is better) into a stable (0..1] score where higher is better.
  1421. // Avoid per-query normalization so "strong signal" heuristics can work.
  1422. const score = 1 / (1 + Math.max(0, row.bm25_score));
  1423. return {
  1424. filepath: row.filepath,
  1425. displayPath: row.display_path,
  1426. title: row.title,
  1427. hash: row.hash,
  1428. docid: getDocid(row.hash),
  1429. collectionName,
  1430. modifiedAt: "", // Not available in FTS query
  1431. bodyLength: row.body.length,
  1432. body: row.body,
  1433. context: getContextForFile(db, row.filepath),
  1434. score,
  1435. source: "fts" as const,
  1436. };
  1437. });
  1438. }
  1439. // =============================================================================
  1440. // Vector Search
  1441. // =============================================================================
  1442. export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionId?: number): Promise<SearchResult[]> {
  1443. const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1444. if (!tableExists) return [];
  1445. const embedding = await getEmbedding(query, model, true);
  1446. if (!embedding) return [];
  1447. // sqlite-vec requires "k = ?" for KNN queries
  1448. let sql = `
  1449. SELECT
  1450. v.hash_seq,
  1451. v.distance,
  1452. 'qmd://' || d.collection || '/' || d.path as filepath,
  1453. d.collection || '/' || d.path as display_path,
  1454. d.title,
  1455. content.doc as body,
  1456. cv.hash,
  1457. cv.pos
  1458. FROM vectors_vec v
  1459. JOIN content_vectors cv ON cv.hash || '_' || cv.seq = v.hash_seq
  1460. JOIN documents d ON d.hash = cv.hash AND d.active = 1
  1461. JOIN content ON content.hash = d.hash
  1462. WHERE v.embedding MATCH ? AND k = ?
  1463. `;
  1464. const params: (Float32Array | number | string)[] = [new Float32Array(embedding), limit * 3];
  1465. if (collectionId) {
  1466. // Filter by collection name
  1467. sql += ` AND d.collection = ?`;
  1468. params.push(String(collectionId));
  1469. }
  1470. sql += ` ORDER BY v.distance`;
  1471. const rows = db.prepare(sql).all(...params) as { hash_seq: string; distance: number; filepath: string; display_path: string; title: string; body: string; hash: string; pos: number }[];
  1472. const seen = new Map<string, { row: typeof rows[0]; bestDist: number }>();
  1473. for (const row of rows) {
  1474. const existing = seen.get(row.filepath);
  1475. if (!existing || row.distance < existing.bestDist) {
  1476. seen.set(row.filepath, { row, bestDist: row.distance });
  1477. }
  1478. }
  1479. return Array.from(seen.values())
  1480. .sort((a, b) => a.bestDist - b.bestDist)
  1481. .slice(0, limit)
  1482. .map(({ row }) => {
  1483. const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
  1484. return {
  1485. filepath: row.filepath,
  1486. displayPath: row.display_path,
  1487. title: row.title,
  1488. hash: row.hash,
  1489. docid: getDocid(row.hash),
  1490. collectionName,
  1491. modifiedAt: "", // Not available in vec query
  1492. bodyLength: row.body.length,
  1493. body: row.body,
  1494. context: getContextForFile(db, row.filepath),
  1495. score: 1 - row.distance, // Cosine similarity = 1 - cosine distance
  1496. source: "vec" as const,
  1497. chunkPos: row.pos,
  1498. };
  1499. });
  1500. }
  1501. // =============================================================================
  1502. // Embeddings
  1503. // =============================================================================
  1504. async function getEmbedding(text: string, model: string, isQuery: boolean): Promise<number[] | null> {
  1505. const llm = getDefaultLlamaCpp();
  1506. // Format text using the appropriate prompt template
  1507. const formattedText = isQuery ? formatQueryForEmbedding(text) : formatDocForEmbedding(text);
  1508. const result = await llm.embed(formattedText, { model, isQuery });
  1509. return result?.embedding || null;
  1510. }
  1511. /**
  1512. * Get all unique content hashes that need embeddings (from active documents).
  1513. * Returns hash, document body, and a sample path for display purposes.
  1514. */
  1515. export function getHashesForEmbedding(db: Database): { hash: string; body: string; path: string }[] {
  1516. return db.prepare(`
  1517. SELECT d.hash, c.doc as body, MIN(d.path) as path
  1518. FROM documents d
  1519. JOIN content c ON d.hash = c.hash
  1520. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  1521. WHERE d.active = 1 AND v.hash IS NULL
  1522. GROUP BY d.hash
  1523. `).all() as { hash: string; body: string; path: string }[];
  1524. }
  1525. /**
  1526. * Clear all embeddings from the database (force re-index).
  1527. * Deletes all rows from content_vectors and drops the vectors_vec table.
  1528. */
  1529. export function clearAllEmbeddings(db: Database): void {
  1530. db.exec(`DELETE FROM content_vectors`);
  1531. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  1532. }
  1533. /**
  1534. * Insert a single embedding into both content_vectors and vectors_vec tables.
  1535. * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
  1536. */
  1537. export function insertEmbedding(
  1538. db: Database,
  1539. hash: string,
  1540. seq: number,
  1541. pos: number,
  1542. embedding: Float32Array,
  1543. model: string,
  1544. embeddedAt: string
  1545. ): void {
  1546. const hashSeq = `${hash}_${seq}`;
  1547. const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
  1548. const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
  1549. insertVecStmt.run(hashSeq, embedding);
  1550. insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
  1551. }
  1552. // =============================================================================
  1553. // Query expansion
  1554. // =============================================================================
  1555. export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database): Promise<string[]> {
  1556. // Check cache first
  1557. const cacheKey = getCacheKey("expandQuery", { query, model });
  1558. const cached = getCachedResult(db, cacheKey);
  1559. if (cached) {
  1560. const lines = cached.split('\n').map(l => l.trim()).filter(l => l.length > 0);
  1561. return [query, ...lines.slice(0, 2)];
  1562. }
  1563. const llm = getDefaultLlamaCpp();
  1564. // Note: LlamaCpp uses hardcoded model, model parameter is ignored
  1565. const results = await llm.expandQuery(query);
  1566. const queryTexts = results.map(r => r.text);
  1567. // Cache the expanded queries (excluding original)
  1568. const expandedOnly = queryTexts.filter(t => t !== query);
  1569. if (expandedOnly.length > 0) {
  1570. setCachedResult(db, cacheKey, expandedOnly.join('\n'));
  1571. }
  1572. return Array.from(new Set([query, ...queryTexts]));
  1573. }
  1574. // =============================================================================
  1575. // Reranking
  1576. // =============================================================================
  1577. export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database): Promise<{ file: string; score: number }[]> {
  1578. const cachedResults: Map<string, number> = new Map();
  1579. const uncachedDocs: RerankDocument[] = [];
  1580. // Check cache for each document
  1581. for (const doc of documents) {
  1582. const cacheKey = getCacheKey("rerank", { query, file: doc.file, model });
  1583. const cached = getCachedResult(db, cacheKey);
  1584. if (cached !== null) {
  1585. cachedResults.set(doc.file, parseFloat(cached));
  1586. } else {
  1587. uncachedDocs.push({ file: doc.file, text: doc.text });
  1588. }
  1589. }
  1590. // Rerank uncached documents using LlamaCpp
  1591. if (uncachedDocs.length > 0) {
  1592. const llm = getDefaultLlamaCpp();
  1593. const rerankResult = await llm.rerank(query, uncachedDocs, { model });
  1594. // Cache results
  1595. for (const result of rerankResult.results) {
  1596. const cacheKey = getCacheKey("rerank", { query, file: result.file, model });
  1597. setCachedResult(db, cacheKey, result.score.toString());
  1598. cachedResults.set(result.file, result.score);
  1599. }
  1600. }
  1601. // Return all results sorted by score
  1602. return documents
  1603. .map(doc => ({ file: doc.file, score: cachedResults.get(doc.file) || 0 }))
  1604. .sort((a, b) => b.score - a.score);
  1605. }
  1606. // =============================================================================
  1607. // Reciprocal Rank Fusion
  1608. // =============================================================================
  1609. export function reciprocalRankFusion(
  1610. resultLists: RankedResult[][],
  1611. weights: number[] = [],
  1612. k: number = 60
  1613. ): RankedResult[] {
  1614. const scores = new Map<string, { result: RankedResult; rrfScore: number; topRank: number }>();
  1615. for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
  1616. const list = resultLists[listIdx];
  1617. if (!list) continue;
  1618. const weight = weights[listIdx] ?? 1.0;
  1619. for (let rank = 0; rank < list.length; rank++) {
  1620. const result = list[rank];
  1621. if (!result) continue;
  1622. const rrfContribution = weight / (k + rank + 1);
  1623. const existing = scores.get(result.file);
  1624. if (existing) {
  1625. existing.rrfScore += rrfContribution;
  1626. existing.topRank = Math.min(existing.topRank, rank);
  1627. } else {
  1628. scores.set(result.file, {
  1629. result,
  1630. rrfScore: rrfContribution,
  1631. topRank: rank,
  1632. });
  1633. }
  1634. }
  1635. }
  1636. // Top-rank bonus
  1637. for (const entry of scores.values()) {
  1638. if (entry.topRank === 0) {
  1639. entry.rrfScore += 0.05;
  1640. } else if (entry.topRank <= 2) {
  1641. entry.rrfScore += 0.02;
  1642. }
  1643. }
  1644. return Array.from(scores.values())
  1645. .sort((a, b) => b.rrfScore - a.rrfScore)
  1646. .map(e => ({ ...e.result, score: e.rrfScore }));
  1647. }
  1648. // =============================================================================
  1649. // Document retrieval
  1650. // =============================================================================
  1651. type DbDocRow = {
  1652. virtual_path: string;
  1653. display_path: string;
  1654. title: string;
  1655. hash: string;
  1656. collection: string;
  1657. path: string;
  1658. modified_at: string;
  1659. body_length: number;
  1660. body?: string;
  1661. };
  1662. /**
  1663. * Find a document by filename/path, docid (#hash), or with fuzzy matching.
  1664. * Returns document metadata without body by default.
  1665. *
  1666. * Supports:
  1667. * - Virtual paths: qmd://collection/path/to/file.md
  1668. * - Absolute paths: /path/to/file.md
  1669. * - Relative paths: path/to/file.md
  1670. * - Short docid: #abc123 (first 6 chars of hash)
  1671. */
  1672. export function findDocument(db: Database, filename: string, options: { includeBody?: boolean } = {}): DocumentResult | DocumentNotFound {
  1673. let filepath = filename;
  1674. const colonMatch = filepath.match(/:(\d+)$/);
  1675. if (colonMatch) {
  1676. filepath = filepath.slice(0, -colonMatch[0].length);
  1677. }
  1678. // Check if this is a docid lookup (#hash or just 6-char hex)
  1679. if (filepath.startsWith('#') || /^[a-f0-9]{6}$/i.test(filepath)) {
  1680. const docidMatch = findDocumentByDocid(db, filepath);
  1681. if (docidMatch) {
  1682. filepath = docidMatch.filepath;
  1683. } else {
  1684. return { error: "not_found", query: filename, similarFiles: [] };
  1685. }
  1686. }
  1687. if (filepath.startsWith('~/')) {
  1688. filepath = homedir() + filepath.slice(1);
  1689. }
  1690. const bodyCol = options.includeBody ? `, content.doc as body` : ``;
  1691. // Build computed columns
  1692. // Note: absoluteFilepath is computed from YAML collections after query
  1693. const selectCols = `
  1694. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  1695. d.collection || '/' || d.path as display_path,
  1696. d.title,
  1697. d.hash,
  1698. d.collection,
  1699. d.modified_at,
  1700. LENGTH(content.doc) as body_length
  1701. ${bodyCol}
  1702. `;
  1703. // Try to match by virtual path first
  1704. let doc = db.prepare(`
  1705. SELECT ${selectCols}
  1706. FROM documents d
  1707. JOIN content ON content.hash = d.hash
  1708. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  1709. `).get(filepath) as DbDocRow | null;
  1710. // Try fuzzy match by virtual path
  1711. if (!doc) {
  1712. doc = db.prepare(`
  1713. SELECT ${selectCols}
  1714. FROM documents d
  1715. JOIN content ON content.hash = d.hash
  1716. WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
  1717. LIMIT 1
  1718. `).get(`%${filepath}`) as DbDocRow | null;
  1719. }
  1720. // Try to match by absolute path (requires looking up collection paths from YAML)
  1721. if (!doc && !filepath.startsWith('qmd://')) {
  1722. const collections = collectionsListCollections();
  1723. for (const coll of collections) {
  1724. let relativePath: string | null = null;
  1725. // If filepath is absolute and starts with collection path, extract relative part
  1726. if (filepath.startsWith(coll.path + '/')) {
  1727. relativePath = filepath.slice(coll.path.length + 1);
  1728. }
  1729. // Otherwise treat filepath as relative to collection
  1730. else if (!filepath.startsWith('/')) {
  1731. relativePath = filepath;
  1732. }
  1733. if (relativePath) {
  1734. doc = db.prepare(`
  1735. SELECT ${selectCols}
  1736. FROM documents d
  1737. JOIN content ON content.hash = d.hash
  1738. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  1739. `).get(coll.name, relativePath) as DbDocRow | null;
  1740. if (doc) break;
  1741. }
  1742. }
  1743. }
  1744. if (!doc) {
  1745. const similar = findSimilarFiles(db, filepath, 5, 5);
  1746. return { error: "not_found", query: filename, similarFiles: similar };
  1747. }
  1748. // Get context using virtual path
  1749. const virtualPath = doc.virtual_path || `qmd://${doc.collection}/${doc.display_path}`;
  1750. const context = getContextForFile(db, virtualPath);
  1751. return {
  1752. filepath: virtualPath,
  1753. displayPath: doc.display_path,
  1754. title: doc.title,
  1755. context,
  1756. hash: doc.hash,
  1757. docid: getDocid(doc.hash),
  1758. collectionName: doc.collection,
  1759. modifiedAt: doc.modified_at,
  1760. bodyLength: doc.body_length,
  1761. ...(options.includeBody && doc.body !== undefined && { body: doc.body }),
  1762. };
  1763. }
  1764. /**
  1765. * Get the body content for a document
  1766. * Optionally slice by line range
  1767. */
  1768. export function getDocumentBody(db: Database, doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number): string | null {
  1769. const filepath = doc.filepath;
  1770. // Try to resolve document by filepath (absolute or virtual)
  1771. let row: { body: string } | null = null;
  1772. // Try virtual path first
  1773. if (filepath.startsWith('qmd://')) {
  1774. row = db.prepare(`
  1775. SELECT content.doc as body
  1776. FROM documents d
  1777. JOIN content ON content.hash = d.hash
  1778. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  1779. `).get(filepath) as { body: string } | null;
  1780. }
  1781. // Try absolute path by looking up in YAML collections
  1782. if (!row) {
  1783. const collections = collectionsListCollections();
  1784. for (const coll of collections) {
  1785. if (filepath.startsWith(coll.path + '/')) {
  1786. const relativePath = filepath.slice(coll.path.length + 1);
  1787. row = db.prepare(`
  1788. SELECT content.doc as body
  1789. FROM documents d
  1790. JOIN content ON content.hash = d.hash
  1791. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  1792. `).get(coll.name, relativePath) as { body: string } | null;
  1793. if (row) break;
  1794. }
  1795. }
  1796. }
  1797. if (!row) return null;
  1798. let body = row.body;
  1799. if (fromLine !== undefined || maxLines !== undefined) {
  1800. const lines = body.split('\n');
  1801. const start = (fromLine || 1) - 1;
  1802. const end = maxLines !== undefined ? start + maxLines : lines.length;
  1803. body = lines.slice(start, end).join('\n');
  1804. }
  1805. return body;
  1806. }
  1807. /**
  1808. * Find multiple documents by glob pattern or comma-separated list
  1809. * Returns documents without body by default (use getDocumentBody to load)
  1810. */
  1811. export function findDocuments(
  1812. db: Database,
  1813. pattern: string,
  1814. options: { includeBody?: boolean; maxBytes?: number } = {}
  1815. ): { docs: MultiGetResult[]; errors: string[] } {
  1816. const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
  1817. const errors: string[] = [];
  1818. const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
  1819. const bodyCol = options.includeBody ? `, content.doc as body` : ``;
  1820. const selectCols = `
  1821. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  1822. d.collection || '/' || d.path as display_path,
  1823. d.title,
  1824. d.hash,
  1825. d.collection,
  1826. d.modified_at,
  1827. LENGTH(content.doc) as body_length
  1828. ${bodyCol}
  1829. `;
  1830. let fileRows: DbDocRow[];
  1831. if (isCommaSeparated) {
  1832. const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
  1833. fileRows = [];
  1834. for (const name of names) {
  1835. let doc = db.prepare(`
  1836. SELECT ${selectCols}
  1837. FROM documents d
  1838. JOIN content ON content.hash = d.hash
  1839. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  1840. `).get(name) as DbDocRow | null;
  1841. if (!doc) {
  1842. doc = db.prepare(`
  1843. SELECT ${selectCols}
  1844. FROM documents d
  1845. JOIN content ON content.hash = d.hash
  1846. WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
  1847. LIMIT 1
  1848. `).get(`%${name}`) as DbDocRow | null;
  1849. }
  1850. if (doc) {
  1851. fileRows.push(doc);
  1852. } else {
  1853. const similar = findSimilarFiles(db, name, 5, 3);
  1854. let msg = `File not found: ${name}`;
  1855. if (similar.length > 0) {
  1856. msg += ` (did you mean: ${similar.join(', ')}?)`;
  1857. }
  1858. errors.push(msg);
  1859. }
  1860. }
  1861. } else {
  1862. // Glob pattern match
  1863. const matched = matchFilesByGlob(db, pattern);
  1864. if (matched.length === 0) {
  1865. errors.push(`No files matched pattern: ${pattern}`);
  1866. return { docs: [], errors };
  1867. }
  1868. const virtualPaths = matched.map(m => m.filepath);
  1869. const placeholders = virtualPaths.map(() => '?').join(',');
  1870. fileRows = db.prepare(`
  1871. SELECT ${selectCols}
  1872. FROM documents d
  1873. JOIN content ON content.hash = d.hash
  1874. WHERE 'qmd://' || d.collection || '/' || d.path IN (${placeholders}) AND d.active = 1
  1875. `).all(...virtualPaths) as DbDocRow[];
  1876. }
  1877. const results: MultiGetResult[] = [];
  1878. for (const row of fileRows) {
  1879. // Get context using virtual path
  1880. const virtualPath = row.virtual_path || `qmd://${row.collection}/${row.display_path}`;
  1881. const context = getContextForFile(db, virtualPath);
  1882. if (row.body_length > maxBytes) {
  1883. results.push({
  1884. doc: { filepath: virtualPath, displayPath: row.display_path },
  1885. skipped: true,
  1886. skipReason: `File too large (${Math.round(row.body_length / 1024)}KB > ${Math.round(maxBytes / 1024)}KB)`,
  1887. });
  1888. continue;
  1889. }
  1890. results.push({
  1891. doc: {
  1892. filepath: virtualPath,
  1893. displayPath: row.display_path,
  1894. title: row.title || row.display_path.split('/').pop() || row.display_path,
  1895. context,
  1896. hash: row.hash,
  1897. docid: getDocid(row.hash),
  1898. collectionName: row.collection,
  1899. modifiedAt: row.modified_at,
  1900. bodyLength: row.body_length,
  1901. ...(options.includeBody && row.body !== undefined && { body: row.body }),
  1902. },
  1903. skipped: false,
  1904. });
  1905. }
  1906. return { docs: results, errors };
  1907. }
  1908. // =============================================================================
  1909. // Status
  1910. // =============================================================================
  1911. export function getStatus(db: Database): IndexStatus {
  1912. // Load collections from YAML
  1913. const yamlCollections = collectionsListCollections();
  1914. // Get document counts and last update times for each collection
  1915. const collections = yamlCollections.map(col => {
  1916. const stats = db.prepare(`
  1917. SELECT
  1918. COUNT(*) as active_count,
  1919. MAX(modified_at) as last_doc_update
  1920. FROM documents
  1921. WHERE collection = ? AND active = 1
  1922. `).get(col.name) as { active_count: number; last_doc_update: string | null };
  1923. return {
  1924. name: col.name,
  1925. path: col.path,
  1926. pattern: col.pattern,
  1927. documents: stats.active_count,
  1928. lastUpdated: stats.last_doc_update || new Date().toISOString(),
  1929. };
  1930. });
  1931. // Sort by last update time (most recent first)
  1932. collections.sort((a, b) => {
  1933. if (!a.lastUpdated) return 1;
  1934. if (!b.lastUpdated) return -1;
  1935. return new Date(b.lastUpdated).getTime() - new Date(a.lastUpdated).getTime();
  1936. });
  1937. const totalDocs = (db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get() as { c: number }).c;
  1938. const needsEmbedding = getHashesNeedingEmbedding(db);
  1939. const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1940. return {
  1941. totalDocuments: totalDocs,
  1942. needsEmbedding,
  1943. hasVectorIndex: hasVectors,
  1944. collections,
  1945. };
  1946. }
  1947. // =============================================================================
  1948. // Snippet extraction
  1949. // =============================================================================
  1950. export type SnippetResult = {
  1951. line: number; // 1-indexed line number of best match
  1952. snippet: string; // The snippet text with diff-style header
  1953. linesBefore: number; // Lines in document before snippet
  1954. linesAfter: number; // Lines in document after snippet
  1955. snippetLines: number; // Number of lines in snippet
  1956. };
  1957. export function extractSnippet(body: string, query: string, maxLen = 500, chunkPos?: number): SnippetResult {
  1958. const totalLines = body.split('\n').length;
  1959. let searchBody = body;
  1960. let lineOffset = 0;
  1961. if (chunkPos && chunkPos > 0) {
  1962. const contextStart = Math.max(0, chunkPos - 100);
  1963. const contextEnd = Math.min(body.length, chunkPos + maxLen + 100);
  1964. searchBody = body.slice(contextStart, contextEnd);
  1965. if (contextStart > 0) {
  1966. lineOffset = body.slice(0, contextStart).split('\n').length - 1;
  1967. }
  1968. }
  1969. const lines = searchBody.split('\n');
  1970. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
  1971. let bestLine = 0, bestScore = -1;
  1972. for (let i = 0; i < lines.length; i++) {
  1973. const lineLower = (lines[i] ?? "").toLowerCase();
  1974. let score = 0;
  1975. for (const term of queryTerms) {
  1976. if (lineLower.includes(term)) score++;
  1977. }
  1978. if (score > bestScore) {
  1979. bestScore = score;
  1980. bestLine = i;
  1981. }
  1982. }
  1983. const start = Math.max(0, bestLine - 1);
  1984. const end = Math.min(lines.length, bestLine + 3);
  1985. const snippetLines = lines.slice(start, end);
  1986. let snippetText = snippetLines.join('\n');
  1987. // If we focused on a chunk window and it produced an empty/whitespace-only snippet,
  1988. // fall back to a full-document snippet so we always show something useful.
  1989. if (chunkPos && chunkPos > 0 && snippetText.trim().length === 0) {
  1990. return extractSnippet(body, query, maxLen, undefined);
  1991. }
  1992. if (snippetText.length > maxLen) snippetText = snippetText.substring(0, maxLen - 3) + "...";
  1993. const absoluteStart = lineOffset + start + 1; // 1-indexed
  1994. const snippetLineCount = snippetLines.length;
  1995. const linesBefore = absoluteStart - 1;
  1996. const linesAfter = totalLines - (absoluteStart + snippetLineCount - 1);
  1997. // Format with diff-style header: @@ -start,count @@ (linesBefore before, linesAfter after)
  1998. const header = `@@ -${absoluteStart},${snippetLineCount} @@ (${linesBefore} before, ${linesAfter} after)`;
  1999. const snippet = `${header}\n${snippetText}`;
  2000. return {
  2001. line: lineOffset + bestLine + 1,
  2002. snippet,
  2003. linesBefore,
  2004. linesAfter,
  2005. snippetLines: snippetLineCount,
  2006. };
  2007. }