store.ts 84 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477
  1. /**
  2. * QMD Store - Core data access and retrieval functions
  3. *
  4. * This module provides all database operations, search functions, and document
  5. * retrieval for QMD. It returns raw data structures that can be formatted by
  6. * CLI or MCP consumers.
  7. *
  8. * Usage:
  9. * const store = createStore("/path/to/db.sqlite");
  10. * // or use default path:
  11. * const store = createStore();
  12. */
  13. import { Database } from "bun:sqlite";
  14. import { Glob } from "bun";
  15. import * as sqliteVec from "sqlite-vec";
  16. import {
  17. LlamaCpp,
  18. getDefaultLlamaCpp,
  19. formatQueryForEmbedding,
  20. formatDocForEmbedding,
  21. type RerankDocument,
  22. } from "./llm";
  23. import {
  24. findContextForPath as collectionsFindContextForPath,
  25. addContext as collectionsAddContext,
  26. removeContext as collectionsRemoveContext,
  27. listAllContexts as collectionsListAllContexts,
  28. getCollection,
  29. listCollections as collectionsListCollections,
  30. addCollection as collectionsAddCollection,
  31. removeCollection as collectionsRemoveCollection,
  32. renameCollection as collectionsRenameCollection,
  33. setGlobalContext,
  34. loadConfig as collectionsLoadConfig,
  35. type NamedCollection,
  36. } from "./collections";
  37. // =============================================================================
  38. // Configuration
  39. // =============================================================================
  40. const HOME = Bun.env.HOME || "/tmp";
  41. export const DEFAULT_EMBED_MODEL = "embeddinggemma";
  42. export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
  43. export const DEFAULT_QUERY_MODEL = "qwen3:0.6b";
  44. export const DEFAULT_GLOB = "**/*.md";
  45. export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
  46. // Chunking: 800 tokens per chunk with 15% overlap
  47. export const CHUNK_SIZE_TOKENS = 800;
  48. export const CHUNK_OVERLAP_TOKENS = Math.floor(CHUNK_SIZE_TOKENS * 0.15); // 120 tokens (15% overlap)
  49. // Fallback char-based approximation for sync chunking (~4 chars per token)
  50. export const CHUNK_SIZE_CHARS = CHUNK_SIZE_TOKENS * 4; // 3200 chars
  51. export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 480 chars
  52. // =============================================================================
  53. // Path utilities
  54. // =============================================================================
  55. export function homedir(): string {
  56. return HOME;
  57. }
  58. export function resolve(...paths: string[]): string {
  59. let result = paths[0].startsWith('/') ? '' : Bun.env.PWD || process.cwd();
  60. for (const p of paths) {
  61. if (p.startsWith('/')) {
  62. result = p;
  63. } else {
  64. result = result + '/' + p;
  65. }
  66. }
  67. const parts = result.split('/').filter(Boolean);
  68. const normalized: string[] = [];
  69. for (const part of parts) {
  70. if (part === '..') normalized.pop();
  71. else if (part !== '.') normalized.push(part);
  72. }
  73. return '/' + normalized.join('/');
  74. }
  75. // Flag to indicate production mode (set by qmd.ts at startup)
  76. let _productionMode = false;
  77. export function enableProductionMode(): void {
  78. _productionMode = true;
  79. }
  80. export function getDefaultDbPath(indexName: string = "index"): string {
  81. // Always allow override via INDEX_PATH (for testing)
  82. if (Bun.env.INDEX_PATH) {
  83. return Bun.env.INDEX_PATH;
  84. }
  85. // In non-production mode (tests), require explicit path
  86. if (!_productionMode) {
  87. throw new Error(
  88. "Database path not set. Tests must set INDEX_PATH env var or use createStore() with explicit path. " +
  89. "This prevents tests from accidentally writing to the global index."
  90. );
  91. }
  92. const cacheDir = Bun.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
  93. const qmdCacheDir = resolve(cacheDir, "qmd");
  94. try { Bun.spawnSync(["mkdir", "-p", qmdCacheDir]); } catch {}
  95. return resolve(qmdCacheDir, `${indexName}.sqlite`);
  96. }
  97. export function getPwd(): string {
  98. return process.env.PWD || process.cwd();
  99. }
  100. export function getRealPath(path: string): string {
  101. try {
  102. const result = Bun.spawnSync(["realpath", path]);
  103. if (result.success) {
  104. return result.stdout.toString().trim();
  105. }
  106. } catch {}
  107. return resolve(path);
  108. }
  109. // =============================================================================
  110. // Virtual Path Utilities (qmd://)
  111. // =============================================================================
  112. export type VirtualPath = {
  113. collectionName: string;
  114. path: string; // relative path within collection
  115. };
  116. /**
  117. * Normalize explicit virtual path formats to standard qmd:// format.
  118. * Only handles paths that are already explicitly virtual:
  119. * - qmd://collection/path.md (already normalized)
  120. * - qmd:////collection/path.md (extra slashes - normalize)
  121. * - //collection/path.md (missing qmd: prefix - add it)
  122. *
  123. * Does NOT handle:
  124. * - collection/path.md (bare paths - could be filesystem relative)
  125. * - :linenum suffix (should be parsed separately before calling this)
  126. */
  127. export function normalizeVirtualPath(input: string): string {
  128. let path = input.trim();
  129. // Handle qmd:// with extra slashes: qmd:////collection/path -> qmd://collection/path
  130. if (path.startsWith('qmd:')) {
  131. // Remove qmd: prefix and normalize slashes
  132. path = path.slice(4);
  133. // Remove leading slashes and re-add exactly two
  134. path = path.replace(/^\/+/, '');
  135. return `qmd://${path}`;
  136. }
  137. // Handle //collection/path (missing qmd: prefix)
  138. if (path.startsWith('//')) {
  139. path = path.replace(/^\/+/, '');
  140. return `qmd://${path}`;
  141. }
  142. // Return as-is for other cases (filesystem paths, docids, bare collection/path, etc.)
  143. return path;
  144. }
  145. /**
  146. * Parse a virtual path like "qmd://collection-name/path/to/file.md"
  147. * into its components.
  148. * Also supports collection root: "qmd://collection-name/" or "qmd://collection-name"
  149. */
  150. export function parseVirtualPath(virtualPath: string): VirtualPath | null {
  151. // Normalize the path first
  152. const normalized = normalizeVirtualPath(virtualPath);
  153. // Match: qmd://collection-name[/optional-path]
  154. // Allows: qmd://name, qmd://name/, qmd://name/path
  155. const match = normalized.match(/^qmd:\/\/([^\/]+)\/?(.*)$/);
  156. if (!match) return null;
  157. return {
  158. collectionName: match[1],
  159. path: match[2] || '', // Empty string for collection root
  160. };
  161. }
  162. /**
  163. * Build a virtual path from collection name and relative path.
  164. */
  165. export function buildVirtualPath(collectionName: string, path: string): string {
  166. return `qmd://${collectionName}/${path}`;
  167. }
  168. /**
  169. * Check if a path is explicitly a virtual path.
  170. * Only recognizes explicit virtual path formats:
  171. * - qmd://collection/path.md
  172. * - //collection/path.md
  173. *
  174. * Does NOT consider bare collection/path.md as virtual - that should be
  175. * handled separately by checking if the first component is a collection name.
  176. */
  177. export function isVirtualPath(path: string): boolean {
  178. const trimmed = path.trim();
  179. // Explicit qmd:// prefix (with any number of slashes)
  180. if (trimmed.startsWith('qmd:')) return true;
  181. // //collection/path format (missing qmd: prefix)
  182. if (trimmed.startsWith('//')) return true;
  183. return false;
  184. }
  185. /**
  186. * Resolve a virtual path to absolute filesystem path.
  187. */
  188. export function resolveVirtualPath(db: Database, virtualPath: string): string | null {
  189. const parsed = parseVirtualPath(virtualPath);
  190. if (!parsed) return null;
  191. const coll = getCollectionByName(db, parsed.collectionName);
  192. if (!coll) return null;
  193. return resolve(coll.pwd, parsed.path);
  194. }
  195. /**
  196. * Convert an absolute filesystem path to a virtual path.
  197. * Returns null if the file is not in any indexed collection.
  198. */
  199. export function toVirtualPath(db: Database, absolutePath: string): string | null {
  200. // Get all collections from YAML config
  201. const collections = collectionsListCollections();
  202. // Find which collection this absolute path belongs to
  203. for (const coll of collections) {
  204. if (absolutePath.startsWith(coll.path + '/') || absolutePath === coll.path) {
  205. // Extract relative path
  206. const relativePath = absolutePath.startsWith(coll.path + '/')
  207. ? absolutePath.slice(coll.path.length + 1)
  208. : '';
  209. // Verify this document exists in the database
  210. const doc = db.prepare(`
  211. SELECT d.path
  212. FROM documents d
  213. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  214. LIMIT 1
  215. `).get(coll.name, relativePath) as { path: string } | null;
  216. if (doc) {
  217. return buildVirtualPath(coll.name, relativePath);
  218. }
  219. }
  220. }
  221. return null;
  222. }
  223. // =============================================================================
  224. // Database initialization
  225. // =============================================================================
  226. // On macOS, use Homebrew's SQLite which supports extensions
  227. if (process.platform === "darwin") {
  228. const homebrewSqlitePath = "/opt/homebrew/opt/sqlite/lib/libsqlite3.dylib";
  229. try {
  230. if (Bun.file(homebrewSqlitePath).size > 0) {
  231. Database.setCustomSQLite(homebrewSqlitePath);
  232. }
  233. } catch {}
  234. }
  235. function initializeDatabase(db: Database): void {
  236. sqliteVec.load(db);
  237. db.exec("PRAGMA journal_mode = WAL");
  238. db.exec("PRAGMA foreign_keys = ON");
  239. // Drop legacy tables that are now managed in YAML
  240. db.exec(`DROP TABLE IF EXISTS path_contexts`);
  241. db.exec(`DROP TABLE IF EXISTS collections`);
  242. // Content-addressable storage - the source of truth for document content
  243. db.exec(`
  244. CREATE TABLE IF NOT EXISTS content (
  245. hash TEXT PRIMARY KEY,
  246. doc TEXT NOT NULL,
  247. created_at TEXT NOT NULL
  248. )
  249. `);
  250. // Documents table - file system layer mapping virtual paths to content hashes
  251. // Collections are now managed in ~/.config/qmd/index.yml
  252. db.exec(`
  253. CREATE TABLE IF NOT EXISTS documents (
  254. id INTEGER PRIMARY KEY AUTOINCREMENT,
  255. collection TEXT NOT NULL,
  256. path TEXT NOT NULL,
  257. title TEXT NOT NULL,
  258. hash TEXT NOT NULL,
  259. created_at TEXT NOT NULL,
  260. modified_at TEXT NOT NULL,
  261. active INTEGER NOT NULL DEFAULT 1,
  262. FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
  263. UNIQUE(collection, path)
  264. )
  265. `);
  266. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection, active)`);
  267. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`);
  268. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path, active)`);
  269. // Cache table for LLM API calls (table name kept for backwards compatibility)
  270. db.exec(`
  271. CREATE TABLE IF NOT EXISTS llm_cache (
  272. hash TEXT PRIMARY KEY,
  273. result TEXT NOT NULL,
  274. created_at TEXT NOT NULL
  275. )
  276. `);
  277. // Content vectors
  278. const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
  279. const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
  280. if (cvInfo.length > 0 && !hasSeqColumn) {
  281. db.exec(`DROP TABLE IF EXISTS content_vectors`);
  282. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  283. }
  284. db.exec(`
  285. CREATE TABLE IF NOT EXISTS content_vectors (
  286. hash TEXT NOT NULL,
  287. seq INTEGER NOT NULL DEFAULT 0,
  288. pos INTEGER NOT NULL DEFAULT 0,
  289. model TEXT NOT NULL,
  290. embedded_at TEXT NOT NULL,
  291. PRIMARY KEY (hash, seq)
  292. )
  293. `);
  294. // FTS - index filepath (collection/path), title, and content
  295. db.exec(`
  296. CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
  297. filepath, title, body,
  298. tokenize='porter unicode61'
  299. )
  300. `);
  301. // Triggers to keep FTS in sync
  302. db.exec(`
  303. CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents
  304. WHEN new.active = 1
  305. BEGIN
  306. INSERT INTO documents_fts(rowid, filepath, title, body)
  307. SELECT
  308. new.id,
  309. new.collection || '/' || new.path,
  310. new.title,
  311. (SELECT doc FROM content WHERE hash = new.hash)
  312. WHERE new.active = 1;
  313. END
  314. `);
  315. db.exec(`
  316. CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
  317. DELETE FROM documents_fts WHERE rowid = old.id;
  318. END
  319. `);
  320. db.exec(`
  321. CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents
  322. BEGIN
  323. -- Delete from FTS if no longer active
  324. DELETE FROM documents_fts WHERE rowid = old.id AND new.active = 0;
  325. -- Update FTS if still/newly active
  326. INSERT OR REPLACE INTO documents_fts(rowid, filepath, title, body)
  327. SELECT
  328. new.id,
  329. new.collection || '/' || new.path,
  330. new.title,
  331. (SELECT doc FROM content WHERE hash = new.hash)
  332. WHERE new.active = 1;
  333. END
  334. `);
  335. }
  336. function ensureVecTableInternal(db: Database, dimensions: number): void {
  337. const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get() as { sql: string } | null;
  338. if (tableInfo) {
  339. const match = tableInfo.sql.match(/float\[(\d+)\]/);
  340. const hasHashSeq = tableInfo.sql.includes('hash_seq');
  341. const hasCosine = tableInfo.sql.includes('distance_metric=cosine');
  342. if (match && parseInt(match[1]) === dimensions && hasHashSeq && hasCosine) return;
  343. // Table exists but wrong schema - need to rebuild
  344. db.exec("DROP TABLE IF EXISTS vectors_vec");
  345. }
  346. db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}] distance_metric=cosine)`);
  347. }
  348. // =============================================================================
  349. // Store Factory
  350. // =============================================================================
  351. export type Store = {
  352. db: Database;
  353. dbPath: string;
  354. close: () => void;
  355. ensureVecTable: (dimensions: number) => void;
  356. // Index health
  357. getHashesNeedingEmbedding: () => number;
  358. getIndexHealth: () => IndexHealthInfo;
  359. getStatus: () => IndexStatus;
  360. // Caching
  361. getCacheKey: typeof getCacheKey;
  362. getCachedResult: (cacheKey: string) => string | null;
  363. setCachedResult: (cacheKey: string, result: string) => void;
  364. clearCache: () => void;
  365. // Cleanup and maintenance
  366. deleteLLMCache: () => number;
  367. deleteInactiveDocuments: () => number;
  368. cleanupOrphanedContent: () => number;
  369. cleanupOrphanedVectors: () => number;
  370. cleanupDuplicateCollections: () => number;
  371. vacuumDatabase: () => void;
  372. // Context
  373. getContextForFile: (filepath: string) => string | null;
  374. getContextForPath: (collectionName: string, path: string) => string | null;
  375. getCollectionByName: (name: string) => { name: string; pwd: string; glob_pattern: string } | null;
  376. getCollectionsWithoutContext: () => { name: string; pwd: string; doc_count: number }[];
  377. getTopLevelPathsWithoutContext: (collectionName: string) => string[];
  378. // Virtual paths
  379. parseVirtualPath: typeof parseVirtualPath;
  380. buildVirtualPath: typeof buildVirtualPath;
  381. isVirtualPath: typeof isVirtualPath;
  382. resolveVirtualPath: (virtualPath: string) => string | null;
  383. toVirtualPath: (absolutePath: string) => string | null;
  384. // Search
  385. searchFTS: (query: string, limit?: number, collectionId?: number) => SearchResult[];
  386. searchVec: (query: string, model: string, limit?: number, collectionId?: number) => Promise<SearchResult[]>;
  387. // Query expansion & reranking
  388. expandQuery: (query: string, model?: string) => Promise<string[]>;
  389. rerank: (query: string, documents: { file: string; text: string }[], model?: string) => Promise<{ file: string; score: number }[]>;
  390. // Document retrieval
  391. findDocument: (filename: string, options?: { includeBody?: boolean }) => DocumentResult | DocumentNotFound;
  392. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => string | null;
  393. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => { docs: MultiGetResult[]; errors: string[] };
  394. // Legacy compatibility
  395. getDocument: (filename: string, fromLine?: number, maxLines?: number) => (DocumentResult & { body: string }) | DocumentNotFound;
  396. getMultipleDocuments: (pattern: string, maxLines?: number, maxBytes?: number) => { files: MultiGetFile[]; errors: string[] };
  397. // Fuzzy matching and docid lookup
  398. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => string[];
  399. matchFilesByGlob: (pattern: string) => { filepath: string; displayPath: string; bodyLength: number }[];
  400. findDocumentByDocid: (docid: string) => { filepath: string; hash: string } | null;
  401. // Document indexing operations
  402. insertContent: (hash: string, content: string, createdAt: string) => void;
  403. insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => void;
  404. findActiveDocument: (collectionName: string, path: string) => { id: number; hash: string; title: string } | null;
  405. updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => void;
  406. updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => void;
  407. deactivateDocument: (collectionName: string, path: string) => void;
  408. getActiveDocumentPaths: (collectionName: string) => string[];
  409. // Vector/embedding operations
  410. getHashesForEmbedding: () => { hash: string; body: string; path: string }[];
  411. clearAllEmbeddings: () => void;
  412. insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void;
  413. };
  414. /**
  415. * Create a new store instance with the given database path.
  416. * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
  417. *
  418. * @param dbPath - Path to the SQLite database file
  419. * @returns Store instance with all methods bound to the database
  420. */
  421. export function createStore(dbPath?: string): Store {
  422. const resolvedPath = dbPath || getDefaultDbPath();
  423. const db = new Database(resolvedPath);
  424. initializeDatabase(db);
  425. return {
  426. db,
  427. dbPath: resolvedPath,
  428. close: () => db.close(),
  429. ensureVecTable: (dimensions: number) => ensureVecTableInternal(db, dimensions),
  430. // Index health
  431. getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
  432. getIndexHealth: () => getIndexHealth(db),
  433. getStatus: () => getStatus(db),
  434. // Caching
  435. getCacheKey,
  436. getCachedResult: (cacheKey: string) => getCachedResult(db, cacheKey),
  437. setCachedResult: (cacheKey: string, result: string) => setCachedResult(db, cacheKey, result),
  438. clearCache: () => clearCache(db),
  439. // Cleanup and maintenance
  440. deleteLLMCache: () => deleteLLMCache(db),
  441. deleteInactiveDocuments: () => deleteInactiveDocuments(db),
  442. cleanupOrphanedContent: () => cleanupOrphanedContent(db),
  443. cleanupOrphanedVectors: () => cleanupOrphanedVectors(db),
  444. cleanupDuplicateCollections: () => cleanupDuplicateCollections(db),
  445. vacuumDatabase: () => vacuumDatabase(db),
  446. // Context
  447. getContextForFile: (filepath: string) => getContextForFile(db, filepath),
  448. getContextForPath: (collectionName: string, path: string) => getContextForPath(db, collectionName, path),
  449. getCollectionByName: (name: string) => getCollectionByName(db, name),
  450. getCollectionsWithoutContext: () => getCollectionsWithoutContext(db),
  451. getTopLevelPathsWithoutContext: (collectionName: string) => getTopLevelPathsWithoutContext(db, collectionName),
  452. // Virtual paths
  453. parseVirtualPath,
  454. buildVirtualPath,
  455. isVirtualPath,
  456. resolveVirtualPath: (virtualPath: string) => resolveVirtualPath(db, virtualPath),
  457. toVirtualPath: (absolutePath: string) => toVirtualPath(db, absolutePath),
  458. // Search
  459. searchFTS: (query: string, limit?: number, collectionId?: number) => searchFTS(db, query, limit, collectionId),
  460. searchVec: (query: string, model: string, limit?: number, collectionId?: number) => searchVec(db, query, model, limit, collectionId),
  461. // Query expansion & reranking
  462. expandQuery: (query: string, model?: string) => expandQuery(query, model, db),
  463. rerank: (query: string, documents: { file: string; text: string }[], model?: string) => rerank(query, documents, model, db),
  464. // Document retrieval
  465. findDocument: (filename: string, options?: { includeBody?: boolean }) => findDocument(db, filename, options),
  466. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => getDocumentBody(db, doc, fromLine, maxLines),
  467. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => findDocuments(db, pattern, options),
  468. // Legacy compatibility
  469. getDocument: (filename: string, fromLine?: number, maxLines?: number) => getDocument(db, filename, fromLine, maxLines),
  470. getMultipleDocuments: (pattern: string, maxLines?: number, maxBytes?: number) => getMultipleDocuments(db, pattern, maxLines, maxBytes),
  471. // Fuzzy matching and docid lookup
  472. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => findSimilarFiles(db, query, maxDistance, limit),
  473. matchFilesByGlob: (pattern: string) => matchFilesByGlob(db, pattern),
  474. findDocumentByDocid: (docid: string) => findDocumentByDocid(db, docid),
  475. // Document indexing operations
  476. insertContent: (hash: string, content: string, createdAt: string) => insertContent(db, hash, content, createdAt),
  477. insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt),
  478. findActiveDocument: (collectionName: string, path: string) => findActiveDocument(db, collectionName, path),
  479. updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => updateDocumentTitle(db, documentId, title, modifiedAt),
  480. updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => updateDocument(db, documentId, title, hash, modifiedAt),
  481. deactivateDocument: (collectionName: string, path: string) => deactivateDocument(db, collectionName, path),
  482. getActiveDocumentPaths: (collectionName: string) => getActiveDocumentPaths(db, collectionName),
  483. // Vector/embedding operations
  484. getHashesForEmbedding: () => getHashesForEmbedding(db),
  485. clearAllEmbeddings: () => clearAllEmbeddings(db),
  486. insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
  487. };
  488. }
  489. // =============================================================================
  490. // Legacy compatibility - will be removed
  491. // =============================================================================
  492. let _legacyDb: Database | null = null;
  493. let _legacyDbPath: string | null = null;
  494. /** @deprecated Use createStore() instead */
  495. export function setCustomIndexName(name: string | null): void {
  496. _legacyDbPath = name ? getDefaultDbPath(name) : null;
  497. _legacyDb = null; // Reset so next getDb() creates new connection
  498. }
  499. /** @deprecated Use createStore() instead */
  500. export function getDbPath(): string {
  501. return _legacyDbPath || getDefaultDbPath();
  502. }
  503. /** @deprecated Use createStore() instead */
  504. export function getDb(): Database {
  505. if (!_legacyDb) {
  506. _legacyDb = new Database(getDbPath());
  507. initializeDatabase(_legacyDb);
  508. }
  509. return _legacyDb;
  510. }
  511. /** @deprecated Use store.db.close() instead. Closes the legacy db and resets singleton. */
  512. export function closeDb(): void {
  513. if (_legacyDb) {
  514. _legacyDb.close();
  515. _legacyDb = null;
  516. }
  517. }
  518. /** @deprecated Use store.ensureVecTable() instead */
  519. export function ensureVecTable(db: Database, dimensions: number): void {
  520. ensureVecTableInternal(db, dimensions);
  521. }
  522. // =============================================================================
  523. // Core Document Type
  524. // =============================================================================
  525. /**
  526. * Unified document result type with all metadata.
  527. * Body is optional - use getDocumentBody() to load it separately if needed.
  528. */
  529. export type DocumentResult = {
  530. filepath: string; // Full filesystem path
  531. displayPath: string; // Short display path (e.g., "docs/readme.md")
  532. title: string; // Document title (from first heading or filename)
  533. context: string | null; // Folder context description if configured
  534. hash: string; // Content hash for caching/change detection
  535. docid: string; // Short docid (first 6 chars of hash) for quick reference
  536. collectionName: string; // Parent collection name
  537. modifiedAt: string; // Last modification timestamp
  538. bodyLength: number; // Body length in bytes (useful before loading)
  539. body?: string; // Document body (optional, load with getDocumentBody)
  540. };
  541. /**
  542. * Extract short docid from a full hash (first 6 characters).
  543. */
  544. export function getDocid(hash: string): string {
  545. return hash.slice(0, 6);
  546. }
  547. /**
  548. * Handelize a filename to be more token-friendly.
  549. * - Convert triple underscore `___` to `/` (folder separator)
  550. * - Convert to lowercase
  551. * - Replace sequences of non-word chars (except /) with single dash
  552. * - Remove leading/trailing dashes from path segments
  553. * - Preserve folder structure (a/b/c/d.md stays structured)
  554. * - Preserve file extension
  555. */
  556. export function handelize(path: string): string {
  557. if (!path || path.trim() === '') {
  558. throw new Error('handelize: path cannot be empty');
  559. }
  560. // Check for paths that are just extensions or only dots/special chars
  561. // A valid path must have at least one alphanumeric character before processing
  562. const segments = path.split('/').filter(Boolean);
  563. const lastSegment = segments[segments.length - 1] || '';
  564. const filenameWithoutExt = lastSegment.replace(/\.[^.]+$/, '');
  565. const hasValidContent = /[a-zA-Z0-9]/.test(filenameWithoutExt);
  566. if (!hasValidContent) {
  567. throw new Error(`handelize: path "${path}" has no valid filename content`);
  568. }
  569. const result = path
  570. .replace(/___/g, '/') // Triple underscore becomes folder separator
  571. .toLowerCase()
  572. .split('/')
  573. .map((segment, idx, arr) => {
  574. const isLastSegment = idx === arr.length - 1;
  575. if (isLastSegment) {
  576. // For the filename (last segment), preserve the extension
  577. const extMatch = segment.match(/(\.[a-z0-9]+)$/i);
  578. const ext = extMatch ? extMatch[1] : '';
  579. const nameWithoutExt = ext ? segment.slice(0, -ext.length) : segment;
  580. const cleanedName = nameWithoutExt
  581. .replace(/[\W_]+/g, '-') // Replace non-word chars with dash
  582. .replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
  583. return cleanedName + ext;
  584. } else {
  585. // For directories, just clean normally
  586. return segment
  587. .replace(/[\W_]+/g, '-')
  588. .replace(/^-+|-+$/g, '');
  589. }
  590. })
  591. .filter(Boolean)
  592. .join('/');
  593. if (!result) {
  594. throw new Error(`handelize: path "${path}" resulted in empty string after processing`);
  595. }
  596. return result;
  597. }
  598. /**
  599. * Search result extends DocumentResult with score and source info
  600. */
  601. export type SearchResult = DocumentResult & {
  602. score: number; // Relevance score (0-1)
  603. source: "fts" | "vec"; // Search source (full-text or vector)
  604. chunkPos?: number; // Character position of matching chunk (for vector search)
  605. };
  606. /**
  607. * Ranked result for RRF fusion (simplified, used internally)
  608. */
  609. export type RankedResult = {
  610. file: string;
  611. displayPath: string;
  612. title: string;
  613. body: string;
  614. score: number;
  615. };
  616. /**
  617. * Error result when document is not found
  618. */
  619. export type DocumentNotFound = {
  620. error: "not_found";
  621. query: string;
  622. similarFiles: string[];
  623. };
  624. /**
  625. * Result from multi-get operations
  626. */
  627. export type MultiGetResult = {
  628. doc: DocumentResult;
  629. skipped: false;
  630. } | {
  631. doc: Pick<DocumentResult, "filepath" | "displayPath">;
  632. skipped: true;
  633. skipReason: string;
  634. };
  635. export type CollectionInfo = {
  636. name: string;
  637. path: string;
  638. pattern: string;
  639. documents: number;
  640. lastUpdated: string;
  641. };
  642. export type IndexStatus = {
  643. totalDocuments: number;
  644. needsEmbedding: number;
  645. hasVectorIndex: boolean;
  646. collections: CollectionInfo[];
  647. };
  648. // =============================================================================
  649. // Index health
  650. // =============================================================================
  651. export function getHashesNeedingEmbedding(db: Database): number {
  652. const result = db.prepare(`
  653. SELECT COUNT(DISTINCT d.hash) as count
  654. FROM documents d
  655. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  656. WHERE d.active = 1 AND v.hash IS NULL
  657. `).get() as { count: number };
  658. return result.count;
  659. }
  660. export type IndexHealthInfo = {
  661. needsEmbedding: number;
  662. totalDocs: number;
  663. daysStale: number | null;
  664. };
  665. export function getIndexHealth(db: Database): IndexHealthInfo {
  666. const needsEmbedding = getHashesNeedingEmbedding(db);
  667. const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count;
  668. const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
  669. let daysStale: number | null = null;
  670. if (mostRecent?.latest) {
  671. const lastUpdate = new Date(mostRecent.latest);
  672. daysStale = Math.floor((Date.now() - lastUpdate.getTime()) / (24 * 60 * 60 * 1000));
  673. }
  674. return { needsEmbedding, totalDocs, daysStale };
  675. }
  676. // =============================================================================
  677. // Caching
  678. // =============================================================================
  679. export function getCacheKey(url: string, body: object): string {
  680. const hash = new Bun.CryptoHasher("sha256");
  681. hash.update(url);
  682. hash.update(JSON.stringify(body));
  683. return hash.digest("hex");
  684. }
  685. export function getCachedResult(db: Database, cacheKey: string): string | null {
  686. const row = db.prepare(`SELECT result FROM llm_cache WHERE hash = ?`).get(cacheKey) as { result: string } | null;
  687. return row?.result || null;
  688. }
  689. export function setCachedResult(db: Database, cacheKey: string, result: string): void {
  690. const now = new Date().toISOString();
  691. db.prepare(`INSERT OR REPLACE INTO llm_cache (hash, result, created_at) VALUES (?, ?, ?)`).run(cacheKey, result, now);
  692. if (Math.random() < 0.01) {
  693. db.exec(`DELETE FROM llm_cache WHERE hash NOT IN (SELECT hash FROM llm_cache ORDER BY created_at DESC LIMIT 1000)`);
  694. }
  695. }
  696. export function clearCache(db: Database): void {
  697. db.exec(`DELETE FROM llm_cache`);
  698. }
  699. // =============================================================================
  700. // Cleanup and maintenance operations
  701. // =============================================================================
  702. /**
  703. * Delete cached LLM API responses.
  704. * Returns the number of cached responses deleted.
  705. */
  706. export function deleteLLMCache(db: Database): number {
  707. const result = db.prepare(`DELETE FROM llm_cache`).run();
  708. return result.changes;
  709. }
  710. /**
  711. * Remove inactive document records (active = 0).
  712. * Returns the number of inactive documents deleted.
  713. */
  714. export function deleteInactiveDocuments(db: Database): number {
  715. const result = db.prepare(`DELETE FROM documents WHERE active = 0`).run();
  716. return result.changes;
  717. }
  718. /**
  719. * Remove orphaned content hashes that are not referenced by any active document.
  720. * Returns the number of orphaned content hashes deleted.
  721. */
  722. export function cleanupOrphanedContent(db: Database): number {
  723. const result = db.prepare(`
  724. DELETE FROM content
  725. WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  726. `).run();
  727. return result.changes;
  728. }
  729. /**
  730. * Remove orphaned vector embeddings that are not referenced by any active document.
  731. * Returns the number of orphaned embedding chunks deleted.
  732. */
  733. export function cleanupOrphanedVectors(db: Database): number {
  734. // Check if vectors_vec table exists
  735. const tableExists = db.prepare(`
  736. SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'
  737. `).get();
  738. if (!tableExists) {
  739. return 0;
  740. }
  741. // Count orphaned vectors first
  742. const countResult = db.prepare(`
  743. SELECT COUNT(*) as c FROM content_vectors cv
  744. WHERE NOT EXISTS (
  745. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  746. )
  747. `).get() as { c: number };
  748. if (countResult.c === 0) {
  749. return 0;
  750. }
  751. // Delete from vectors_vec first
  752. db.exec(`
  753. DELETE FROM vectors_vec WHERE hash_seq IN (
  754. SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
  755. WHERE NOT EXISTS (
  756. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  757. )
  758. )
  759. `);
  760. // Delete from content_vectors
  761. db.exec(`
  762. DELETE FROM content_vectors WHERE hash NOT IN (
  763. SELECT hash FROM documents WHERE active = 1
  764. )
  765. `);
  766. return countResult.c;
  767. }
  768. /**
  769. * Remove duplicate collections, keeping the oldest one per (pwd, glob_pattern).
  770. * NOTE: This function is deprecated since collections are now managed in YAML.
  771. * Kept for backwards compatibility but returns 0.
  772. */
  773. export function cleanupDuplicateCollections(db: Database): number {
  774. // Collections are now managed in YAML, no cleanup needed
  775. return 0;
  776. }
  777. /**
  778. * Run VACUUM to reclaim unused space in the database.
  779. * This operation rebuilds the database file to eliminate fragmentation.
  780. */
  781. export function vacuumDatabase(db: Database): void {
  782. db.exec(`VACUUM`);
  783. }
  784. // =============================================================================
  785. // Document helpers
  786. // =============================================================================
  787. export async function hashContent(content: string): Promise<string> {
  788. const hash = new Bun.CryptoHasher("sha256");
  789. hash.update(content);
  790. return hash.digest("hex");
  791. }
  792. export function extractTitle(content: string, filename: string): string {
  793. const match = content.match(/^##?\s+(.+)$/m);
  794. if (match) {
  795. const title = match[1].trim();
  796. if (title === "📝 Notes" || title === "Notes") {
  797. const nextMatch = content.match(/^##\s+(.+)$/m);
  798. if (nextMatch) return nextMatch[1].trim();
  799. }
  800. return title;
  801. }
  802. return filename.replace(/\.md$/, "").split("/").pop() || filename;
  803. }
  804. // =============================================================================
  805. // Document indexing operations
  806. // =============================================================================
  807. /**
  808. * Insert content into the content table (content-addressable storage).
  809. * Uses INSERT OR IGNORE so duplicate hashes are skipped.
  810. */
  811. export function insertContent(db: Database, hash: string, content: string, createdAt: string): void {
  812. db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
  813. .run(hash, content, createdAt);
  814. }
  815. /**
  816. * Insert a new document into the documents table.
  817. */
  818. export function insertDocument(
  819. db: Database,
  820. collectionName: string,
  821. path: string,
  822. title: string,
  823. hash: string,
  824. createdAt: string,
  825. modifiedAt: string
  826. ): void {
  827. db.prepare(`
  828. INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active)
  829. VALUES (?, ?, ?, ?, ?, ?, 1)
  830. `).run(collectionName, path, title, hash, createdAt, modifiedAt);
  831. }
  832. /**
  833. * Find an active document by collection name and path.
  834. */
  835. export function findActiveDocument(
  836. db: Database,
  837. collectionName: string,
  838. path: string
  839. ): { id: number; hash: string; title: string } | null {
  840. return db.prepare(`
  841. SELECT id, hash, title FROM documents
  842. WHERE collection = ? AND path = ? AND active = 1
  843. `).get(collectionName, path) as { id: number; hash: string; title: string } | null;
  844. }
  845. /**
  846. * Update the title and modified_at timestamp for a document.
  847. */
  848. export function updateDocumentTitle(
  849. db: Database,
  850. documentId: number,
  851. title: string,
  852. modifiedAt: string
  853. ): void {
  854. db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
  855. .run(title, modifiedAt, documentId);
  856. }
  857. /**
  858. * Update an existing document's hash, title, and modified_at timestamp.
  859. * Used when content changes but the file path stays the same.
  860. */
  861. export function updateDocument(
  862. db: Database,
  863. documentId: number,
  864. title: string,
  865. hash: string,
  866. modifiedAt: string
  867. ): void {
  868. db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`)
  869. .run(title, hash, modifiedAt, documentId);
  870. }
  871. /**
  872. * Deactivate a document (mark as inactive but don't delete).
  873. */
  874. export function deactivateDocument(db: Database, collectionName: string, path: string): void {
  875. db.prepare(`UPDATE documents SET active = 0 WHERE collection = ? AND path = ? AND active = 1`)
  876. .run(collectionName, path);
  877. }
  878. /**
  879. * Get all active document paths for a collection.
  880. */
  881. export function getActiveDocumentPaths(db: Database, collectionName: string): string[] {
  882. const rows = db.prepare(`
  883. SELECT path FROM documents WHERE collection = ? AND active = 1
  884. `).all(collectionName) as { path: string }[];
  885. return rows.map(r => r.path);
  886. }
  887. // Re-export from llm.ts for backwards compatibility
  888. export { formatQueryForEmbedding, formatDocForEmbedding };
  889. export function chunkDocument(content: string, maxChars: number = CHUNK_SIZE_CHARS, overlapChars: number = CHUNK_OVERLAP_CHARS): { text: string; pos: number }[] {
  890. if (content.length <= maxChars) {
  891. return [{ text: content, pos: 0 }];
  892. }
  893. const chunks: { text: string; pos: number }[] = [];
  894. let charPos = 0;
  895. while (charPos < content.length) {
  896. // Calculate end position for this chunk
  897. let endPos = Math.min(charPos + maxChars, content.length);
  898. // If not at the end, try to find a good break point
  899. if (endPos < content.length) {
  900. const slice = content.slice(charPos, endPos);
  901. // Look for break points in the last 30% of the chunk
  902. const searchStart = Math.floor(slice.length * 0.7);
  903. const searchSlice = slice.slice(searchStart);
  904. // Priority: paragraph > sentence > line > word
  905. let breakOffset = -1;
  906. const paragraphBreak = searchSlice.lastIndexOf('\n\n');
  907. if (paragraphBreak >= 0) {
  908. breakOffset = searchStart + paragraphBreak + 2;
  909. } else {
  910. const sentenceEnd = Math.max(
  911. searchSlice.lastIndexOf('. '),
  912. searchSlice.lastIndexOf('.\n'),
  913. searchSlice.lastIndexOf('? '),
  914. searchSlice.lastIndexOf('?\n'),
  915. searchSlice.lastIndexOf('! '),
  916. searchSlice.lastIndexOf('!\n')
  917. );
  918. if (sentenceEnd >= 0) {
  919. breakOffset = searchStart + sentenceEnd + 2;
  920. } else {
  921. const lineBreak = searchSlice.lastIndexOf('\n');
  922. if (lineBreak >= 0) {
  923. breakOffset = searchStart + lineBreak + 1;
  924. } else {
  925. const spaceBreak = searchSlice.lastIndexOf(' ');
  926. if (spaceBreak >= 0) {
  927. breakOffset = searchStart + spaceBreak + 1;
  928. }
  929. }
  930. }
  931. }
  932. if (breakOffset > 0) {
  933. endPos = charPos + breakOffset;
  934. }
  935. }
  936. // Ensure we make progress
  937. if (endPos <= charPos) {
  938. endPos = Math.min(charPos + maxChars, content.length);
  939. }
  940. chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
  941. // Move forward, but overlap with previous chunk
  942. // For last chunk, don't overlap (just go to the end)
  943. if (endPos >= content.length) {
  944. break;
  945. }
  946. charPos = endPos - overlapChars;
  947. if (charPos <= chunks[chunks.length - 1].pos) {
  948. // Prevent infinite loop - move forward at least a bit
  949. charPos = endPos;
  950. }
  951. }
  952. return chunks;
  953. }
  954. /**
  955. * Chunk a document by actual token count using the LLM tokenizer.
  956. * More accurate than character-based chunking but requires async.
  957. */
  958. export async function chunkDocumentByTokens(
  959. content: string,
  960. maxTokens: number = CHUNK_SIZE_TOKENS,
  961. overlapTokens: number = CHUNK_OVERLAP_TOKENS
  962. ): Promise<{ text: string; pos: number; tokens: number }[]> {
  963. const llm = getDefaultLlamaCpp();
  964. // For small documents, check if we need chunking at all
  965. const totalTokens = await llm.countTokens(content);
  966. if (totalTokens <= maxTokens) {
  967. return [{ text: content, pos: 0, tokens: totalTokens }];
  968. }
  969. const chunks: { text: string; pos: number; tokens: number }[] = [];
  970. let charPos = 0;
  971. while (charPos < content.length) {
  972. // Binary search to find the right chunk end position
  973. // Start with an estimate based on average tokens per char
  974. const avgCharsPerToken = content.length / totalTokens;
  975. let estimatedEnd = Math.min(charPos + Math.floor(maxTokens * avgCharsPerToken * 1.1), content.length);
  976. // Get token count for this slice
  977. let slice = content.slice(charPos, estimatedEnd);
  978. let sliceTokens = await llm.countTokens(slice);
  979. // Adjust until we're close to maxTokens
  980. while (sliceTokens > maxTokens && estimatedEnd > charPos + 100) {
  981. // Reduce by ~10%
  982. estimatedEnd = charPos + Math.floor((estimatedEnd - charPos) * 0.9);
  983. slice = content.slice(charPos, estimatedEnd);
  984. sliceTokens = await llm.countTokens(slice);
  985. }
  986. // If we're under, try to expand (but not past content end)
  987. while (sliceTokens < maxTokens * 0.9 && estimatedEnd < content.length) {
  988. const newEnd = Math.min(estimatedEnd + Math.floor((estimatedEnd - charPos) * 0.1), content.length);
  989. if (newEnd === estimatedEnd) break;
  990. const newSlice = content.slice(charPos, newEnd);
  991. const newTokens = await llm.countTokens(newSlice);
  992. if (newTokens > maxTokens) break;
  993. estimatedEnd = newEnd;
  994. slice = newSlice;
  995. sliceTokens = newTokens;
  996. }
  997. // Find a good break point in the last 30% of the chunk
  998. if (estimatedEnd < content.length) {
  999. const searchStart = charPos + Math.floor((estimatedEnd - charPos) * 0.7);
  1000. const searchSlice = content.slice(searchStart, estimatedEnd);
  1001. let breakOffset = -1;
  1002. const paragraphBreak = searchSlice.lastIndexOf('\n\n');
  1003. if (paragraphBreak >= 0) {
  1004. breakOffset = paragraphBreak + 2;
  1005. } else {
  1006. const sentenceEnd = Math.max(
  1007. searchSlice.lastIndexOf('. '),
  1008. searchSlice.lastIndexOf('.\n'),
  1009. searchSlice.lastIndexOf('? '),
  1010. searchSlice.lastIndexOf('?\n'),
  1011. searchSlice.lastIndexOf('! '),
  1012. searchSlice.lastIndexOf('!\n')
  1013. );
  1014. if (sentenceEnd >= 0) {
  1015. breakOffset = sentenceEnd + 2;
  1016. } else {
  1017. const lineBreak = searchSlice.lastIndexOf('\n');
  1018. if (lineBreak >= 0) {
  1019. breakOffset = lineBreak + 1;
  1020. } else {
  1021. const spaceBreak = searchSlice.lastIndexOf(' ');
  1022. if (spaceBreak >= 0) {
  1023. breakOffset = spaceBreak + 1;
  1024. }
  1025. }
  1026. }
  1027. }
  1028. if (breakOffset >= 0) {
  1029. estimatedEnd = searchStart + breakOffset;
  1030. slice = content.slice(charPos, estimatedEnd);
  1031. sliceTokens = await llm.countTokens(slice);
  1032. }
  1033. }
  1034. chunks.push({ text: slice, pos: charPos, tokens: sliceTokens });
  1035. // Move forward with overlap
  1036. if (estimatedEnd >= content.length) break;
  1037. // Calculate overlap in characters based on token ratio
  1038. const overlapChars = Math.floor(overlapTokens * (slice.length / sliceTokens));
  1039. charPos = estimatedEnd - overlapChars;
  1040. if (charPos <= chunks[chunks.length - 1].pos) {
  1041. charPos = estimatedEnd; // Prevent infinite loop
  1042. }
  1043. }
  1044. return chunks;
  1045. }
  1046. // =============================================================================
  1047. // Fuzzy matching
  1048. // =============================================================================
  1049. function levenshtein(a: string, b: string): number {
  1050. const m = a.length, n = b.length;
  1051. if (m === 0) return n;
  1052. if (n === 0) return m;
  1053. const dp: number[][] = Array.from({ length: m + 1 }, (_, i) => [i]);
  1054. for (let j = 1; j <= n; j++) dp[0][j] = j;
  1055. for (let i = 1; i <= m; i++) {
  1056. for (let j = 1; j <= n; j++) {
  1057. const cost = a[i - 1] === b[j - 1] ? 0 : 1;
  1058. dp[i][j] = Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost);
  1059. }
  1060. }
  1061. return dp[m][n];
  1062. }
  1063. /**
  1064. * Find a document by its short docid (first 6 characters of hash).
  1065. * Returns the document's virtual path if found, null otherwise.
  1066. * If multiple documents match the same short hash (collision), returns the first one.
  1067. */
  1068. export function findDocumentByDocid(db: Database, docid: string): { filepath: string; hash: string } | null {
  1069. // Normalize: remove leading # if present
  1070. const shortHash = docid.startsWith('#') ? docid.slice(1) : docid;
  1071. if (shortHash.length < 1) return null;
  1072. // Look up documents where hash starts with the short hash
  1073. const doc = db.prepare(`
  1074. SELECT 'qmd://' || d.collection || '/' || d.path as filepath, d.hash
  1075. FROM documents d
  1076. WHERE d.hash LIKE ? AND d.active = 1
  1077. LIMIT 1
  1078. `).get(`${shortHash}%`) as { filepath: string; hash: string } | null;
  1079. return doc;
  1080. }
  1081. export function findSimilarFiles(db: Database, query: string, maxDistance: number = 3, limit: number = 5): string[] {
  1082. const allFiles = db.prepare(`
  1083. SELECT d.path
  1084. FROM documents d
  1085. WHERE d.active = 1
  1086. `).all() as { path: string }[];
  1087. const queryLower = query.toLowerCase();
  1088. const scored = allFiles
  1089. .map(f => ({ path: f.path, dist: levenshtein(f.path.toLowerCase(), queryLower) }))
  1090. .filter(f => f.dist <= maxDistance)
  1091. .sort((a, b) => a.dist - b.dist)
  1092. .slice(0, limit);
  1093. return scored.map(f => f.path);
  1094. }
  1095. export function matchFilesByGlob(db: Database, pattern: string): { filepath: string; displayPath: string; bodyLength: number }[] {
  1096. const allFiles = db.prepare(`
  1097. SELECT
  1098. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  1099. LENGTH(content.doc) as body_length,
  1100. d.path,
  1101. d.collection
  1102. FROM documents d
  1103. JOIN content ON content.hash = d.hash
  1104. WHERE d.active = 1
  1105. `).all() as { virtual_path: string; body_length: number; path: string; collection: string }[];
  1106. const glob = new Glob(pattern);
  1107. return allFiles
  1108. .filter(f => glob.match(f.virtual_path) || glob.match(f.path))
  1109. .map(f => ({
  1110. filepath: f.virtual_path, // Virtual path for precise lookup
  1111. displayPath: f.path, // Relative path for display
  1112. bodyLength: f.body_length
  1113. }));
  1114. }
  1115. // =============================================================================
  1116. // Context
  1117. // =============================================================================
  1118. /**
  1119. * Get context for a file path using hierarchical inheritance.
  1120. * Contexts are collection-scoped and inherit from parent directories.
  1121. * For example, context at "/talks" applies to "/talks/2024/keynote.md".
  1122. *
  1123. * @param db Database instance (unused - kept for compatibility)
  1124. * @param collectionName Collection name
  1125. * @param path Relative path within the collection
  1126. * @returns Context string or null if no context is defined
  1127. */
  1128. export function getContextForPath(db: Database, collectionName: string, path: string): string | null {
  1129. const config = collectionsLoadConfig();
  1130. const coll = getCollection(collectionName);
  1131. if (!coll) return null;
  1132. // Collect ALL matching contexts (global + all path prefixes)
  1133. const contexts: string[] = [];
  1134. // Add global context if present
  1135. if (config.global_context) {
  1136. contexts.push(config.global_context);
  1137. }
  1138. // Add all matching path contexts (from most general to most specific)
  1139. if (coll.context) {
  1140. const normalizedPath = path.startsWith("/") ? path : `/${path}`;
  1141. // Collect all matching prefixes
  1142. const matchingContexts: { prefix: string; context: string }[] = [];
  1143. for (const [prefix, context] of Object.entries(coll.context)) {
  1144. const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
  1145. if (normalizedPath.startsWith(normalizedPrefix)) {
  1146. matchingContexts.push({ prefix: normalizedPrefix, context });
  1147. }
  1148. }
  1149. // Sort by prefix length (shortest/most general first)
  1150. matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
  1151. // Add all matching contexts
  1152. for (const match of matchingContexts) {
  1153. contexts.push(match.context);
  1154. }
  1155. }
  1156. // Join all contexts with double newline
  1157. return contexts.length > 0 ? contexts.join('\n\n') : null;
  1158. }
  1159. /**
  1160. * Legacy function for backward compatibility - resolves filepath to collection+path first
  1161. */
  1162. export function getContextForFile(db: Database, filepath: string): string | null {
  1163. // Handle undefined or null filepath
  1164. if (!filepath) return null;
  1165. // Get all collections from YAML config
  1166. const collections = collectionsListCollections();
  1167. const config = collectionsLoadConfig();
  1168. // Parse virtual path format: qmd://collection/path
  1169. let collectionName: string;
  1170. let relativePath: string;
  1171. if (filepath.startsWith('qmd://')) {
  1172. // Virtual path: qmd://collection/path
  1173. const parts = filepath.slice(6).split('/'); // Remove 'qmd://'
  1174. collectionName = parts[0];
  1175. relativePath = parts.slice(1).join('/');
  1176. } else {
  1177. // Filesystem path: find which collection this absolute path belongs to
  1178. let found = false;
  1179. for (const coll of collections) {
  1180. // Skip collections with missing paths
  1181. if (!coll || !coll.path) continue;
  1182. if (filepath.startsWith(coll.path + '/') || filepath === coll.path) {
  1183. collectionName = coll.name;
  1184. // Extract relative path
  1185. relativePath = filepath.startsWith(coll.path + '/')
  1186. ? filepath.slice(coll.path.length + 1)
  1187. : '';
  1188. found = true;
  1189. break;
  1190. }
  1191. }
  1192. if (!found) return null;
  1193. }
  1194. // Get the collection from config
  1195. const coll = getCollection(collectionName);
  1196. if (!coll) return null;
  1197. // Verify this document exists in the database
  1198. const doc = db.prepare(`
  1199. SELECT d.path
  1200. FROM documents d
  1201. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  1202. LIMIT 1
  1203. `).get(collectionName, relativePath) as { path: string } | null;
  1204. if (!doc) return null;
  1205. // Collect ALL matching contexts (global + all path prefixes)
  1206. const contexts: string[] = [];
  1207. // Add global context if present
  1208. if (config.global_context) {
  1209. contexts.push(config.global_context);
  1210. }
  1211. // Add all matching path contexts (from most general to most specific)
  1212. if (coll.context) {
  1213. const normalizedPath = relativePath.startsWith("/") ? relativePath : `/${relativePath}`;
  1214. // Collect all matching prefixes
  1215. const matchingContexts: { prefix: string; context: string }[] = [];
  1216. for (const [prefix, context] of Object.entries(coll.context)) {
  1217. const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
  1218. if (normalizedPath.startsWith(normalizedPrefix)) {
  1219. matchingContexts.push({ prefix: normalizedPrefix, context });
  1220. }
  1221. }
  1222. // Sort by prefix length (shortest/most general first)
  1223. matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
  1224. // Add all matching contexts
  1225. for (const match of matchingContexts) {
  1226. contexts.push(match.context);
  1227. }
  1228. }
  1229. // Join all contexts with double newline
  1230. return contexts.length > 0 ? contexts.join('\n\n') : null;
  1231. }
  1232. /**
  1233. * Get collection by name from YAML config.
  1234. * Returns collection metadata from ~/.config/qmd/index.yml
  1235. */
  1236. export function getCollectionByName(db: Database, name: string): { name: string; pwd: string; glob_pattern: string } | null {
  1237. const collection = getCollection(name);
  1238. if (!collection) return null;
  1239. return {
  1240. name: collection.name,
  1241. pwd: collection.path,
  1242. glob_pattern: collection.pattern,
  1243. };
  1244. }
  1245. /**
  1246. * List all collections with document counts from database.
  1247. * Merges YAML config with database statistics.
  1248. */
  1249. export function listCollections(db: Database): { name: string; pwd: string; glob_pattern: string; doc_count: number; active_count: number; last_modified: string | null }[] {
  1250. const collections = collectionsListCollections();
  1251. // Get document counts from database for each collection
  1252. const result = collections.map(coll => {
  1253. const stats = db.prepare(`
  1254. SELECT
  1255. COUNT(d.id) as doc_count,
  1256. SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count,
  1257. MAX(d.modified_at) as last_modified
  1258. FROM documents d
  1259. WHERE d.collection = ?
  1260. `).get(coll.name) as { doc_count: number; active_count: number; last_modified: string | null } | null;
  1261. return {
  1262. name: coll.name,
  1263. pwd: coll.path,
  1264. glob_pattern: coll.pattern,
  1265. doc_count: stats?.doc_count || 0,
  1266. active_count: stats?.active_count || 0,
  1267. last_modified: stats?.last_modified || null,
  1268. };
  1269. });
  1270. return result;
  1271. }
  1272. /**
  1273. * Remove a collection and clean up its documents.
  1274. * Uses collections.ts to remove from YAML config and cleans up database.
  1275. */
  1276. export function removeCollection(db: Database, collectionName: string): { deletedDocs: number; cleanedHashes: number } {
  1277. // Delete documents from database
  1278. const docResult = db.prepare(`DELETE FROM documents WHERE collection = ?`).run(collectionName);
  1279. // Clean up orphaned content hashes
  1280. const cleanupResult = db.prepare(`
  1281. DELETE FROM content
  1282. WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  1283. `).run();
  1284. // Remove from YAML config (returns true if found and removed)
  1285. collectionsRemoveCollection(collectionName);
  1286. return {
  1287. deletedDocs: docResult.changes,
  1288. cleanedHashes: cleanupResult.changes
  1289. };
  1290. }
  1291. /**
  1292. * Rename a collection.
  1293. * Updates both YAML config and database documents table.
  1294. */
  1295. export function renameCollection(db: Database, oldName: string, newName: string): void {
  1296. // Update all documents with the new collection name in database
  1297. db.prepare(`UPDATE documents SET collection = ? WHERE collection = ?`)
  1298. .run(newName, oldName);
  1299. // Rename in YAML config
  1300. collectionsRenameCollection(oldName, newName);
  1301. }
  1302. // =============================================================================
  1303. // Context Management Operations
  1304. // =============================================================================
  1305. /**
  1306. * Insert or update a context for a specific collection and path prefix.
  1307. */
  1308. export function insertContext(db: Database, collectionId: number, pathPrefix: string, context: string): void {
  1309. // Get collection name from ID
  1310. const coll = db.prepare(`SELECT name FROM collections WHERE id = ?`).get(collectionId) as { name: string } | null;
  1311. if (!coll) {
  1312. throw new Error(`Collection with id ${collectionId} not found`);
  1313. }
  1314. // Use collections.ts to add context
  1315. collectionsAddContext(coll.name, pathPrefix, context);
  1316. }
  1317. /**
  1318. * Delete a context for a specific collection and path prefix.
  1319. * Returns the number of contexts deleted.
  1320. */
  1321. export function deleteContext(db: Database, collectionName: string, pathPrefix: string): number {
  1322. // Use collections.ts to remove context
  1323. const success = collectionsRemoveContext(collectionName, pathPrefix);
  1324. return success ? 1 : 0;
  1325. }
  1326. /**
  1327. * Delete all global contexts (contexts with empty path_prefix).
  1328. * Returns the number of contexts deleted.
  1329. */
  1330. export function deleteGlobalContexts(db: Database): number {
  1331. let deletedCount = 0;
  1332. // Remove global context
  1333. setGlobalContext(undefined);
  1334. deletedCount++;
  1335. // Remove root context (empty string) from all collections
  1336. const collections = collectionsListCollections();
  1337. for (const coll of collections) {
  1338. const success = collectionsRemoveContext(coll.name, '');
  1339. if (success) {
  1340. deletedCount++;
  1341. }
  1342. }
  1343. return deletedCount;
  1344. }
  1345. /**
  1346. * List all contexts, grouped by collection.
  1347. * Returns contexts ordered by collection name, then by path prefix length (longest first).
  1348. */
  1349. export function listPathContexts(db: Database): { collection_name: string; path_prefix: string; context: string }[] {
  1350. const allContexts = collectionsListAllContexts();
  1351. // Convert to expected format and sort
  1352. return allContexts.map(ctx => ({
  1353. collection_name: ctx.collection,
  1354. path_prefix: ctx.path,
  1355. context: ctx.context,
  1356. })).sort((a, b) => {
  1357. // Sort by collection name first
  1358. if (a.collection_name !== b.collection_name) {
  1359. return a.collection_name.localeCompare(b.collection_name);
  1360. }
  1361. // Then by path prefix length (longest first)
  1362. if (a.path_prefix.length !== b.path_prefix.length) {
  1363. return b.path_prefix.length - a.path_prefix.length;
  1364. }
  1365. // Then alphabetically
  1366. return a.path_prefix.localeCompare(b.path_prefix);
  1367. });
  1368. }
  1369. /**
  1370. * Get all collections (name only - from YAML config).
  1371. */
  1372. export function getAllCollections(db: Database): { name: string }[] {
  1373. const collections = collectionsListCollections();
  1374. return collections.map(c => ({ name: c.name }));
  1375. }
  1376. /**
  1377. * Check which collections don't have any context defined.
  1378. * Returns collections that have no context entries at all (not even root context).
  1379. */
  1380. export function getCollectionsWithoutContext(db: Database): { name: string; pwd: string; doc_count: number }[] {
  1381. // Get all collections from YAML config
  1382. const yamlCollections = collectionsListCollections();
  1383. // Filter to those without context
  1384. const collectionsWithoutContext: { name: string; pwd: string; doc_count: number }[] = [];
  1385. for (const coll of yamlCollections) {
  1386. // Check if collection has any context
  1387. if (!coll.context || Object.keys(coll.context).length === 0) {
  1388. // Get doc count from database
  1389. const stats = db.prepare(`
  1390. SELECT COUNT(d.id) as doc_count
  1391. FROM documents d
  1392. WHERE d.collection = ? AND d.active = 1
  1393. `).get(coll.name) as { doc_count: number } | null;
  1394. collectionsWithoutContext.push({
  1395. name: coll.name,
  1396. pwd: coll.path,
  1397. doc_count: stats?.doc_count || 0,
  1398. });
  1399. }
  1400. }
  1401. return collectionsWithoutContext.sort((a, b) => a.name.localeCompare(b.name));
  1402. }
  1403. /**
  1404. * Get top-level directories in a collection that don't have context.
  1405. * Useful for suggesting where context might be needed.
  1406. */
  1407. export function getTopLevelPathsWithoutContext(db: Database, collectionName: string): string[] {
  1408. // Get all paths in the collection from database
  1409. const paths = db.prepare(`
  1410. SELECT DISTINCT path FROM documents
  1411. WHERE collection = ? AND active = 1
  1412. `).all(collectionName) as { path: string }[];
  1413. // Get existing contexts for this collection from YAML
  1414. const yamlColl = getCollection(collectionName);
  1415. if (!yamlColl) return [];
  1416. const contextPrefixes = new Set<string>();
  1417. if (yamlColl.context) {
  1418. for (const prefix of Object.keys(yamlColl.context)) {
  1419. contextPrefixes.add(prefix);
  1420. }
  1421. }
  1422. // Extract top-level directories (first path component)
  1423. const topLevelDirs = new Set<string>();
  1424. for (const { path } of paths) {
  1425. const parts = path.split('/').filter(Boolean);
  1426. if (parts.length > 1) {
  1427. topLevelDirs.add(parts[0]);
  1428. }
  1429. }
  1430. // Filter out directories that already have context (exact or parent)
  1431. const missing: string[] = [];
  1432. for (const dir of topLevelDirs) {
  1433. let hasContext = false;
  1434. // Check if this dir or any parent has context
  1435. for (const prefix of contextPrefixes) {
  1436. if (prefix === '' || prefix === dir || dir.startsWith(prefix + '/')) {
  1437. hasContext = true;
  1438. break;
  1439. }
  1440. }
  1441. if (!hasContext) {
  1442. missing.push(dir);
  1443. }
  1444. }
  1445. return missing.sort();
  1446. }
  1447. // =============================================================================
  1448. // FTS Search
  1449. // =============================================================================
  1450. function sanitizeFTS5Term(term: string): string {
  1451. return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
  1452. }
  1453. function buildFTS5Query(query: string): string | null {
  1454. const terms = query.split(/\s+/)
  1455. .map(t => sanitizeFTS5Term(t))
  1456. .filter(t => t.length > 0);
  1457. if (terms.length === 0) return null;
  1458. if (terms.length === 1) return `"${terms[0]}"*`;
  1459. return terms.map(t => `"${t}"*`).join(' AND ');
  1460. }
  1461. export function searchFTS(db: Database, query: string, limit: number = 20, collectionId?: number): SearchResult[] {
  1462. const ftsQuery = buildFTS5Query(query);
  1463. if (!ftsQuery) return [];
  1464. let sql = `
  1465. SELECT
  1466. 'qmd://' || d.collection || '/' || d.path as filepath,
  1467. d.collection || '/' || d.path as display_path,
  1468. d.title,
  1469. content.doc as body,
  1470. d.hash,
  1471. bm25(documents_fts, 10.0, 1.0) as score
  1472. FROM documents_fts f
  1473. JOIN documents d ON d.id = f.rowid
  1474. JOIN content ON content.hash = d.hash
  1475. WHERE documents_fts MATCH ? AND d.active = 1
  1476. `;
  1477. const params: (string | number)[] = [ftsQuery];
  1478. if (collectionId !== undefined) {
  1479. // Note: collectionId is a legacy parameter that should be phased out
  1480. // Collections are now managed in YAML. For now, we interpret it as a collection name filter.
  1481. // This code path is likely unused as collection filtering should be done at CLI level.
  1482. sql += ` AND d.collection = ?`;
  1483. params.push(String(collectionId));
  1484. }
  1485. sql += ` ORDER BY score LIMIT ?`;
  1486. params.push(limit);
  1487. const rows = db.prepare(sql).all(...params) as { filepath: string; display_path: string; title: string; body: string; hash: string; score: number }[];
  1488. const maxScore = rows.length > 0 ? Math.max(...rows.map(r => Math.abs(r.score))) : 1;
  1489. return rows.map(row => {
  1490. const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
  1491. return {
  1492. filepath: row.filepath,
  1493. displayPath: row.display_path,
  1494. title: row.title,
  1495. hash: row.hash,
  1496. docid: getDocid(row.hash),
  1497. collectionName,
  1498. modifiedAt: "", // Not available in FTS query
  1499. bodyLength: row.body.length,
  1500. body: row.body,
  1501. context: getContextForFile(db, row.filepath),
  1502. score: Math.abs(row.score) / maxScore,
  1503. source: "fts" as const,
  1504. };
  1505. });
  1506. }
  1507. // =============================================================================
  1508. // Vector Search
  1509. // =============================================================================
  1510. export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionId?: number): Promise<SearchResult[]> {
  1511. const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1512. if (!tableExists) return [];
  1513. const embedding = await getEmbedding(query, model, true);
  1514. if (!embedding) return [];
  1515. // sqlite-vec requires "k = ?" for KNN queries
  1516. let sql = `
  1517. SELECT
  1518. v.hash_seq,
  1519. v.distance,
  1520. 'qmd://' || d.collection || '/' || d.path as filepath,
  1521. d.collection || '/' || d.path as display_path,
  1522. d.title,
  1523. content.doc as body,
  1524. cv.hash,
  1525. cv.pos
  1526. FROM vectors_vec v
  1527. JOIN content_vectors cv ON cv.hash || '_' || cv.seq = v.hash_seq
  1528. JOIN documents d ON d.hash = cv.hash AND d.active = 1
  1529. JOIN content ON content.hash = d.hash
  1530. WHERE v.embedding MATCH ? AND k = ?
  1531. `;
  1532. if (collectionId !== undefined) {
  1533. // Note: collectionId is a legacy parameter that should be phased out
  1534. // Collections are now managed in YAML. For now, we interpret it as a collection name filter.
  1535. sql += ` AND d.collection = ?`;
  1536. sql = sql.replace('?', String(collectionId)); // Hacky but maintains compatibility
  1537. }
  1538. sql += ` ORDER BY v.distance`;
  1539. const rows = db.prepare(sql).all(new Float32Array(embedding), limit * 3) as { hash_seq: string; distance: number; filepath: string; display_path: string; title: string; body: string; hash: string; pos: number }[];
  1540. const seen = new Map<string, { row: typeof rows[0]; bestDist: number }>();
  1541. for (const row of rows) {
  1542. const existing = seen.get(row.filepath);
  1543. if (!existing || row.distance < existing.bestDist) {
  1544. seen.set(row.filepath, { row, bestDist: row.distance });
  1545. }
  1546. }
  1547. return Array.from(seen.values())
  1548. .sort((a, b) => a.bestDist - b.bestDist)
  1549. .slice(0, limit)
  1550. .map(({ row }) => {
  1551. const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
  1552. return {
  1553. filepath: row.filepath,
  1554. displayPath: row.display_path,
  1555. title: row.title,
  1556. hash: row.hash,
  1557. docid: getDocid(row.hash),
  1558. collectionName,
  1559. modifiedAt: "", // Not available in vec query
  1560. bodyLength: row.body.length,
  1561. body: row.body,
  1562. context: getContextForFile(db, row.filepath),
  1563. score: 1 - row.distance, // Cosine similarity = 1 - cosine distance
  1564. source: "vec" as const,
  1565. chunkPos: row.pos,
  1566. };
  1567. });
  1568. }
  1569. // =============================================================================
  1570. // Embeddings
  1571. // =============================================================================
  1572. async function getEmbedding(text: string, model: string, isQuery: boolean): Promise<number[] | null> {
  1573. const llm = getDefaultLlamaCpp();
  1574. // Format text using the appropriate prompt template
  1575. const formattedText = isQuery ? formatQueryForEmbedding(text) : formatDocForEmbedding(text);
  1576. const result = await llm.embed(formattedText, { model, isQuery });
  1577. return result?.embedding || null;
  1578. }
  1579. /**
  1580. * Get all unique content hashes that need embeddings (from active documents).
  1581. * Returns hash, document body, and a sample path for display purposes.
  1582. */
  1583. export function getHashesForEmbedding(db: Database): { hash: string; body: string; path: string }[] {
  1584. return db.prepare(`
  1585. SELECT d.hash, c.doc as body, MIN(d.path) as path
  1586. FROM documents d
  1587. JOIN content c ON d.hash = c.hash
  1588. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  1589. WHERE d.active = 1 AND v.hash IS NULL
  1590. GROUP BY d.hash
  1591. `).all() as { hash: string; body: string; path: string }[];
  1592. }
  1593. /**
  1594. * Clear all embeddings from the database (force re-index).
  1595. * Deletes all rows from content_vectors and drops the vectors_vec table.
  1596. */
  1597. export function clearAllEmbeddings(db: Database): void {
  1598. db.exec(`DELETE FROM content_vectors`);
  1599. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  1600. }
  1601. /**
  1602. * Insert a single embedding into both content_vectors and vectors_vec tables.
  1603. * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
  1604. */
  1605. export function insertEmbedding(
  1606. db: Database,
  1607. hash: string,
  1608. seq: number,
  1609. pos: number,
  1610. embedding: Float32Array,
  1611. model: string,
  1612. embeddedAt: string
  1613. ): void {
  1614. const hashSeq = `${hash}_${seq}`;
  1615. const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
  1616. const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
  1617. insertVecStmt.run(hashSeq, embedding);
  1618. insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
  1619. }
  1620. // =============================================================================
  1621. // Query expansion
  1622. // =============================================================================
  1623. export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database): Promise<string[]> {
  1624. // Check cache first
  1625. const cacheKey = getCacheKey("expandQuery", { query, model });
  1626. const cached = getCachedResult(db, cacheKey);
  1627. if (cached) {
  1628. const lines = cached.split('\n').map(l => l.trim()).filter(l => l.length > 0);
  1629. return [query, ...lines.slice(0, 2)];
  1630. }
  1631. const llm = getDefaultLlamaCpp();
  1632. // Note: LlamaCpp uses hardcoded model, model parameter is ignored
  1633. const results = await llm.expandQuery(query, 2);
  1634. // Cache the expanded queries (excluding original)
  1635. if (results.length > 1) {
  1636. setCachedResult(db, cacheKey, results.slice(1).join('\n'));
  1637. }
  1638. return results;
  1639. }
  1640. // =============================================================================
  1641. // Reranking
  1642. // =============================================================================
  1643. export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database): Promise<{ file: string; score: number }[]> {
  1644. const cachedResults: Map<string, number> = new Map();
  1645. const uncachedDocs: RerankDocument[] = [];
  1646. // Check cache for each document
  1647. for (const doc of documents) {
  1648. const cacheKey = getCacheKey("rerank", { query, file: doc.file, model });
  1649. const cached = getCachedResult(db, cacheKey);
  1650. if (cached !== null) {
  1651. cachedResults.set(doc.file, parseFloat(cached));
  1652. } else {
  1653. uncachedDocs.push({ file: doc.file, text: doc.text });
  1654. }
  1655. }
  1656. // Rerank uncached documents using LlamaCpp
  1657. if (uncachedDocs.length > 0) {
  1658. const llm = getDefaultLlamaCpp();
  1659. const rerankResult = await llm.rerank(query, uncachedDocs, { model });
  1660. // Cache results
  1661. for (const result of rerankResult.results) {
  1662. const cacheKey = getCacheKey("rerank", { query, file: result.file, model });
  1663. setCachedResult(db, cacheKey, result.score.toString());
  1664. cachedResults.set(result.file, result.score);
  1665. }
  1666. }
  1667. // Return all results sorted by score
  1668. return documents
  1669. .map(doc => ({ file: doc.file, score: cachedResults.get(doc.file) || 0 }))
  1670. .sort((a, b) => b.score - a.score);
  1671. }
  1672. // =============================================================================
  1673. // Reciprocal Rank Fusion
  1674. // =============================================================================
  1675. export function reciprocalRankFusion(
  1676. resultLists: RankedResult[][],
  1677. weights: number[] = [],
  1678. k: number = 60
  1679. ): RankedResult[] {
  1680. const scores = new Map<string, { result: RankedResult; rrfScore: number; topRank: number }>();
  1681. for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
  1682. const list = resultLists[listIdx];
  1683. const weight = weights[listIdx] ?? 1.0;
  1684. for (let rank = 0; rank < list.length; rank++) {
  1685. const result = list[rank];
  1686. const rrfContribution = weight / (k + rank + 1);
  1687. const existing = scores.get(result.file);
  1688. if (existing) {
  1689. existing.rrfScore += rrfContribution;
  1690. existing.topRank = Math.min(existing.topRank, rank);
  1691. } else {
  1692. scores.set(result.file, {
  1693. result,
  1694. rrfScore: rrfContribution,
  1695. topRank: rank,
  1696. });
  1697. }
  1698. }
  1699. }
  1700. // Top-rank bonus
  1701. for (const entry of scores.values()) {
  1702. if (entry.topRank === 0) {
  1703. entry.rrfScore += 0.05;
  1704. } else if (entry.topRank <= 2) {
  1705. entry.rrfScore += 0.02;
  1706. }
  1707. }
  1708. return Array.from(scores.values())
  1709. .sort((a, b) => b.rrfScore - a.rrfScore)
  1710. .map(e => ({ ...e.result, score: e.rrfScore }));
  1711. }
  1712. // =============================================================================
  1713. // Document retrieval
  1714. // =============================================================================
  1715. type DbDocRow = {
  1716. display_path: string;
  1717. title: string;
  1718. hash: string;
  1719. collection: string;
  1720. path: string;
  1721. modified_at: string;
  1722. body_length: number;
  1723. body?: string;
  1724. };
  1725. /**
  1726. * Find a document by filename/path, docid (#hash), or with fuzzy matching.
  1727. * Returns document metadata without body by default.
  1728. *
  1729. * Supports:
  1730. * - Virtual paths: qmd://collection/path/to/file.md
  1731. * - Absolute paths: /path/to/file.md
  1732. * - Relative paths: path/to/file.md
  1733. * - Short docid: #abc123 (first 6 chars of hash)
  1734. */
  1735. export function findDocument(db: Database, filename: string, options: { includeBody?: boolean } = {}): DocumentResult | DocumentNotFound {
  1736. let filepath = filename;
  1737. const colonMatch = filepath.match(/:(\d+)$/);
  1738. if (colonMatch) {
  1739. filepath = filepath.slice(0, -colonMatch[0].length);
  1740. }
  1741. // Check if this is a docid lookup (#hash or just 6-char hex)
  1742. if (filepath.startsWith('#') || /^[a-f0-9]{6}$/i.test(filepath)) {
  1743. const docidMatch = findDocumentByDocid(db, filepath);
  1744. if (docidMatch) {
  1745. filepath = docidMatch.filepath;
  1746. } else {
  1747. return { error: "not_found", query: filename, similarFiles: [] };
  1748. }
  1749. }
  1750. if (filepath.startsWith('~/')) {
  1751. filepath = homedir() + filepath.slice(1);
  1752. }
  1753. const bodyCol = options.includeBody ? `, content.doc as body` : ``;
  1754. // Build computed columns
  1755. // Note: absoluteFilepath is computed from YAML collections after query
  1756. const selectCols = `
  1757. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  1758. d.collection || '/' || d.path as display_path,
  1759. d.title,
  1760. d.hash,
  1761. d.collection,
  1762. d.modified_at,
  1763. LENGTH(content.doc) as body_length
  1764. ${bodyCol}
  1765. `;
  1766. // Try to match by virtual path first
  1767. let doc = db.prepare(`
  1768. SELECT ${selectCols}
  1769. FROM documents d
  1770. JOIN content ON content.hash = d.hash
  1771. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  1772. `).get(filepath) as DbDocRow | null;
  1773. // Try fuzzy match by virtual path
  1774. if (!doc) {
  1775. doc = db.prepare(`
  1776. SELECT ${selectCols}
  1777. FROM documents d
  1778. JOIN content ON content.hash = d.hash
  1779. WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
  1780. LIMIT 1
  1781. `).get(`%${filepath}`) as DbDocRow | null;
  1782. }
  1783. // Try to match by absolute path (requires looking up collection paths from YAML)
  1784. if (!doc && !filepath.startsWith('qmd://')) {
  1785. const collections = collectionsListCollections();
  1786. for (const coll of collections) {
  1787. let relativePath: string | null = null;
  1788. // If filepath is absolute and starts with collection path, extract relative part
  1789. if (filepath.startsWith(coll.path + '/')) {
  1790. relativePath = filepath.slice(coll.path.length + 1);
  1791. }
  1792. // Otherwise treat filepath as relative to collection
  1793. else if (!filepath.startsWith('/')) {
  1794. relativePath = filepath;
  1795. }
  1796. if (relativePath) {
  1797. doc = db.prepare(`
  1798. SELECT ${selectCols}
  1799. FROM documents d
  1800. JOIN content ON content.hash = d.hash
  1801. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  1802. `).get(coll.name, relativePath) as DbDocRow | null;
  1803. if (doc) break;
  1804. }
  1805. }
  1806. }
  1807. if (!doc) {
  1808. const similar = findSimilarFiles(db, filepath, 5, 5);
  1809. return { error: "not_found", query: filename, similarFiles: similar };
  1810. }
  1811. // Get context using virtual path
  1812. const virtualPath = doc.virtual_path || `qmd://${doc.collection}/${doc.display_path}`;
  1813. const context = getContextForFile(db, virtualPath);
  1814. return {
  1815. filepath: virtualPath,
  1816. displayPath: doc.display_path,
  1817. title: doc.title,
  1818. context,
  1819. hash: doc.hash,
  1820. docid: getDocid(doc.hash),
  1821. collectionName: doc.collection,
  1822. modifiedAt: doc.modified_at,
  1823. bodyLength: doc.body_length,
  1824. ...(options.includeBody && doc.body !== undefined && { body: doc.body }),
  1825. };
  1826. }
  1827. /**
  1828. * Get the body content for a document
  1829. * Optionally slice by line range
  1830. */
  1831. export function getDocumentBody(db: Database, doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number): string | null {
  1832. const filepath = 'filepath' in doc ? doc.filepath : doc.filepath;
  1833. // Try to resolve document by filepath (absolute or virtual)
  1834. let row: { body: string } | null = null;
  1835. // Try virtual path first
  1836. if (filepath.startsWith('qmd://')) {
  1837. row = db.prepare(`
  1838. SELECT content.doc as body
  1839. FROM documents d
  1840. JOIN content ON content.hash = d.hash
  1841. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  1842. `).get(filepath) as { body: string } | null;
  1843. }
  1844. // Try absolute path by looking up in YAML collections
  1845. if (!row) {
  1846. const collections = collectionsListCollections();
  1847. for (const coll of collections) {
  1848. if (filepath.startsWith(coll.path + '/')) {
  1849. const relativePath = filepath.slice(coll.path.length + 1);
  1850. row = db.prepare(`
  1851. SELECT content.doc as body
  1852. FROM documents d
  1853. JOIN content ON content.hash = d.hash
  1854. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  1855. `).get(coll.name, relativePath) as { body: string } | null;
  1856. if (row) break;
  1857. }
  1858. }
  1859. }
  1860. if (!row) return null;
  1861. let body = row.body;
  1862. if (fromLine !== undefined || maxLines !== undefined) {
  1863. const lines = body.split('\n');
  1864. const start = (fromLine || 1) - 1;
  1865. const end = maxLines !== undefined ? start + maxLines : lines.length;
  1866. body = lines.slice(start, end).join('\n');
  1867. }
  1868. return body;
  1869. }
  1870. /**
  1871. * Legacy function for backwards compatibility
  1872. * Combines findDocument + getDocumentBody with line slicing
  1873. */
  1874. export function getDocument(db: Database, filename: string, fromLine?: number, maxLines?: number): (DocumentResult & { body: string }) | DocumentNotFound {
  1875. // Parse :line suffix
  1876. let parsedFromLine = fromLine;
  1877. let filepath = filename;
  1878. const colonMatch = filepath.match(/:(\d+)$/);
  1879. if (colonMatch && !parsedFromLine) {
  1880. parsedFromLine = parseInt(colonMatch[1], 10);
  1881. filepath = filepath.slice(0, -colonMatch[0].length);
  1882. }
  1883. const result = findDocument(db, filepath, { includeBody: true });
  1884. if ("error" in result) return result;
  1885. let body = result.body || "";
  1886. if (parsedFromLine !== undefined || maxLines !== undefined) {
  1887. const lines = body.split('\n');
  1888. const start = (parsedFromLine || 1) - 1;
  1889. const end = maxLines !== undefined ? start + maxLines : lines.length;
  1890. body = lines.slice(start, end).join('\n');
  1891. }
  1892. return { ...result, body };
  1893. }
  1894. /**
  1895. * Find multiple documents by glob pattern or comma-separated list
  1896. * Returns documents without body by default (use getDocumentBody to load)
  1897. */
  1898. export function findDocuments(
  1899. db: Database,
  1900. pattern: string,
  1901. options: { includeBody?: boolean; maxBytes?: number } = {}
  1902. ): { docs: MultiGetResult[]; errors: string[] } {
  1903. const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
  1904. const errors: string[] = [];
  1905. const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
  1906. const bodyCol = options.includeBody ? `, content.doc as body` : ``;
  1907. const selectCols = `
  1908. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  1909. d.collection || '/' || d.path as display_path,
  1910. d.title,
  1911. d.hash,
  1912. d.collection,
  1913. d.modified_at,
  1914. LENGTH(content.doc) as body_length
  1915. ${bodyCol}
  1916. `;
  1917. let fileRows: DbDocRow[];
  1918. if (isCommaSeparated) {
  1919. const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
  1920. fileRows = [];
  1921. for (const name of names) {
  1922. let doc = db.prepare(`
  1923. SELECT ${selectCols}
  1924. FROM documents d
  1925. JOIN content ON content.hash = d.hash
  1926. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  1927. `).get(name) as DbDocRow | null;
  1928. if (!doc) {
  1929. doc = db.prepare(`
  1930. SELECT ${selectCols}
  1931. FROM documents d
  1932. JOIN content ON content.hash = d.hash
  1933. WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
  1934. LIMIT 1
  1935. `).get(`%${name}`) as DbDocRow | null;
  1936. }
  1937. if (doc) {
  1938. fileRows.push(doc);
  1939. } else {
  1940. const similar = findSimilarFiles(db, name, 5, 3);
  1941. let msg = `File not found: ${name}`;
  1942. if (similar.length > 0) {
  1943. msg += ` (did you mean: ${similar.join(', ')}?)`;
  1944. }
  1945. errors.push(msg);
  1946. }
  1947. }
  1948. } else {
  1949. // Glob pattern match
  1950. const matched = matchFilesByGlob(db, pattern);
  1951. if (matched.length === 0) {
  1952. errors.push(`No files matched pattern: ${pattern}`);
  1953. return { docs: [], errors };
  1954. }
  1955. const virtualPaths = matched.map(m => m.filepath);
  1956. const placeholders = virtualPaths.map(() => '?').join(',');
  1957. fileRows = db.prepare(`
  1958. SELECT ${selectCols}
  1959. FROM documents d
  1960. JOIN content ON content.hash = d.hash
  1961. WHERE 'qmd://' || d.collection || '/' || d.path IN (${placeholders}) AND d.active = 1
  1962. `).all(...virtualPaths) as DbDocRow[];
  1963. }
  1964. const results: MultiGetResult[] = [];
  1965. for (const row of fileRows) {
  1966. // Get context using virtual path
  1967. const virtualPath = row.virtual_path || `qmd://${row.collection}/${row.display_path}`;
  1968. const context = getContextForFile(db, virtualPath);
  1969. if (row.body_length > maxBytes) {
  1970. results.push({
  1971. doc: { filepath: virtualPath, displayPath: row.display_path },
  1972. skipped: true,
  1973. skipReason: `File too large (${Math.round(row.body_length / 1024)}KB > ${Math.round(maxBytes / 1024)}KB)`,
  1974. });
  1975. continue;
  1976. }
  1977. results.push({
  1978. doc: {
  1979. filepath: virtualPath,
  1980. displayPath: row.display_path,
  1981. title: row.title || row.display_path.split('/').pop() || row.display_path,
  1982. context,
  1983. hash: row.hash,
  1984. docid: getDocid(row.hash),
  1985. collectionName: row.collection,
  1986. modifiedAt: row.modified_at,
  1987. bodyLength: row.body_length,
  1988. ...(options.includeBody && row.body !== undefined && { body: row.body }),
  1989. },
  1990. skipped: false,
  1991. });
  1992. }
  1993. return { docs: results, errors };
  1994. }
  1995. /**
  1996. * Legacy function for backwards compatibility
  1997. */
  1998. export function getMultipleDocuments(db: Database, pattern: string, maxLines?: number, maxBytes: number = DEFAULT_MULTI_GET_MAX_BYTES): { files: MultiGetFile[]; errors: string[] } {
  1999. const { docs, errors } = findDocuments(db, pattern, { includeBody: true, maxBytes });
  2000. const files: MultiGetFile[] = docs.map(result => {
  2001. if (result.skipped) {
  2002. return {
  2003. filepath: result.doc.filepath,
  2004. displayPath: result.doc.displayPath,
  2005. title: "",
  2006. body: "",
  2007. context: null,
  2008. skipped: true as const,
  2009. skipReason: result.skipReason,
  2010. };
  2011. }
  2012. let body = result.doc.body || "";
  2013. if (maxLines !== undefined) {
  2014. const lines = body.split('\n');
  2015. body = lines.slice(0, maxLines).join('\n');
  2016. if (lines.length > maxLines) {
  2017. body += `\n\n[... truncated ${lines.length - maxLines} more lines]`;
  2018. }
  2019. }
  2020. return {
  2021. filepath: result.doc.filepath,
  2022. displayPath: result.doc.displayPath,
  2023. title: result.doc.title,
  2024. body,
  2025. context: result.doc.context,
  2026. skipped: false as const,
  2027. };
  2028. });
  2029. return { files, errors };
  2030. }
  2031. // Keep the old MultiGetFile type for backwards compatibility
  2032. export type MultiGetFile = {
  2033. filepath: string;
  2034. displayPath: string;
  2035. title: string;
  2036. body: string;
  2037. context: string | null;
  2038. skipped: false;
  2039. } | {
  2040. filepath: string;
  2041. displayPath: string;
  2042. title: string;
  2043. body: string;
  2044. context: string | null;
  2045. skipped: true;
  2046. skipReason: string;
  2047. };
  2048. // =============================================================================
  2049. // Status
  2050. // =============================================================================
  2051. export function getStatus(db: Database): IndexStatus {
  2052. // Load collections from YAML
  2053. const yamlCollections = collectionsListCollections();
  2054. // Get document counts and last update times for each collection
  2055. const collections = yamlCollections.map(col => {
  2056. const stats = db.prepare(`
  2057. SELECT
  2058. COUNT(*) as active_count,
  2059. MAX(modified_at) as last_doc_update
  2060. FROM documents
  2061. WHERE collection = ? AND active = 1
  2062. `).get(col.name) as { active_count: number; last_doc_update: string | null };
  2063. return {
  2064. name: col.name,
  2065. path: col.path,
  2066. pattern: col.pattern,
  2067. documents: stats.active_count,
  2068. lastUpdated: stats.last_doc_update || new Date().toISOString(),
  2069. };
  2070. });
  2071. // Sort by last update time (most recent first)
  2072. collections.sort((a, b) => {
  2073. if (!a.lastUpdated) return 1;
  2074. if (!b.lastUpdated) return -1;
  2075. return new Date(b.lastUpdated).getTime() - new Date(a.lastUpdated).getTime();
  2076. });
  2077. const totalDocs = (db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get() as { c: number }).c;
  2078. const needsEmbedding = getHashesNeedingEmbedding(db);
  2079. const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  2080. return {
  2081. totalDocuments: totalDocs,
  2082. needsEmbedding,
  2083. hasVectorIndex: hasVectors,
  2084. collections,
  2085. };
  2086. }
  2087. // =============================================================================
  2088. // Snippet extraction
  2089. // =============================================================================
  2090. export type SnippetResult = {
  2091. line: number; // 1-indexed line number of best match
  2092. snippet: string; // The snippet text with diff-style header
  2093. linesBefore: number; // Lines in document before snippet
  2094. linesAfter: number; // Lines in document after snippet
  2095. snippetLines: number; // Number of lines in snippet
  2096. };
  2097. export function extractSnippet(body: string, query: string, maxLen = 500, chunkPos?: number): SnippetResult {
  2098. const totalLines = body.split('\n').length;
  2099. let searchBody = body;
  2100. let lineOffset = 0;
  2101. if (chunkPos && chunkPos > 0) {
  2102. const contextStart = Math.max(0, chunkPos - 100);
  2103. const contextEnd = Math.min(body.length, chunkPos + maxLen + 100);
  2104. searchBody = body.slice(contextStart, contextEnd);
  2105. if (contextStart > 0) {
  2106. lineOffset = body.slice(0, contextStart).split('\n').length - 1;
  2107. }
  2108. }
  2109. const lines = searchBody.split('\n');
  2110. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
  2111. let bestLine = 0, bestScore = -1;
  2112. for (let i = 0; i < lines.length; i++) {
  2113. const lineLower = lines[i].toLowerCase();
  2114. let score = 0;
  2115. for (const term of queryTerms) {
  2116. if (lineLower.includes(term)) score++;
  2117. }
  2118. if (score > bestScore) {
  2119. bestScore = score;
  2120. bestLine = i;
  2121. }
  2122. }
  2123. const start = Math.max(0, bestLine - 1);
  2124. const end = Math.min(lines.length, bestLine + 3);
  2125. const snippetLines = lines.slice(start, end);
  2126. let snippetText = snippetLines.join('\n');
  2127. if (snippetText.length > maxLen) snippetText = snippetText.substring(0, maxLen - 3) + "...";
  2128. const absoluteStart = lineOffset + start + 1; // 1-indexed
  2129. const snippetLineCount = snippetLines.length;
  2130. const linesBefore = absoluteStart - 1;
  2131. const linesAfter = totalLines - (absoluteStart + snippetLineCount - 1);
  2132. // Format with diff-style header: @@ -start,count @@ (linesBefore before, linesAfter after)
  2133. const header = `@@ -${absoluteStart},${snippetLineCount} @@ (${linesBefore} before, ${linesAfter} after)`;
  2134. const snippet = `${header}\n${snippetText}`;
  2135. return {
  2136. line: lineOffset + bestLine + 1,
  2137. snippet,
  2138. linesBefore,
  2139. linesAfter,
  2140. snippetLines: snippetLineCount,
  2141. };
  2142. }