store.ts 86 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568
  1. /**
  2. * QMD Store - Core data access and retrieval functions
  3. *
  4. * This module provides all database operations, search functions, and document
  5. * retrieval for QMD. It returns raw data structures that can be formatted by
  6. * CLI or MCP consumers.
  7. *
  8. * Usage:
  9. * const store = createStore("/path/to/db.sqlite");
  10. * // or use default path:
  11. * const store = createStore();
  12. */
  13. import { Database } from "bun:sqlite";
  14. import { Glob } from "bun";
  15. import { realpathSync } from "node:fs";
  16. import * as sqliteVec from "sqlite-vec";
  17. import {
  18. LlamaCpp,
  19. getDefaultLlamaCpp,
  20. formatQueryForEmbedding,
  21. formatDocForEmbedding,
  22. type RerankDocument,
  23. } from "./llm";
  24. import {
  25. findContextForPath as collectionsFindContextForPath,
  26. addContext as collectionsAddContext,
  27. removeContext as collectionsRemoveContext,
  28. listAllContexts as collectionsListAllContexts,
  29. getCollection,
  30. listCollections as collectionsListCollections,
  31. addCollection as collectionsAddCollection,
  32. removeCollection as collectionsRemoveCollection,
  33. renameCollection as collectionsRenameCollection,
  34. setGlobalContext,
  35. loadConfig as collectionsLoadConfig,
  36. type NamedCollection,
  37. } from "./collections";
  38. // =============================================================================
  39. // Configuration
  40. // =============================================================================
  41. const HOME = Bun.env.HOME || "/tmp";
  42. export const DEFAULT_EMBED_MODEL = "embeddinggemma";
  43. export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
  44. export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
  45. export const DEFAULT_GLOB = "**/*.md";
  46. export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
  47. // Chunking: 800 tokens per chunk with 15% overlap
  48. export const CHUNK_SIZE_TOKENS = 800;
  49. export const CHUNK_OVERLAP_TOKENS = Math.floor(CHUNK_SIZE_TOKENS * 0.15); // 120 tokens (15% overlap)
  50. // Fallback char-based approximation for sync chunking (~4 chars per token)
  51. export const CHUNK_SIZE_CHARS = CHUNK_SIZE_TOKENS * 4; // 3200 chars
  52. export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 480 chars
  53. // =============================================================================
  54. // Path utilities
  55. // =============================================================================
  56. export function homedir(): string {
  57. return HOME;
  58. }
  59. /**
  60. * Check if a path is absolute.
  61. * Supports:
  62. * - Unix paths: /path/to/file
  63. * - Windows native: C:\path or C:/path
  64. * - Git Bash: /c/path or /C/path (C-Z drives, excluding A/B floppy drives)
  65. *
  66. * Note: /c without trailing slash is treated as Unix path (directory named "c"),
  67. * while /c/ or /c/path are treated as Git Bash paths (C: drive).
  68. */
  69. export function isAbsolutePath(path: string): boolean {
  70. if (!path) return false;
  71. // Unix absolute path
  72. if (path.startsWith('/')) {
  73. // Check if it's a Git Bash style path like /c/ or /c/Users (C-Z only, not A or B)
  74. // Requires path[2] === '/' to distinguish from Unix paths like /c or /cache
  75. if (path.length >= 3 && path[2] === '/') {
  76. const driveLetter = path[1];
  77. if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
  78. return true;
  79. }
  80. }
  81. // Any other path starting with / is Unix absolute
  82. return true;
  83. }
  84. // Windows native path: C:\ or C:/ (any letter A-Z)
  85. if (path.length >= 2 && /[a-zA-Z]/.test(path[0]!) && path[1] === ':') {
  86. return true;
  87. }
  88. return false;
  89. }
  90. /**
  91. * Normalize path separators to forward slashes.
  92. * Converts Windows backslashes to forward slashes.
  93. */
  94. export function normalizePathSeparators(path: string): string {
  95. return path.replace(/\\/g, '/');
  96. }
  97. /**
  98. * Get the relative path from a prefix.
  99. * Returns null if path is not under prefix.
  100. * Returns empty string if path equals prefix.
  101. */
  102. export function getRelativePathFromPrefix(path: string, prefix: string): string | null {
  103. // Empty prefix is invalid
  104. if (!prefix) {
  105. return null;
  106. }
  107. const normalizedPath = normalizePathSeparators(path);
  108. const normalizedPrefix = normalizePathSeparators(prefix);
  109. // Ensure prefix ends with / for proper matching
  110. const prefixWithSlash = !normalizedPrefix.endsWith('/')
  111. ? normalizedPrefix + '/'
  112. : normalizedPrefix;
  113. // Exact match
  114. if (normalizedPath === normalizedPrefix) {
  115. return '';
  116. }
  117. // Check if path starts with prefix
  118. if (normalizedPath.startsWith(prefixWithSlash)) {
  119. return normalizedPath.slice(prefixWithSlash.length);
  120. }
  121. return null;
  122. }
  123. export function resolve(...paths: string[]): string {
  124. if (paths.length === 0) {
  125. throw new Error("resolve: at least one path segment is required");
  126. }
  127. // Normalize all paths to use forward slashes
  128. const normalizedPaths = paths.map(normalizePathSeparators);
  129. let result = '';
  130. let windowsDrive = '';
  131. // Check if first path is absolute
  132. const firstPath = normalizedPaths[0]!;
  133. if (isAbsolutePath(firstPath)) {
  134. result = firstPath;
  135. // Extract Windows drive letter if present
  136. if (firstPath.length >= 2 && /[a-zA-Z]/.test(firstPath[0]!) && firstPath[1] === ':') {
  137. windowsDrive = firstPath.slice(0, 2);
  138. result = firstPath.slice(2);
  139. } else if (firstPath.startsWith('/') && firstPath.length >= 3 && firstPath[2] === '/') {
  140. // Git Bash style: /c/ -> C: (C-Z drives only, not A or B)
  141. const driveLetter = firstPath[1];
  142. if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
  143. windowsDrive = driveLetter.toUpperCase() + ':';
  144. result = firstPath.slice(2);
  145. }
  146. }
  147. } else {
  148. // Start with PWD or cwd, then append the first relative path
  149. const pwd = normalizePathSeparators(Bun.env.PWD || process.cwd());
  150. // Extract Windows drive from PWD if present
  151. if (pwd.length >= 2 && /[a-zA-Z]/.test(pwd[0]!) && pwd[1] === ':') {
  152. windowsDrive = pwd.slice(0, 2);
  153. result = pwd.slice(2) + '/' + firstPath;
  154. } else {
  155. result = pwd + '/' + firstPath;
  156. }
  157. }
  158. // Process remaining paths
  159. for (let i = 1; i < normalizedPaths.length; i++) {
  160. const p = normalizedPaths[i]!;
  161. if (isAbsolutePath(p)) {
  162. // Absolute path replaces everything
  163. result = p;
  164. // Update Windows drive if present
  165. if (p.length >= 2 && /[a-zA-Z]/.test(p[0]!) && p[1] === ':') {
  166. windowsDrive = p.slice(0, 2);
  167. result = p.slice(2);
  168. } else if (p.startsWith('/') && p.length >= 3 && p[2] === '/') {
  169. // Git Bash style (C-Z drives only, not A or B)
  170. const driveLetter = p[1];
  171. if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
  172. windowsDrive = driveLetter.toUpperCase() + ':';
  173. result = p.slice(2);
  174. } else {
  175. windowsDrive = '';
  176. }
  177. } else {
  178. windowsDrive = '';
  179. }
  180. } else {
  181. // Relative path - append
  182. result = result + '/' + p;
  183. }
  184. }
  185. // Normalize . and .. components
  186. const parts = result.split('/').filter(Boolean);
  187. const normalized: string[] = [];
  188. for (const part of parts) {
  189. if (part === '..') {
  190. normalized.pop();
  191. } else if (part !== '.') {
  192. normalized.push(part);
  193. }
  194. }
  195. // Build final path
  196. const finalPath = '/' + normalized.join('/');
  197. // Prepend Windows drive if present
  198. if (windowsDrive) {
  199. return windowsDrive + finalPath;
  200. }
  201. return finalPath;
  202. }
  203. // Flag to indicate production mode (set by qmd.ts at startup)
  204. let _productionMode = false;
  205. export function enableProductionMode(): void {
  206. _productionMode = true;
  207. }
  208. export function getDefaultDbPath(indexName: string = "index"): string {
  209. // Always allow override via INDEX_PATH (for testing)
  210. if (Bun.env.INDEX_PATH) {
  211. return Bun.env.INDEX_PATH;
  212. }
  213. // In non-production mode (tests), require explicit path
  214. if (!_productionMode) {
  215. throw new Error(
  216. "Database path not set. Tests must set INDEX_PATH env var or use createStore() with explicit path. " +
  217. "This prevents tests from accidentally writing to the global index."
  218. );
  219. }
  220. const cacheDir = Bun.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
  221. const qmdCacheDir = resolve(cacheDir, "qmd");
  222. try { Bun.spawnSync(["mkdir", "-p", qmdCacheDir]); } catch { }
  223. return resolve(qmdCacheDir, `${indexName}.sqlite`);
  224. }
  225. export function getPwd(): string {
  226. return process.env.PWD || process.cwd();
  227. }
  228. export function getRealPath(path: string): string {
  229. try {
  230. return realpathSync(path);
  231. } catch {
  232. return resolve(path);
  233. }
  234. }
  235. // =============================================================================
  236. // Virtual Path Utilities (qmd://)
  237. // =============================================================================
  238. export type VirtualPath = {
  239. collectionName: string;
  240. path: string; // relative path within collection
  241. };
  242. /**
  243. * Normalize explicit virtual path formats to standard qmd:// format.
  244. * Only handles paths that are already explicitly virtual:
  245. * - qmd://collection/path.md (already normalized)
  246. * - qmd:////collection/path.md (extra slashes - normalize)
  247. * - //collection/path.md (missing qmd: prefix - add it)
  248. *
  249. * Does NOT handle:
  250. * - collection/path.md (bare paths - could be filesystem relative)
  251. * - :linenum suffix (should be parsed separately before calling this)
  252. */
  253. export function normalizeVirtualPath(input: string): string {
  254. let path = input.trim();
  255. // Handle qmd:// with extra slashes: qmd:////collection/path -> qmd://collection/path
  256. if (path.startsWith('qmd:')) {
  257. // Remove qmd: prefix and normalize slashes
  258. path = path.slice(4);
  259. // Remove leading slashes and re-add exactly two
  260. path = path.replace(/^\/+/, '');
  261. return `qmd://${path}`;
  262. }
  263. // Handle //collection/path (missing qmd: prefix)
  264. if (path.startsWith('//')) {
  265. path = path.replace(/^\/+/, '');
  266. return `qmd://${path}`;
  267. }
  268. // Return as-is for other cases (filesystem paths, docids, bare collection/path, etc.)
  269. return path;
  270. }
  271. /**
  272. * Parse a virtual path like "qmd://collection-name/path/to/file.md"
  273. * into its components.
  274. * Also supports collection root: "qmd://collection-name/" or "qmd://collection-name"
  275. */
  276. export function parseVirtualPath(virtualPath: string): VirtualPath | null {
  277. // Normalize the path first
  278. const normalized = normalizeVirtualPath(virtualPath);
  279. // Match: qmd://collection-name[/optional-path]
  280. // Allows: qmd://name, qmd://name/, qmd://name/path
  281. const match = normalized.match(/^qmd:\/\/([^\/]+)\/?(.*)$/);
  282. if (!match?.[1]) return null;
  283. return {
  284. collectionName: match[1],
  285. path: match[2] ?? '', // Empty string for collection root
  286. };
  287. }
  288. /**
  289. * Build a virtual path from collection name and relative path.
  290. */
  291. export function buildVirtualPath(collectionName: string, path: string): string {
  292. return `qmd://${collectionName}/${path}`;
  293. }
  294. /**
  295. * Check if a path is explicitly a virtual path.
  296. * Only recognizes explicit virtual path formats:
  297. * - qmd://collection/path.md
  298. * - //collection/path.md
  299. *
  300. * Does NOT consider bare collection/path.md as virtual - that should be
  301. * handled separately by checking if the first component is a collection name.
  302. */
  303. export function isVirtualPath(path: string): boolean {
  304. const trimmed = path.trim();
  305. // Explicit qmd:// prefix (with any number of slashes)
  306. if (trimmed.startsWith('qmd:')) return true;
  307. // //collection/path format (missing qmd: prefix)
  308. if (trimmed.startsWith('//')) return true;
  309. return false;
  310. }
  311. /**
  312. * Resolve a virtual path to absolute filesystem path.
  313. */
  314. export function resolveVirtualPath(db: Database, virtualPath: string): string | null {
  315. const parsed = parseVirtualPath(virtualPath);
  316. if (!parsed) return null;
  317. const coll = getCollectionByName(db, parsed.collectionName);
  318. if (!coll) return null;
  319. return resolve(coll.pwd, parsed.path);
  320. }
  321. /**
  322. * Convert an absolute filesystem path to a virtual path.
  323. * Returns null if the file is not in any indexed collection.
  324. */
  325. export function toVirtualPath(db: Database, absolutePath: string): string | null {
  326. // Get all collections from YAML config
  327. const collections = collectionsListCollections();
  328. // Find which collection this absolute path belongs to
  329. for (const coll of collections) {
  330. if (absolutePath.startsWith(coll.path + '/') || absolutePath === coll.path) {
  331. // Extract relative path
  332. const relativePath = absolutePath.startsWith(coll.path + '/')
  333. ? absolutePath.slice(coll.path.length + 1)
  334. : '';
  335. // Verify this document exists in the database
  336. const doc = db.prepare(`
  337. SELECT d.path
  338. FROM documents d
  339. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  340. LIMIT 1
  341. `).get(coll.name, relativePath) as { path: string } | null;
  342. if (doc) {
  343. return buildVirtualPath(coll.name, relativePath);
  344. }
  345. }
  346. }
  347. return null;
  348. }
  349. // =============================================================================
  350. // Database initialization
  351. // =============================================================================
  352. function setSQLiteFromBrewPrefixEnv(): void {
  353. const candidates: string[] = [];
  354. if (process.platform === "darwin") {
  355. // Use BREW_PREFIX for non-standard Homebrew installs (common on corporate Macs).
  356. const brewPrefix = Bun.env.BREW_PREFIX || Bun.env.HOMEBREW_PREFIX;
  357. if (brewPrefix) {
  358. // Homebrew can place SQLite in opt/sqlite (keg-only) or directly under the prefix.
  359. candidates.push(`${brewPrefix}/opt/sqlite/lib/libsqlite3.dylib`);
  360. candidates.push(`${brewPrefix}/lib/libsqlite3.dylib`);
  361. } else {
  362. candidates.push("/opt/homebrew/opt/sqlite/lib/libsqlite3.dylib");
  363. candidates.push("/usr/local/opt/sqlite/lib/libsqlite3.dylib");
  364. }
  365. }
  366. for (const candidate of candidates) {
  367. try {
  368. if (Bun.file(candidate).size > 0) {
  369. Database.setCustomSQLite(candidate);
  370. return;
  371. }
  372. } catch { }
  373. }
  374. }
  375. setSQLiteFromBrewPrefixEnv();
  376. function initializeDatabase(db: Database): void {
  377. try {
  378. sqliteVec.load(db);
  379. } catch (err) {
  380. if (err instanceof Error && err.message.includes("does not support dynamic extension loading")) {
  381. throw new Error(
  382. "SQLite build does not support dynamic extension loading. " +
  383. "Install Homebrew SQLite so the sqlite-vec extension can be loaded, " +
  384. "and set BREW_PREFIX if Homebrew is installed in a non-standard location."
  385. );
  386. }
  387. throw err;
  388. }
  389. db.exec("PRAGMA journal_mode = WAL");
  390. db.exec("PRAGMA foreign_keys = ON");
  391. // Drop legacy tables that are now managed in YAML
  392. db.exec(`DROP TABLE IF EXISTS path_contexts`);
  393. db.exec(`DROP TABLE IF EXISTS collections`);
  394. // Content-addressable storage - the source of truth for document content
  395. db.exec(`
  396. CREATE TABLE IF NOT EXISTS content (
  397. hash TEXT PRIMARY KEY,
  398. doc TEXT NOT NULL,
  399. created_at TEXT NOT NULL
  400. )
  401. `);
  402. // Documents table - file system layer mapping virtual paths to content hashes
  403. // Collections are now managed in ~/.config/qmd/index.yml
  404. db.exec(`
  405. CREATE TABLE IF NOT EXISTS documents (
  406. id INTEGER PRIMARY KEY AUTOINCREMENT,
  407. collection TEXT NOT NULL,
  408. path TEXT NOT NULL,
  409. title TEXT NOT NULL,
  410. hash TEXT NOT NULL,
  411. created_at TEXT NOT NULL,
  412. modified_at TEXT NOT NULL,
  413. active INTEGER NOT NULL DEFAULT 1,
  414. FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
  415. UNIQUE(collection, path)
  416. )
  417. `);
  418. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection, active)`);
  419. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`);
  420. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path, active)`);
  421. // Cache table for LLM API calls
  422. db.exec(`
  423. CREATE TABLE IF NOT EXISTS llm_cache (
  424. hash TEXT PRIMARY KEY,
  425. result TEXT NOT NULL,
  426. created_at TEXT NOT NULL
  427. )
  428. `);
  429. // Content vectors
  430. const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
  431. const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
  432. if (cvInfo.length > 0 && !hasSeqColumn) {
  433. db.exec(`DROP TABLE IF EXISTS content_vectors`);
  434. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  435. }
  436. db.exec(`
  437. CREATE TABLE IF NOT EXISTS content_vectors (
  438. hash TEXT NOT NULL,
  439. seq INTEGER NOT NULL DEFAULT 0,
  440. pos INTEGER NOT NULL DEFAULT 0,
  441. model TEXT NOT NULL,
  442. embedded_at TEXT NOT NULL,
  443. PRIMARY KEY (hash, seq)
  444. )
  445. `);
  446. // FTS - index filepath (collection/path), title, and content
  447. db.exec(`
  448. CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
  449. filepath, title, body,
  450. tokenize='porter unicode61'
  451. )
  452. `);
  453. // Triggers to keep FTS in sync
  454. db.exec(`
  455. CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents
  456. WHEN new.active = 1
  457. BEGIN
  458. INSERT INTO documents_fts(rowid, filepath, title, body)
  459. SELECT
  460. new.id,
  461. new.collection || '/' || new.path,
  462. new.title,
  463. (SELECT doc FROM content WHERE hash = new.hash)
  464. WHERE new.active = 1;
  465. END
  466. `);
  467. db.exec(`
  468. CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
  469. DELETE FROM documents_fts WHERE rowid = old.id;
  470. END
  471. `);
  472. db.exec(`
  473. CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents
  474. BEGIN
  475. -- Delete from FTS if no longer active
  476. DELETE FROM documents_fts WHERE rowid = old.id AND new.active = 0;
  477. -- Update FTS if still/newly active
  478. INSERT OR REPLACE INTO documents_fts(rowid, filepath, title, body)
  479. SELECT
  480. new.id,
  481. new.collection || '/' || new.path,
  482. new.title,
  483. (SELECT doc FROM content WHERE hash = new.hash)
  484. WHERE new.active = 1;
  485. END
  486. `);
  487. }
  488. function ensureVecTableInternal(db: Database, dimensions: number): void {
  489. const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get() as { sql: string } | null;
  490. if (tableInfo) {
  491. const match = tableInfo.sql.match(/float\[(\d+)\]/);
  492. const hasHashSeq = tableInfo.sql.includes('hash_seq');
  493. const hasCosine = tableInfo.sql.includes('distance_metric=cosine');
  494. const existingDims = match?.[1] ? parseInt(match[1], 10) : null;
  495. if (existingDims === dimensions && hasHashSeq && hasCosine) return;
  496. // Table exists but wrong schema - need to rebuild
  497. db.exec("DROP TABLE IF EXISTS vectors_vec");
  498. }
  499. db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}] distance_metric=cosine)`);
  500. }
  501. // =============================================================================
  502. // Store Factory
  503. // =============================================================================
  504. export type Store = {
  505. db: Database;
  506. dbPath: string;
  507. close: () => void;
  508. ensureVecTable: (dimensions: number) => void;
  509. // Index health
  510. getHashesNeedingEmbedding: () => number;
  511. getIndexHealth: () => IndexHealthInfo;
  512. getStatus: () => IndexStatus;
  513. // Caching
  514. getCacheKey: typeof getCacheKey;
  515. getCachedResult: (cacheKey: string) => string | null;
  516. setCachedResult: (cacheKey: string, result: string) => void;
  517. clearCache: () => void;
  518. // Cleanup and maintenance
  519. deleteLLMCache: () => number;
  520. deleteInactiveDocuments: () => number;
  521. cleanupOrphanedContent: () => number;
  522. cleanupOrphanedVectors: () => number;
  523. vacuumDatabase: () => void;
  524. // Context
  525. getContextForFile: (filepath: string) => string | null;
  526. getContextForPath: (collectionName: string, path: string) => string | null;
  527. getCollectionByName: (name: string) => { name: string; pwd: string; glob_pattern: string } | null;
  528. getCollectionsWithoutContext: () => { name: string; pwd: string; doc_count: number }[];
  529. getTopLevelPathsWithoutContext: (collectionName: string) => string[];
  530. // Virtual paths
  531. parseVirtualPath: typeof parseVirtualPath;
  532. buildVirtualPath: typeof buildVirtualPath;
  533. isVirtualPath: typeof isVirtualPath;
  534. resolveVirtualPath: (virtualPath: string) => string | null;
  535. toVirtualPath: (absolutePath: string) => string | null;
  536. // Search
  537. searchFTS: (query: string, limit?: number, collectionId?: number) => SearchResult[];
  538. searchVec: (query: string, model: string, limit?: number, collectionName?: string) => Promise<SearchResult[]>;
  539. // Query expansion & reranking
  540. expandQuery: (query: string, model?: string) => Promise<string[]>;
  541. rerank: (query: string, documents: { file: string; text: string }[], model?: string) => Promise<{ file: string; score: number }[]>;
  542. // Document retrieval
  543. findDocument: (filename: string, options?: { includeBody?: boolean }) => DocumentResult | DocumentNotFound;
  544. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => string | null;
  545. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => { docs: MultiGetResult[]; errors: string[] };
  546. // Fuzzy matching and docid lookup
  547. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => string[];
  548. matchFilesByGlob: (pattern: string) => { filepath: string; displayPath: string; bodyLength: number }[];
  549. findDocumentByDocid: (docid: string) => { filepath: string; hash: string } | null;
  550. // Document indexing operations
  551. insertContent: (hash: string, content: string, createdAt: string) => void;
  552. insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => void;
  553. findActiveDocument: (collectionName: string, path: string) => { id: number; hash: string; title: string } | null;
  554. updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => void;
  555. updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => void;
  556. deactivateDocument: (collectionName: string, path: string) => void;
  557. getActiveDocumentPaths: (collectionName: string) => string[];
  558. // Vector/embedding operations
  559. getHashesForEmbedding: () => { hash: string; body: string; path: string }[];
  560. clearAllEmbeddings: () => void;
  561. insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void;
  562. };
  563. /**
  564. * Create a new store instance with the given database path.
  565. * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
  566. *
  567. * @param dbPath - Path to the SQLite database file
  568. * @returns Store instance with all methods bound to the database
  569. */
  570. export function createStore(dbPath?: string): Store {
  571. const resolvedPath = dbPath || getDefaultDbPath();
  572. const db = new Database(resolvedPath);
  573. initializeDatabase(db);
  574. return {
  575. db,
  576. dbPath: resolvedPath,
  577. close: () => db.close(),
  578. ensureVecTable: (dimensions: number) => ensureVecTableInternal(db, dimensions),
  579. // Index health
  580. getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
  581. getIndexHealth: () => getIndexHealth(db),
  582. getStatus: () => getStatus(db),
  583. // Caching
  584. getCacheKey,
  585. getCachedResult: (cacheKey: string) => getCachedResult(db, cacheKey),
  586. setCachedResult: (cacheKey: string, result: string) => setCachedResult(db, cacheKey, result),
  587. clearCache: () => clearCache(db),
  588. // Cleanup and maintenance
  589. deleteLLMCache: () => deleteLLMCache(db),
  590. deleteInactiveDocuments: () => deleteInactiveDocuments(db),
  591. cleanupOrphanedContent: () => cleanupOrphanedContent(db),
  592. cleanupOrphanedVectors: () => cleanupOrphanedVectors(db),
  593. vacuumDatabase: () => vacuumDatabase(db),
  594. // Context
  595. getContextForFile: (filepath: string) => getContextForFile(db, filepath),
  596. getContextForPath: (collectionName: string, path: string) => getContextForPath(db, collectionName, path),
  597. getCollectionByName: (name: string) => getCollectionByName(db, name),
  598. getCollectionsWithoutContext: () => getCollectionsWithoutContext(db),
  599. getTopLevelPathsWithoutContext: (collectionName: string) => getTopLevelPathsWithoutContext(db, collectionName),
  600. // Virtual paths
  601. parseVirtualPath,
  602. buildVirtualPath,
  603. isVirtualPath,
  604. resolveVirtualPath: (virtualPath: string) => resolveVirtualPath(db, virtualPath),
  605. toVirtualPath: (absolutePath: string) => toVirtualPath(db, absolutePath),
  606. // Search
  607. searchFTS: (query: string, limit?: number, collectionId?: number) => searchFTS(db, query, limit, collectionId),
  608. searchVec: (query: string, model: string, limit?: number, collectionName?: string) => searchVec(db, query, model, limit, collectionName),
  609. // Query expansion & reranking
  610. expandQuery: (query: string, model?: string) => expandQuery(query, model, db),
  611. rerank: (query: string, documents: { file: string; text: string }[], model?: string) => rerank(query, documents, model, db),
  612. // Document retrieval
  613. findDocument: (filename: string, options?: { includeBody?: boolean }) => findDocument(db, filename, options),
  614. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => getDocumentBody(db, doc, fromLine, maxLines),
  615. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => findDocuments(db, pattern, options),
  616. // Fuzzy matching and docid lookup
  617. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => findSimilarFiles(db, query, maxDistance, limit),
  618. matchFilesByGlob: (pattern: string) => matchFilesByGlob(db, pattern),
  619. findDocumentByDocid: (docid: string) => findDocumentByDocid(db, docid),
  620. // Document indexing operations
  621. insertContent: (hash: string, content: string, createdAt: string) => insertContent(db, hash, content, createdAt),
  622. insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt),
  623. findActiveDocument: (collectionName: string, path: string) => findActiveDocument(db, collectionName, path),
  624. updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => updateDocumentTitle(db, documentId, title, modifiedAt),
  625. updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => updateDocument(db, documentId, title, hash, modifiedAt),
  626. deactivateDocument: (collectionName: string, path: string) => deactivateDocument(db, collectionName, path),
  627. getActiveDocumentPaths: (collectionName: string) => getActiveDocumentPaths(db, collectionName),
  628. // Vector/embedding operations
  629. getHashesForEmbedding: () => getHashesForEmbedding(db),
  630. clearAllEmbeddings: () => clearAllEmbeddings(db),
  631. insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
  632. };
  633. }
  634. // =============================================================================
  635. // Core Document Type
  636. // =============================================================================
  637. /**
  638. * Unified document result type with all metadata.
  639. * Body is optional - use getDocumentBody() to load it separately if needed.
  640. */
  641. export type DocumentResult = {
  642. filepath: string; // Full filesystem path
  643. displayPath: string; // Short display path (e.g., "docs/readme.md")
  644. title: string; // Document title (from first heading or filename)
  645. context: string | null; // Folder context description if configured
  646. hash: string; // Content hash for caching/change detection
  647. docid: string; // Short docid (first 6 chars of hash) for quick reference
  648. collectionName: string; // Parent collection name
  649. modifiedAt: string; // Last modification timestamp
  650. bodyLength: number; // Body length in bytes (useful before loading)
  651. body?: string; // Document body (optional, load with getDocumentBody)
  652. };
  653. /**
  654. * Extract short docid from a full hash (first 6 characters).
  655. */
  656. export function getDocid(hash: string): string {
  657. return hash.slice(0, 6);
  658. }
  659. /**
  660. * Handelize a filename to be more token-friendly.
  661. * - Convert triple underscore `___` to `/` (folder separator)
  662. * - Convert to lowercase
  663. * - Replace sequences of non-word chars (except /) with single dash
  664. * - Remove leading/trailing dashes from path segments
  665. * - Preserve folder structure (a/b/c/d.md stays structured)
  666. * - Preserve file extension
  667. */
  668. export function handelize(path: string): string {
  669. if (!path || path.trim() === '') {
  670. throw new Error('handelize: path cannot be empty');
  671. }
  672. // Check for paths that are just extensions or only dots/special chars
  673. // A valid path must have at least one letter or digit (including Unicode)
  674. const segments = path.split('/').filter(Boolean);
  675. const lastSegment = segments[segments.length - 1] || '';
  676. const filenameWithoutExt = lastSegment.replace(/\.[^.]+$/, '');
  677. const hasValidContent = /[\p{L}\p{N}]/u.test(filenameWithoutExt);
  678. if (!hasValidContent) {
  679. throw new Error(`handelize: path "${path}" has no valid filename content`);
  680. }
  681. const result = path
  682. .replace(/___/g, '/') // Triple underscore becomes folder separator
  683. .toLowerCase()
  684. .split('/')
  685. .map((segment, idx, arr) => {
  686. const isLastSegment = idx === arr.length - 1;
  687. if (isLastSegment) {
  688. // For the filename (last segment), preserve the extension
  689. const extMatch = segment.match(/(\.[a-z0-9]+)$/i);
  690. const ext = extMatch ? extMatch[1] : '';
  691. const nameWithoutExt = ext ? segment.slice(0, -ext.length) : segment;
  692. const cleanedName = nameWithoutExt
  693. .replace(/[^\p{L}\p{N}]+/gu, '-') // Replace non-letter/digit chars with dash
  694. .replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
  695. return cleanedName + ext;
  696. } else {
  697. // For directories, just clean normally
  698. return segment
  699. .replace(/[^\p{L}\p{N}]+/gu, '-')
  700. .replace(/^-+|-+$/g, '');
  701. }
  702. })
  703. .filter(Boolean)
  704. .join('/');
  705. if (!result) {
  706. throw new Error(`handelize: path "${path}" resulted in empty string after processing`);
  707. }
  708. return result;
  709. }
  710. /**
  711. * Search result extends DocumentResult with score and source info
  712. */
  713. export type SearchResult = DocumentResult & {
  714. score: number; // Relevance score (0-1)
  715. source: "fts" | "vec"; // Search source (full-text or vector)
  716. chunkPos?: number; // Character position of matching chunk (for vector search)
  717. };
  718. /**
  719. * Ranked result for RRF fusion (simplified, used internally)
  720. */
  721. export type RankedResult = {
  722. file: string;
  723. displayPath: string;
  724. title: string;
  725. body: string;
  726. score: number;
  727. };
  728. /**
  729. * Error result when document is not found
  730. */
  731. export type DocumentNotFound = {
  732. error: "not_found";
  733. query: string;
  734. similarFiles: string[];
  735. };
  736. /**
  737. * Result from multi-get operations
  738. */
  739. export type MultiGetResult = {
  740. doc: DocumentResult;
  741. skipped: false;
  742. } | {
  743. doc: Pick<DocumentResult, "filepath" | "displayPath">;
  744. skipped: true;
  745. skipReason: string;
  746. };
  747. export type CollectionInfo = {
  748. name: string;
  749. path: string;
  750. pattern: string;
  751. documents: number;
  752. lastUpdated: string;
  753. };
  754. export type IndexStatus = {
  755. totalDocuments: number;
  756. needsEmbedding: number;
  757. hasVectorIndex: boolean;
  758. collections: CollectionInfo[];
  759. };
  760. // =============================================================================
  761. // Index health
  762. // =============================================================================
  763. export function getHashesNeedingEmbedding(db: Database): number {
  764. const result = db.prepare(`
  765. SELECT COUNT(DISTINCT d.hash) as count
  766. FROM documents d
  767. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  768. WHERE d.active = 1 AND v.hash IS NULL
  769. `).get() as { count: number };
  770. return result.count;
  771. }
  772. export type IndexHealthInfo = {
  773. needsEmbedding: number;
  774. totalDocs: number;
  775. daysStale: number | null;
  776. };
  777. export function getIndexHealth(db: Database): IndexHealthInfo {
  778. const needsEmbedding = getHashesNeedingEmbedding(db);
  779. const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count;
  780. const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
  781. let daysStale: number | null = null;
  782. if (mostRecent?.latest) {
  783. const lastUpdate = new Date(mostRecent.latest);
  784. daysStale = Math.floor((Date.now() - lastUpdate.getTime()) / (24 * 60 * 60 * 1000));
  785. }
  786. return { needsEmbedding, totalDocs, daysStale };
  787. }
  788. // =============================================================================
  789. // Caching
  790. // =============================================================================
  791. export function getCacheKey(url: string, body: object): string {
  792. const hash = new Bun.CryptoHasher("sha256");
  793. hash.update(url);
  794. hash.update(JSON.stringify(body));
  795. return hash.digest("hex");
  796. }
  797. export function getCachedResult(db: Database, cacheKey: string): string | null {
  798. const row = db.prepare(`SELECT result FROM llm_cache WHERE hash = ?`).get(cacheKey) as { result: string } | null;
  799. return row?.result || null;
  800. }
  801. export function setCachedResult(db: Database, cacheKey: string, result: string): void {
  802. const now = new Date().toISOString();
  803. db.prepare(`INSERT OR REPLACE INTO llm_cache (hash, result, created_at) VALUES (?, ?, ?)`).run(cacheKey, result, now);
  804. if (Math.random() < 0.01) {
  805. db.exec(`DELETE FROM llm_cache WHERE hash NOT IN (SELECT hash FROM llm_cache ORDER BY created_at DESC LIMIT 1000)`);
  806. }
  807. }
  808. export function clearCache(db: Database): void {
  809. db.exec(`DELETE FROM llm_cache`);
  810. }
  811. // =============================================================================
  812. // Cleanup and maintenance operations
  813. // =============================================================================
  814. /**
  815. * Delete cached LLM API responses.
  816. * Returns the number of cached responses deleted.
  817. */
  818. export function deleteLLMCache(db: Database): number {
  819. const result = db.prepare(`DELETE FROM llm_cache`).run();
  820. return result.changes;
  821. }
  822. /**
  823. * Remove inactive document records (active = 0).
  824. * Returns the number of inactive documents deleted.
  825. */
  826. export function deleteInactiveDocuments(db: Database): number {
  827. const result = db.prepare(`DELETE FROM documents WHERE active = 0`).run();
  828. return result.changes;
  829. }
  830. /**
  831. * Remove orphaned content hashes that are not referenced by any active document.
  832. * Returns the number of orphaned content hashes deleted.
  833. */
  834. export function cleanupOrphanedContent(db: Database): number {
  835. const result = db.prepare(`
  836. DELETE FROM content
  837. WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  838. `).run();
  839. return result.changes;
  840. }
  841. /**
  842. * Remove orphaned vector embeddings that are not referenced by any active document.
  843. * Returns the number of orphaned embedding chunks deleted.
  844. */
  845. export function cleanupOrphanedVectors(db: Database): number {
  846. // Check if vectors_vec table exists
  847. const tableExists = db.prepare(`
  848. SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'
  849. `).get();
  850. if (!tableExists) {
  851. return 0;
  852. }
  853. // Count orphaned vectors first
  854. const countResult = db.prepare(`
  855. SELECT COUNT(*) as c FROM content_vectors cv
  856. WHERE NOT EXISTS (
  857. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  858. )
  859. `).get() as { c: number };
  860. if (countResult.c === 0) {
  861. return 0;
  862. }
  863. // Delete from vectors_vec first
  864. db.exec(`
  865. DELETE FROM vectors_vec WHERE hash_seq IN (
  866. SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
  867. WHERE NOT EXISTS (
  868. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  869. )
  870. )
  871. `);
  872. // Delete from content_vectors
  873. db.exec(`
  874. DELETE FROM content_vectors WHERE hash NOT IN (
  875. SELECT hash FROM documents WHERE active = 1
  876. )
  877. `);
  878. return countResult.c;
  879. }
  880. /**
  881. * Run VACUUM to reclaim unused space in the database.
  882. * This operation rebuilds the database file to eliminate fragmentation.
  883. */
  884. export function vacuumDatabase(db: Database): void {
  885. db.exec(`VACUUM`);
  886. }
  887. // =============================================================================
  888. // Document helpers
  889. // =============================================================================
  890. export async function hashContent(content: string): Promise<string> {
  891. const hash = new Bun.CryptoHasher("sha256");
  892. hash.update(content);
  893. return hash.digest("hex");
  894. }
  895. const titleExtractors: Record<string, (content: string) => string | null> = {
  896. '.md': (content) => {
  897. const match = content.match(/^##?\s+(.+)$/m);
  898. if (match) {
  899. const title = (match[1] ?? "").trim();
  900. if (title === "📝 Notes" || title === "Notes") {
  901. const nextMatch = content.match(/^##\s+(.+)$/m);
  902. if (nextMatch?.[1]) return nextMatch[1].trim();
  903. }
  904. return title;
  905. }
  906. return null;
  907. },
  908. '.org': (content) => {
  909. const titleProp = content.match(/^#\+TITLE:\s*(.+)$/im);
  910. if (titleProp?.[1]) return titleProp[1].trim();
  911. const heading = content.match(/^\*+\s+(.+)$/m);
  912. if (heading?.[1]) return heading[1].trim();
  913. return null;
  914. },
  915. };
  916. export function extractTitle(content: string, filename: string): string {
  917. const ext = filename.slice(filename.lastIndexOf('.')).toLowerCase();
  918. const extractor = titleExtractors[ext];
  919. if (extractor) {
  920. const title = extractor(content);
  921. if (title) return title;
  922. }
  923. return filename.replace(/\.[^.]+$/, "").split("/").pop() || filename;
  924. }
  925. // =============================================================================
  926. // Document indexing operations
  927. // =============================================================================
  928. /**
  929. * Insert content into the content table (content-addressable storage).
  930. * Uses INSERT OR IGNORE so duplicate hashes are skipped.
  931. */
  932. export function insertContent(db: Database, hash: string, content: string, createdAt: string): void {
  933. db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
  934. .run(hash, content, createdAt);
  935. }
  936. /**
  937. * Insert a new document into the documents table.
  938. */
  939. export function insertDocument(
  940. db: Database,
  941. collectionName: string,
  942. path: string,
  943. title: string,
  944. hash: string,
  945. createdAt: string,
  946. modifiedAt: string
  947. ): void {
  948. db.prepare(`
  949. INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active)
  950. VALUES (?, ?, ?, ?, ?, ?, 1)
  951. `).run(collectionName, path, title, hash, createdAt, modifiedAt);
  952. }
  953. /**
  954. * Find an active document by collection name and path.
  955. */
  956. export function findActiveDocument(
  957. db: Database,
  958. collectionName: string,
  959. path: string
  960. ): { id: number; hash: string; title: string } | null {
  961. return db.prepare(`
  962. SELECT id, hash, title FROM documents
  963. WHERE collection = ? AND path = ? AND active = 1
  964. `).get(collectionName, path) as { id: number; hash: string; title: string } | null;
  965. }
  966. /**
  967. * Update the title and modified_at timestamp for a document.
  968. */
  969. export function updateDocumentTitle(
  970. db: Database,
  971. documentId: number,
  972. title: string,
  973. modifiedAt: string
  974. ): void {
  975. db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
  976. .run(title, modifiedAt, documentId);
  977. }
  978. /**
  979. * Update an existing document's hash, title, and modified_at timestamp.
  980. * Used when content changes but the file path stays the same.
  981. */
  982. export function updateDocument(
  983. db: Database,
  984. documentId: number,
  985. title: string,
  986. hash: string,
  987. modifiedAt: string
  988. ): void {
  989. db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`)
  990. .run(title, hash, modifiedAt, documentId);
  991. }
  992. /**
  993. * Deactivate a document (mark as inactive but don't delete).
  994. */
  995. export function deactivateDocument(db: Database, collectionName: string, path: string): void {
  996. db.prepare(`UPDATE documents SET active = 0 WHERE collection = ? AND path = ? AND active = 1`)
  997. .run(collectionName, path);
  998. }
  999. /**
  1000. * Get all active document paths for a collection.
  1001. */
  1002. export function getActiveDocumentPaths(db: Database, collectionName: string): string[] {
  1003. const rows = db.prepare(`
  1004. SELECT path FROM documents WHERE collection = ? AND active = 1
  1005. `).all(collectionName) as { path: string }[];
  1006. return rows.map(r => r.path);
  1007. }
  1008. export { formatQueryForEmbedding, formatDocForEmbedding };
  1009. export function chunkDocument(content: string, maxChars: number = CHUNK_SIZE_CHARS, overlapChars: number = CHUNK_OVERLAP_CHARS): { text: string; pos: number }[] {
  1010. if (content.length <= maxChars) {
  1011. return [{ text: content, pos: 0 }];
  1012. }
  1013. const chunks: { text: string; pos: number }[] = [];
  1014. let charPos = 0;
  1015. while (charPos < content.length) {
  1016. // Calculate end position for this chunk
  1017. let endPos = Math.min(charPos + maxChars, content.length);
  1018. // If not at the end, try to find a good break point
  1019. if (endPos < content.length) {
  1020. const slice = content.slice(charPos, endPos);
  1021. // Look for break points in the last 30% of the chunk
  1022. const searchStart = Math.floor(slice.length * 0.7);
  1023. const searchSlice = slice.slice(searchStart);
  1024. // Priority: paragraph > sentence > line > word
  1025. let breakOffset = -1;
  1026. const paragraphBreak = searchSlice.lastIndexOf('\n\n');
  1027. if (paragraphBreak >= 0) {
  1028. breakOffset = searchStart + paragraphBreak + 2;
  1029. } else {
  1030. const sentenceEnd = Math.max(
  1031. searchSlice.lastIndexOf('. '),
  1032. searchSlice.lastIndexOf('.\n'),
  1033. searchSlice.lastIndexOf('? '),
  1034. searchSlice.lastIndexOf('?\n'),
  1035. searchSlice.lastIndexOf('! '),
  1036. searchSlice.lastIndexOf('!\n')
  1037. );
  1038. if (sentenceEnd >= 0) {
  1039. breakOffset = searchStart + sentenceEnd + 2;
  1040. } else {
  1041. const lineBreak = searchSlice.lastIndexOf('\n');
  1042. if (lineBreak >= 0) {
  1043. breakOffset = searchStart + lineBreak + 1;
  1044. } else {
  1045. const spaceBreak = searchSlice.lastIndexOf(' ');
  1046. if (spaceBreak >= 0) {
  1047. breakOffset = searchStart + spaceBreak + 1;
  1048. }
  1049. }
  1050. }
  1051. }
  1052. if (breakOffset > 0) {
  1053. endPos = charPos + breakOffset;
  1054. }
  1055. }
  1056. // Ensure we make progress
  1057. if (endPos <= charPos) {
  1058. endPos = Math.min(charPos + maxChars, content.length);
  1059. }
  1060. chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
  1061. // Move forward, but overlap with previous chunk
  1062. // For last chunk, don't overlap (just go to the end)
  1063. if (endPos >= content.length) {
  1064. break;
  1065. }
  1066. charPos = endPos - overlapChars;
  1067. const lastChunkPos = chunks.at(-1)!.pos;
  1068. if (charPos <= lastChunkPos) {
  1069. // Prevent infinite loop - move forward at least a bit
  1070. charPos = endPos;
  1071. }
  1072. }
  1073. return chunks;
  1074. }
  1075. /**
  1076. * Chunk a document by actual token count using the LLM tokenizer.
  1077. * More accurate than character-based chunking but requires async.
  1078. */
  1079. export async function chunkDocumentByTokens(
  1080. content: string,
  1081. maxTokens: number = CHUNK_SIZE_TOKENS,
  1082. overlapTokens: number = CHUNK_OVERLAP_TOKENS
  1083. ): Promise<{ text: string; pos: number; tokens: number }[]> {
  1084. const llm = getDefaultLlamaCpp();
  1085. // Tokenize once upfront
  1086. const allTokens = await llm.tokenize(content);
  1087. const totalTokens = allTokens.length;
  1088. if (totalTokens <= maxTokens) {
  1089. return [{ text: content, pos: 0, tokens: totalTokens }];
  1090. }
  1091. const chunks: { text: string; pos: number; tokens: number }[] = [];
  1092. const step = maxTokens - overlapTokens;
  1093. const avgCharsPerToken = content.length / totalTokens;
  1094. let tokenPos = 0;
  1095. while (tokenPos < totalTokens) {
  1096. const chunkEnd = Math.min(tokenPos + maxTokens, totalTokens);
  1097. const chunkTokens = allTokens.slice(tokenPos, chunkEnd);
  1098. let chunkText = await llm.detokenize(chunkTokens);
  1099. // Find a good break point if not at end of document
  1100. if (chunkEnd < totalTokens) {
  1101. const searchStart = Math.floor(chunkText.length * 0.7);
  1102. const searchSlice = chunkText.slice(searchStart);
  1103. let breakOffset = -1;
  1104. const paragraphBreak = searchSlice.lastIndexOf('\n\n');
  1105. if (paragraphBreak >= 0) {
  1106. breakOffset = paragraphBreak + 2;
  1107. } else {
  1108. const sentenceEnd = Math.max(
  1109. searchSlice.lastIndexOf('. '),
  1110. searchSlice.lastIndexOf('.\n'),
  1111. searchSlice.lastIndexOf('? '),
  1112. searchSlice.lastIndexOf('?\n'),
  1113. searchSlice.lastIndexOf('! '),
  1114. searchSlice.lastIndexOf('!\n')
  1115. );
  1116. if (sentenceEnd >= 0) {
  1117. breakOffset = sentenceEnd + 2;
  1118. } else {
  1119. const lineBreak = searchSlice.lastIndexOf('\n');
  1120. if (lineBreak >= 0) {
  1121. breakOffset = lineBreak + 1;
  1122. }
  1123. }
  1124. }
  1125. if (breakOffset >= 0) {
  1126. chunkText = chunkText.slice(0, searchStart + breakOffset);
  1127. }
  1128. }
  1129. // Approximate character position based on token position
  1130. const charPos = Math.floor(tokenPos * avgCharsPerToken);
  1131. chunks.push({ text: chunkText, pos: charPos, tokens: chunkTokens.length });
  1132. // Move forward
  1133. if (chunkEnd >= totalTokens) break;
  1134. // Advance by step tokens (maxTokens - overlap)
  1135. tokenPos += step;
  1136. }
  1137. return chunks;
  1138. }
  1139. // =============================================================================
  1140. // Fuzzy matching
  1141. // =============================================================================
  1142. function levenshtein(a: string, b: string): number {
  1143. const m = a.length, n = b.length;
  1144. if (m === 0) return n;
  1145. if (n === 0) return m;
  1146. const dp: number[][] = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0));
  1147. for (let i = 0; i <= m; i++) dp[i]![0] = i;
  1148. for (let j = 0; j <= n; j++) dp[0]![j] = j;
  1149. for (let i = 1; i <= m; i++) {
  1150. for (let j = 1; j <= n; j++) {
  1151. const cost = a[i - 1] === b[j - 1] ? 0 : 1;
  1152. dp[i]![j] = Math.min(
  1153. dp[i - 1]![j]! + 1,
  1154. dp[i]![j - 1]! + 1,
  1155. dp[i - 1]![j - 1]! + cost
  1156. );
  1157. }
  1158. }
  1159. return dp[m]![n]!;
  1160. }
  1161. /**
  1162. * Normalize a docid input by stripping surrounding quotes and leading #.
  1163. * Handles: "#abc123", 'abc123', "abc123", #abc123, abc123
  1164. * Returns the bare hex string.
  1165. */
  1166. export function normalizeDocid(docid: string): string {
  1167. let normalized = docid.trim();
  1168. // Strip surrounding quotes (single or double)
  1169. if ((normalized.startsWith('"') && normalized.endsWith('"')) ||
  1170. (normalized.startsWith("'") && normalized.endsWith("'"))) {
  1171. normalized = normalized.slice(1, -1);
  1172. }
  1173. // Strip leading # if present
  1174. if (normalized.startsWith('#')) {
  1175. normalized = normalized.slice(1);
  1176. }
  1177. return normalized;
  1178. }
  1179. /**
  1180. * Check if a string looks like a docid reference.
  1181. * Accepts: #abc123, abc123, "#abc123", "abc123", '#abc123', 'abc123'
  1182. * Returns true if the normalized form is a valid hex string of 6+ chars.
  1183. */
  1184. export function isDocid(input: string): boolean {
  1185. const normalized = normalizeDocid(input);
  1186. // Must be at least 6 hex characters
  1187. return normalized.length >= 6 && /^[a-f0-9]+$/i.test(normalized);
  1188. }
  1189. /**
  1190. * Find a document by its short docid (first 6 characters of hash).
  1191. * Returns the document's virtual path if found, null otherwise.
  1192. * If multiple documents match the same short hash (collision), returns the first one.
  1193. *
  1194. * Accepts lenient input: #abc123, abc123, "#abc123", "abc123"
  1195. */
  1196. export function findDocumentByDocid(db: Database, docid: string): { filepath: string; hash: string } | null {
  1197. const shortHash = normalizeDocid(docid);
  1198. if (shortHash.length < 1) return null;
  1199. // Look up documents where hash starts with the short hash
  1200. const doc = db.prepare(`
  1201. SELECT 'qmd://' || d.collection || '/' || d.path as filepath, d.hash
  1202. FROM documents d
  1203. WHERE d.hash LIKE ? AND d.active = 1
  1204. LIMIT 1
  1205. `).get(`${shortHash}%`) as { filepath: string; hash: string } | null;
  1206. return doc;
  1207. }
  1208. export function findSimilarFiles(db: Database, query: string, maxDistance: number = 3, limit: number = 5): string[] {
  1209. const allFiles = db.prepare(`
  1210. SELECT d.path
  1211. FROM documents d
  1212. WHERE d.active = 1
  1213. `).all() as { path: string }[];
  1214. const queryLower = query.toLowerCase();
  1215. const scored = allFiles
  1216. .map(f => ({ path: f.path, dist: levenshtein(f.path.toLowerCase(), queryLower) }))
  1217. .filter(f => f.dist <= maxDistance)
  1218. .sort((a, b) => a.dist - b.dist)
  1219. .slice(0, limit);
  1220. return scored.map(f => f.path);
  1221. }
  1222. export function matchFilesByGlob(db: Database, pattern: string): { filepath: string; displayPath: string; bodyLength: number }[] {
  1223. const allFiles = db.prepare(`
  1224. SELECT
  1225. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  1226. LENGTH(content.doc) as body_length,
  1227. d.path,
  1228. d.collection
  1229. FROM documents d
  1230. JOIN content ON content.hash = d.hash
  1231. WHERE d.active = 1
  1232. `).all() as { virtual_path: string; body_length: number; path: string; collection: string }[];
  1233. const glob = new Glob(pattern);
  1234. return allFiles
  1235. .filter(f => glob.match(f.virtual_path) || glob.match(f.path))
  1236. .map(f => ({
  1237. filepath: f.virtual_path, // Virtual path for precise lookup
  1238. displayPath: f.path, // Relative path for display
  1239. bodyLength: f.body_length
  1240. }));
  1241. }
  1242. // =============================================================================
  1243. // Context
  1244. // =============================================================================
  1245. /**
  1246. * Get context for a file path using hierarchical inheritance.
  1247. * Contexts are collection-scoped and inherit from parent directories.
  1248. * For example, context at "/talks" applies to "/talks/2024/keynote.md".
  1249. *
  1250. * @param db Database instance (unused - kept for compatibility)
  1251. * @param collectionName Collection name
  1252. * @param path Relative path within the collection
  1253. * @returns Context string or null if no context is defined
  1254. */
  1255. export function getContextForPath(db: Database, collectionName: string, path: string): string | null {
  1256. const config = collectionsLoadConfig();
  1257. const coll = getCollection(collectionName);
  1258. if (!coll) return null;
  1259. // Collect ALL matching contexts (global + all path prefixes)
  1260. const contexts: string[] = [];
  1261. // Add global context if present
  1262. if (config.global_context) {
  1263. contexts.push(config.global_context);
  1264. }
  1265. // Add all matching path contexts (from most general to most specific)
  1266. if (coll.context) {
  1267. const normalizedPath = path.startsWith("/") ? path : `/${path}`;
  1268. // Collect all matching prefixes
  1269. const matchingContexts: { prefix: string; context: string }[] = [];
  1270. for (const [prefix, context] of Object.entries(coll.context)) {
  1271. const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
  1272. if (normalizedPath.startsWith(normalizedPrefix)) {
  1273. matchingContexts.push({ prefix: normalizedPrefix, context });
  1274. }
  1275. }
  1276. // Sort by prefix length (shortest/most general first)
  1277. matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
  1278. // Add all matching contexts
  1279. for (const match of matchingContexts) {
  1280. contexts.push(match.context);
  1281. }
  1282. }
  1283. // Join all contexts with double newline
  1284. return contexts.length > 0 ? contexts.join('\n\n') : null;
  1285. }
  1286. /**
  1287. * Get context for a file path (virtual or filesystem).
  1288. * Resolves the collection and relative path using the YAML collections config.
  1289. */
  1290. export function getContextForFile(db: Database, filepath: string): string | null {
  1291. // Handle undefined or null filepath
  1292. if (!filepath) return null;
  1293. // Get all collections from YAML config
  1294. const collections = collectionsListCollections();
  1295. const config = collectionsLoadConfig();
  1296. // Parse virtual path format: qmd://collection/path
  1297. let collectionName: string | null = null;
  1298. let relativePath: string | null = null;
  1299. const parsedVirtual = filepath.startsWith('qmd://') ? parseVirtualPath(filepath) : null;
  1300. if (parsedVirtual) {
  1301. collectionName = parsedVirtual.collectionName;
  1302. relativePath = parsedVirtual.path;
  1303. } else {
  1304. // Filesystem path: find which collection this absolute path belongs to
  1305. for (const coll of collections) {
  1306. // Skip collections with missing paths
  1307. if (!coll || !coll.path) continue;
  1308. if (filepath.startsWith(coll.path + '/') || filepath === coll.path) {
  1309. collectionName = coll.name;
  1310. // Extract relative path
  1311. relativePath = filepath.startsWith(coll.path + '/')
  1312. ? filepath.slice(coll.path.length + 1)
  1313. : '';
  1314. break;
  1315. }
  1316. }
  1317. if (!collectionName || relativePath === null) return null;
  1318. }
  1319. // Get the collection from config
  1320. const coll = getCollection(collectionName);
  1321. if (!coll) return null;
  1322. // Verify this document exists in the database
  1323. const doc = db.prepare(`
  1324. SELECT d.path
  1325. FROM documents d
  1326. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  1327. LIMIT 1
  1328. `).get(collectionName, relativePath) as { path: string } | null;
  1329. if (!doc) return null;
  1330. // Collect ALL matching contexts (global + all path prefixes)
  1331. const contexts: string[] = [];
  1332. // Add global context if present
  1333. if (config.global_context) {
  1334. contexts.push(config.global_context);
  1335. }
  1336. // Add all matching path contexts (from most general to most specific)
  1337. if (coll.context) {
  1338. const normalizedPath = relativePath.startsWith("/") ? relativePath : `/${relativePath}`;
  1339. // Collect all matching prefixes
  1340. const matchingContexts: { prefix: string; context: string }[] = [];
  1341. for (const [prefix, context] of Object.entries(coll.context)) {
  1342. const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
  1343. if (normalizedPath.startsWith(normalizedPrefix)) {
  1344. matchingContexts.push({ prefix: normalizedPrefix, context });
  1345. }
  1346. }
  1347. // Sort by prefix length (shortest/most general first)
  1348. matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
  1349. // Add all matching contexts
  1350. for (const match of matchingContexts) {
  1351. contexts.push(match.context);
  1352. }
  1353. }
  1354. // Join all contexts with double newline
  1355. return contexts.length > 0 ? contexts.join('\n\n') : null;
  1356. }
  1357. /**
  1358. * Get collection by name from YAML config.
  1359. * Returns collection metadata from ~/.config/qmd/index.yml
  1360. */
  1361. export function getCollectionByName(db: Database, name: string): { name: string; pwd: string; glob_pattern: string } | null {
  1362. const collection = getCollection(name);
  1363. if (!collection) return null;
  1364. return {
  1365. name: collection.name,
  1366. pwd: collection.path,
  1367. glob_pattern: collection.pattern,
  1368. };
  1369. }
  1370. /**
  1371. * List all collections with document counts from database.
  1372. * Merges YAML config with database statistics.
  1373. */
  1374. export function listCollections(db: Database): { name: string; pwd: string; glob_pattern: string; doc_count: number; active_count: number; last_modified: string | null }[] {
  1375. const collections = collectionsListCollections();
  1376. // Get document counts from database for each collection
  1377. const result = collections.map(coll => {
  1378. const stats = db.prepare(`
  1379. SELECT
  1380. COUNT(d.id) as doc_count,
  1381. SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count,
  1382. MAX(d.modified_at) as last_modified
  1383. FROM documents d
  1384. WHERE d.collection = ?
  1385. `).get(coll.name) as { doc_count: number; active_count: number; last_modified: string | null } | null;
  1386. return {
  1387. name: coll.name,
  1388. pwd: coll.path,
  1389. glob_pattern: coll.pattern,
  1390. doc_count: stats?.doc_count || 0,
  1391. active_count: stats?.active_count || 0,
  1392. last_modified: stats?.last_modified || null,
  1393. };
  1394. });
  1395. return result;
  1396. }
  1397. /**
  1398. * Remove a collection and clean up its documents.
  1399. * Uses collections.ts to remove from YAML config and cleans up database.
  1400. */
  1401. export function removeCollection(db: Database, collectionName: string): { deletedDocs: number; cleanedHashes: number } {
  1402. // Delete documents from database
  1403. const docResult = db.prepare(`DELETE FROM documents WHERE collection = ?`).run(collectionName);
  1404. // Clean up orphaned content hashes
  1405. const cleanupResult = db.prepare(`
  1406. DELETE FROM content
  1407. WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  1408. `).run();
  1409. // Remove from YAML config (returns true if found and removed)
  1410. collectionsRemoveCollection(collectionName);
  1411. return {
  1412. deletedDocs: docResult.changes,
  1413. cleanedHashes: cleanupResult.changes
  1414. };
  1415. }
  1416. /**
  1417. * Rename a collection.
  1418. * Updates both YAML config and database documents table.
  1419. */
  1420. export function renameCollection(db: Database, oldName: string, newName: string): void {
  1421. // Update all documents with the new collection name in database
  1422. db.prepare(`UPDATE documents SET collection = ? WHERE collection = ?`)
  1423. .run(newName, oldName);
  1424. // Rename in YAML config
  1425. collectionsRenameCollection(oldName, newName);
  1426. }
  1427. // =============================================================================
  1428. // Context Management Operations
  1429. // =============================================================================
  1430. /**
  1431. * Insert or update a context for a specific collection and path prefix.
  1432. */
  1433. export function insertContext(db: Database, collectionId: number, pathPrefix: string, context: string): void {
  1434. // Get collection name from ID
  1435. const coll = db.prepare(`SELECT name FROM collections WHERE id = ?`).get(collectionId) as { name: string } | null;
  1436. if (!coll) {
  1437. throw new Error(`Collection with id ${collectionId} not found`);
  1438. }
  1439. // Use collections.ts to add context
  1440. collectionsAddContext(coll.name, pathPrefix, context);
  1441. }
  1442. /**
  1443. * Delete a context for a specific collection and path prefix.
  1444. * Returns the number of contexts deleted.
  1445. */
  1446. export function deleteContext(db: Database, collectionName: string, pathPrefix: string): number {
  1447. // Use collections.ts to remove context
  1448. const success = collectionsRemoveContext(collectionName, pathPrefix);
  1449. return success ? 1 : 0;
  1450. }
  1451. /**
  1452. * Delete all global contexts (contexts with empty path_prefix).
  1453. * Returns the number of contexts deleted.
  1454. */
  1455. export function deleteGlobalContexts(db: Database): number {
  1456. let deletedCount = 0;
  1457. // Remove global context
  1458. setGlobalContext(undefined);
  1459. deletedCount++;
  1460. // Remove root context (empty string) from all collections
  1461. const collections = collectionsListCollections();
  1462. for (const coll of collections) {
  1463. const success = collectionsRemoveContext(coll.name, '');
  1464. if (success) {
  1465. deletedCount++;
  1466. }
  1467. }
  1468. return deletedCount;
  1469. }
  1470. /**
  1471. * List all contexts, grouped by collection.
  1472. * Returns contexts ordered by collection name, then by path prefix length (longest first).
  1473. */
  1474. export function listPathContexts(db: Database): { collection_name: string; path_prefix: string; context: string }[] {
  1475. const allContexts = collectionsListAllContexts();
  1476. // Convert to expected format and sort
  1477. return allContexts.map(ctx => ({
  1478. collection_name: ctx.collection,
  1479. path_prefix: ctx.path,
  1480. context: ctx.context,
  1481. })).sort((a, b) => {
  1482. // Sort by collection name first
  1483. if (a.collection_name !== b.collection_name) {
  1484. return a.collection_name.localeCompare(b.collection_name);
  1485. }
  1486. // Then by path prefix length (longest first)
  1487. if (a.path_prefix.length !== b.path_prefix.length) {
  1488. return b.path_prefix.length - a.path_prefix.length;
  1489. }
  1490. // Then alphabetically
  1491. return a.path_prefix.localeCompare(b.path_prefix);
  1492. });
  1493. }
  1494. /**
  1495. * Get all collections (name only - from YAML config).
  1496. */
  1497. export function getAllCollections(db: Database): { name: string }[] {
  1498. const collections = collectionsListCollections();
  1499. return collections.map(c => ({ name: c.name }));
  1500. }
  1501. /**
  1502. * Check which collections don't have any context defined.
  1503. * Returns collections that have no context entries at all (not even root context).
  1504. */
  1505. export function getCollectionsWithoutContext(db: Database): { name: string; pwd: string; doc_count: number }[] {
  1506. // Get all collections from YAML config
  1507. const yamlCollections = collectionsListCollections();
  1508. // Filter to those without context
  1509. const collectionsWithoutContext: { name: string; pwd: string; doc_count: number }[] = [];
  1510. for (const coll of yamlCollections) {
  1511. // Check if collection has any context
  1512. if (!coll.context || Object.keys(coll.context).length === 0) {
  1513. // Get doc count from database
  1514. const stats = db.prepare(`
  1515. SELECT COUNT(d.id) as doc_count
  1516. FROM documents d
  1517. WHERE d.collection = ? AND d.active = 1
  1518. `).get(coll.name) as { doc_count: number } | null;
  1519. collectionsWithoutContext.push({
  1520. name: coll.name,
  1521. pwd: coll.path,
  1522. doc_count: stats?.doc_count || 0,
  1523. });
  1524. }
  1525. }
  1526. return collectionsWithoutContext.sort((a, b) => a.name.localeCompare(b.name));
  1527. }
  1528. /**
  1529. * Get top-level directories in a collection that don't have context.
  1530. * Useful for suggesting where context might be needed.
  1531. */
  1532. export function getTopLevelPathsWithoutContext(db: Database, collectionName: string): string[] {
  1533. // Get all paths in the collection from database
  1534. const paths = db.prepare(`
  1535. SELECT DISTINCT path FROM documents
  1536. WHERE collection = ? AND active = 1
  1537. `).all(collectionName) as { path: string }[];
  1538. // Get existing contexts for this collection from YAML
  1539. const yamlColl = getCollection(collectionName);
  1540. if (!yamlColl) return [];
  1541. const contextPrefixes = new Set<string>();
  1542. if (yamlColl.context) {
  1543. for (const prefix of Object.keys(yamlColl.context)) {
  1544. contextPrefixes.add(prefix);
  1545. }
  1546. }
  1547. // Extract top-level directories (first path component)
  1548. const topLevelDirs = new Set<string>();
  1549. for (const { path } of paths) {
  1550. const parts = path.split('/').filter(Boolean);
  1551. if (parts.length > 1) {
  1552. const dir = parts[0];
  1553. if (dir) topLevelDirs.add(dir);
  1554. }
  1555. }
  1556. // Filter out directories that already have context (exact or parent)
  1557. const missing: string[] = [];
  1558. for (const dir of topLevelDirs) {
  1559. let hasContext = false;
  1560. // Check if this dir or any parent has context
  1561. for (const prefix of contextPrefixes) {
  1562. if (prefix === '' || prefix === dir || dir.startsWith(prefix + '/')) {
  1563. hasContext = true;
  1564. break;
  1565. }
  1566. }
  1567. if (!hasContext) {
  1568. missing.push(dir);
  1569. }
  1570. }
  1571. return missing.sort();
  1572. }
  1573. // =============================================================================
  1574. // FTS Search
  1575. // =============================================================================
  1576. function sanitizeFTS5Term(term: string): string {
  1577. return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
  1578. }
  1579. function buildFTS5Query(query: string): string | null {
  1580. const terms = query.split(/\s+/)
  1581. .map(t => sanitizeFTS5Term(t))
  1582. .filter(t => t.length > 0);
  1583. if (terms.length === 0) return null;
  1584. if (terms.length === 1) return `"${terms[0]}"*`;
  1585. return terms.map(t => `"${t}"*`).join(' AND ');
  1586. }
  1587. export function searchFTS(db: Database, query: string, limit: number = 20, collectionId?: number): SearchResult[] {
  1588. const ftsQuery = buildFTS5Query(query);
  1589. if (!ftsQuery) return [];
  1590. let sql = `
  1591. SELECT
  1592. 'qmd://' || d.collection || '/' || d.path as filepath,
  1593. d.collection || '/' || d.path as display_path,
  1594. d.title,
  1595. content.doc as body,
  1596. d.hash,
  1597. bm25(documents_fts, 10.0, 1.0) as bm25_score
  1598. FROM documents_fts f
  1599. JOIN documents d ON d.id = f.rowid
  1600. JOIN content ON content.hash = d.hash
  1601. WHERE documents_fts MATCH ? AND d.active = 1
  1602. `;
  1603. const params: (string | number)[] = [ftsQuery];
  1604. if (collectionId) {
  1605. // Note: collectionId is a legacy parameter that should be phased out
  1606. // Collections are now managed in YAML. For now, we interpret it as a collection name filter.
  1607. // This code path is likely unused as collection filtering should be done at CLI level.
  1608. sql += ` AND d.collection = ?`;
  1609. params.push(String(collectionId));
  1610. }
  1611. // bm25 lower is better; sort ascending.
  1612. sql += ` ORDER BY bm25_score ASC LIMIT ?`;
  1613. params.push(limit);
  1614. const rows = db.prepare(sql).all(...params) as { filepath: string; display_path: string; title: string; body: string; hash: string; bm25_score: number }[];
  1615. return rows.map(row => {
  1616. const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
  1617. // Convert bm25 (lower is better) into a stable (0..1] score where higher is better.
  1618. // Avoid per-query normalization so "strong signal" heuristics can work.
  1619. const score = 1 / (1 + Math.max(0, row.bm25_score));
  1620. return {
  1621. filepath: row.filepath,
  1622. displayPath: row.display_path,
  1623. title: row.title,
  1624. hash: row.hash,
  1625. docid: getDocid(row.hash),
  1626. collectionName,
  1627. modifiedAt: "", // Not available in FTS query
  1628. bodyLength: row.body.length,
  1629. body: row.body,
  1630. context: getContextForFile(db, row.filepath),
  1631. score,
  1632. source: "fts" as const,
  1633. };
  1634. });
  1635. }
  1636. // =============================================================================
  1637. // Vector Search
  1638. // =============================================================================
  1639. export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string): Promise<SearchResult[]> {
  1640. const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1641. if (!tableExists) return [];
  1642. const embedding = await getEmbedding(query, model, true);
  1643. if (!embedding) return [];
  1644. // IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
  1645. // hang indefinitely when combined with JOINs in the same query. Do NOT try to
  1646. // "optimize" this by combining into a single query with JOINs - it will break.
  1647. // See: https://github.com/tobi/qmd/pull/23
  1648. // Step 1: Get vector matches from sqlite-vec (no JOINs allowed)
  1649. const vecResults = db.prepare(`
  1650. SELECT hash_seq, distance
  1651. FROM vectors_vec
  1652. WHERE embedding MATCH ? AND k = ?
  1653. `).all(new Float32Array(embedding), limit * 3) as { hash_seq: string; distance: number }[];
  1654. if (vecResults.length === 0) return [];
  1655. // Step 2: Get chunk info and document data
  1656. const hashSeqs = vecResults.map(r => r.hash_seq);
  1657. const distanceMap = new Map(vecResults.map(r => [r.hash_seq, r.distance]));
  1658. // Build query for document lookup
  1659. const placeholders = hashSeqs.map(() => '?').join(',');
  1660. let docSql = `
  1661. SELECT
  1662. cv.hash || '_' || cv.seq as hash_seq,
  1663. cv.hash,
  1664. cv.pos,
  1665. 'qmd://' || d.collection || '/' || d.path as filepath,
  1666. d.collection || '/' || d.path as display_path,
  1667. d.title,
  1668. content.doc as body
  1669. FROM content_vectors cv
  1670. JOIN documents d ON d.hash = cv.hash AND d.active = 1
  1671. JOIN content ON content.hash = d.hash
  1672. WHERE cv.hash || '_' || cv.seq IN (${placeholders})
  1673. `;
  1674. const params: string[] = [...hashSeqs];
  1675. if (collectionName) {
  1676. docSql += ` AND d.collection = ?`;
  1677. params.push(collectionName);
  1678. }
  1679. const docRows = db.prepare(docSql).all(...params) as {
  1680. hash_seq: string; hash: string; pos: number; filepath: string;
  1681. display_path: string; title: string; body: string;
  1682. }[];
  1683. // Combine with distances and dedupe by filepath
  1684. const seen = new Map<string, { row: typeof docRows[0]; bestDist: number }>();
  1685. for (const row of docRows) {
  1686. const distance = distanceMap.get(row.hash_seq) ?? 1;
  1687. const existing = seen.get(row.filepath);
  1688. if (!existing || distance < existing.bestDist) {
  1689. seen.set(row.filepath, { row, bestDist: distance });
  1690. }
  1691. }
  1692. return Array.from(seen.values())
  1693. .sort((a, b) => a.bestDist - b.bestDist)
  1694. .slice(0, limit)
  1695. .map(({ row, bestDist }) => {
  1696. const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
  1697. return {
  1698. filepath: row.filepath,
  1699. displayPath: row.display_path,
  1700. title: row.title,
  1701. hash: row.hash,
  1702. docid: getDocid(row.hash),
  1703. collectionName,
  1704. modifiedAt: "", // Not available in vec query
  1705. bodyLength: row.body.length,
  1706. body: row.body,
  1707. context: getContextForFile(db, row.filepath),
  1708. score: 1 - bestDist, // Cosine similarity = 1 - cosine distance
  1709. source: "vec" as const,
  1710. chunkPos: row.pos,
  1711. };
  1712. });
  1713. }
  1714. // =============================================================================
  1715. // Embeddings
  1716. // =============================================================================
  1717. async function getEmbedding(text: string, model: string, isQuery: boolean): Promise<number[] | null> {
  1718. const llm = getDefaultLlamaCpp();
  1719. // Format text using the appropriate prompt template
  1720. const formattedText = isQuery ? formatQueryForEmbedding(text) : formatDocForEmbedding(text);
  1721. const result = await llm.embed(formattedText, { model, isQuery });
  1722. return result?.embedding || null;
  1723. }
  1724. /**
  1725. * Get all unique content hashes that need embeddings (from active documents).
  1726. * Returns hash, document body, and a sample path for display purposes.
  1727. */
  1728. export function getHashesForEmbedding(db: Database): { hash: string; body: string; path: string }[] {
  1729. return db.prepare(`
  1730. SELECT d.hash, c.doc as body, MIN(d.path) as path
  1731. FROM documents d
  1732. JOIN content c ON d.hash = c.hash
  1733. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  1734. WHERE d.active = 1 AND v.hash IS NULL
  1735. GROUP BY d.hash
  1736. `).all() as { hash: string; body: string; path: string }[];
  1737. }
  1738. /**
  1739. * Clear all embeddings from the database (force re-index).
  1740. * Deletes all rows from content_vectors and drops the vectors_vec table.
  1741. */
  1742. export function clearAllEmbeddings(db: Database): void {
  1743. db.exec(`DELETE FROM content_vectors`);
  1744. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  1745. }
  1746. /**
  1747. * Insert a single embedding into both content_vectors and vectors_vec tables.
  1748. * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
  1749. */
  1750. export function insertEmbedding(
  1751. db: Database,
  1752. hash: string,
  1753. seq: number,
  1754. pos: number,
  1755. embedding: Float32Array,
  1756. model: string,
  1757. embeddedAt: string
  1758. ): void {
  1759. const hashSeq = `${hash}_${seq}`;
  1760. const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
  1761. const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
  1762. insertVecStmt.run(hashSeq, embedding);
  1763. insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
  1764. }
  1765. // =============================================================================
  1766. // Query expansion
  1767. // =============================================================================
  1768. export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database): Promise<string[]> {
  1769. // Check cache first
  1770. const cacheKey = getCacheKey("expandQuery", { query, model });
  1771. const cached = getCachedResult(db, cacheKey);
  1772. if (cached) {
  1773. const lines = cached.split('\n').map(l => l.trim()).filter(l => l.length > 0);
  1774. return [query, ...lines.slice(0, 2)];
  1775. }
  1776. const llm = getDefaultLlamaCpp();
  1777. // Note: LlamaCpp uses hardcoded model, model parameter is ignored
  1778. const results = await llm.expandQuery(query);
  1779. const queryTexts = results.map(r => r.text);
  1780. // Cache the expanded queries (excluding original)
  1781. const expandedOnly = queryTexts.filter(t => t !== query);
  1782. if (expandedOnly.length > 0) {
  1783. setCachedResult(db, cacheKey, expandedOnly.join('\n'));
  1784. }
  1785. return Array.from(new Set([query, ...queryTexts]));
  1786. }
  1787. // =============================================================================
  1788. // Reranking
  1789. // =============================================================================
  1790. export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database): Promise<{ file: string; score: number }[]> {
  1791. const cachedResults: Map<string, number> = new Map();
  1792. const uncachedDocs: RerankDocument[] = [];
  1793. // Check cache for each document
  1794. for (const doc of documents) {
  1795. const cacheKey = getCacheKey("rerank", { query, file: doc.file, model });
  1796. const cached = getCachedResult(db, cacheKey);
  1797. if (cached !== null) {
  1798. cachedResults.set(doc.file, parseFloat(cached));
  1799. } else {
  1800. uncachedDocs.push({ file: doc.file, text: doc.text });
  1801. }
  1802. }
  1803. // Rerank uncached documents using LlamaCpp
  1804. if (uncachedDocs.length > 0) {
  1805. const llm = getDefaultLlamaCpp();
  1806. const rerankResult = await llm.rerank(query, uncachedDocs, { model });
  1807. // Cache results
  1808. for (const result of rerankResult.results) {
  1809. const cacheKey = getCacheKey("rerank", { query, file: result.file, model });
  1810. setCachedResult(db, cacheKey, result.score.toString());
  1811. cachedResults.set(result.file, result.score);
  1812. }
  1813. }
  1814. // Return all results sorted by score
  1815. return documents
  1816. .map(doc => ({ file: doc.file, score: cachedResults.get(doc.file) || 0 }))
  1817. .sort((a, b) => b.score - a.score);
  1818. }
  1819. // =============================================================================
  1820. // Reciprocal Rank Fusion
  1821. // =============================================================================
  1822. export function reciprocalRankFusion(
  1823. resultLists: RankedResult[][],
  1824. weights: number[] = [],
  1825. k: number = 60
  1826. ): RankedResult[] {
  1827. const scores = new Map<string, { result: RankedResult; rrfScore: number; topRank: number }>();
  1828. for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
  1829. const list = resultLists[listIdx];
  1830. if (!list) continue;
  1831. const weight = weights[listIdx] ?? 1.0;
  1832. for (let rank = 0; rank < list.length; rank++) {
  1833. const result = list[rank];
  1834. if (!result) continue;
  1835. const rrfContribution = weight / (k + rank + 1);
  1836. const existing = scores.get(result.file);
  1837. if (existing) {
  1838. existing.rrfScore += rrfContribution;
  1839. existing.topRank = Math.min(existing.topRank, rank);
  1840. } else {
  1841. scores.set(result.file, {
  1842. result,
  1843. rrfScore: rrfContribution,
  1844. topRank: rank,
  1845. });
  1846. }
  1847. }
  1848. }
  1849. // Top-rank bonus
  1850. for (const entry of scores.values()) {
  1851. if (entry.topRank === 0) {
  1852. entry.rrfScore += 0.05;
  1853. } else if (entry.topRank <= 2) {
  1854. entry.rrfScore += 0.02;
  1855. }
  1856. }
  1857. return Array.from(scores.values())
  1858. .sort((a, b) => b.rrfScore - a.rrfScore)
  1859. .map(e => ({ ...e.result, score: e.rrfScore }));
  1860. }
  1861. // =============================================================================
  1862. // Document retrieval
  1863. // =============================================================================
  1864. type DbDocRow = {
  1865. virtual_path: string;
  1866. display_path: string;
  1867. title: string;
  1868. hash: string;
  1869. collection: string;
  1870. path: string;
  1871. modified_at: string;
  1872. body_length: number;
  1873. body?: string;
  1874. };
  1875. /**
  1876. * Find a document by filename/path, docid (#hash), or with fuzzy matching.
  1877. * Returns document metadata without body by default.
  1878. *
  1879. * Supports:
  1880. * - Virtual paths: qmd://collection/path/to/file.md
  1881. * - Absolute paths: /path/to/file.md
  1882. * - Relative paths: path/to/file.md
  1883. * - Short docid: #abc123 (first 6 chars of hash)
  1884. */
  1885. export function findDocument(db: Database, filename: string, options: { includeBody?: boolean } = {}): DocumentResult | DocumentNotFound {
  1886. let filepath = filename;
  1887. const colonMatch = filepath.match(/:(\d+)$/);
  1888. if (colonMatch) {
  1889. filepath = filepath.slice(0, -colonMatch[0].length);
  1890. }
  1891. // Check if this is a docid lookup (#abc123, abc123, "#abc123", "abc123", etc.)
  1892. if (isDocid(filepath)) {
  1893. const docidMatch = findDocumentByDocid(db, filepath);
  1894. if (docidMatch) {
  1895. filepath = docidMatch.filepath;
  1896. } else {
  1897. return { error: "not_found", query: filename, similarFiles: [] };
  1898. }
  1899. }
  1900. if (filepath.startsWith('~/')) {
  1901. filepath = homedir() + filepath.slice(1);
  1902. }
  1903. const bodyCol = options.includeBody ? `, content.doc as body` : ``;
  1904. // Build computed columns
  1905. // Note: absoluteFilepath is computed from YAML collections after query
  1906. const selectCols = `
  1907. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  1908. d.collection || '/' || d.path as display_path,
  1909. d.title,
  1910. d.hash,
  1911. d.collection,
  1912. d.modified_at,
  1913. LENGTH(content.doc) as body_length
  1914. ${bodyCol}
  1915. `;
  1916. // Try to match by virtual path first
  1917. let doc = db.prepare(`
  1918. SELECT ${selectCols}
  1919. FROM documents d
  1920. JOIN content ON content.hash = d.hash
  1921. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  1922. `).get(filepath) as DbDocRow | null;
  1923. // Try fuzzy match by virtual path
  1924. if (!doc) {
  1925. doc = db.prepare(`
  1926. SELECT ${selectCols}
  1927. FROM documents d
  1928. JOIN content ON content.hash = d.hash
  1929. WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
  1930. LIMIT 1
  1931. `).get(`%${filepath}`) as DbDocRow | null;
  1932. }
  1933. // Try to match by absolute path (requires looking up collection paths from YAML)
  1934. if (!doc && !filepath.startsWith('qmd://')) {
  1935. const collections = collectionsListCollections();
  1936. for (const coll of collections) {
  1937. let relativePath: string | null = null;
  1938. // If filepath is absolute and starts with collection path, extract relative part
  1939. if (filepath.startsWith(coll.path + '/')) {
  1940. relativePath = filepath.slice(coll.path.length + 1);
  1941. }
  1942. // Otherwise treat filepath as relative to collection
  1943. else if (!filepath.startsWith('/')) {
  1944. relativePath = filepath;
  1945. }
  1946. if (relativePath) {
  1947. doc = db.prepare(`
  1948. SELECT ${selectCols}
  1949. FROM documents d
  1950. JOIN content ON content.hash = d.hash
  1951. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  1952. `).get(coll.name, relativePath) as DbDocRow | null;
  1953. if (doc) break;
  1954. }
  1955. }
  1956. }
  1957. if (!doc) {
  1958. const similar = findSimilarFiles(db, filepath, 5, 5);
  1959. return { error: "not_found", query: filename, similarFiles: similar };
  1960. }
  1961. // Get context using virtual path
  1962. const virtualPath = doc.virtual_path || `qmd://${doc.collection}/${doc.display_path}`;
  1963. const context = getContextForFile(db, virtualPath);
  1964. return {
  1965. filepath: virtualPath,
  1966. displayPath: doc.display_path,
  1967. title: doc.title,
  1968. context,
  1969. hash: doc.hash,
  1970. docid: getDocid(doc.hash),
  1971. collectionName: doc.collection,
  1972. modifiedAt: doc.modified_at,
  1973. bodyLength: doc.body_length,
  1974. ...(options.includeBody && doc.body !== undefined && { body: doc.body }),
  1975. };
  1976. }
  1977. /**
  1978. * Get the body content for a document
  1979. * Optionally slice by line range
  1980. */
  1981. export function getDocumentBody(db: Database, doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number): string | null {
  1982. const filepath = doc.filepath;
  1983. // Try to resolve document by filepath (absolute or virtual)
  1984. let row: { body: string } | null = null;
  1985. // Try virtual path first
  1986. if (filepath.startsWith('qmd://')) {
  1987. row = db.prepare(`
  1988. SELECT content.doc as body
  1989. FROM documents d
  1990. JOIN content ON content.hash = d.hash
  1991. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  1992. `).get(filepath) as { body: string } | null;
  1993. }
  1994. // Try absolute path by looking up in YAML collections
  1995. if (!row) {
  1996. const collections = collectionsListCollections();
  1997. for (const coll of collections) {
  1998. if (filepath.startsWith(coll.path + '/')) {
  1999. const relativePath = filepath.slice(coll.path.length + 1);
  2000. row = db.prepare(`
  2001. SELECT content.doc as body
  2002. FROM documents d
  2003. JOIN content ON content.hash = d.hash
  2004. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  2005. `).get(coll.name, relativePath) as { body: string } | null;
  2006. if (row) break;
  2007. }
  2008. }
  2009. }
  2010. if (!row) return null;
  2011. let body = row.body;
  2012. if (fromLine !== undefined || maxLines !== undefined) {
  2013. const lines = body.split('\n');
  2014. const start = (fromLine || 1) - 1;
  2015. const end = maxLines !== undefined ? start + maxLines : lines.length;
  2016. body = lines.slice(start, end).join('\n');
  2017. }
  2018. return body;
  2019. }
  2020. /**
  2021. * Find multiple documents by glob pattern or comma-separated list
  2022. * Returns documents without body by default (use getDocumentBody to load)
  2023. */
  2024. export function findDocuments(
  2025. db: Database,
  2026. pattern: string,
  2027. options: { includeBody?: boolean; maxBytes?: number } = {}
  2028. ): { docs: MultiGetResult[]; errors: string[] } {
  2029. const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
  2030. const errors: string[] = [];
  2031. const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
  2032. const bodyCol = options.includeBody ? `, content.doc as body` : ``;
  2033. const selectCols = `
  2034. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  2035. d.collection || '/' || d.path as display_path,
  2036. d.title,
  2037. d.hash,
  2038. d.collection,
  2039. d.modified_at,
  2040. LENGTH(content.doc) as body_length
  2041. ${bodyCol}
  2042. `;
  2043. let fileRows: DbDocRow[];
  2044. if (isCommaSeparated) {
  2045. const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
  2046. fileRows = [];
  2047. for (const name of names) {
  2048. let doc = db.prepare(`
  2049. SELECT ${selectCols}
  2050. FROM documents d
  2051. JOIN content ON content.hash = d.hash
  2052. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  2053. `).get(name) as DbDocRow | null;
  2054. if (!doc) {
  2055. doc = db.prepare(`
  2056. SELECT ${selectCols}
  2057. FROM documents d
  2058. JOIN content ON content.hash = d.hash
  2059. WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
  2060. LIMIT 1
  2061. `).get(`%${name}`) as DbDocRow | null;
  2062. }
  2063. if (doc) {
  2064. fileRows.push(doc);
  2065. } else {
  2066. const similar = findSimilarFiles(db, name, 5, 3);
  2067. let msg = `File not found: ${name}`;
  2068. if (similar.length > 0) {
  2069. msg += ` (did you mean: ${similar.join(', ')}?)`;
  2070. }
  2071. errors.push(msg);
  2072. }
  2073. }
  2074. } else {
  2075. // Glob pattern match
  2076. const matched = matchFilesByGlob(db, pattern);
  2077. if (matched.length === 0) {
  2078. errors.push(`No files matched pattern: ${pattern}`);
  2079. return { docs: [], errors };
  2080. }
  2081. const virtualPaths = matched.map(m => m.filepath);
  2082. const placeholders = virtualPaths.map(() => '?').join(',');
  2083. fileRows = db.prepare(`
  2084. SELECT ${selectCols}
  2085. FROM documents d
  2086. JOIN content ON content.hash = d.hash
  2087. WHERE 'qmd://' || d.collection || '/' || d.path IN (${placeholders}) AND d.active = 1
  2088. `).all(...virtualPaths) as DbDocRow[];
  2089. }
  2090. const results: MultiGetResult[] = [];
  2091. for (const row of fileRows) {
  2092. // Get context using virtual path
  2093. const virtualPath = row.virtual_path || `qmd://${row.collection}/${row.display_path}`;
  2094. const context = getContextForFile(db, virtualPath);
  2095. if (row.body_length > maxBytes) {
  2096. results.push({
  2097. doc: { filepath: virtualPath, displayPath: row.display_path },
  2098. skipped: true,
  2099. skipReason: `File too large (${Math.round(row.body_length / 1024)}KB > ${Math.round(maxBytes / 1024)}KB)`,
  2100. });
  2101. continue;
  2102. }
  2103. results.push({
  2104. doc: {
  2105. filepath: virtualPath,
  2106. displayPath: row.display_path,
  2107. title: row.title || row.display_path.split('/').pop() || row.display_path,
  2108. context,
  2109. hash: row.hash,
  2110. docid: getDocid(row.hash),
  2111. collectionName: row.collection,
  2112. modifiedAt: row.modified_at,
  2113. bodyLength: row.body_length,
  2114. ...(options.includeBody && row.body !== undefined && { body: row.body }),
  2115. },
  2116. skipped: false,
  2117. });
  2118. }
  2119. return { docs: results, errors };
  2120. }
  2121. // =============================================================================
  2122. // Status
  2123. // =============================================================================
  2124. export function getStatus(db: Database): IndexStatus {
  2125. // Load collections from YAML
  2126. const yamlCollections = collectionsListCollections();
  2127. // Get document counts and last update times for each collection
  2128. const collections = yamlCollections.map(col => {
  2129. const stats = db.prepare(`
  2130. SELECT
  2131. COUNT(*) as active_count,
  2132. MAX(modified_at) as last_doc_update
  2133. FROM documents
  2134. WHERE collection = ? AND active = 1
  2135. `).get(col.name) as { active_count: number; last_doc_update: string | null };
  2136. return {
  2137. name: col.name,
  2138. path: col.path,
  2139. pattern: col.pattern,
  2140. documents: stats.active_count,
  2141. lastUpdated: stats.last_doc_update || new Date().toISOString(),
  2142. };
  2143. });
  2144. // Sort by last update time (most recent first)
  2145. collections.sort((a, b) => {
  2146. if (!a.lastUpdated) return 1;
  2147. if (!b.lastUpdated) return -1;
  2148. return new Date(b.lastUpdated).getTime() - new Date(a.lastUpdated).getTime();
  2149. });
  2150. const totalDocs = (db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get() as { c: number }).c;
  2151. const needsEmbedding = getHashesNeedingEmbedding(db);
  2152. const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  2153. return {
  2154. totalDocuments: totalDocs,
  2155. needsEmbedding,
  2156. hasVectorIndex: hasVectors,
  2157. collections,
  2158. };
  2159. }
  2160. // =============================================================================
  2161. // Snippet extraction
  2162. // =============================================================================
  2163. export type SnippetResult = {
  2164. line: number; // 1-indexed line number of best match
  2165. snippet: string; // The snippet text with diff-style header
  2166. linesBefore: number; // Lines in document before snippet
  2167. linesAfter: number; // Lines in document after snippet
  2168. snippetLines: number; // Number of lines in snippet
  2169. };
  2170. export function extractSnippet(body: string, query: string, maxLen = 500, chunkPos?: number): SnippetResult {
  2171. const totalLines = body.split('\n').length;
  2172. let searchBody = body;
  2173. let lineOffset = 0;
  2174. if (chunkPos && chunkPos > 0) {
  2175. const contextStart = Math.max(0, chunkPos - 100);
  2176. const contextEnd = Math.min(body.length, chunkPos + maxLen + 100);
  2177. searchBody = body.slice(contextStart, contextEnd);
  2178. if (contextStart > 0) {
  2179. lineOffset = body.slice(0, contextStart).split('\n').length - 1;
  2180. }
  2181. }
  2182. const lines = searchBody.split('\n');
  2183. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
  2184. let bestLine = 0, bestScore = -1;
  2185. for (let i = 0; i < lines.length; i++) {
  2186. const lineLower = (lines[i] ?? "").toLowerCase();
  2187. let score = 0;
  2188. for (const term of queryTerms) {
  2189. if (lineLower.includes(term)) score++;
  2190. }
  2191. if (score > bestScore) {
  2192. bestScore = score;
  2193. bestLine = i;
  2194. }
  2195. }
  2196. const start = Math.max(0, bestLine - 1);
  2197. const end = Math.min(lines.length, bestLine + 3);
  2198. const snippetLines = lines.slice(start, end);
  2199. let snippetText = snippetLines.join('\n');
  2200. // If we focused on a chunk window and it produced an empty/whitespace-only snippet,
  2201. // fall back to a full-document snippet so we always show something useful.
  2202. if (chunkPos && chunkPos > 0 && snippetText.trim().length === 0) {
  2203. return extractSnippet(body, query, maxLen, undefined);
  2204. }
  2205. if (snippetText.length > maxLen) snippetText = snippetText.substring(0, maxLen - 3) + "...";
  2206. const absoluteStart = lineOffset + start + 1; // 1-indexed
  2207. const snippetLineCount = snippetLines.length;
  2208. const linesBefore = absoluteStart - 1;
  2209. const linesAfter = totalLines - (absoluteStart + snippetLineCount - 1);
  2210. // Format with diff-style header: @@ -start,count @@ (linesBefore before, linesAfter after)
  2211. const header = `@@ -${absoluteStart},${snippetLineCount} @@ (${linesBefore} before, ${linesAfter} after)`;
  2212. const snippet = `${header}\n${snippetText}`;
  2213. return {
  2214. line: lineOffset + bestLine + 1,
  2215. snippet,
  2216. linesBefore,
  2217. linesAfter,
  2218. snippetLines: snippetLineCount,
  2219. };
  2220. }