qmd.ts 92 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626
  1. #!/usr/bin/env bun
  2. import { Database } from "bun:sqlite";
  3. import { Glob, $ } from "bun";
  4. import { parseArgs } from "util";
  5. import * as sqliteVec from "sqlite-vec";
  6. import {
  7. getDb,
  8. closeDb,
  9. getDbPath,
  10. getPwd,
  11. getRealPath,
  12. homedir,
  13. resolve,
  14. setCustomIndexName,
  15. enableProductionMode,
  16. searchFTS,
  17. searchVec,
  18. reciprocalRankFusion,
  19. extractSnippet,
  20. getContextForFile,
  21. getContextForPath,
  22. listCollections,
  23. removeCollection,
  24. renameCollection,
  25. findSimilarFiles,
  26. matchFilesByGlob,
  27. getHashesNeedingEmbedding,
  28. getHashesForEmbedding,
  29. clearAllEmbeddings,
  30. insertEmbedding,
  31. getDocument as storeGetDocument,
  32. getMultipleDocuments as storeMultiGetDocuments,
  33. getStatus,
  34. hashContent,
  35. extractTitle,
  36. formatDocForEmbedding,
  37. formatQueryForEmbedding,
  38. chunkDocument,
  39. chunkDocumentByTokens,
  40. ensureVecTable,
  41. clearCache,
  42. getCacheKey,
  43. getCachedResult,
  44. setCachedResult,
  45. getIndexHealth,
  46. parseVirtualPath,
  47. buildVirtualPath,
  48. isVirtualPath,
  49. resolveVirtualPath,
  50. toVirtualPath,
  51. insertContent,
  52. insertDocument,
  53. findActiveDocument,
  54. updateDocumentTitle,
  55. updateDocument,
  56. deactivateDocument,
  57. getActiveDocumentPaths,
  58. cleanupOrphanedContent,
  59. deleteLLMCache,
  60. deleteInactiveDocuments,
  61. cleanupOrphanedVectors,
  62. cleanupDuplicateCollections,
  63. vacuumDatabase,
  64. getCollectionsWithoutContext,
  65. getTopLevelPathsWithoutContext,
  66. handelize,
  67. DEFAULT_EMBED_MODEL,
  68. DEFAULT_QUERY_MODEL,
  69. DEFAULT_RERANK_MODEL,
  70. DEFAULT_GLOB,
  71. DEFAULT_MULTI_GET_MAX_BYTES,
  72. } from "./store.js";
  73. import { getDefaultLlamaCpp, disposeDefaultLlamaCpp, type RerankDocument, type ExpandedQuery } from "./llm.js";
  74. import type { SearchResult, RankedResult } from "./store.js";
  75. import {
  76. formatSearchResults,
  77. formatDocuments,
  78. escapeXml,
  79. escapeCSV,
  80. type OutputFormat,
  81. } from "./formatter.js";
  82. import {
  83. getCollection as getCollectionFromYaml,
  84. listCollections as yamlListCollections,
  85. addContext as yamlAddContext,
  86. removeContext as yamlRemoveContext,
  87. setGlobalContext,
  88. listAllContexts,
  89. } from "./collections.js";
  90. // Enable production mode - allows using default database path
  91. // Tests must set INDEX_PATH or use createStore() with explicit path
  92. enableProductionMode();
  93. // Terminal colors (respects NO_COLOR env)
  94. const useColor = !process.env.NO_COLOR && process.stdout.isTTY;
  95. const c = {
  96. reset: useColor ? "\x1b[0m" : "",
  97. dim: useColor ? "\x1b[2m" : "",
  98. bold: useColor ? "\x1b[1m" : "",
  99. cyan: useColor ? "\x1b[36m" : "",
  100. yellow: useColor ? "\x1b[33m" : "",
  101. green: useColor ? "\x1b[32m" : "",
  102. magenta: useColor ? "\x1b[35m" : "",
  103. blue: useColor ? "\x1b[34m" : "",
  104. };
  105. // Terminal cursor control
  106. const cursor = {
  107. hide() { process.stderr.write('\x1b[?25l'); },
  108. show() { process.stderr.write('\x1b[?25h'); },
  109. };
  110. // Ensure cursor is restored on exit
  111. process.on('SIGINT', () => { cursor.show(); process.exit(130); });
  112. process.on('SIGTERM', () => { cursor.show(); process.exit(143); });
  113. // Terminal progress bar using OSC 9;4 escape sequence
  114. const progress = {
  115. set(percent: number) {
  116. process.stderr.write(`\x1b]9;4;1;${Math.round(percent)}\x07`);
  117. },
  118. clear() {
  119. process.stderr.write(`\x1b]9;4;0\x07`);
  120. },
  121. indeterminate() {
  122. process.stderr.write(`\x1b]9;4;3\x07`);
  123. },
  124. error() {
  125. process.stderr.write(`\x1b]9;4;2\x07`);
  126. },
  127. };
  128. // Format seconds into human-readable ETA
  129. function formatETA(seconds: number): string {
  130. if (seconds < 60) return `${Math.round(seconds)}s`;
  131. if (seconds < 3600) return `${Math.floor(seconds / 60)}m ${Math.round(seconds % 60)}s`;
  132. return `${Math.floor(seconds / 3600)}h ${Math.floor((seconds % 3600) / 60)}m`;
  133. }
  134. // Check index health and print warnings/tips
  135. function checkIndexHealth(db: Database): void {
  136. const { needsEmbedding, totalDocs, daysStale } = getIndexHealth(db);
  137. // Warn if many docs need embedding
  138. if (needsEmbedding > 0) {
  139. const pct = Math.round((needsEmbedding / totalDocs) * 100);
  140. if (pct >= 10) {
  141. process.stderr.write(`${c.yellow}Warning: ${needsEmbedding} documents (${pct}%) need embeddings. Run 'qmd embed' for better results.${c.reset}\n`);
  142. } else {
  143. process.stderr.write(`${c.dim}Tip: ${needsEmbedding} documents need embeddings. Run 'qmd embed' to index them.${c.reset}\n`);
  144. }
  145. }
  146. // Check if most recent document update is older than 2 weeks
  147. if (daysStale !== null && daysStale >= 14) {
  148. process.stderr.write(`${c.dim}Tip: Index last updated ${daysStale} days ago. Run 'qmd update' to refresh.${c.reset}\n`);
  149. }
  150. }
  151. // Compute unique display path for a document
  152. // Always include at least parent folder + filename, add more parent dirs until unique
  153. function computeDisplayPath(
  154. filepath: string,
  155. collectionPath: string,
  156. existingPaths: Set<string>
  157. ): string {
  158. // Get path relative to collection (include collection dir name)
  159. const collectionDir = collectionPath.replace(/\/$/, '');
  160. const collectionName = collectionDir.split('/').pop() || '';
  161. let relativePath: string;
  162. if (filepath.startsWith(collectionDir + '/')) {
  163. // filepath is under collection: use collection name + relative path
  164. relativePath = collectionName + filepath.slice(collectionDir.length);
  165. } else {
  166. // Fallback: just use the filepath
  167. relativePath = filepath;
  168. }
  169. const parts = relativePath.split('/').filter(p => p.length > 0);
  170. // Always include at least parent folder + filename (minimum 2 parts if available)
  171. // Then add more parent dirs until unique
  172. const minParts = Math.min(2, parts.length);
  173. for (let i = parts.length - minParts; i >= 0; i--) {
  174. const candidate = parts.slice(i).join('/');
  175. if (!existingPaths.has(candidate)) {
  176. return candidate;
  177. }
  178. }
  179. // Absolute fallback: use full path (should be unique)
  180. return filepath;
  181. }
  182. // Rerank documents using node-llama-cpp cross-encoder model
  183. async function rerank(query: string, documents: { file: string; text: string }[], _model: string = DEFAULT_RERANK_MODEL, _db?: Database): Promise<{ file: string; score: number }[]> {
  184. if (documents.length === 0) return [];
  185. const total = documents.length;
  186. process.stderr.write(`Reranking ${total} documents...\n`);
  187. progress.indeterminate();
  188. const llm = getDefaultLlamaCpp();
  189. const rerankDocs: RerankDocument[] = documents.map((doc) => ({
  190. file: doc.file,
  191. text: doc.text.slice(0, 4000), // Truncate to context limit
  192. }));
  193. const result = await llm.rerank(query, rerankDocs);
  194. progress.clear();
  195. process.stderr.write("\n");
  196. return result.results.map((r) => ({ file: r.file, score: r.score }));
  197. }
  198. function formatTimeAgo(date: Date): string {
  199. const seconds = Math.floor((Date.now() - date.getTime()) / 1000);
  200. if (seconds < 60) return `${seconds}s ago`;
  201. const minutes = Math.floor(seconds / 60);
  202. if (minutes < 60) return `${minutes}m ago`;
  203. const hours = Math.floor(minutes / 60);
  204. if (hours < 24) return `${hours}h ago`;
  205. const days = Math.floor(hours / 24);
  206. return `${days}d ago`;
  207. }
  208. function formatBytes(bytes: number): string {
  209. if (bytes < 1024) return `${bytes} B`;
  210. if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
  211. if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
  212. return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
  213. }
  214. function showStatus(): void {
  215. const dbPath = getDbPath();
  216. const db = getDb();
  217. // Cleanup any duplicate collections
  218. cleanupDuplicateCollections(db);
  219. // Index size
  220. let indexSize = 0;
  221. try {
  222. const stat = Bun.file(dbPath).size;
  223. indexSize = stat;
  224. } catch {}
  225. // Collections info (from YAML + database stats)
  226. const collections = listCollections(db);
  227. // Overall stats
  228. const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number };
  229. const vectorCount = db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get() as { count: number };
  230. const needsEmbedding = getHashesNeedingEmbedding(db);
  231. // Most recent update across all collections
  232. const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
  233. console.log(`${c.bold}QMD Status${c.reset}\n`);
  234. console.log(`Index: ${dbPath}`);
  235. console.log(`Size: ${formatBytes(indexSize)}\n`);
  236. console.log(`${c.bold}Documents${c.reset}`);
  237. console.log(` Total: ${totalDocs.count} files indexed`);
  238. console.log(` Vectors: ${vectorCount.count} embedded`);
  239. if (needsEmbedding > 0) {
  240. console.log(` ${c.yellow}Pending: ${needsEmbedding} need embedding${c.reset} (run 'qmd embed')`);
  241. }
  242. if (mostRecent.latest) {
  243. const lastUpdate = new Date(mostRecent.latest);
  244. console.log(` Updated: ${formatTimeAgo(lastUpdate)}`);
  245. }
  246. // Get all contexts grouped by collection (from YAML)
  247. const allContexts = listAllContexts();
  248. const contextsByCollection = new Map<string, { path_prefix: string; context: string }[]>();
  249. for (const ctx of allContexts) {
  250. // Group contexts by collection name
  251. if (!contextsByCollection.has(ctx.collection)) {
  252. contextsByCollection.set(ctx.collection, []);
  253. }
  254. contextsByCollection.get(ctx.collection)!.push({
  255. path_prefix: ctx.path,
  256. context: ctx.context
  257. });
  258. }
  259. if (collections.length > 0) {
  260. console.log(`\n${c.bold}Collections${c.reset}`);
  261. for (const col of collections) {
  262. const lastMod = col.last_modified ? formatTimeAgo(new Date(col.last_modified)) : "never";
  263. const contexts = contextsByCollection.get(col.name) || [];
  264. console.log(` ${c.cyan}${col.name}${c.reset} ${c.dim}(qmd://${col.name}/)${c.reset}`);
  265. console.log(` ${c.dim}Pattern:${c.reset} ${col.glob_pattern}`);
  266. console.log(` ${c.dim}Files:${c.reset} ${col.active_count} (updated ${lastMod})`);
  267. if (contexts.length > 0) {
  268. console.log(` ${c.dim}Contexts:${c.reset} ${contexts.length}`);
  269. for (const ctx of contexts) {
  270. // Handle both empty string and '/' as root context
  271. const pathDisplay = (ctx.path_prefix === '' || ctx.path_prefix === '/') ? '/' : `/${ctx.path_prefix}`;
  272. const contextPreview = ctx.context.length > 60
  273. ? ctx.context.substring(0, 57) + '...'
  274. : ctx.context;
  275. console.log(` ${c.dim}${pathDisplay}:${c.reset} ${contextPreview}`);
  276. }
  277. }
  278. }
  279. // Show examples of virtual paths
  280. console.log(`\n${c.bold}Examples${c.reset}`);
  281. console.log(` ${c.dim}# List files in a collection${c.reset}`);
  282. if (collections.length > 0) {
  283. console.log(` qmd ls ${collections[0].name}`);
  284. }
  285. console.log(` ${c.dim}# Get a document${c.reset}`);
  286. if (collections.length > 0) {
  287. console.log(` qmd get qmd://${collections[0].name}/path/to/file.md`);
  288. }
  289. console.log(` ${c.dim}# Search within a collection${c.reset}`);
  290. if (collections.length > 0) {
  291. console.log(` qmd search "query" -c ${collections[0].name}`);
  292. }
  293. } else {
  294. console.log(`\n${c.dim}No collections. Run 'qmd collection add .' to index markdown files.${c.reset}`);
  295. }
  296. closeDb();
  297. }
  298. async function updateCollections(): Promise<void> {
  299. const db = getDb();
  300. cleanupDuplicateCollections(db);
  301. // Clear Ollama cache on update
  302. clearCache(db);
  303. const collections = listCollections(db);
  304. if (collections.length === 0) {
  305. console.log(`${c.dim}No collections found. Run 'qmd collection add .' to index markdown files.${c.reset}`);
  306. closeDb();
  307. return;
  308. }
  309. // Don't close db here - indexFiles will reuse it and close at the end
  310. console.log(`${c.bold}Updating ${collections.length} collection(s)...${c.reset}\n`);
  311. for (let i = 0; i < collections.length; i++) {
  312. const col = collections[i];
  313. console.log(`${c.cyan}[${i + 1}/${collections.length}]${c.reset} ${c.bold}${col.name}${c.reset} ${c.dim}(${col.glob_pattern})${c.reset}`);
  314. // Execute custom update command if specified in YAML
  315. const yamlCol = getCollectionFromYaml(col.name);
  316. if (yamlCol?.update) {
  317. console.log(`${c.dim} Running update command: ${yamlCol.update}${c.reset}`);
  318. try {
  319. const proc = Bun.spawn(["/usr/bin/env", "bash", "-c", yamlCol.update], {
  320. cwd: col.pwd,
  321. stdout: "pipe",
  322. stderr: "pipe",
  323. });
  324. const output = await new Response(proc.stdout).text();
  325. const errorOutput = await new Response(proc.stderr).text();
  326. const exitCode = await proc.exited;
  327. if (output.trim()) {
  328. console.log(output.trim().split('\n').map(l => ` ${l}`).join('\n'));
  329. }
  330. if (errorOutput.trim()) {
  331. console.log(errorOutput.trim().split('\n').map(l => ` ${l}`).join('\n'));
  332. }
  333. if (exitCode !== 0) {
  334. console.log(`${c.yellow}✗ Update command failed with exit code ${exitCode}${c.reset}`);
  335. process.exit(exitCode);
  336. }
  337. } catch (err) {
  338. console.log(`${c.yellow}✗ Update command failed: ${err}${c.reset}`);
  339. process.exit(1);
  340. }
  341. }
  342. await indexFiles(col.pwd, col.glob_pattern, col.name);
  343. console.log("");
  344. }
  345. console.log(`${c.green}✓ All collections updated.${c.reset}`);
  346. }
  347. /**
  348. * Detect which collection (if any) contains the given filesystem path.
  349. * Returns { collectionId, collectionName, relativePath } or null if not in any collection.
  350. */
  351. function detectCollectionFromPath(db: Database, fsPath: string): { collectionName: string; relativePath: string } | null {
  352. const realPath = getRealPath(fsPath);
  353. // Find collections that this path is under from YAML
  354. const allCollections = yamlListCollections();
  355. // Find longest matching path
  356. let bestMatch: { name: string; path: string } | null = null;
  357. for (const coll of allCollections) {
  358. if (realPath.startsWith(coll.path + '/') || realPath === coll.path) {
  359. if (!bestMatch || coll.path.length > bestMatch.path.length) {
  360. bestMatch = { name: coll.name, path: coll.path };
  361. }
  362. }
  363. }
  364. if (!bestMatch) return null;
  365. // Calculate relative path
  366. let relativePath = realPath;
  367. if (relativePath.startsWith(bestMatch.path + '/')) {
  368. relativePath = relativePath.slice(bestMatch.path.length + 1);
  369. } else if (relativePath === bestMatch.path) {
  370. relativePath = '';
  371. }
  372. return {
  373. collectionName: bestMatch.name,
  374. relativePath
  375. };
  376. }
  377. async function contextAdd(pathArg: string | undefined, contextText: string): Promise<void> {
  378. const db = getDb();
  379. // Handle "/" as global context (applies to all collections)
  380. if (pathArg === '/') {
  381. setGlobalContext(contextText);
  382. console.log(`${c.green}✓${c.reset} Set global context`);
  383. console.log(`${c.dim}Context: ${contextText}${c.reset}`);
  384. closeDb();
  385. return;
  386. }
  387. // Resolve path - defaults to current directory if not provided
  388. let fsPath = pathArg || '.';
  389. if (fsPath === '.' || fsPath === './') {
  390. fsPath = getPwd();
  391. } else if (fsPath.startsWith('~/')) {
  392. fsPath = homedir() + fsPath.slice(1);
  393. } else if (!fsPath.startsWith('/') && !fsPath.startsWith('qmd://')) {
  394. fsPath = resolve(getPwd(), fsPath);
  395. }
  396. // Handle virtual paths (qmd://collection/path)
  397. if (isVirtualPath(fsPath)) {
  398. const parsed = parseVirtualPath(fsPath);
  399. if (!parsed) {
  400. console.error(`${c.yellow}Invalid virtual path: ${fsPath}${c.reset}`);
  401. process.exit(1);
  402. }
  403. const coll = getCollectionFromYaml(parsed.collectionName);
  404. if (!coll) {
  405. console.error(`${c.yellow}Collection not found: ${parsed.collectionName}${c.reset}`);
  406. process.exit(1);
  407. }
  408. yamlAddContext(parsed.collectionName, parsed.path, contextText);
  409. const displayPath = parsed.path
  410. ? `qmd://${parsed.collectionName}/${parsed.path}`
  411. : `qmd://${parsed.collectionName}/ (collection root)`;
  412. console.log(`${c.green}✓${c.reset} Added context for: ${displayPath}`);
  413. console.log(`${c.dim}Context: ${contextText}${c.reset}`);
  414. closeDb();
  415. return;
  416. }
  417. // Detect collection from filesystem path
  418. const detected = detectCollectionFromPath(db, fsPath);
  419. if (!detected) {
  420. console.error(`${c.yellow}Path is not in any indexed collection: ${fsPath}${c.reset}`);
  421. console.error(`${c.dim}Run 'qmd status' to see indexed collections${c.reset}`);
  422. process.exit(1);
  423. }
  424. yamlAddContext(detected.collectionName, detected.relativePath, contextText);
  425. const displayPath = detected.relativePath ? `qmd://${detected.collectionName}/${detected.relativePath}` : `qmd://${detected.collectionName}/`;
  426. console.log(`${c.green}✓${c.reset} Added context for: ${displayPath}`);
  427. console.log(`${c.dim}Context: ${contextText}${c.reset}`);
  428. closeDb();
  429. }
  430. function contextList(): void {
  431. const db = getDb();
  432. const allContexts = listAllContexts();
  433. if (allContexts.length === 0) {
  434. console.log(`${c.dim}No contexts configured. Use 'qmd context add' to add one.${c.reset}`);
  435. closeDb();
  436. return;
  437. }
  438. console.log(`\n${c.bold}Configured Contexts${c.reset}\n`);
  439. let lastCollection = '';
  440. for (const ctx of allContexts) {
  441. if (ctx.collection !== lastCollection) {
  442. console.log(`${c.cyan}${ctx.collection}${c.reset}`);
  443. lastCollection = ctx.collection;
  444. }
  445. const displayPath = ctx.path ? ` ${ctx.path}` : ' / (root)';
  446. console.log(`${displayPath}`);
  447. console.log(` ${c.dim}${ctx.context}${c.reset}`);
  448. }
  449. closeDb();
  450. }
  451. function contextRemove(pathArg: string): void {
  452. if (pathArg === '/') {
  453. // Remove global context
  454. setGlobalContext(undefined);
  455. console.log(`${c.green}✓${c.reset} Removed global context`);
  456. return;
  457. }
  458. // Handle virtual paths
  459. if (isVirtualPath(pathArg)) {
  460. const parsed = parseVirtualPath(pathArg);
  461. if (!parsed) {
  462. console.error(`${c.yellow}Invalid virtual path: ${pathArg}${c.reset}`);
  463. process.exit(1);
  464. }
  465. const coll = getCollectionFromYaml(parsed.collectionName);
  466. if (!coll) {
  467. console.error(`${c.yellow}Collection not found: ${parsed.collectionName}${c.reset}`);
  468. process.exit(1);
  469. }
  470. const success = yamlRemoveContext(coll.name, parsed.path);
  471. if (!success) {
  472. console.error(`${c.yellow}No context found for: ${pathArg}${c.reset}`);
  473. process.exit(1);
  474. }
  475. console.log(`${c.green}✓${c.reset} Removed context for: ${pathArg}`);
  476. return;
  477. }
  478. // Handle filesystem paths
  479. let fsPath = pathArg;
  480. if (fsPath === '.' || fsPath === './') {
  481. fsPath = getPwd();
  482. } else if (fsPath.startsWith('~/')) {
  483. fsPath = homedir() + fsPath.slice(1);
  484. } else if (!fsPath.startsWith('/')) {
  485. fsPath = resolve(getPwd(), fsPath);
  486. }
  487. const db = getDb();
  488. const detected = detectCollectionFromPath(db, fsPath);
  489. closeDb();
  490. if (!detected) {
  491. console.error(`${c.yellow}Path is not in any indexed collection: ${fsPath}${c.reset}`);
  492. process.exit(1);
  493. }
  494. const success = yamlRemoveContext(detected.collectionName, detected.relativePath);
  495. if (!success) {
  496. console.error(`${c.yellow}No context found for: qmd://${detected.collectionName}/${detected.relativePath}${c.reset}`);
  497. process.exit(1);
  498. }
  499. console.log(`${c.green}✓${c.reset} Removed context for: qmd://${detected.collectionName}/${detected.relativePath}`);
  500. }
  501. function contextCheck(): void {
  502. const db = getDb();
  503. // Get collections without any context
  504. const collectionsWithoutContext = getCollectionsWithoutContext(db);
  505. // Get all collections to check for missing path contexts
  506. const allCollections = listCollections(db);
  507. if (collectionsWithoutContext.length === 0 && allCollections.length > 0) {
  508. // Check if all collections have contexts
  509. console.log(`\n${c.green}✓${c.reset} ${c.bold}All collections have context configured${c.reset}\n`);
  510. }
  511. if (collectionsWithoutContext.length > 0) {
  512. console.log(`\n${c.yellow}Collections without any context:${c.reset}\n`);
  513. for (const coll of collectionsWithoutContext) {
  514. console.log(`${c.cyan}${coll.name}${c.reset} ${c.dim}(${coll.doc_count} documents)${c.reset}`);
  515. console.log(` ${c.dim}Suggestion: qmd context add qmd://${coll.name}/ "Description of ${coll.name}"${c.reset}\n`);
  516. }
  517. }
  518. // Check for top-level paths without context within collections that DO have context
  519. const collectionsWithContext = allCollections.filter(c =>
  520. !collectionsWithoutContext.some(cwc => cwc.id === c.id)
  521. );
  522. let hasPathSuggestions = false;
  523. for (const coll of collectionsWithContext) {
  524. const missingPaths = getTopLevelPathsWithoutContext(db, coll.id);
  525. if (missingPaths.length > 0) {
  526. if (!hasPathSuggestions) {
  527. console.log(`${c.yellow}Top-level directories without context:${c.reset}\n`);
  528. hasPathSuggestions = true;
  529. }
  530. console.log(`${c.cyan}${coll.name}${c.reset}`);
  531. for (const path of missingPaths) {
  532. console.log(` ${path}`);
  533. console.log(` ${c.dim}Suggestion: qmd context add qmd://${coll.name}/${path} "Description of ${path}"${c.reset}`);
  534. }
  535. console.log('');
  536. }
  537. }
  538. if (collectionsWithoutContext.length === 0 && !hasPathSuggestions) {
  539. console.log(`${c.dim}All collections and major paths have context configured.${c.reset}`);
  540. console.log(`${c.dim}Use 'qmd context list' to see all configured contexts.${c.reset}\n`);
  541. }
  542. closeDb();
  543. }
  544. function getDocument(filename: string, fromLine?: number, maxLines?: number, lineNumbers?: boolean): void {
  545. const db = getDb();
  546. // Parse :linenum suffix from filename (e.g., "file.md:100")
  547. let inputPath = filename;
  548. const colonMatch = inputPath.match(/:(\d+)$/);
  549. if (colonMatch && !fromLine) {
  550. fromLine = parseInt(colonMatch[1], 10);
  551. inputPath = inputPath.slice(0, -colonMatch[0].length);
  552. }
  553. let doc: { collectionName: string; path: string; body: string } | null = null;
  554. let virtualPath: string;
  555. // Handle virtual paths (qmd://collection/path)
  556. if (isVirtualPath(inputPath)) {
  557. const parsed = parseVirtualPath(inputPath);
  558. if (!parsed) {
  559. console.error(`Invalid virtual path: ${inputPath}`);
  560. closeDb();
  561. process.exit(1);
  562. }
  563. // Try exact match on collection + path
  564. doc = db.prepare(`
  565. SELECT d.collection as collectionName, d.path, content.doc as body
  566. FROM documents d
  567. JOIN content ON content.hash = d.hash
  568. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  569. `).get(parsed.collectionName, parsed.path) as typeof doc;
  570. if (!doc) {
  571. // Try fuzzy match by path ending
  572. doc = db.prepare(`
  573. SELECT d.collection as collectionName, d.path, content.doc as body
  574. FROM documents d
  575. JOIN content ON content.hash = d.hash
  576. WHERE d.collection = ? AND d.path LIKE ? AND d.active = 1
  577. LIMIT 1
  578. `).get(parsed.collectionName, `%${parsed.path}`) as typeof doc;
  579. }
  580. virtualPath = inputPath;
  581. } else {
  582. // Try to interpret as collection/path format first (before filesystem path)
  583. // If path is relative (no / or ~ prefix), check if first component is a collection name
  584. if (!inputPath.startsWith('/') && !inputPath.startsWith('~')) {
  585. const parts = inputPath.split('/');
  586. if (parts.length >= 2) {
  587. const possibleCollection = parts[0];
  588. const possiblePath = parts.slice(1).join('/');
  589. // Check if this collection exists
  590. const collExists = db.prepare(`
  591. SELECT 1 FROM documents WHERE collection = ? AND active = 1 LIMIT 1
  592. `).get(possibleCollection);
  593. if (collExists) {
  594. // Try exact match on collection + path
  595. doc = db.prepare(`
  596. SELECT d.collection as collectionName, d.path, content.doc as body
  597. FROM documents d
  598. JOIN content ON content.hash = d.hash
  599. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  600. `).get(possibleCollection, possiblePath) as typeof doc;
  601. if (!doc) {
  602. // Try fuzzy match by path ending
  603. doc = db.prepare(`
  604. SELECT d.collection as collectionName, d.path, content.doc as body
  605. FROM documents d
  606. JOIN content ON content.hash = d.hash
  607. WHERE d.collection = ? AND d.path LIKE ? AND d.active = 1
  608. LIMIT 1
  609. `).get(possibleCollection, `%${possiblePath}`) as typeof doc;
  610. }
  611. if (doc) {
  612. virtualPath = buildVirtualPath(doc.collectionName, doc.path);
  613. // Skip the filesystem path handling below
  614. }
  615. }
  616. }
  617. }
  618. // If not found as collection/path, handle as filesystem paths
  619. if (!doc) {
  620. let fsPath = inputPath;
  621. // Expand ~ to home directory
  622. if (fsPath.startsWith('~/')) {
  623. fsPath = homedir() + fsPath.slice(1);
  624. } else if (!fsPath.startsWith('/')) {
  625. // Relative path - resolve from current directory
  626. fsPath = resolve(getPwd(), fsPath);
  627. }
  628. fsPath = getRealPath(fsPath);
  629. // Try to detect which collection contains this path
  630. const detected = detectCollectionFromPath(db, fsPath);
  631. if (detected) {
  632. // Found collection - query by collection name + relative path
  633. doc = db.prepare(`
  634. SELECT d.collection as collectionName, d.path, content.doc as body
  635. FROM documents d
  636. JOIN content ON content.hash = d.hash
  637. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  638. `).get(detected.collectionName, detected.relativePath) as typeof doc;
  639. }
  640. // Fuzzy match by filename (last component of path)
  641. if (!doc) {
  642. const filename = inputPath.split('/').pop() || inputPath;
  643. doc = db.prepare(`
  644. SELECT d.collection as collectionName, d.path, content.doc as body
  645. FROM documents d
  646. JOIN content ON content.hash = d.hash
  647. WHERE d.path LIKE ? AND d.active = 1
  648. LIMIT 1
  649. `).get(`%${filename}`) as typeof doc;
  650. }
  651. if (doc) {
  652. virtualPath = buildVirtualPath(doc.collectionName, doc.path);
  653. } else {
  654. virtualPath = inputPath;
  655. }
  656. }
  657. }
  658. if (!doc) {
  659. console.error(`Document not found: ${filename}`);
  660. closeDb();
  661. process.exit(1);
  662. }
  663. // Get context for this file
  664. const context = getContextForPath(db, doc.collectionName, doc.path);
  665. let output = doc.body;
  666. const startLine = fromLine || 1;
  667. // Apply line filtering if specified
  668. if (fromLine !== undefined || maxLines !== undefined) {
  669. const lines = output.split('\n');
  670. const start = startLine - 1; // Convert to 0-indexed
  671. const end = maxLines !== undefined ? start + maxLines : lines.length;
  672. output = lines.slice(start, end).join('\n');
  673. }
  674. // Add line numbers if requested
  675. if (lineNumbers) {
  676. output = addLineNumbers(output, startLine);
  677. }
  678. // Output context header if exists
  679. if (context) {
  680. console.log(`Folder Context: ${context}\n---\n`);
  681. }
  682. console.log(output);
  683. closeDb();
  684. }
  685. // Multi-get: fetch multiple documents by glob pattern or comma-separated list
  686. function multiGet(pattern: string, maxLines?: number, maxBytes: number = DEFAULT_MULTI_GET_MAX_BYTES, format: OutputFormat = "cli"): void {
  687. const db = getDb();
  688. // Check if it's a comma-separated list or a glob pattern
  689. const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
  690. let files: { filepath: string; displayPath: string; bodyLength: number; collection?: string; path?: string }[];
  691. if (isCommaSeparated) {
  692. // Comma-separated list of files (can be virtual paths or relative paths)
  693. const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
  694. files = [];
  695. for (const name of names) {
  696. let doc: { virtual_path: string; body_length: number; collection: string; path: string } | null = null;
  697. // Handle virtual paths
  698. if (isVirtualPath(name)) {
  699. const parsed = parseVirtualPath(name);
  700. if (parsed) {
  701. // Try exact match on collection + path
  702. doc = db.prepare(`
  703. SELECT
  704. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  705. LENGTH(content.doc) as body_length,
  706. d.collection,
  707. d.path
  708. FROM documents d
  709. JOIN content ON content.hash = d.hash
  710. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  711. `).get(parsed.collectionName, parsed.path) as typeof doc;
  712. }
  713. } else {
  714. // Try exact match on path
  715. doc = db.prepare(`
  716. SELECT
  717. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  718. LENGTH(content.doc) as body_length,
  719. d.collection,
  720. d.path
  721. FROM documents d
  722. JOIN content ON content.hash = d.hash
  723. WHERE d.path = ? AND d.active = 1
  724. LIMIT 1
  725. `).get(name) as typeof doc;
  726. // Try suffix match
  727. if (!doc) {
  728. doc = db.prepare(`
  729. SELECT
  730. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  731. LENGTH(content.doc) as body_length,
  732. d.collection,
  733. d.path
  734. FROM documents d
  735. JOIN content ON content.hash = d.hash
  736. WHERE d.path LIKE ? AND d.active = 1
  737. LIMIT 1
  738. `).get(`%${name}`) as typeof doc;
  739. }
  740. }
  741. if (doc) {
  742. files.push({
  743. filepath: doc.virtual_path,
  744. displayPath: doc.virtual_path,
  745. bodyLength: doc.body_length,
  746. collection: doc.collection,
  747. path: doc.path
  748. });
  749. } else {
  750. console.error(`File not found: ${name}`);
  751. }
  752. }
  753. } else {
  754. // Glob pattern - matchFilesByGlob now returns virtual paths
  755. files = matchFilesByGlob(db, pattern).map(f => ({
  756. ...f,
  757. collection: undefined, // Will be fetched later if needed
  758. path: undefined
  759. }));
  760. if (files.length === 0) {
  761. console.error(`No files matched pattern: ${pattern}`);
  762. closeDb();
  763. process.exit(1);
  764. }
  765. }
  766. // Collect results for structured output
  767. const results: { file: string; displayPath: string; title: string; body: string; context: string | null; skipped: boolean; skipReason?: string }[] = [];
  768. for (const file of files) {
  769. // Parse virtual path to get collection info if not already available
  770. let collection = file.collection;
  771. let path = file.path;
  772. if (!collection || !path) {
  773. const parsed = parseVirtualPath(file.filepath);
  774. if (parsed) {
  775. collection = parsed.collectionName;
  776. path = parsed.path;
  777. }
  778. }
  779. // Get context using collection-scoped function
  780. const context = collection && path ? getContextForPath(db, collection, path) : null;
  781. // Check size limit
  782. if (file.bodyLength > maxBytes) {
  783. results.push({
  784. file: file.filepath,
  785. displayPath: file.displayPath,
  786. title: file.displayPath.split('/').pop() || file.displayPath,
  787. body: "",
  788. context,
  789. skipped: true,
  790. skipReason: `File too large (${Math.round(file.bodyLength / 1024)}KB > ${Math.round(maxBytes / 1024)}KB). Use 'qmd get ${file.displayPath}' to retrieve.`,
  791. });
  792. continue;
  793. }
  794. // Fetch document content using collection and path
  795. if (!collection || !path) continue;
  796. const doc = db.prepare(`
  797. SELECT content.doc as body, d.title
  798. FROM documents d
  799. JOIN content ON content.hash = d.hash
  800. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  801. `).get(collection, path) as { body: string; title: string } | null;
  802. if (!doc) continue;
  803. let body = doc.body;
  804. // Apply line limit if specified
  805. if (maxLines !== undefined) {
  806. const lines = body.split('\n');
  807. body = lines.slice(0, maxLines).join('\n');
  808. if (lines.length > maxLines) {
  809. body += `\n\n[... truncated ${lines.length - maxLines} more lines]`;
  810. }
  811. }
  812. results.push({
  813. file: file.filepath,
  814. displayPath: file.displayPath,
  815. title: doc.title || file.displayPath.split('/').pop() || file.displayPath,
  816. body,
  817. context,
  818. skipped: false,
  819. });
  820. }
  821. closeDb();
  822. // Output based on format
  823. if (format === "json") {
  824. const output = results.map(r => ({
  825. file: r.displayPath,
  826. title: r.title,
  827. ...(r.context && { context: r.context }),
  828. ...(r.skipped ? { skipped: true, reason: r.skipReason } : { body: r.body }),
  829. }));
  830. console.log(JSON.stringify(output, null, 2));
  831. } else if (format === "csv") {
  832. const escapeField = (val: string | null): string => {
  833. if (val === null || val === undefined) return "";
  834. const str = String(val);
  835. if (str.includes(",") || str.includes('"') || str.includes("\n")) {
  836. return `"${str.replace(/"/g, '""')}"`;
  837. }
  838. return str;
  839. };
  840. console.log("file,title,context,skipped,body");
  841. for (const r of results) {
  842. console.log([r.displayPath, r.title, r.context || "", r.skipped ? "true" : "false", r.skipped ? r.skipReason : r.body].map(escapeField).join(","));
  843. }
  844. } else if (format === "files") {
  845. for (const r of results) {
  846. const ctx = r.context ? `,"${r.context.replace(/"/g, '""')}"` : "";
  847. const status = r.skipped ? "[SKIPPED]" : "";
  848. console.log(`${r.displayPath}${ctx}${status ? `,${status}` : ""}`);
  849. }
  850. } else if (format === "md") {
  851. for (const r of results) {
  852. console.log(`## ${r.displayPath}\n`);
  853. if (r.title && r.title !== r.displayPath) console.log(`**Title:** ${r.title}\n`);
  854. if (r.context) console.log(`**Context:** ${r.context}\n`);
  855. if (r.skipped) {
  856. console.log(`> ${r.skipReason}\n`);
  857. } else {
  858. console.log("```");
  859. console.log(r.body);
  860. console.log("```\n");
  861. }
  862. }
  863. } else if (format === "xml") {
  864. console.log('<?xml version="1.0" encoding="UTF-8"?>');
  865. console.log("<documents>");
  866. for (const r of results) {
  867. console.log(" <document>");
  868. console.log(` <file>${escapeXml(r.displayPath)}</file>`);
  869. console.log(` <title>${escapeXml(r.title)}</title>`);
  870. if (r.context) console.log(` <context>${escapeXml(r.context)}</context>`);
  871. if (r.skipped) {
  872. console.log(` <skipped>true</skipped>`);
  873. console.log(` <reason>${escapeXml(r.skipReason || "")}</reason>`);
  874. } else {
  875. console.log(` <body>${escapeXml(r.body)}</body>`);
  876. }
  877. console.log(" </document>");
  878. }
  879. console.log("</documents>");
  880. } else {
  881. // CLI format (default)
  882. for (const r of results) {
  883. console.log(`\n${'='.repeat(60)}`);
  884. console.log(`File: ${r.displayPath}`);
  885. console.log(`${'='.repeat(60)}\n`);
  886. if (r.skipped) {
  887. console.log(`[SKIPPED: ${r.skipReason}]`);
  888. continue;
  889. }
  890. if (r.context) {
  891. console.log(`Folder Context: ${r.context}\n---\n`);
  892. }
  893. console.log(r.body);
  894. }
  895. }
  896. }
  897. // List files in virtual file tree
  898. function listFiles(pathArg?: string): void {
  899. const db = getDb();
  900. if (!pathArg) {
  901. // No argument - list all collections
  902. const yamlCollections = yamlListCollections();
  903. if (yamlCollections.length === 0) {
  904. console.log("No collections found. Run 'qmd add .' to index files.");
  905. closeDb();
  906. return;
  907. }
  908. // Get file counts from database for each collection
  909. const collections = yamlCollections.map(coll => {
  910. const stats = db.prepare(`
  911. SELECT COUNT(*) as file_count
  912. FROM documents d
  913. WHERE d.collection = ? AND d.active = 1
  914. `).get(coll.name) as { file_count: number } | null;
  915. return {
  916. name: coll.name,
  917. file_count: stats?.file_count || 0
  918. };
  919. });
  920. console.log(`${c.bold}Collections:${c.reset}\n`);
  921. for (const coll of collections) {
  922. console.log(` ${c.dim}qmd://${c.reset}${c.cyan}${coll.name}/${c.reset} ${c.dim}(${coll.file_count} files)${c.reset}`);
  923. }
  924. closeDb();
  925. return;
  926. }
  927. // Parse the path argument
  928. let collectionName: string;
  929. let pathPrefix: string | null = null;
  930. if (pathArg.startsWith('qmd://')) {
  931. // Virtual path format: qmd://collection/path
  932. const parsed = parseVirtualPath(pathArg);
  933. if (!parsed) {
  934. console.error(`Invalid virtual path: ${pathArg}`);
  935. closeDb();
  936. process.exit(1);
  937. }
  938. collectionName = parsed.collectionName;
  939. pathPrefix = parsed.path;
  940. } else {
  941. // Just collection name or collection/path
  942. const parts = pathArg.split('/');
  943. collectionName = parts[0];
  944. if (parts.length > 1) {
  945. pathPrefix = parts.slice(1).join('/');
  946. }
  947. }
  948. // Get the collection
  949. const coll = getCollectionFromYaml(collectionName);
  950. if (!coll) {
  951. console.error(`Collection not found: ${collectionName}`);
  952. console.error(`Run 'qmd ls' to see available collections.`);
  953. closeDb();
  954. process.exit(1);
  955. }
  956. // List files in the collection with size and modification time
  957. let query: string;
  958. let params: any[];
  959. if (pathPrefix) {
  960. // List files under a specific path
  961. query = `
  962. SELECT d.path, d.title, d.modified_at, LENGTH(ct.doc) as size
  963. FROM documents d
  964. JOIN content ct ON d.hash = ct.hash
  965. WHERE d.collection = ? AND d.path LIKE ? AND d.active = 1
  966. ORDER BY d.path
  967. `;
  968. params = [coll.name, `${pathPrefix}%`];
  969. } else {
  970. // List all files in the collection
  971. query = `
  972. SELECT d.path, d.title, d.modified_at, LENGTH(ct.doc) as size
  973. FROM documents d
  974. JOIN content ct ON d.hash = ct.hash
  975. WHERE d.collection = ? AND d.active = 1
  976. ORDER BY d.path
  977. `;
  978. params = [coll.name];
  979. }
  980. const files = db.prepare(query).all(...params) as { path: string; title: string; modified_at: string; size: number }[];
  981. if (files.length === 0) {
  982. if (pathPrefix) {
  983. console.log(`No files found under qmd://${collectionName}/${pathPrefix}`);
  984. } else {
  985. console.log(`No files found in collection: ${collectionName}`);
  986. }
  987. closeDb();
  988. return;
  989. }
  990. // Calculate max widths for alignment
  991. const maxSize = Math.max(...files.map(f => formatBytes(f.size).length));
  992. // Output in ls -l style
  993. for (const file of files) {
  994. const sizeStr = formatBytes(file.size).padStart(maxSize);
  995. const date = new Date(file.modified_at);
  996. const timeStr = formatLsTime(date);
  997. // Dim the qmd:// prefix, highlight the filename
  998. console.log(`${sizeStr} ${timeStr} ${c.dim}qmd://${collectionName}/${c.reset}${c.cyan}${file.path}${c.reset}`);
  999. }
  1000. closeDb();
  1001. }
  1002. // Format date/time like ls -l
  1003. function formatLsTime(date: Date): string {
  1004. const now = new Date();
  1005. const sixMonthsAgo = new Date(now.getTime() - 6 * 30 * 24 * 60 * 60 * 1000);
  1006. const months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
  1007. const month = months[date.getMonth()];
  1008. const day = date.getDate().toString().padStart(2, ' ');
  1009. // If file is older than 6 months, show year instead of time
  1010. if (date < sixMonthsAgo) {
  1011. const year = date.getFullYear();
  1012. return `${month} ${day} ${year}`;
  1013. } else {
  1014. const hours = date.getHours().toString().padStart(2, '0');
  1015. const minutes = date.getMinutes().toString().padStart(2, '0');
  1016. return `${month} ${day} ${hours}:${minutes}`;
  1017. }
  1018. }
  1019. // Collection management commands
  1020. function collectionList(): void {
  1021. const db = getDb();
  1022. const collections = listCollections(db);
  1023. if (collections.length === 0) {
  1024. console.log("No collections found. Run 'qmd add .' to create one.");
  1025. closeDb();
  1026. return;
  1027. }
  1028. console.log(`${c.bold}Collections (${collections.length}):${c.reset}\n`);
  1029. for (const coll of collections) {
  1030. const updatedAt = new Date(coll.updated_at);
  1031. const timeAgo = formatTimeAgo(updatedAt);
  1032. console.log(`${c.cyan}${coll.name}${c.reset} ${c.dim}(qmd://${coll.name}/)${c.reset}`);
  1033. console.log(` ${c.dim}Pattern:${c.reset} ${coll.glob_pattern}`);
  1034. console.log(` ${c.dim}Files:${c.reset} ${coll.active_count}`);
  1035. console.log(` ${c.dim}Updated:${c.reset} ${timeAgo}`);
  1036. console.log();
  1037. }
  1038. closeDb();
  1039. }
  1040. async function collectionAdd(pwd: string, globPattern: string, name?: string): Promise<void> {
  1041. // If name not provided, generate from pwd basename
  1042. if (!name) {
  1043. const parts = pwd.split('/').filter(Boolean);
  1044. name = parts[parts.length - 1] || 'root';
  1045. }
  1046. // Check if collection with this name already exists in YAML
  1047. const existing = getCollectionFromYaml(name);
  1048. if (existing) {
  1049. console.error(`${c.yellow}Collection '${name}' already exists.${c.reset}`);
  1050. console.error(`Use a different name with --name <name>`);
  1051. process.exit(1);
  1052. }
  1053. // Check if a collection with this pwd+glob already exists in YAML
  1054. const allCollections = yamlListCollections();
  1055. const existingPwdGlob = allCollections.find(c => c.path === pwd && c.pattern === globPattern);
  1056. if (existingPwdGlob) {
  1057. console.error(`${c.yellow}A collection already exists for this path and pattern:${c.reset}`);
  1058. console.error(` Name: ${existingPwdGlob.name} (qmd://${existingPwdGlob.name}/)`);
  1059. console.error(` Pattern: ${globPattern}`);
  1060. console.error(`\nUse 'qmd update' to re-index it, or remove it first with 'qmd collection remove ${existingPwdGlob.name}'`);
  1061. process.exit(1);
  1062. }
  1063. // Add to YAML config
  1064. const { addCollection } = await import("./collections.js");
  1065. addCollection(name, pwd, globPattern);
  1066. // Create the collection and index files
  1067. console.log(`Creating collection '${name}'...`);
  1068. await indexFiles(pwd, globPattern, name);
  1069. console.log(`${c.green}✓${c.reset} Collection '${name}' created successfully`);
  1070. }
  1071. function collectionRemove(name: string): void {
  1072. // Check if collection exists in YAML
  1073. const coll = getCollectionFromYaml(name);
  1074. if (!coll) {
  1075. console.error(`${c.yellow}Collection not found: ${name}${c.reset}`);
  1076. console.error(`Run 'qmd collection list' to see available collections.`);
  1077. process.exit(1);
  1078. }
  1079. const db = getDb();
  1080. const result = removeCollection(db, name);
  1081. closeDb();
  1082. console.log(`${c.green}✓${c.reset} Removed collection '${name}'`);
  1083. console.log(` Deleted ${result.deletedDocs} documents`);
  1084. if (result.cleanedHashes > 0) {
  1085. console.log(` Cleaned up ${result.cleanedHashes} orphaned content hashes`);
  1086. }
  1087. }
  1088. function collectionRename(oldName: string, newName: string): void {
  1089. // Check if old collection exists in YAML
  1090. const coll = getCollectionFromYaml(oldName);
  1091. if (!coll) {
  1092. console.error(`${c.yellow}Collection not found: ${oldName}${c.reset}`);
  1093. console.error(`Run 'qmd collection list' to see available collections.`);
  1094. process.exit(1);
  1095. }
  1096. // Check if new name already exists in YAML
  1097. const existing = getCollectionFromYaml(newName);
  1098. if (existing) {
  1099. console.error(`${c.yellow}Collection name already exists: ${newName}${c.reset}`);
  1100. console.error(`Choose a different name or remove the existing collection first.`);
  1101. process.exit(1);
  1102. }
  1103. const db = getDb();
  1104. renameCollection(db, oldName, newName);
  1105. closeDb();
  1106. console.log(`${c.green}✓${c.reset} Renamed collection '${oldName}' to '${newName}'`);
  1107. console.log(` Virtual paths updated: ${c.cyan}qmd://${oldName}/${c.reset} → ${c.cyan}qmd://${newName}/${c.reset}`);
  1108. }
  1109. async function indexFiles(pwd?: string, globPattern: string = DEFAULT_GLOB, collectionName?: string): Promise<void> {
  1110. const db = getDb();
  1111. const resolvedPwd = pwd || getPwd();
  1112. const now = new Date().toISOString();
  1113. const excludeDirs = ["node_modules", ".git", ".cache", "vendor", "dist", "build"];
  1114. // Clear Ollama cache on index
  1115. clearCache(db);
  1116. // Collection name must be provided (from YAML)
  1117. if (!collectionName) {
  1118. throw new Error("Collection name is required. Collections must be defined in ~/.config/qmd/index.yml");
  1119. }
  1120. console.log(`Collection: ${resolvedPwd} (${globPattern})`);
  1121. progress.indeterminate();
  1122. const glob = new Glob(globPattern);
  1123. const files: string[] = [];
  1124. for await (const file of glob.scan({ cwd: resolvedPwd, onlyFiles: true, followSymlinks: true })) {
  1125. // Skip node_modules, hidden folders (.*), and other common excludes
  1126. const parts = file.split("/");
  1127. const shouldSkip = parts.some(part =>
  1128. part === "node_modules" ||
  1129. part.startsWith(".") ||
  1130. excludeDirs.includes(part)
  1131. );
  1132. if (!shouldSkip) {
  1133. files.push(file);
  1134. }
  1135. }
  1136. const total = files.length;
  1137. if (total === 0) {
  1138. progress.clear();
  1139. console.log("No files found matching pattern.");
  1140. closeDb();
  1141. return;
  1142. }
  1143. let indexed = 0, updated = 0, unchanged = 0, processed = 0;
  1144. const seenPaths = new Set<string>();
  1145. const startTime = Date.now();
  1146. for (const relativeFile of files) {
  1147. const filepath = getRealPath(resolve(resolvedPwd, relativeFile));
  1148. const path = handelize(relativeFile); // Normalize path for token-friendliness
  1149. seenPaths.add(path);
  1150. const content = await Bun.file(filepath).text();
  1151. const hash = await hashContent(content);
  1152. const title = extractTitle(content, relativeFile);
  1153. // Check if document exists in this collection with this path
  1154. const existing = findActiveDocument(db, collectionName, path);
  1155. if (existing) {
  1156. if (existing.hash === hash) {
  1157. // Hash unchanged, but check if title needs updating
  1158. if (existing.title !== title) {
  1159. updateDocumentTitle(db, existing.id, title, now);
  1160. updated++;
  1161. } else {
  1162. unchanged++;
  1163. }
  1164. } else {
  1165. // Content changed - insert new content hash and update document
  1166. insertContent(db, hash, content, now);
  1167. const stat = await Bun.file(filepath).stat();
  1168. updateDocument(db, existing.id, title, hash,
  1169. stat ? new Date(stat.mtime).toISOString() : now);
  1170. updated++;
  1171. }
  1172. } else {
  1173. // New document - insert content and document
  1174. indexed++;
  1175. insertContent(db, hash, content, now);
  1176. const stat = await Bun.file(filepath).stat();
  1177. insertDocument(db, collectionName, path, title, hash,
  1178. stat ? new Date(stat.birthtime).toISOString() : now,
  1179. stat ? new Date(stat.mtime).toISOString() : now);
  1180. }
  1181. processed++;
  1182. progress.set((processed / total) * 100);
  1183. const elapsed = (Date.now() - startTime) / 1000;
  1184. const rate = processed / elapsed;
  1185. const remaining = (total - processed) / rate;
  1186. const eta = processed > 2 ? ` ETA: ${formatETA(remaining)}` : "";
  1187. process.stderr.write(`\rIndexing: ${processed}/${total}${eta} `);
  1188. }
  1189. // Deactivate documents in this collection that no longer exist
  1190. const allActive = getActiveDocumentPaths(db, collectionName);
  1191. let removed = 0;
  1192. for (const path of allActive) {
  1193. if (!seenPaths.has(path)) {
  1194. deactivateDocument(db, collectionName, path);
  1195. removed++;
  1196. }
  1197. }
  1198. // Clean up orphaned content hashes (content not referenced by any document)
  1199. const orphanedContent = cleanupOrphanedContent(db);
  1200. // Check if vector index needs updating
  1201. const needsEmbedding = getHashesNeedingEmbedding(db);
  1202. progress.clear();
  1203. console.log(`\nIndexed: ${indexed} new, ${updated} updated, ${unchanged} unchanged, ${removed} removed`);
  1204. if (orphanedContent > 0) {
  1205. console.log(`Cleaned up ${orphanedContent} orphaned content hash(es)`);
  1206. }
  1207. if (needsEmbedding > 0) {
  1208. console.log(`\nRun 'qmd embed' to update embeddings (${needsEmbedding} unique hashes need vectors)`);
  1209. }
  1210. closeDb();
  1211. }
  1212. function renderProgressBar(percent: number, width: number = 30): string {
  1213. const filled = Math.round((percent / 100) * width);
  1214. const empty = width - filled;
  1215. const bar = "█".repeat(filled) + "░".repeat(empty);
  1216. return bar;
  1217. }
  1218. async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = false): Promise<void> {
  1219. const db = getDb();
  1220. const now = new Date().toISOString();
  1221. // If force, clear all vectors
  1222. if (force) {
  1223. console.log(`${c.yellow}Force re-indexing: clearing all vectors...${c.reset}`);
  1224. clearAllEmbeddings(db);
  1225. }
  1226. // Find unique hashes that need embedding (from active documents)
  1227. const hashesToEmbed = getHashesForEmbedding(db);
  1228. if (hashesToEmbed.length === 0) {
  1229. console.log(`${c.green}✓ All content hashes already have embeddings.${c.reset}`);
  1230. closeDb();
  1231. return;
  1232. }
  1233. // Prepare documents with chunks
  1234. type ChunkItem = { hash: string; title: string; text: string; seq: number; pos: number; tokens: number; bytes: number; displayName: string };
  1235. const allChunks: ChunkItem[] = [];
  1236. let multiChunkDocs = 0;
  1237. // Chunk all documents using actual token counts
  1238. process.stderr.write(`Chunking ${hashesToEmbed.length} documents by token count...\n`);
  1239. for (const item of hashesToEmbed) {
  1240. const encoder = new TextEncoder();
  1241. const bodyBytes = encoder.encode(item.body).length;
  1242. if (bodyBytes === 0) continue; // Skip empty
  1243. const title = extractTitle(item.body, item.path);
  1244. const displayName = item.path;
  1245. const chunks = await chunkDocumentByTokens(item.body); // Uses actual tokenizer
  1246. if (chunks.length > 1) multiChunkDocs++;
  1247. for (let seq = 0; seq < chunks.length; seq++) {
  1248. allChunks.push({
  1249. hash: item.hash,
  1250. title,
  1251. text: chunks[seq].text,
  1252. seq,
  1253. pos: chunks[seq].pos,
  1254. tokens: chunks[seq].tokens,
  1255. bytes: encoder.encode(chunks[seq].text).length,
  1256. displayName,
  1257. });
  1258. }
  1259. }
  1260. if (allChunks.length === 0) {
  1261. console.log(`${c.green}✓ No non-empty documents to embed.${c.reset}`);
  1262. closeDb();
  1263. return;
  1264. }
  1265. const totalBytes = allChunks.reduce((sum, c) => sum + c.bytes, 0);
  1266. const totalChunks = allChunks.length;
  1267. const totalDocs = hashesToEmbed.length;
  1268. console.log(`${c.bold}Embedding ${totalDocs} documents${c.reset} ${c.dim}(${totalChunks} chunks, ${formatBytes(totalBytes)})${c.reset}`);
  1269. if (multiChunkDocs > 0) {
  1270. console.log(`${c.dim}${multiChunkDocs} documents split into multiple chunks${c.reset}`);
  1271. }
  1272. console.log(`${c.dim}Model: ${model}${c.reset}\n`);
  1273. // Hide cursor during embedding
  1274. cursor.hide();
  1275. // Get embedding dimensions from first chunk
  1276. progress.indeterminate();
  1277. const llm = getDefaultLlamaCpp();
  1278. const firstText = formatDocForEmbedding(allChunks[0].text, allChunks[0].title);
  1279. const firstResult = await llm.embed(firstText);
  1280. if (!firstResult) {
  1281. throw new Error("Failed to get embedding dimensions from first chunk");
  1282. }
  1283. ensureVecTable(db, firstResult.embedding.length);
  1284. let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
  1285. const startTime = Date.now();
  1286. // Batch embedding for better throughput
  1287. // Process in batches of 32 to balance memory usage and efficiency
  1288. const BATCH_SIZE = 32;
  1289. for (let batchStart = 0; batchStart < allChunks.length; batchStart += BATCH_SIZE) {
  1290. const batchEnd = Math.min(batchStart + BATCH_SIZE, allChunks.length);
  1291. const batch = allChunks.slice(batchStart, batchEnd);
  1292. // Format texts for embedding
  1293. const texts = batch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title));
  1294. try {
  1295. // Batch embed all texts at once
  1296. const embeddings = await llm.embedBatch(texts);
  1297. // Insert each embedding
  1298. for (let i = 0; i < batch.length; i++) {
  1299. const chunk = batch[i];
  1300. const embedding = embeddings[i];
  1301. if (embedding) {
  1302. insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
  1303. chunksEmbedded++;
  1304. } else {
  1305. errors++;
  1306. console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}${c.reset}`);
  1307. }
  1308. bytesProcessed += chunk.bytes;
  1309. }
  1310. } catch (err) {
  1311. // If batch fails, try individual embeddings as fallback
  1312. for (const chunk of batch) {
  1313. try {
  1314. const text = formatDocForEmbedding(chunk.text, chunk.title);
  1315. const result = await llm.embed(text);
  1316. if (result) {
  1317. insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
  1318. chunksEmbedded++;
  1319. } else {
  1320. errors++;
  1321. }
  1322. } catch (innerErr) {
  1323. errors++;
  1324. console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}: ${innerErr}${c.reset}`);
  1325. }
  1326. bytesProcessed += chunk.bytes;
  1327. }
  1328. }
  1329. const percent = (bytesProcessed / totalBytes) * 100;
  1330. progress.set(percent);
  1331. const elapsed = (Date.now() - startTime) / 1000;
  1332. const bytesPerSec = bytesProcessed / elapsed;
  1333. const remainingBytes = totalBytes - bytesProcessed;
  1334. const etaSec = remainingBytes / bytesPerSec;
  1335. const bar = renderProgressBar(percent);
  1336. const percentStr = percent.toFixed(0).padStart(3);
  1337. const throughput = `${formatBytes(bytesPerSec)}/s`;
  1338. const eta = elapsed > 2 ? formatETA(etaSec) : "...";
  1339. const errStr = errors > 0 ? ` ${c.yellow}${errors} err${c.reset}` : "";
  1340. process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${chunksEmbedded}/${totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset} `);
  1341. }
  1342. progress.clear();
  1343. cursor.show();
  1344. const totalTimeSec = (Date.now() - startTime) / 1000;
  1345. const avgThroughput = formatBytes(totalBytes / totalTimeSec);
  1346. console.log(`\r${c.green}${renderProgressBar(100)}${c.reset} ${c.bold}100%${c.reset} `);
  1347. console.log(`\n${c.green}✓ Done!${c.reset} Embedded ${c.bold}${chunksEmbedded}${c.reset} chunks from ${c.bold}${totalDocs}${c.reset} documents in ${c.bold}${formatETA(totalTimeSec)}${c.reset} ${c.dim}(${avgThroughput}/s)${c.reset}`);
  1348. if (errors > 0) {
  1349. console.log(`${c.yellow}⚠ ${errors} chunks failed${c.reset}`);
  1350. }
  1351. closeDb();
  1352. }
  1353. // Sanitize a term for FTS5: remove punctuation except apostrophes
  1354. function sanitizeFTS5Term(term: string): string {
  1355. // Remove all non-alphanumeric except apostrophes (for contractions like "don't")
  1356. return term.replace(/[^\w']/g, '').trim();
  1357. }
  1358. // Build FTS5 query: phrase-aware with fallback to individual terms
  1359. function buildFTS5Query(query: string): string {
  1360. // Sanitize the full query for phrase matching
  1361. const sanitizedQuery = query.replace(/[^\w\s']/g, '').trim();
  1362. const terms = query
  1363. .split(/\s+/)
  1364. .map(sanitizeFTS5Term)
  1365. .filter(term => term.length >= 2); // Skip single chars and empty
  1366. if (terms.length === 0) return "";
  1367. if (terms.length === 1) return `"${terms[0].replace(/"/g, '""')}"`;
  1368. // Strategy: exact phrase OR proximity match OR individual terms
  1369. // Exact phrase matches rank highest, then close proximity, then any term
  1370. const phrase = `"${sanitizedQuery.replace(/"/g, '""')}"`;
  1371. const quotedTerms = terms.map(t => `"${t.replace(/"/g, '""')}"`);
  1372. // FTS5 NEAR syntax: NEAR(term1 term2, distance)
  1373. const nearPhrase = `NEAR(${quotedTerms.join(' ')}, 10)`;
  1374. const orTerms = quotedTerms.join(' OR ');
  1375. // Exact phrase > proximity > any term
  1376. return `(${phrase}) OR (${nearPhrase}) OR (${orTerms})`;
  1377. }
  1378. // Normalize BM25 score to 0-1 range using sigmoid
  1379. function normalizeBM25(score: number): number {
  1380. // BM25 scores are negative in SQLite (lower = better)
  1381. // Typical range: -15 (excellent) to -2 (weak match)
  1382. // Map to 0-1 where higher is better
  1383. const absScore = Math.abs(score);
  1384. // Sigmoid-ish normalization: maps ~2-15 range to ~0.1-0.95
  1385. return 1 / (1 + Math.exp(-(absScore - 5) / 3));
  1386. }
  1387. function normalizeScores(results: SearchResult[]): SearchResult[] {
  1388. if (results.length === 0) return results;
  1389. const maxScore = Math.max(...results.map(r => r.score));
  1390. const minScore = Math.min(...results.map(r => r.score));
  1391. const range = maxScore - minScore || 1;
  1392. return results.map(r => ({ ...r, score: (r.score - minScore) / range }));
  1393. }
  1394. // Reciprocal Rank Fusion: combines multiple ranked lists
  1395. // RRF score = sum(1 / (k + rank)) across all lists where doc appears
  1396. // k=60 is standard, provides good balance between top and lower ranks
  1397. export type RankedResult = { file: string; displayPath: string; title: string; body: string; score: number };
  1398. function reciprocalRankFusion(
  1399. resultLists: RankedResult[][],
  1400. weights: number[] = [], // Weight per result list (default 1.0)
  1401. k: number = 60
  1402. ): RankedResult[] {
  1403. const scores = new Map<string, { score: number; displayPath: string; title: string; body: string; bestRank: number }>();
  1404. for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
  1405. const results = resultLists[listIdx];
  1406. const weight = weights[listIdx] ?? 1.0;
  1407. for (let rank = 0; rank < results.length; rank++) {
  1408. const doc = results[rank];
  1409. const rrfScore = weight / (k + rank + 1);
  1410. const existing = scores.get(doc.file);
  1411. if (existing) {
  1412. existing.score += rrfScore;
  1413. existing.bestRank = Math.min(existing.bestRank, rank);
  1414. } else {
  1415. scores.set(doc.file, { score: rrfScore, displayPath: doc.displayPath, title: doc.title, body: doc.body, bestRank: rank });
  1416. }
  1417. }
  1418. }
  1419. // Add bonus for best rank: documents that ranked #1-3 in any list get a boost
  1420. // This prevents dilution of exact matches by expansion queries
  1421. return Array.from(scores.entries())
  1422. .map(([file, { score, displayPath, title, body, bestRank }]) => {
  1423. let bonus = 0;
  1424. if (bestRank === 0) bonus = 0.05; // Ranked #1 somewhere
  1425. else if (bestRank <= 2) bonus = 0.02; // Ranked top-3 somewhere
  1426. return { file, displayPath, title, body, score: score + bonus };
  1427. })
  1428. .sort((a, b) => b.score - a.score);
  1429. }
  1430. type OutputOptions = {
  1431. format: OutputFormat;
  1432. full: boolean;
  1433. limit: number;
  1434. minScore: number;
  1435. all?: boolean;
  1436. collection?: string; // Filter by collection name (pwd suffix match)
  1437. lineNumbers?: boolean; // Add line numbers to output
  1438. };
  1439. // Extract snippet with more context lines for CLI display
  1440. function extractSnippetWithContext(body: string, query: string, contextLines = 3, chunkPos?: number): { line: number; snippet: string; hasMatch: boolean } {
  1441. // If chunkPos provided, focus search on that area
  1442. let lineOffset = 0;
  1443. let searchBody = body;
  1444. if (chunkPos && chunkPos > 0) {
  1445. const contextStart = Math.max(0, chunkPos - 200);
  1446. searchBody = body.slice(contextStart);
  1447. if (contextStart > 0) {
  1448. lineOffset = body.slice(0, contextStart).split('\n').length - 1;
  1449. }
  1450. }
  1451. const lines = searchBody.split('\n');
  1452. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
  1453. let bestLine = 0, bestScore = -1;
  1454. for (let i = 0; i < lines.length; i++) {
  1455. const lineLower = lines[i].toLowerCase();
  1456. let score = 0;
  1457. for (const term of queryTerms) {
  1458. if (lineLower.includes(term)) score++;
  1459. }
  1460. if (score > bestScore) {
  1461. bestScore = score;
  1462. bestLine = i;
  1463. }
  1464. }
  1465. // No query match found - return beginning of chunk area or file
  1466. if (bestScore <= 0) {
  1467. const preview = lines.slice(0, contextLines * 2).join('\n').trim();
  1468. return { line: lineOffset + 1, snippet: preview, hasMatch: false };
  1469. }
  1470. const startLine = Math.max(0, bestLine - contextLines);
  1471. const endLine = Math.min(lines.length, bestLine + contextLines + 1);
  1472. const snippet = lines.slice(startLine, endLine).join('\n').trim();
  1473. return { line: lineOffset + bestLine + 1, snippet, hasMatch: true };
  1474. }
  1475. // Highlight query terms in text (skip short words < 3 chars)
  1476. function highlightTerms(text: string, query: string): string {
  1477. if (!useColor) return text;
  1478. const terms = query.toLowerCase().split(/\s+/).filter(t => t.length >= 3);
  1479. let result = text;
  1480. for (const term of terms) {
  1481. const regex = new RegExp(`(${term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')})`, 'gi');
  1482. result = result.replace(regex, `${c.yellow}${c.bold}$1${c.reset}`);
  1483. }
  1484. return result;
  1485. }
  1486. // Format score with color based on value
  1487. function formatScore(score: number): string {
  1488. const pct = (score * 100).toFixed(0).padStart(3);
  1489. if (!useColor) return `${pct}%`;
  1490. if (score >= 0.7) return `${c.green}${pct}%${c.reset}`;
  1491. if (score >= 0.4) return `${c.yellow}${pct}%${c.reset}`;
  1492. return `${c.dim}${pct}%${c.reset}`;
  1493. }
  1494. // Shorten directory path for display - relative to $HOME (used for context paths, not documents)
  1495. function shortPath(dirpath: string): string {
  1496. const home = homedir();
  1497. if (dirpath.startsWith(home)) {
  1498. return '~' + dirpath.slice(home.length);
  1499. }
  1500. return dirpath;
  1501. }
  1502. // Add line numbers to text content
  1503. function addLineNumbers(text: string, startLine: number = 1): string {
  1504. const lines = text.split('\n');
  1505. return lines.map((line, i) => `${startLine + i}: ${line}`).join('\n');
  1506. }
  1507. function outputResults(results: { file: string; displayPath: string; title: string; body: string; score: number; context?: string | null; chunkPos?: number; hash?: string; docid?: string }[], query: string, opts: OutputOptions): void {
  1508. const filtered = results.filter(r => r.score >= opts.minScore).slice(0, opts.limit);
  1509. if (filtered.length === 0) {
  1510. console.log("No results found above minimum score threshold.");
  1511. return;
  1512. }
  1513. // Helper to create qmd:// URI from displayPath
  1514. const toQmdPath = (displayPath: string) => `qmd://${displayPath}`;
  1515. if (opts.format === "json") {
  1516. // JSON output for LLM consumption
  1517. const output = filtered.map(row => {
  1518. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
  1519. let body = opts.full ? row.body : undefined;
  1520. let snippet = !opts.full ? extractSnippet(row.body, query, 300, row.chunkPos).snippet : undefined;
  1521. if (opts.lineNumbers) {
  1522. if (body) body = addLineNumbers(body);
  1523. if (snippet) snippet = addLineNumbers(snippet);
  1524. }
  1525. return {
  1526. ...(docid && { docid: `#${docid}` }),
  1527. score: Math.round(row.score * 100) / 100,
  1528. file: toQmdPath(row.displayPath),
  1529. title: row.title,
  1530. ...(row.context && { context: row.context }),
  1531. ...(body && { body }),
  1532. ...(snippet && { snippet }),
  1533. };
  1534. });
  1535. console.log(JSON.stringify(output, null, 2));
  1536. } else if (opts.format === "files") {
  1537. // Simple docid,score,filepath,context output
  1538. for (const row of filtered) {
  1539. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
  1540. const ctx = row.context ? `,"${row.context.replace(/"/g, '""')}"` : "";
  1541. console.log(`#${docid},${row.score.toFixed(2)},${toQmdPath(row.displayPath)}${ctx}`);
  1542. }
  1543. } else if (opts.format === "cli") {
  1544. for (let i = 0; i < filtered.length; i++) {
  1545. const row = filtered[i];
  1546. const { line, snippet, hasMatch } = extractSnippetWithContext(row.body, query, 2, row.chunkPos);
  1547. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
  1548. // Line 1: filepath with docid
  1549. const path = toQmdPath(row.displayPath);
  1550. const lineInfo = hasMatch ? `:${line}` : "";
  1551. const docidStr = docid ? ` ${c.dim}#${docid}${c.reset}` : "";
  1552. console.log(`${c.cyan}${path}${c.dim}${lineInfo}${c.reset}${docidStr}`);
  1553. // Line 2: Title (if available)
  1554. if (row.title) {
  1555. console.log(`${c.bold}Title: ${row.title}${c.reset}`);
  1556. }
  1557. // Line 3: Context (if available)
  1558. if (row.context) {
  1559. console.log(`${c.dim}Context: ${row.context}${c.reset}`);
  1560. }
  1561. // Line 4: Score
  1562. const score = formatScore(row.score);
  1563. console.log(`Score: ${c.bold}${score}${c.reset}`);
  1564. console.log();
  1565. // Snippet with highlighting (no leading | chars for better word wrap)
  1566. let displaySnippet = opts.lineNumbers ? addLineNumbers(snippet, line) : snippet;
  1567. const highlighted = highlightTerms(displaySnippet, query);
  1568. console.log(highlighted);
  1569. // Double empty line between results
  1570. if (i < filtered.length - 1) console.log('\n');
  1571. }
  1572. } else if (opts.format === "md") {
  1573. for (const row of filtered) {
  1574. const heading = row.title || row.displayPath;
  1575. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
  1576. let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos).snippet;
  1577. if (opts.lineNumbers) {
  1578. content = addLineNumbers(content);
  1579. }
  1580. const docidLine = docid ? `**docid:** \`#${docid}\`\n` : "";
  1581. const contextLine = row.context ? `**context:** ${row.context}\n` : "";
  1582. console.log(`---\n# ${heading}\n${docidLine}${contextLine}\n${content}\n`);
  1583. }
  1584. } else if (opts.format === "xml") {
  1585. for (const row of filtered) {
  1586. const titleAttr = row.title ? ` title="${row.title.replace(/"/g, '&quot;')}"` : "";
  1587. const contextAttr = row.context ? ` context="${row.context.replace(/"/g, '&quot;')}"` : "";
  1588. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
  1589. let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos).snippet;
  1590. if (opts.lineNumbers) {
  1591. content = addLineNumbers(content);
  1592. }
  1593. console.log(`<file docid="#${docid}" name="${toQmdPath(row.displayPath)}"${titleAttr}${contextAttr}>\n${content}\n</file>\n`);
  1594. }
  1595. } else {
  1596. // CSV format
  1597. console.log("docid,score,file,title,context,line,snippet");
  1598. for (const row of filtered) {
  1599. const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos);
  1600. let content = opts.full ? row.body : snippet;
  1601. if (opts.lineNumbers) {
  1602. content = addLineNumbers(content, line);
  1603. }
  1604. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
  1605. console.log(`#${docid},${row.score.toFixed(4)},${escapeCSV(toQmdPath(row.displayPath))},${escapeCSV(row.title)},${escapeCSV(row.context || "")},${line},${escapeCSV(content)}`);
  1606. }
  1607. }
  1608. }
  1609. function search(query: string, opts: OutputOptions): void {
  1610. const db = getDb();
  1611. // Validate collection filter if specified
  1612. let collectionName: string | undefined;
  1613. if (opts.collection) {
  1614. const coll = getCollectionFromYaml(opts.collection);
  1615. if (!coll) {
  1616. console.error(`Collection not found: ${opts.collection}`);
  1617. closeDb();
  1618. process.exit(1);
  1619. }
  1620. collectionName = opts.collection;
  1621. }
  1622. // Use large limit for --all, otherwise fetch more than needed and let outputResults filter
  1623. const fetchLimit = opts.all ? 100000 : Math.max(50, opts.limit * 2);
  1624. // searchFTS accepts collection name as number parameter for legacy reasons (will be fixed in store.ts)
  1625. const results = searchFTS(db, query, fetchLimit, collectionName as any);
  1626. // Add context to results
  1627. const resultsWithContext = results.map(r => ({
  1628. ...r,
  1629. context: getContextForFile(db, r.filepath),
  1630. }));
  1631. closeDb();
  1632. if (resultsWithContext.length === 0) {
  1633. console.log("No results found.");
  1634. return;
  1635. }
  1636. outputResults(resultsWithContext, query, opts);
  1637. }
  1638. async function vectorSearch(query: string, opts: OutputOptions, model: string = DEFAULT_EMBED_MODEL): Promise<void> {
  1639. const db = getDb();
  1640. // Validate collection filter if specified
  1641. let collectionName: string | undefined;
  1642. if (opts.collection) {
  1643. const coll = getCollectionFromYaml(opts.collection);
  1644. if (!coll) {
  1645. console.error(`Collection not found: ${opts.collection}`);
  1646. closeDb();
  1647. process.exit(1);
  1648. }
  1649. collectionName = opts.collection;
  1650. }
  1651. const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1652. if (!tableExists) {
  1653. console.error("Vector index not found. Run 'qmd embed' first to create embeddings.");
  1654. closeDb();
  1655. return;
  1656. }
  1657. // Check index health and warn about issues
  1658. checkIndexHealth(db);
  1659. // Expand query using structured output (no lexical for vector-only search)
  1660. const expanded = await expandQueryStructured(query, false);
  1661. // Build list of queries for vector search: original, vectorQuery, and hyde
  1662. const vectorQueries: string[] = [query];
  1663. if (expanded.vectorQuery && expanded.vectorQuery !== query) {
  1664. vectorQueries.push(expanded.vectorQuery);
  1665. }
  1666. if (expanded.hyde && expanded.hyde.length > 20) {
  1667. vectorQueries.push(expanded.hyde);
  1668. }
  1669. process.stderr.write(`${c.dim}Searching ${vectorQueries.length} vector queries...${c.reset}\n`);
  1670. // Collect results from all query variations
  1671. const perQueryLimit = opts.all ? 500 : 20;
  1672. const allResults = new Map<string, { file: string; displayPath: string; title: string; body: string; score: number; hash: string }>();
  1673. for (const q of vectorQueries) {
  1674. const vecResults = await searchVec(db, q, model, perQueryLimit, collectionName as any);
  1675. for (const r of vecResults) {
  1676. const existing = allResults.get(r.filepath);
  1677. if (!existing || r.score > existing.score) {
  1678. allResults.set(r.filepath, { file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score, hash: r.hash });
  1679. }
  1680. }
  1681. }
  1682. // Sort by max score and limit to requested count
  1683. const results = Array.from(allResults.values())
  1684. .sort((a, b) => b.score - a.score)
  1685. .slice(0, opts.limit)
  1686. .map(r => ({ ...r, context: getContextForFile(db, r.file) }));
  1687. closeDb();
  1688. if (results.length === 0) {
  1689. console.log("No results found.");
  1690. return;
  1691. }
  1692. outputResults(results, query, { ...opts, limit: results.length }); // Already limited
  1693. }
  1694. // Expand query using structured output with JSON schema grammar
  1695. async function expandQueryStructured(query: string, includeLexical: boolean = true): Promise<ExpandedQuery> {
  1696. process.stderr.write(`${c.dim}Expanding query...${c.reset}\n`);
  1697. const llm = getDefaultLlamaCpp();
  1698. const expanded = await llm.expandQueryStructured(query, includeLexical);
  1699. // Log the expansion as a tree, starting with original query
  1700. const lines: string[] = [];
  1701. const bothLabel = includeLexical ? ' · (lexical+vector)' : ' · (vector)';
  1702. lines.push(`${c.dim}├─ ${query}${bothLabel}${c.reset}`);
  1703. if (expanded.lexicalQuery && expanded.lexicalQuery !== query) {
  1704. lines.push(`${c.dim}├─ ${expanded.lexicalQuery} · (lexical)${c.reset}`);
  1705. }
  1706. if (expanded.vectorQuery && expanded.vectorQuery !== query) {
  1707. lines.push(`${c.dim}├─ ${expanded.vectorQuery} · (vector)${c.reset}`);
  1708. }
  1709. if (expanded.hyde && expanded.hyde.length > 20) {
  1710. // Truncate hyde to first ~60 chars for display
  1711. const hydePreview = expanded.hyde.length > 60
  1712. ? expanded.hyde.substring(0, 60).replace(/\n/g, ' ') + '...'
  1713. : expanded.hyde.replace(/\n/g, ' ');
  1714. lines.push(`${c.dim}├─ ${hydePreview} · (vector)${c.reset}`);
  1715. }
  1716. // Fix last item to use └─ instead of ├─
  1717. if (lines.length > 0) {
  1718. lines[lines.length - 1] = lines[lines.length - 1].replace('├─', '└─');
  1719. }
  1720. for (const line of lines) {
  1721. process.stderr.write(line + '\n');
  1722. }
  1723. return expanded;
  1724. }
  1725. // Legacy wrapper for backward compatibility
  1726. async function expandQuery(query: string, _model: string = DEFAULT_QUERY_MODEL, _db?: Database): Promise<string[]> {
  1727. const expanded = await expandQueryStructured(query, true);
  1728. const queries = [query];
  1729. if (expanded.lexicalQuery && expanded.lexicalQuery !== query) queries.push(expanded.lexicalQuery);
  1730. if (expanded.vectorQuery && expanded.vectorQuery !== query) queries.push(expanded.vectorQuery);
  1731. return queries;
  1732. }
  1733. async function querySearch(query: string, opts: OutputOptions, embedModel: string = DEFAULT_EMBED_MODEL, rerankModel: string = DEFAULT_RERANK_MODEL): Promise<void> {
  1734. const db = getDb();
  1735. // Validate collection filter if specified
  1736. let collectionName: string | undefined;
  1737. if (opts.collection) {
  1738. const coll = getCollectionFromYaml(opts.collection);
  1739. if (!coll) {
  1740. console.error(`Collection not found: ${opts.collection}`);
  1741. closeDb();
  1742. process.exit(1);
  1743. }
  1744. collectionName = opts.collection;
  1745. }
  1746. // Check index health and warn about issues
  1747. checkIndexHealth(db);
  1748. // Run initial BM25 search (will be reused for retrieval)
  1749. const initialFts = searchFTS(db, query, 20, collectionName as any);
  1750. const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1751. // Check if initial results have strong signals (skip expansion if so)
  1752. // Strong signal = top result has high normalized score (> 0.7)
  1753. const hasStrongSignal = initialFts.length > 0 && initialFts[0].score > 0.7;
  1754. let ftsQueries: string[] = [query];
  1755. let vectorQueries: string[] = [query];
  1756. if (hasStrongSignal) {
  1757. // Strong BM25 signal - skip expensive LLM expansion
  1758. process.stderr.write(`${c.dim}Strong BM25 signal (${initialFts[0].score.toFixed(2)}) - skipping expansion${c.reset}\n`);
  1759. } else {
  1760. // Weak signal - expand query for better recall
  1761. const expanded = await expandQueryStructured(query, true);
  1762. if (expanded.lexicalQuery && expanded.lexicalQuery !== query) {
  1763. ftsQueries.push(expanded.lexicalQuery);
  1764. }
  1765. if (expanded.vectorQuery && expanded.vectorQuery !== query) {
  1766. vectorQueries.push(expanded.vectorQuery);
  1767. }
  1768. if (expanded.hyde && expanded.hyde.length > 20) {
  1769. vectorQueries.push(expanded.hyde);
  1770. }
  1771. }
  1772. process.stderr.write(`${c.dim}Searching ${ftsQueries.length} lexical + ${vectorQueries.length} vector queries...${c.reset}\n`);
  1773. // Collect ranked result lists for RRF fusion
  1774. const rankedLists: RankedResult[][] = [];
  1775. // Map to store hash by filepath for final results
  1776. const hashMap = new Map<string, string>();
  1777. // FTS searches with lexical queries (reuse initial search for original query)
  1778. if (initialFts.length > 0) {
  1779. for (const r of initialFts) hashMap.set(r.filepath, r.hash);
  1780. rankedLists.push(initialFts.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score })));
  1781. }
  1782. // Run expanded queries (skip first which is original)
  1783. for (const q of ftsQueries.slice(1)) {
  1784. const ftsResults = searchFTS(db, q, 20, collectionName as any);
  1785. if (ftsResults.length > 0) {
  1786. for (const r of ftsResults) hashMap.set(r.filepath, r.hash);
  1787. rankedLists.push(ftsResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score })));
  1788. }
  1789. }
  1790. // Vector searches with semantic queries + hyde
  1791. if (hasVectors) {
  1792. for (const q of vectorQueries) {
  1793. const vecResults = await searchVec(db, q, embedModel, 20, collectionName as any);
  1794. if (vecResults.length > 0) {
  1795. for (const r of vecResults) hashMap.set(r.filepath, r.hash);
  1796. rankedLists.push(vecResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score })));
  1797. }
  1798. }
  1799. }
  1800. // Apply Reciprocal Rank Fusion to combine all ranked lists
  1801. // Give 2x weight to original query results (first 2 lists: FTS + vector)
  1802. const weights = rankedLists.map((_, i) => i < 2 ? 2.0 : 1.0);
  1803. const fused = reciprocalRankFusion(rankedLists, weights);
  1804. const candidates = fused.slice(0, 30); // Over-retrieve for reranking
  1805. if (candidates.length === 0) {
  1806. console.log("No results found.");
  1807. closeDb();
  1808. return;
  1809. }
  1810. // Rerank multiple chunks per document, then aggregate scores
  1811. // This improves ranking for long documents where keyword-matched chunk isn't always best
  1812. const MAX_CHUNKS_PER_DOC = 3;
  1813. const chunksToRerank: { file: string; text: string; chunkIdx: number }[] = [];
  1814. const docChunkMap = new Map<string, { chunks: { text: string; pos: number }[]; selectedIndices: number[] }>();
  1815. for (const c of candidates) {
  1816. const chunks = chunkDocument(c.body);
  1817. if (chunks.length <= MAX_CHUNKS_PER_DOC) {
  1818. // Small document - rerank all chunks
  1819. for (let i = 0; i < chunks.length; i++) {
  1820. chunksToRerank.push({ file: c.file, text: chunks[i].text, chunkIdx: i });
  1821. }
  1822. docChunkMap.set(c.file, { chunks, selectedIndices: chunks.map((_, i) => i) });
  1823. } else {
  1824. // Score all chunks by keyword match, select top MAX_CHUNKS_PER_DOC
  1825. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
  1826. const scored = chunks.map((chunk, idx) => {
  1827. const chunkLower = chunk.text.toLowerCase();
  1828. const score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
  1829. return { idx, score };
  1830. });
  1831. scored.sort((a, b) => b.score - a.score);
  1832. const selectedIndices = scored.slice(0, MAX_CHUNKS_PER_DOC).map(s => s.idx);
  1833. for (const idx of selectedIndices) {
  1834. chunksToRerank.push({ file: c.file, text: chunks[idx].text, chunkIdx: idx });
  1835. }
  1836. docChunkMap.set(c.file, { chunks, selectedIndices });
  1837. }
  1838. }
  1839. // Rerank all selected chunks (with caching)
  1840. // Use file:chunkIdx as unique identifier for reranker
  1841. const reranked = await rerank(
  1842. query,
  1843. chunksToRerank.map(c => ({ file: `${c.file}:${c.chunkIdx}`, text: c.text })),
  1844. rerankModel,
  1845. db
  1846. );
  1847. // Aggregate chunk scores back to document level using top-2 average
  1848. // (or max if only 1 chunk) - this balances best chunk with consistency
  1849. const docScores = new Map<string, { scores: number[]; bestChunkIdx: number }>();
  1850. for (const r of reranked) {
  1851. const [file, chunkIdxStr] = r.file.split(/:(\d+)$/);
  1852. const chunkIdx = parseInt(chunkIdxStr || "0");
  1853. const existing = docScores.get(file);
  1854. if (existing) {
  1855. existing.scores.push(r.score);
  1856. if (r.score > (existing.scores[0] || 0)) {
  1857. existing.bestChunkIdx = chunkIdx;
  1858. }
  1859. } else {
  1860. docScores.set(file, { scores: [r.score], bestChunkIdx: chunkIdx });
  1861. }
  1862. }
  1863. // Compute aggregated score: top-2 average (rewards consistency across chunks)
  1864. const aggregatedScores = new Map<string, { score: number; bestChunkIdx: number }>();
  1865. for (const [file, { scores, bestChunkIdx }] of docScores) {
  1866. scores.sort((a, b) => b - a);
  1867. const topScores = scores.slice(0, 2);
  1868. const avgScore = topScores.reduce((a, b) => a + b, 0) / topScores.length;
  1869. aggregatedScores.set(file, { score: avgScore, bestChunkIdx });
  1870. }
  1871. // Blend RRF position score with aggregated reranker score using position-aware weights
  1872. // Top retrieval results get more protection from reranker disagreement
  1873. const candidateMap = new Map(candidates.map(c => [c.file, { displayPath: c.displayPath, title: c.title, body: c.body }]));
  1874. const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1])); // 1-indexed rank
  1875. const finalResults = Array.from(aggregatedScores.entries()).map(([file, { score: rerankScore, bestChunkIdx }]) => {
  1876. const rrfRank = rrfRankMap.get(file) || 30;
  1877. // Position-aware blending: top retrieval results preserved more
  1878. // Rank 1-3: 75% RRF, 25% reranker (trust retrieval for exact matches)
  1879. // Rank 4-10: 60% RRF, 40% reranker
  1880. // Rank 11+: 40% RRF, 60% reranker (trust reranker for lower-ranked)
  1881. let rrfWeight: number;
  1882. if (rrfRank <= 3) {
  1883. rrfWeight = 0.75;
  1884. } else if (rrfRank <= 10) {
  1885. rrfWeight = 0.60;
  1886. } else {
  1887. rrfWeight = 0.40;
  1888. }
  1889. const rrfScore = 1 / rrfRank; // Position-based: 1, 0.5, 0.33...
  1890. const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * rerankScore;
  1891. const candidate = candidateMap.get(file);
  1892. // Use the best-scoring chunk's text for the body (better for snippets)
  1893. const chunkInfo = docChunkMap.get(file);
  1894. const chunkBody = chunkInfo ? chunkInfo.chunks[bestChunkIdx]?.text || chunkInfo.chunks[0].text : candidate?.body || "";
  1895. const chunkPos = chunkInfo ? chunkInfo.chunks[bestChunkIdx]?.pos || 0 : 0;
  1896. return {
  1897. file,
  1898. displayPath: candidate?.displayPath || "",
  1899. title: candidate?.title || "",
  1900. body: chunkBody,
  1901. chunkPos,
  1902. score: blendedScore,
  1903. context: getContextForFile(db, file),
  1904. hash: hashMap.get(file) || "",
  1905. };
  1906. }).sort((a, b) => b.score - a.score);
  1907. // Deduplicate by file (safety net - shouldn't happen but prevents duplicate output)
  1908. const seenFiles = new Set<string>();
  1909. const dedupedResults = finalResults.filter(r => {
  1910. if (seenFiles.has(r.file)) return false;
  1911. seenFiles.add(r.file);
  1912. return true;
  1913. });
  1914. closeDb();
  1915. outputResults(dedupedResults, query, opts);
  1916. }
  1917. // Parse CLI arguments using util.parseArgs
  1918. function parseCLI() {
  1919. const { values, positionals } = parseArgs({
  1920. args: Bun.argv.slice(2), // Skip bun and script path
  1921. options: {
  1922. // Global options
  1923. index: { type: "string" },
  1924. help: { type: "boolean", short: "h" },
  1925. // Search options
  1926. n: { type: "string" },
  1927. "min-score": { type: "string" },
  1928. all: { type: "boolean" },
  1929. full: { type: "boolean" },
  1930. csv: { type: "boolean" },
  1931. md: { type: "boolean" },
  1932. xml: { type: "boolean" },
  1933. files: { type: "boolean" },
  1934. json: { type: "boolean" },
  1935. collection: { type: "string", short: "c" }, // Filter by collection
  1936. // Collection options
  1937. name: { type: "string" }, // collection name
  1938. mask: { type: "string" }, // glob pattern
  1939. // Embed options
  1940. force: { type: "boolean", short: "f" },
  1941. // Update options
  1942. pull: { type: "boolean" }, // git pull before update
  1943. // Get options
  1944. l: { type: "string" }, // max lines
  1945. from: { type: "string" }, // start line
  1946. "max-bytes": { type: "string" }, // max bytes for multi-get
  1947. "line-numbers": { type: "boolean" }, // add line numbers to output
  1948. },
  1949. allowPositionals: true,
  1950. strict: false, // Allow unknown options to pass through
  1951. });
  1952. // Set global index name in store
  1953. if (values.index) {
  1954. setCustomIndexName(values.index);
  1955. }
  1956. // Determine output format
  1957. let format: OutputFormat = "cli";
  1958. if (values.csv) format = "csv";
  1959. else if (values.md) format = "md";
  1960. else if (values.xml) format = "xml";
  1961. else if (values.files) format = "files";
  1962. else if (values.json) format = "json";
  1963. // Default limit: 20 for --files/--json, 5 otherwise
  1964. // --all means return all results (use very large limit)
  1965. const defaultLimit = (format === "files" || format === "json") ? 20 : 5;
  1966. const isAll = values.all || false;
  1967. const opts: OutputOptions = {
  1968. format,
  1969. full: values.full || false,
  1970. limit: isAll ? 100000 : (values.n ? parseInt(values.n, 10) || defaultLimit : defaultLimit),
  1971. minScore: values["min-score"] ? parseFloat(values["min-score"]) || 0 : 0,
  1972. all: isAll,
  1973. collection: values.collection as string | undefined,
  1974. lineNumbers: values["line-numbers"] || false,
  1975. };
  1976. return {
  1977. command: positionals[0] || "",
  1978. args: positionals.slice(1),
  1979. query: positionals.slice(1).join(" "),
  1980. opts,
  1981. values,
  1982. };
  1983. }
  1984. function showHelp(): void {
  1985. console.log("Usage:");
  1986. console.log(" qmd collection add [path] --name <name> --mask <pattern> - Create/index collection");
  1987. console.log(" qmd collection list - List all collections with details");
  1988. console.log(" qmd collection remove <name> - Remove a collection by name");
  1989. console.log(" qmd collection rename <old> <new> - Rename a collection");
  1990. console.log(" qmd ls [collection[/path]] - List collections or files in a collection");
  1991. console.log(" qmd context add [path] \"text\" - Add context for path (defaults to current dir)");
  1992. console.log(" qmd context list - List all contexts");
  1993. console.log(" qmd context rm <path> - Remove context");
  1994. console.log(" qmd get <file>[:line] [-l N] [--from N] - Get document (optionally from line, max N lines)");
  1995. console.log(" qmd multi-get <pattern> [-l N] [--max-bytes N] - Get multiple docs by glob or comma-separated list");
  1996. console.log(" qmd status - Show index status and collections");
  1997. console.log(" qmd update [--pull] - Re-index all collections (--pull: git pull first)");
  1998. console.log(" qmd embed [-f] - Create vector embeddings (800 tokens/chunk, 15% overlap)");
  1999. console.log(" qmd cleanup - Remove cache and orphaned data, vacuum DB");
  2000. console.log(" qmd search <query> - Full-text search (BM25)");
  2001. console.log(" qmd vsearch <query> - Vector similarity search");
  2002. console.log(" qmd query <query> - Combined search with query expansion + reranking");
  2003. console.log(" qmd mcp - Start MCP server (for AI agent integration)");
  2004. console.log("");
  2005. console.log("Global options:");
  2006. console.log(" --index <name> - Use custom index name (default: index)");
  2007. console.log("");
  2008. console.log("Search options:");
  2009. console.log(" -n <num> - Number of results (default: 5, or 20 for --files)");
  2010. console.log(" --all - Return all matches (use with --min-score to filter)");
  2011. console.log(" --min-score <num> - Minimum similarity score");
  2012. console.log(" --full - Output full document instead of snippet");
  2013. console.log(" --line-numbers - Add line numbers to output");
  2014. console.log(" --files - Output docid,score,filepath,context (default: 20 results)");
  2015. console.log(" --json - JSON output with snippets (default: 20 results)");
  2016. console.log(" --csv - CSV output with snippets");
  2017. console.log(" --md - Markdown output");
  2018. console.log(" --xml - XML output");
  2019. console.log(" -c, --collection <name> - Filter results to a specific collection");
  2020. console.log("");
  2021. console.log("Multi-get options:");
  2022. console.log(" -l <num> - Maximum lines per file");
  2023. console.log(" --max-bytes <num> - Skip files larger than N bytes (default: 10240)");
  2024. console.log(" --json/--csv/--md/--xml/--files - Output format (same as search)");
  2025. console.log("");
  2026. console.log("Models (auto-downloaded from HuggingFace):");
  2027. console.log(" Embedding: embeddinggemma-300M-Q8_0");
  2028. console.log(" Reranking: qwen3-reranker-0.6b-q8_0");
  2029. console.log(" Generation: Qwen3-0.6B-Q8_0");
  2030. console.log("");
  2031. console.log(`Index: ${getDbPath()}`);
  2032. }
  2033. // Main CLI - only run if this is the main module
  2034. if (import.meta.main) {
  2035. const cli = parseCLI();
  2036. if (!cli.command || cli.values.help) {
  2037. showHelp();
  2038. process.exit(cli.values.help ? 0 : 1);
  2039. }
  2040. switch (cli.command) {
  2041. case "context": {
  2042. const subcommand = cli.args[0];
  2043. if (!subcommand) {
  2044. console.error("Usage: qmd context <add|list|check|rm>");
  2045. console.error("");
  2046. console.error("Commands:");
  2047. console.error(" qmd context add [path] \"text\" - Add context (defaults to current dir)");
  2048. console.error(" qmd context add / \"text\" - Add global context to all collections");
  2049. console.error(" qmd context list - List all contexts");
  2050. console.error(" qmd context check - Check for missing contexts");
  2051. console.error(" qmd context rm <path> - Remove context");
  2052. process.exit(1);
  2053. }
  2054. switch (subcommand) {
  2055. case "add": {
  2056. if (cli.args.length < 2) {
  2057. console.error("Usage: qmd context add [path] \"text\"");
  2058. console.error("");
  2059. console.error("Examples:");
  2060. console.error(" qmd context add \"Context for current directory\"");
  2061. console.error(" qmd context add . \"Context for current directory\"");
  2062. console.error(" qmd context add /subfolder \"Context for subfolder\"");
  2063. console.error(" qmd context add / \"Global context for all collections\"");
  2064. console.error("");
  2065. console.error(" Using virtual paths:");
  2066. console.error(" qmd context add qmd://journals/ \"Context for entire journals collection\"");
  2067. console.error(" qmd context add qmd://journals/2024 \"Context for 2024 journals\"");
  2068. process.exit(1);
  2069. }
  2070. let pathArg: string | undefined;
  2071. let contextText: string;
  2072. // Check if first arg looks like a path or if it's the context text
  2073. const firstArg = cli.args[1];
  2074. const secondArg = cli.args[2];
  2075. if (secondArg) {
  2076. // Two args: path + context
  2077. pathArg = firstArg;
  2078. contextText = cli.args.slice(2).join(" ");
  2079. } else {
  2080. // One arg: context only (use current directory)
  2081. pathArg = undefined;
  2082. contextText = firstArg;
  2083. }
  2084. await contextAdd(pathArg, contextText);
  2085. break;
  2086. }
  2087. case "list": {
  2088. contextList();
  2089. break;
  2090. }
  2091. case "check": {
  2092. contextCheck();
  2093. break;
  2094. }
  2095. case "rm":
  2096. case "remove": {
  2097. if (cli.args.length < 2) {
  2098. console.error("Usage: qmd context rm <path>");
  2099. console.error("Examples:");
  2100. console.error(" qmd context rm /");
  2101. console.error(" qmd context rm qmd://journals/2024");
  2102. process.exit(1);
  2103. }
  2104. contextRemove(cli.args[1]);
  2105. break;
  2106. }
  2107. default:
  2108. console.error(`Unknown subcommand: ${subcommand}`);
  2109. console.error("Available: add, list, check, rm");
  2110. process.exit(1);
  2111. }
  2112. break;
  2113. }
  2114. // Legacy alias for backwards compatibility
  2115. case "add-context": {
  2116. console.error(`${c.yellow}Note: 'qmd add-context' is deprecated. Use 'qmd context add' instead.${c.reset}`);
  2117. if (cli.args.length === 0) {
  2118. console.error("Usage: qmd context add [path] \"text\"");
  2119. process.exit(1);
  2120. }
  2121. let pathArg: string | undefined;
  2122. let contextText: string;
  2123. if (cli.args.length === 1) {
  2124. pathArg = undefined;
  2125. contextText = cli.args[0];
  2126. } else {
  2127. pathArg = cli.args[0];
  2128. contextText = cli.args.slice(1).join(" ");
  2129. }
  2130. await contextAdd(pathArg, contextText);
  2131. break;
  2132. }
  2133. case "get": {
  2134. if (!cli.args[0]) {
  2135. console.error("Usage: qmd get <filepath>[:line] [--from <line>] [-l <lines>] [--line-numbers]");
  2136. process.exit(1);
  2137. }
  2138. const fromLine = cli.values.from ? parseInt(cli.values.from as string, 10) : undefined;
  2139. const maxLines = cli.values.l ? parseInt(cli.values.l as string, 10) : undefined;
  2140. getDocument(cli.args[0], fromLine, maxLines, cli.opts.lineNumbers);
  2141. break;
  2142. }
  2143. case "multi-get": {
  2144. if (!cli.args[0]) {
  2145. console.error("Usage: qmd multi-get <pattern> [-l <lines>] [--max-bytes <bytes>] [--json|--csv|--md|--xml|--files]");
  2146. console.error(" pattern: glob (e.g., 'journals/2025-05*.md') or comma-separated list");
  2147. process.exit(1);
  2148. }
  2149. const maxLinesMulti = cli.values.l ? parseInt(cli.values.l as string, 10) : undefined;
  2150. const maxBytes = cli.values["max-bytes"] ? parseInt(cli.values["max-bytes"] as string, 10) : DEFAULT_MULTI_GET_MAX_BYTES;
  2151. multiGet(cli.args[0], maxLinesMulti, maxBytes, cli.opts.format);
  2152. break;
  2153. }
  2154. case "ls": {
  2155. listFiles(cli.args[0]);
  2156. break;
  2157. }
  2158. case "collection": {
  2159. const subcommand = cli.args[0];
  2160. switch (subcommand) {
  2161. case "list": {
  2162. collectionList();
  2163. break;
  2164. }
  2165. case "add": {
  2166. const pwd = cli.args[1] || getPwd();
  2167. const resolvedPwd = pwd === '.' ? getPwd() : getRealPath(resolve(pwd));
  2168. const globPattern = cli.values.mask as string || DEFAULT_GLOB;
  2169. const name = cli.values.name as string | undefined;
  2170. await collectionAdd(resolvedPwd, globPattern, name);
  2171. break;
  2172. }
  2173. case "remove":
  2174. case "rm": {
  2175. if (!cli.args[1]) {
  2176. console.error("Usage: qmd collection remove <name>");
  2177. console.error(" Use 'qmd collection list' to see available collections");
  2178. process.exit(1);
  2179. }
  2180. collectionRemove(cli.args[1]);
  2181. break;
  2182. }
  2183. case "rename":
  2184. case "mv": {
  2185. if (!cli.args[1] || !cli.args[2]) {
  2186. console.error("Usage: qmd collection rename <old-name> <new-name>");
  2187. console.error(" Use 'qmd collection list' to see available collections");
  2188. process.exit(1);
  2189. }
  2190. collectionRename(cli.args[1], cli.args[2]);
  2191. break;
  2192. }
  2193. default:
  2194. console.error(`Unknown subcommand: ${subcommand}`);
  2195. console.error("Available: list, add, remove, rename");
  2196. process.exit(1);
  2197. }
  2198. break;
  2199. }
  2200. case "status":
  2201. showStatus();
  2202. break;
  2203. case "update":
  2204. await updateCollections();
  2205. break;
  2206. case "embed":
  2207. await vectorIndex(DEFAULT_EMBED_MODEL, cli.values.force || false);
  2208. break;
  2209. case "search":
  2210. if (!cli.query) {
  2211. console.error("Usage: qmd search [options] <query>");
  2212. process.exit(1);
  2213. }
  2214. search(cli.query, cli.opts);
  2215. break;
  2216. case "vsearch":
  2217. if (!cli.query) {
  2218. console.error("Usage: qmd vsearch [options] <query>");
  2219. process.exit(1);
  2220. }
  2221. // Default min-score for vector search is 0.3
  2222. if (!cli.values["min-score"]) {
  2223. cli.opts.minScore = 0.3;
  2224. }
  2225. await vectorSearch(cli.query, cli.opts);
  2226. break;
  2227. case "query":
  2228. if (!cli.query) {
  2229. console.error("Usage: qmd query [options] <query>");
  2230. process.exit(1);
  2231. }
  2232. await querySearch(cli.query, cli.opts);
  2233. break;
  2234. case "mcp": {
  2235. const { startMcpServer } = await import("./mcp.js");
  2236. await startMcpServer();
  2237. break;
  2238. }
  2239. case "cleanup": {
  2240. const db = getDb();
  2241. // 1. Clear llm_cache
  2242. const cacheCount = deleteLLMCache(db);
  2243. console.log(`${c.green}✓${c.reset} Cleared ${cacheCount} cached API responses`);
  2244. // 2. Remove orphaned vectors
  2245. const orphanedVecs = cleanupOrphanedVectors(db);
  2246. if (orphanedVecs > 0) {
  2247. console.log(`${c.green}✓${c.reset} Removed ${orphanedVecs} orphaned embedding chunks`);
  2248. } else {
  2249. console.log(`${c.dim}No orphaned embeddings to remove${c.reset}`);
  2250. }
  2251. // 3. Remove inactive documents
  2252. const inactiveDocs = deleteInactiveDocuments(db);
  2253. if (inactiveDocs > 0) {
  2254. console.log(`${c.green}✓${c.reset} Removed ${inactiveDocs} inactive document records`);
  2255. }
  2256. // 4. Vacuum to reclaim space
  2257. vacuumDatabase(db);
  2258. console.log(`${c.green}✓${c.reset} Database vacuumed`);
  2259. closeDb();
  2260. break;
  2261. }
  2262. default:
  2263. console.error(`Unknown command: ${cli.command}`);
  2264. console.error("Run 'qmd --help' for usage.");
  2265. process.exit(1);
  2266. }
  2267. // Cleanup LlamaCpp instance to prevent NAPI crash on exit
  2268. await disposeDefaultLlamaCpp();
  2269. } // end if (import.meta.main)