qmd.ts 93 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659
  1. #!/usr/bin/env bun
  2. import { Database } from "bun:sqlite";
  3. import { Glob, $ } from "bun";
  4. import { parseArgs } from "util";
  5. import * as sqliteVec from "sqlite-vec";
  6. import {
  7. getPwd,
  8. getRealPath,
  9. homedir,
  10. resolve,
  11. enableProductionMode,
  12. searchFTS,
  13. searchVec,
  14. extractSnippet,
  15. getContextForFile,
  16. getContextForPath,
  17. listCollections,
  18. removeCollection,
  19. renameCollection,
  20. findSimilarFiles,
  21. findDocumentByDocid,
  22. isDocid,
  23. matchFilesByGlob,
  24. getHashesNeedingEmbedding,
  25. getHashesForEmbedding,
  26. clearAllEmbeddings,
  27. insertEmbedding,
  28. getStatus,
  29. hashContent,
  30. extractTitle,
  31. formatDocForEmbedding,
  32. formatQueryForEmbedding,
  33. chunkDocument,
  34. chunkDocumentByTokens,
  35. clearCache,
  36. getCacheKey,
  37. getCachedResult,
  38. setCachedResult,
  39. getIndexHealth,
  40. parseVirtualPath,
  41. buildVirtualPath,
  42. isVirtualPath,
  43. resolveVirtualPath,
  44. toVirtualPath,
  45. insertContent,
  46. insertDocument,
  47. findActiveDocument,
  48. updateDocumentTitle,
  49. updateDocument,
  50. deactivateDocument,
  51. getActiveDocumentPaths,
  52. cleanupOrphanedContent,
  53. deleteLLMCache,
  54. deleteInactiveDocuments,
  55. cleanupOrphanedVectors,
  56. vacuumDatabase,
  57. getCollectionsWithoutContext,
  58. getTopLevelPathsWithoutContext,
  59. handelize,
  60. DEFAULT_EMBED_MODEL,
  61. DEFAULT_QUERY_MODEL,
  62. DEFAULT_RERANK_MODEL,
  63. DEFAULT_GLOB,
  64. DEFAULT_MULTI_GET_MAX_BYTES,
  65. createStore,
  66. getDefaultDbPath,
  67. } from "./store.js";
  68. import { getDefaultLlamaCpp, disposeDefaultLlamaCpp, type RerankDocument, type Queryable, type QueryType } from "./llm.js";
  69. import type { SearchResult, RankedResult } from "./store.js";
  70. import {
  71. formatSearchResults,
  72. formatDocuments,
  73. escapeXml,
  74. escapeCSV,
  75. type OutputFormat,
  76. } from "./formatter.js";
  77. import {
  78. getCollection as getCollectionFromYaml,
  79. listCollections as yamlListCollections,
  80. addContext as yamlAddContext,
  81. removeContext as yamlRemoveContext,
  82. setGlobalContext,
  83. listAllContexts,
  84. } from "./collections.js";
  85. // Enable production mode - allows using default database path
  86. // Tests must set INDEX_PATH or use createStore() with explicit path
  87. enableProductionMode();
  88. // =============================================================================
  89. // Store/DB lifecycle (no legacy singletons in store.ts)
  90. // =============================================================================
  91. let store: ReturnType<typeof createStore> | null = null;
  92. let storeDbPathOverride: string | undefined;
  93. function getStore(): ReturnType<typeof createStore> {
  94. if (!store) {
  95. store = createStore(storeDbPathOverride);
  96. }
  97. return store;
  98. }
  99. function getDb(): Database {
  100. return getStore().db;
  101. }
  102. function closeDb(): void {
  103. if (store) {
  104. store.close();
  105. store = null;
  106. }
  107. }
  108. function getDbPath(): string {
  109. return store?.dbPath ?? storeDbPathOverride ?? getDefaultDbPath();
  110. }
  111. function setIndexName(name: string | null): void {
  112. storeDbPathOverride = name ? getDefaultDbPath(name) : undefined;
  113. // Reset open handle so next use opens the new index
  114. closeDb();
  115. }
  116. function ensureVecTable(_db: Database, dimensions: number): void {
  117. // Store owns the DB; ignore `_db` and ensure vec table on the active store
  118. getStore().ensureVecTable(dimensions);
  119. }
  120. // Terminal colors (respects NO_COLOR env)
  121. const useColor = !process.env.NO_COLOR && process.stdout.isTTY;
  122. const c = {
  123. reset: useColor ? "\x1b[0m" : "",
  124. dim: useColor ? "\x1b[2m" : "",
  125. bold: useColor ? "\x1b[1m" : "",
  126. cyan: useColor ? "\x1b[36m" : "",
  127. yellow: useColor ? "\x1b[33m" : "",
  128. green: useColor ? "\x1b[32m" : "",
  129. magenta: useColor ? "\x1b[35m" : "",
  130. blue: useColor ? "\x1b[34m" : "",
  131. };
  132. // Terminal cursor control
  133. const cursor = {
  134. hide() { process.stderr.write('\x1b[?25l'); },
  135. show() { process.stderr.write('\x1b[?25h'); },
  136. };
  137. // Ensure cursor is restored on exit
  138. process.on('SIGINT', () => { cursor.show(); process.exit(130); });
  139. process.on('SIGTERM', () => { cursor.show(); process.exit(143); });
  140. // Terminal progress bar using OSC 9;4 escape sequence
  141. const progress = {
  142. set(percent: number) {
  143. process.stderr.write(`\x1b]9;4;1;${Math.round(percent)}\x07`);
  144. },
  145. clear() {
  146. process.stderr.write(`\x1b]9;4;0\x07`);
  147. },
  148. indeterminate() {
  149. process.stderr.write(`\x1b]9;4;3\x07`);
  150. },
  151. error() {
  152. process.stderr.write(`\x1b]9;4;2\x07`);
  153. },
  154. };
  155. // Format seconds into human-readable ETA
  156. function formatETA(seconds: number): string {
  157. if (seconds < 60) return `${Math.round(seconds)}s`;
  158. if (seconds < 3600) return `${Math.floor(seconds / 60)}m ${Math.round(seconds % 60)}s`;
  159. return `${Math.floor(seconds / 3600)}h ${Math.floor((seconds % 3600) / 60)}m`;
  160. }
  161. // Check index health and print warnings/tips
  162. function checkIndexHealth(db: Database): void {
  163. const { needsEmbedding, totalDocs, daysStale } = getIndexHealth(db);
  164. // Warn if many docs need embedding
  165. if (needsEmbedding > 0) {
  166. const pct = Math.round((needsEmbedding / totalDocs) * 100);
  167. if (pct >= 10) {
  168. process.stderr.write(`${c.yellow}Warning: ${needsEmbedding} documents (${pct}%) need embeddings. Run 'qmd embed' for better results.${c.reset}\n`);
  169. } else {
  170. process.stderr.write(`${c.dim}Tip: ${needsEmbedding} documents need embeddings. Run 'qmd embed' to index them.${c.reset}\n`);
  171. }
  172. }
  173. // Check if most recent document update is older than 2 weeks
  174. if (daysStale !== null && daysStale >= 14) {
  175. process.stderr.write(`${c.dim}Tip: Index last updated ${daysStale} days ago. Run 'qmd update' to refresh.${c.reset}\n`);
  176. }
  177. }
  178. // Compute unique display path for a document
  179. // Always include at least parent folder + filename, add more parent dirs until unique
  180. function computeDisplayPath(
  181. filepath: string,
  182. collectionPath: string,
  183. existingPaths: Set<string>
  184. ): string {
  185. // Get path relative to collection (include collection dir name)
  186. const collectionDir = collectionPath.replace(/\/$/, '');
  187. const collectionName = collectionDir.split('/').pop() || '';
  188. let relativePath: string;
  189. if (filepath.startsWith(collectionDir + '/')) {
  190. // filepath is under collection: use collection name + relative path
  191. relativePath = collectionName + filepath.slice(collectionDir.length);
  192. } else {
  193. // Fallback: just use the filepath
  194. relativePath = filepath;
  195. }
  196. const parts = relativePath.split('/').filter(p => p.length > 0);
  197. // Always include at least parent folder + filename (minimum 2 parts if available)
  198. // Then add more parent dirs until unique
  199. const minParts = Math.min(2, parts.length);
  200. for (let i = parts.length - minParts; i >= 0; i--) {
  201. const candidate = parts.slice(i).join('/');
  202. if (!existingPaths.has(candidate)) {
  203. return candidate;
  204. }
  205. }
  206. // Absolute fallback: use full path (should be unique)
  207. return filepath;
  208. }
  209. // Rerank documents using node-llama-cpp cross-encoder model
  210. async function rerank(query: string, documents: { file: string; text: string }[], _model: string = DEFAULT_RERANK_MODEL, _db?: Database): Promise<{ file: string; score: number }[]> {
  211. if (documents.length === 0) return [];
  212. const total = documents.length;
  213. process.stderr.write(`Reranking ${total} documents...\n`);
  214. progress.indeterminate();
  215. const llm = getDefaultLlamaCpp();
  216. const rerankDocs: RerankDocument[] = documents.map((doc) => ({
  217. file: doc.file,
  218. text: doc.text.slice(0, 4000), // Truncate to context limit
  219. }));
  220. const result = await llm.rerank(query, rerankDocs);
  221. progress.clear();
  222. process.stderr.write("\n");
  223. return result.results.map((r) => ({ file: r.file, score: r.score }));
  224. }
  225. function formatTimeAgo(date: Date): string {
  226. const seconds = Math.floor((Date.now() - date.getTime()) / 1000);
  227. if (seconds < 60) return `${seconds}s ago`;
  228. const minutes = Math.floor(seconds / 60);
  229. if (minutes < 60) return `${minutes}m ago`;
  230. const hours = Math.floor(minutes / 60);
  231. if (hours < 24) return `${hours}h ago`;
  232. const days = Math.floor(hours / 24);
  233. return `${days}d ago`;
  234. }
  235. function formatBytes(bytes: number): string {
  236. if (bytes < 1024) return `${bytes} B`;
  237. if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
  238. if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
  239. return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
  240. }
  241. function showStatus(): void {
  242. const dbPath = getDbPath();
  243. const db = getDb();
  244. // Collections are defined in YAML; no duplicate cleanup needed.
  245. // Collections are defined in YAML; no duplicate cleanup needed.
  246. // Index size
  247. let indexSize = 0;
  248. try {
  249. const stat = Bun.file(dbPath).size;
  250. indexSize = stat;
  251. } catch { }
  252. // Collections info (from YAML + database stats)
  253. const collections = listCollections(db);
  254. // Overall stats
  255. const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number };
  256. const vectorCount = db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get() as { count: number };
  257. const needsEmbedding = getHashesNeedingEmbedding(db);
  258. // Most recent update across all collections
  259. const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
  260. console.log(`${c.bold}QMD Status${c.reset}\n`);
  261. console.log(`Index: ${dbPath}`);
  262. console.log(`Size: ${formatBytes(indexSize)}\n`);
  263. console.log(`${c.bold}Documents${c.reset}`);
  264. console.log(` Total: ${totalDocs.count} files indexed`);
  265. console.log(` Vectors: ${vectorCount.count} embedded`);
  266. if (needsEmbedding > 0) {
  267. console.log(` ${c.yellow}Pending: ${needsEmbedding} need embedding${c.reset} (run 'qmd embed')`);
  268. }
  269. if (mostRecent.latest) {
  270. const lastUpdate = new Date(mostRecent.latest);
  271. console.log(` Updated: ${formatTimeAgo(lastUpdate)}`);
  272. }
  273. // Get all contexts grouped by collection (from YAML)
  274. const allContexts = listAllContexts();
  275. const contextsByCollection = new Map<string, { path_prefix: string; context: string }[]>();
  276. for (const ctx of allContexts) {
  277. // Group contexts by collection name
  278. if (!contextsByCollection.has(ctx.collection)) {
  279. contextsByCollection.set(ctx.collection, []);
  280. }
  281. contextsByCollection.get(ctx.collection)!.push({
  282. path_prefix: ctx.path,
  283. context: ctx.context
  284. });
  285. }
  286. if (collections.length > 0) {
  287. console.log(`\n${c.bold}Collections${c.reset}`);
  288. for (const col of collections) {
  289. const lastMod = col.last_modified ? formatTimeAgo(new Date(col.last_modified)) : "never";
  290. const contexts = contextsByCollection.get(col.name) || [];
  291. console.log(` ${c.cyan}${col.name}${c.reset} ${c.dim}(qmd://${col.name}/)${c.reset}`);
  292. console.log(` ${c.dim}Pattern:${c.reset} ${col.glob_pattern}`);
  293. console.log(` ${c.dim}Files:${c.reset} ${col.active_count} (updated ${lastMod})`);
  294. if (contexts.length > 0) {
  295. console.log(` ${c.dim}Contexts:${c.reset} ${contexts.length}`);
  296. for (const ctx of contexts) {
  297. // Handle both empty string and '/' as root context
  298. const pathDisplay = (ctx.path_prefix === '' || ctx.path_prefix === '/') ? '/' : `/${ctx.path_prefix}`;
  299. const contextPreview = ctx.context.length > 60
  300. ? ctx.context.substring(0, 57) + '...'
  301. : ctx.context;
  302. console.log(` ${c.dim}${pathDisplay}:${c.reset} ${contextPreview}`);
  303. }
  304. }
  305. }
  306. // Show examples of virtual paths
  307. console.log(`\n${c.bold}Examples${c.reset}`);
  308. console.log(` ${c.dim}# List files in a collection${c.reset}`);
  309. if (collections.length > 0 && collections[0]) {
  310. console.log(` qmd ls ${collections[0].name}`);
  311. }
  312. console.log(` ${c.dim}# Get a document${c.reset}`);
  313. if (collections.length > 0 && collections[0]) {
  314. console.log(` qmd get qmd://${collections[0].name}/path/to/file.md`);
  315. }
  316. console.log(` ${c.dim}# Search within a collection${c.reset}`);
  317. if (collections.length > 0 && collections[0]) {
  318. console.log(` qmd search "query" -c ${collections[0].name}`);
  319. }
  320. } else {
  321. console.log(`\n${c.dim}No collections. Run 'qmd collection add .' to index markdown files.${c.reset}`);
  322. }
  323. closeDb();
  324. }
  325. async function updateCollections(): Promise<void> {
  326. const db = getDb();
  327. // Collections are defined in YAML; no duplicate cleanup needed.
  328. // Clear Ollama cache on update
  329. clearCache(db);
  330. const collections = listCollections(db);
  331. if (collections.length === 0) {
  332. console.log(`${c.dim}No collections found. Run 'qmd collection add .' to index markdown files.${c.reset}`);
  333. closeDb();
  334. return;
  335. }
  336. // Don't close db here - indexFiles will reuse it and close at the end
  337. console.log(`${c.bold}Updating ${collections.length} collection(s)...${c.reset}\n`);
  338. for (let i = 0; i < collections.length; i++) {
  339. const col = collections[i];
  340. if (!col) continue;
  341. console.log(`${c.cyan}[${i + 1}/${collections.length}]${c.reset} ${c.bold}${col.name}${c.reset} ${c.dim}(${col.glob_pattern})${c.reset}`);
  342. // Execute custom update command if specified in YAML
  343. const yamlCol = getCollectionFromYaml(col.name);
  344. if (yamlCol?.update) {
  345. console.log(`${c.dim} Running update command: ${yamlCol.update}${c.reset}`);
  346. try {
  347. const proc = Bun.spawn(["/usr/bin/env", "bash", "-c", yamlCol.update], {
  348. cwd: col.pwd,
  349. stdout: "pipe",
  350. stderr: "pipe",
  351. });
  352. const output = await new Response(proc.stdout).text();
  353. const errorOutput = await new Response(proc.stderr).text();
  354. const exitCode = await proc.exited;
  355. if (output.trim()) {
  356. console.log(output.trim().split('\n').map(l => ` ${l}`).join('\n'));
  357. }
  358. if (errorOutput.trim()) {
  359. console.log(errorOutput.trim().split('\n').map(l => ` ${l}`).join('\n'));
  360. }
  361. if (exitCode !== 0) {
  362. console.log(`${c.yellow}✗ Update command failed with exit code ${exitCode}${c.reset}`);
  363. process.exit(exitCode);
  364. }
  365. } catch (err) {
  366. console.log(`${c.yellow}✗ Update command failed: ${err}${c.reset}`);
  367. process.exit(1);
  368. }
  369. }
  370. await indexFiles(col.pwd, col.glob_pattern, col.name, true);
  371. console.log("");
  372. }
  373. // Check if any documents need embedding (show once at end)
  374. const finalDb = getDb();
  375. const needsEmbedding = getHashesNeedingEmbedding(finalDb);
  376. closeDb();
  377. console.log(`${c.green}✓ All collections updated.${c.reset}`);
  378. if (needsEmbedding > 0) {
  379. console.log(`\nRun 'qmd embed' to update embeddings (${needsEmbedding} unique hashes need vectors)`);
  380. }
  381. }
  382. /**
  383. * Detect which collection (if any) contains the given filesystem path.
  384. * Returns { collectionId, collectionName, relativePath } or null if not in any collection.
  385. */
  386. function detectCollectionFromPath(db: Database, fsPath: string): { collectionName: string; relativePath: string } | null {
  387. const realPath = getRealPath(fsPath);
  388. // Find collections that this path is under from YAML
  389. const allCollections = yamlListCollections();
  390. // Find longest matching path
  391. let bestMatch: { name: string; path: string } | null = null;
  392. for (const coll of allCollections) {
  393. if (realPath.startsWith(coll.path + '/') || realPath === coll.path) {
  394. if (!bestMatch || coll.path.length > bestMatch.path.length) {
  395. bestMatch = { name: coll.name, path: coll.path };
  396. }
  397. }
  398. }
  399. if (!bestMatch) return null;
  400. // Calculate relative path
  401. let relativePath = realPath;
  402. if (relativePath.startsWith(bestMatch.path + '/')) {
  403. relativePath = relativePath.slice(bestMatch.path.length + 1);
  404. } else if (relativePath === bestMatch.path) {
  405. relativePath = '';
  406. }
  407. return {
  408. collectionName: bestMatch.name,
  409. relativePath
  410. };
  411. }
  412. async function contextAdd(pathArg: string | undefined, contextText: string): Promise<void> {
  413. const db = getDb();
  414. // Handle "/" as global context (applies to all collections)
  415. if (pathArg === '/') {
  416. setGlobalContext(contextText);
  417. console.log(`${c.green}✓${c.reset} Set global context`);
  418. console.log(`${c.dim}Context: ${contextText}${c.reset}`);
  419. closeDb();
  420. return;
  421. }
  422. // Resolve path - defaults to current directory if not provided
  423. let fsPath = pathArg || '.';
  424. if (fsPath === '.' || fsPath === './') {
  425. fsPath = getPwd();
  426. } else if (fsPath.startsWith('~/')) {
  427. fsPath = homedir() + fsPath.slice(1);
  428. } else if (!fsPath.startsWith('/') && !fsPath.startsWith('qmd://')) {
  429. fsPath = resolve(getPwd(), fsPath);
  430. }
  431. // Handle virtual paths (qmd://collection/path)
  432. if (isVirtualPath(fsPath)) {
  433. const parsed = parseVirtualPath(fsPath);
  434. if (!parsed) {
  435. console.error(`${c.yellow}Invalid virtual path: ${fsPath}${c.reset}`);
  436. process.exit(1);
  437. }
  438. const coll = getCollectionFromYaml(parsed.collectionName);
  439. if (!coll) {
  440. console.error(`${c.yellow}Collection not found: ${parsed.collectionName}${c.reset}`);
  441. process.exit(1);
  442. }
  443. yamlAddContext(parsed.collectionName, parsed.path, contextText);
  444. const displayPath = parsed.path
  445. ? `qmd://${parsed.collectionName}/${parsed.path}`
  446. : `qmd://${parsed.collectionName}/ (collection root)`;
  447. console.log(`${c.green}✓${c.reset} Added context for: ${displayPath}`);
  448. console.log(`${c.dim}Context: ${contextText}${c.reset}`);
  449. closeDb();
  450. return;
  451. }
  452. // Detect collection from filesystem path
  453. const detected = detectCollectionFromPath(db, fsPath);
  454. if (!detected) {
  455. console.error(`${c.yellow}Path is not in any indexed collection: ${fsPath}${c.reset}`);
  456. console.error(`${c.dim}Run 'qmd status' to see indexed collections${c.reset}`);
  457. process.exit(1);
  458. }
  459. yamlAddContext(detected.collectionName, detected.relativePath, contextText);
  460. const displayPath = detected.relativePath ? `qmd://${detected.collectionName}/${detected.relativePath}` : `qmd://${detected.collectionName}/`;
  461. console.log(`${c.green}✓${c.reset} Added context for: ${displayPath}`);
  462. console.log(`${c.dim}Context: ${contextText}${c.reset}`);
  463. closeDb();
  464. }
  465. function contextList(): void {
  466. const db = getDb();
  467. const allContexts = listAllContexts();
  468. if (allContexts.length === 0) {
  469. console.log(`${c.dim}No contexts configured. Use 'qmd context add' to add one.${c.reset}`);
  470. closeDb();
  471. return;
  472. }
  473. console.log(`\n${c.bold}Configured Contexts${c.reset}\n`);
  474. let lastCollection = '';
  475. for (const ctx of allContexts) {
  476. if (ctx.collection !== lastCollection) {
  477. console.log(`${c.cyan}${ctx.collection}${c.reset}`);
  478. lastCollection = ctx.collection;
  479. }
  480. const displayPath = ctx.path ? ` ${ctx.path}` : ' / (root)';
  481. console.log(`${displayPath}`);
  482. console.log(` ${c.dim}${ctx.context}${c.reset}`);
  483. }
  484. closeDb();
  485. }
  486. function contextRemove(pathArg: string): void {
  487. if (pathArg === '/') {
  488. // Remove global context
  489. setGlobalContext(undefined);
  490. console.log(`${c.green}✓${c.reset} Removed global context`);
  491. return;
  492. }
  493. // Handle virtual paths
  494. if (isVirtualPath(pathArg)) {
  495. const parsed = parseVirtualPath(pathArg);
  496. if (!parsed) {
  497. console.error(`${c.yellow}Invalid virtual path: ${pathArg}${c.reset}`);
  498. process.exit(1);
  499. }
  500. const coll = getCollectionFromYaml(parsed.collectionName);
  501. if (!coll) {
  502. console.error(`${c.yellow}Collection not found: ${parsed.collectionName}${c.reset}`);
  503. process.exit(1);
  504. }
  505. const success = yamlRemoveContext(coll.name, parsed.path);
  506. if (!success) {
  507. console.error(`${c.yellow}No context found for: ${pathArg}${c.reset}`);
  508. process.exit(1);
  509. }
  510. console.log(`${c.green}✓${c.reset} Removed context for: ${pathArg}`);
  511. return;
  512. }
  513. // Handle filesystem paths
  514. let fsPath = pathArg;
  515. if (fsPath === '.' || fsPath === './') {
  516. fsPath = getPwd();
  517. } else if (fsPath.startsWith('~/')) {
  518. fsPath = homedir() + fsPath.slice(1);
  519. } else if (!fsPath.startsWith('/')) {
  520. fsPath = resolve(getPwd(), fsPath);
  521. }
  522. const db = getDb();
  523. const detected = detectCollectionFromPath(db, fsPath);
  524. closeDb();
  525. if (!detected) {
  526. console.error(`${c.yellow}Path is not in any indexed collection: ${fsPath}${c.reset}`);
  527. process.exit(1);
  528. }
  529. const success = yamlRemoveContext(detected.collectionName, detected.relativePath);
  530. if (!success) {
  531. console.error(`${c.yellow}No context found for: qmd://${detected.collectionName}/${detected.relativePath}${c.reset}`);
  532. process.exit(1);
  533. }
  534. console.log(`${c.green}✓${c.reset} Removed context for: qmd://${detected.collectionName}/${detected.relativePath}`);
  535. }
  536. function contextCheck(): void {
  537. const db = getDb();
  538. // Get collections without any context
  539. const collectionsWithoutContext = getCollectionsWithoutContext(db);
  540. // Get all collections to check for missing path contexts
  541. const allCollections = listCollections(db);
  542. if (collectionsWithoutContext.length === 0 && allCollections.length > 0) {
  543. // Check if all collections have contexts
  544. console.log(`\n${c.green}✓${c.reset} ${c.bold}All collections have context configured${c.reset}\n`);
  545. }
  546. if (collectionsWithoutContext.length > 0) {
  547. console.log(`\n${c.yellow}Collections without any context:${c.reset}\n`);
  548. for (const coll of collectionsWithoutContext) {
  549. console.log(`${c.cyan}${coll.name}${c.reset} ${c.dim}(${coll.doc_count} documents)${c.reset}`);
  550. console.log(` ${c.dim}Suggestion: qmd context add qmd://${coll.name}/ "Description of ${coll.name}"${c.reset}\n`);
  551. }
  552. }
  553. // Check for top-level paths without context within collections that DO have context
  554. const collectionsWithContext = allCollections.filter(c =>
  555. c && !collectionsWithoutContext.some(cwc => cwc.name === c.name)
  556. );
  557. let hasPathSuggestions = false;
  558. for (const coll of collectionsWithContext) {
  559. if (!coll) continue;
  560. const missingPaths = getTopLevelPathsWithoutContext(db, coll.name);
  561. if (missingPaths.length > 0) {
  562. if (!hasPathSuggestions) {
  563. console.log(`${c.yellow}Top-level directories without context:${c.reset}\n`);
  564. hasPathSuggestions = true;
  565. }
  566. console.log(`${c.cyan}${coll.name}${c.reset}`);
  567. for (const path of missingPaths) {
  568. console.log(` ${path}`);
  569. console.log(` ${c.dim}Suggestion: qmd context add qmd://${coll.name}/${path} "Description of ${path}"${c.reset}`);
  570. }
  571. console.log('');
  572. }
  573. }
  574. if (collectionsWithoutContext.length === 0 && !hasPathSuggestions) {
  575. console.log(`${c.dim}All collections and major paths have context configured.${c.reset}`);
  576. console.log(`${c.dim}Use 'qmd context list' to see all configured contexts.${c.reset}\n`);
  577. }
  578. closeDb();
  579. }
  580. function getDocument(filename: string, fromLine?: number, maxLines?: number, lineNumbers?: boolean): void {
  581. const db = getDb();
  582. // Parse :linenum suffix from filename (e.g., "file.md:100")
  583. let inputPath = filename;
  584. const colonMatch = inputPath.match(/:(\d+)$/);
  585. if (colonMatch && !fromLine) {
  586. const matched = colonMatch[1];
  587. if (matched) {
  588. fromLine = parseInt(matched, 10);
  589. inputPath = inputPath.slice(0, -colonMatch[0].length);
  590. }
  591. }
  592. // Handle docid lookup (#abc123, abc123, "#abc123", "abc123", etc.)
  593. if (isDocid(inputPath)) {
  594. const docidMatch = findDocumentByDocid(db, inputPath);
  595. if (docidMatch) {
  596. inputPath = docidMatch.filepath;
  597. } else {
  598. console.error(`Document not found: ${filename}`);
  599. closeDb();
  600. process.exit(1);
  601. }
  602. }
  603. let doc: { collectionName: string; path: string; body: string } | null = null;
  604. let virtualPath: string;
  605. // Handle virtual paths (qmd://collection/path)
  606. if (isVirtualPath(inputPath)) {
  607. const parsed = parseVirtualPath(inputPath);
  608. if (!parsed) {
  609. console.error(`Invalid virtual path: ${inputPath}`);
  610. closeDb();
  611. process.exit(1);
  612. }
  613. // Try exact match on collection + path
  614. doc = db.prepare(`
  615. SELECT d.collection as collectionName, d.path, content.doc as body
  616. FROM documents d
  617. JOIN content ON content.hash = d.hash
  618. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  619. `).get(parsed.collectionName, parsed.path) as typeof doc;
  620. if (!doc) {
  621. // Try fuzzy match by path ending
  622. doc = db.prepare(`
  623. SELECT d.collection as collectionName, d.path, content.doc as body
  624. FROM documents d
  625. JOIN content ON content.hash = d.hash
  626. WHERE d.collection = ? AND d.path LIKE ? AND d.active = 1
  627. LIMIT 1
  628. `).get(parsed.collectionName, `%${parsed.path}`) as typeof doc;
  629. }
  630. virtualPath = inputPath;
  631. } else {
  632. // Try to interpret as collection/path format first (before filesystem path)
  633. // If path is relative (no / or ~ prefix), check if first component is a collection name
  634. if (!inputPath.startsWith('/') && !inputPath.startsWith('~')) {
  635. const parts = inputPath.split('/');
  636. if (parts.length >= 2) {
  637. const possibleCollection = parts[0];
  638. const possiblePath = parts.slice(1).join('/');
  639. // Check if this collection exists
  640. const collExists = possibleCollection ? db.prepare(`
  641. SELECT 1 FROM documents WHERE collection = ? AND active = 1 LIMIT 1
  642. `).get(possibleCollection) : null;
  643. if (collExists) {
  644. // Try exact match on collection + path
  645. doc = db.prepare(`
  646. SELECT d.collection as collectionName, d.path, content.doc as body
  647. FROM documents d
  648. JOIN content ON content.hash = d.hash
  649. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  650. `).get(possibleCollection || "", possiblePath || "") as { collectionName: string; path: string; body: string } | null;
  651. if (!doc) {
  652. // Try fuzzy match by path ending
  653. doc = db.prepare(`
  654. SELECT d.collection as collectionName, d.path, content.doc as body
  655. FROM documents d
  656. JOIN content ON content.hash = d.hash
  657. WHERE d.collection = ? AND d.path LIKE ? AND d.active = 1
  658. LIMIT 1
  659. `).get(possibleCollection || "", `%${possiblePath}`) as { collectionName: string; path: string; body: string } | null;
  660. }
  661. if (doc) {
  662. virtualPath = buildVirtualPath(doc.collectionName, doc.path);
  663. // Skip the filesystem path handling below
  664. }
  665. }
  666. }
  667. }
  668. // If not found as collection/path, handle as filesystem paths
  669. if (!doc) {
  670. let fsPath = inputPath;
  671. // Expand ~ to home directory
  672. if (fsPath.startsWith('~/')) {
  673. fsPath = homedir() + fsPath.slice(1);
  674. } else if (!fsPath.startsWith('/')) {
  675. // Relative path - resolve from current directory
  676. fsPath = resolve(getPwd(), fsPath);
  677. }
  678. fsPath = getRealPath(fsPath);
  679. // Try to detect which collection contains this path
  680. const detected = detectCollectionFromPath(db, fsPath);
  681. if (detected) {
  682. // Found collection - query by collection name + relative path
  683. doc = db.prepare(`
  684. SELECT d.collection as collectionName, d.path, content.doc as body
  685. FROM documents d
  686. JOIN content ON content.hash = d.hash
  687. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  688. `).get(detected.collectionName, detected.relativePath) as { collectionName: string; path: string; body: string } | null;
  689. }
  690. // Fuzzy match by filename (last component of path)
  691. if (!doc) {
  692. const filename = inputPath.split('/').pop() || inputPath;
  693. doc = db.prepare(`
  694. SELECT d.collection as collectionName, d.path, content.doc as body
  695. FROM documents d
  696. JOIN content ON content.hash = d.hash
  697. WHERE d.path LIKE ? AND d.active = 1
  698. LIMIT 1
  699. `).get(`%${filename}`) as { collectionName: string; path: string; body: string } | null;
  700. }
  701. if (doc) {
  702. virtualPath = buildVirtualPath(doc.collectionName, doc.path);
  703. } else {
  704. virtualPath = inputPath;
  705. }
  706. }
  707. }
  708. // Ensure doc is not null before proceeding
  709. if (!doc) {
  710. console.error(`Document not found: ${filename}`);
  711. closeDb();
  712. process.exit(1);
  713. }
  714. // Get context for this file
  715. const context = getContextForPath(db, doc.collectionName, doc.path);
  716. let output = doc.body;
  717. const startLine = fromLine || 1;
  718. // Apply line filtering if specified
  719. if (fromLine !== undefined || maxLines !== undefined) {
  720. const lines = output.split('\n');
  721. const start = startLine - 1; // Convert to 0-indexed
  722. const end = maxLines !== undefined ? start + maxLines : lines.length;
  723. output = lines.slice(start, end).join('\n');
  724. }
  725. // Add line numbers if requested
  726. if (lineNumbers) {
  727. output = addLineNumbers(output, startLine);
  728. }
  729. // Output context header if exists
  730. if (context) {
  731. console.log(`Folder Context: ${context}\n---\n`);
  732. }
  733. console.log(output);
  734. closeDb();
  735. }
  736. // Multi-get: fetch multiple documents by glob pattern or comma-separated list
  737. function multiGet(pattern: string, maxLines?: number, maxBytes: number = DEFAULT_MULTI_GET_MAX_BYTES, format: OutputFormat = "cli"): void {
  738. const db = getDb();
  739. // Check if it's a comma-separated list or a glob pattern
  740. const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
  741. let files: { filepath: string; displayPath: string; bodyLength: number; collection?: string; path?: string }[];
  742. if (isCommaSeparated) {
  743. // Comma-separated list of files (can be virtual paths or relative paths)
  744. const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
  745. files = [];
  746. for (const name of names) {
  747. let doc: { virtual_path: string; body_length: number; collection: string; path: string } | null = null;
  748. // Handle virtual paths
  749. if (isVirtualPath(name)) {
  750. const parsed = parseVirtualPath(name);
  751. if (parsed) {
  752. // Try exact match on collection + path
  753. doc = db.prepare(`
  754. SELECT
  755. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  756. LENGTH(content.doc) as body_length,
  757. d.collection,
  758. d.path
  759. FROM documents d
  760. JOIN content ON content.hash = d.hash
  761. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  762. `).get(parsed.collectionName, parsed.path) as typeof doc;
  763. }
  764. } else {
  765. // Try exact match on path
  766. doc = db.prepare(`
  767. SELECT
  768. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  769. LENGTH(content.doc) as body_length,
  770. d.collection,
  771. d.path
  772. FROM documents d
  773. JOIN content ON content.hash = d.hash
  774. WHERE d.path = ? AND d.active = 1
  775. LIMIT 1
  776. `).get(name) as { virtual_path: string; body_length: number; collection: string; path: string } | null;
  777. // Try suffix match
  778. if (!doc) {
  779. doc = db.prepare(`
  780. SELECT
  781. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  782. LENGTH(content.doc) as body_length,
  783. d.collection,
  784. d.path
  785. FROM documents d
  786. JOIN content ON content.hash = d.hash
  787. WHERE d.path LIKE ? AND d.active = 1
  788. LIMIT 1
  789. `).get(`%${name}`) as { virtual_path: string; body_length: number; collection: string; path: string } | null;
  790. }
  791. }
  792. if (doc) {
  793. files.push({
  794. filepath: doc.virtual_path,
  795. displayPath: doc.virtual_path,
  796. bodyLength: doc.body_length,
  797. collection: doc.collection,
  798. path: doc.path
  799. });
  800. } else {
  801. console.error(`File not found: ${name}`);
  802. }
  803. }
  804. } else {
  805. // Glob pattern - matchFilesByGlob now returns virtual paths
  806. files = matchFilesByGlob(db, pattern).map(f => ({
  807. ...f,
  808. collection: undefined, // Will be fetched later if needed
  809. path: undefined
  810. }));
  811. if (files.length === 0) {
  812. console.error(`No files matched pattern: ${pattern}`);
  813. closeDb();
  814. process.exit(1);
  815. }
  816. }
  817. // Collect results for structured output
  818. const results: { file: string; displayPath: string; title: string; body: string; context: string | null; skipped: boolean; skipReason?: string }[] = [];
  819. for (const file of files) {
  820. // Parse virtual path to get collection info if not already available
  821. let collection = file.collection;
  822. let path = file.path;
  823. if (!collection || !path) {
  824. const parsed = parseVirtualPath(file.filepath);
  825. if (parsed) {
  826. collection = parsed.collectionName;
  827. path = parsed.path;
  828. }
  829. }
  830. // Get context using collection-scoped function
  831. const context = collection && path ? getContextForPath(db, collection, path) : null;
  832. // Check size limit
  833. if (file.bodyLength > maxBytes) {
  834. results.push({
  835. file: file.filepath,
  836. displayPath: file.displayPath,
  837. title: file.displayPath.split('/').pop() || file.displayPath,
  838. body: "",
  839. context,
  840. skipped: true,
  841. skipReason: `File too large (${Math.round(file.bodyLength / 1024)}KB > ${Math.round(maxBytes / 1024)}KB). Use 'qmd get ${file.displayPath}' to retrieve.`,
  842. });
  843. continue;
  844. }
  845. // Fetch document content using collection and path
  846. if (!collection || !path) continue;
  847. const doc = db.prepare(`
  848. SELECT content.doc as body, d.title
  849. FROM documents d
  850. JOIN content ON content.hash = d.hash
  851. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  852. `).get(collection, path) as { body: string; title: string } | null;
  853. if (!doc) continue;
  854. let body = doc.body;
  855. // Apply line limit if specified
  856. if (maxLines !== undefined) {
  857. const lines = body.split('\n');
  858. body = lines.slice(0, maxLines).join('\n');
  859. if (lines.length > maxLines) {
  860. body += `\n\n[... truncated ${lines.length - maxLines} more lines]`;
  861. }
  862. }
  863. results.push({
  864. file: file.filepath,
  865. displayPath: file.displayPath,
  866. title: doc.title || file.displayPath.split('/').pop() || file.displayPath,
  867. body,
  868. context,
  869. skipped: false,
  870. });
  871. }
  872. closeDb();
  873. // Output based on format
  874. if (format === "json") {
  875. const output = results.map(r => ({
  876. file: r.displayPath,
  877. title: r.title,
  878. ...(r.context && { context: r.context }),
  879. ...(r.skipped ? { skipped: true, reason: r.skipReason } : { body: r.body }),
  880. }));
  881. console.log(JSON.stringify(output, null, 2));
  882. } else if (format === "csv") {
  883. const escapeField = (val: string | null | undefined): string => {
  884. if (val === null || val === undefined) return "";
  885. const str = String(val);
  886. if (str.includes(",") || str.includes('"') || str.includes("\n")) {
  887. return `"${str.replace(/"/g, '""')}"`;
  888. }
  889. return str;
  890. };
  891. console.log("file,title,context,skipped,body");
  892. for (const r of results) {
  893. console.log([r.displayPath, r.title, r.context, r.skipped ? "true" : "false", r.skipped ? r.skipReason : r.body].map(escapeField).join(","));
  894. }
  895. } else if (format === "files") {
  896. for (const r of results) {
  897. const ctx = r.context ? `,"${r.context.replace(/"/g, '""')}"` : "";
  898. const status = r.skipped ? "[SKIPPED]" : "";
  899. console.log(`${r.displayPath}${ctx}${status ? `,${status}` : ""}`);
  900. }
  901. } else if (format === "md") {
  902. for (const r of results) {
  903. console.log(`## ${r.displayPath}\n`);
  904. if (r.title && r.title !== r.displayPath) console.log(`**Title:** ${r.title}\n`);
  905. if (r.context) console.log(`**Context:** ${r.context}\n`);
  906. if (r.skipped) {
  907. console.log(`> ${r.skipReason}\n`);
  908. } else {
  909. console.log("```");
  910. console.log(r.body);
  911. console.log("```\n");
  912. }
  913. }
  914. } else if (format === "xml") {
  915. console.log('<?xml version="1.0" encoding="UTF-8"?>');
  916. console.log("<documents>");
  917. for (const r of results) {
  918. console.log(" <document>");
  919. console.log(` <file>${escapeXml(r.displayPath)}</file>`);
  920. console.log(` <title>${escapeXml(r.title)}</title>`);
  921. if (r.context) console.log(` <context>${escapeXml(r.context)}</context>`);
  922. if (r.skipped) {
  923. console.log(` <skipped>true</skipped>`);
  924. console.log(` <reason>${escapeXml(r.skipReason || "")}</reason>`);
  925. } else {
  926. console.log(` <body>${escapeXml(r.body)}</body>`);
  927. }
  928. console.log(" </document>");
  929. }
  930. console.log("</documents>");
  931. } else {
  932. // CLI format (default)
  933. for (const r of results) {
  934. console.log(`\n${'='.repeat(60)}`);
  935. console.log(`File: ${r.displayPath}`);
  936. console.log(`${'='.repeat(60)}\n`);
  937. if (r.skipped) {
  938. console.log(`[SKIPPED: ${r.skipReason}]`);
  939. continue;
  940. }
  941. if (r.context) {
  942. console.log(`Folder Context: ${r.context}\n---\n`);
  943. }
  944. console.log(r.body);
  945. }
  946. }
  947. }
  948. // List files in virtual file tree
  949. function listFiles(pathArg?: string): void {
  950. const db = getDb();
  951. if (!pathArg) {
  952. // No argument - list all collections
  953. const yamlCollections = yamlListCollections();
  954. if (yamlCollections.length === 0) {
  955. console.log("No collections found. Run 'qmd add .' to index files.");
  956. closeDb();
  957. return;
  958. }
  959. // Get file counts from database for each collection
  960. const collections = yamlCollections.map(coll => {
  961. const stats = db.prepare(`
  962. SELECT COUNT(*) as file_count
  963. FROM documents d
  964. WHERE d.collection = ? AND d.active = 1
  965. `).get(coll.name) as { file_count: number } | null;
  966. return {
  967. name: coll.name,
  968. file_count: stats?.file_count || 0
  969. };
  970. });
  971. console.log(`${c.bold}Collections:${c.reset}\n`);
  972. for (const coll of collections) {
  973. console.log(` ${c.dim}qmd://${c.reset}${c.cyan}${coll.name}/${c.reset} ${c.dim}(${coll.file_count} files)${c.reset}`);
  974. }
  975. closeDb();
  976. return;
  977. }
  978. // Parse the path argument
  979. let collectionName: string;
  980. let pathPrefix: string | null = null;
  981. if (pathArg.startsWith('qmd://')) {
  982. // Virtual path format: qmd://collection/path
  983. const parsed = parseVirtualPath(pathArg);
  984. if (!parsed) {
  985. console.error(`Invalid virtual path: ${pathArg}`);
  986. closeDb();
  987. process.exit(1);
  988. }
  989. collectionName = parsed.collectionName;
  990. pathPrefix = parsed.path;
  991. } else {
  992. // Just collection name or collection/path
  993. const parts = pathArg.split('/');
  994. collectionName = parts[0] || '';
  995. if (parts.length > 1) {
  996. pathPrefix = parts.slice(1).join('/');
  997. }
  998. }
  999. // Get the collection
  1000. const coll = getCollectionFromYaml(collectionName);
  1001. if (!coll) {
  1002. console.error(`Collection not found: ${collectionName}`);
  1003. console.error(`Run 'qmd ls' to see available collections.`);
  1004. closeDb();
  1005. process.exit(1);
  1006. }
  1007. // List files in the collection with size and modification time
  1008. let query: string;
  1009. let params: any[];
  1010. if (pathPrefix) {
  1011. // List files under a specific path
  1012. query = `
  1013. SELECT d.path, d.title, d.modified_at, LENGTH(ct.doc) as size
  1014. FROM documents d
  1015. JOIN content ct ON d.hash = ct.hash
  1016. WHERE d.collection = ? AND d.path LIKE ? AND d.active = 1
  1017. ORDER BY d.path
  1018. `;
  1019. params = [coll.name, `${pathPrefix}%`];
  1020. } else {
  1021. // List all files in the collection
  1022. query = `
  1023. SELECT d.path, d.title, d.modified_at, LENGTH(ct.doc) as size
  1024. FROM documents d
  1025. JOIN content ct ON d.hash = ct.hash
  1026. WHERE d.collection = ? AND d.active = 1
  1027. ORDER BY d.path
  1028. `;
  1029. params = [coll.name];
  1030. }
  1031. const files = db.prepare(query).all(...params) as { path: string; title: string; modified_at: string; size: number }[];
  1032. if (files.length === 0) {
  1033. if (pathPrefix) {
  1034. console.log(`No files found under qmd://${collectionName}/${pathPrefix}`);
  1035. } else {
  1036. console.log(`No files found in collection: ${collectionName}`);
  1037. }
  1038. closeDb();
  1039. return;
  1040. }
  1041. // Calculate max widths for alignment
  1042. const maxSize = Math.max(...files.map(f => formatBytes(f.size).length));
  1043. // Output in ls -l style
  1044. for (const file of files) {
  1045. const sizeStr = formatBytes(file.size).padStart(maxSize);
  1046. const date = new Date(file.modified_at);
  1047. const timeStr = formatLsTime(date);
  1048. // Dim the qmd:// prefix, highlight the filename
  1049. console.log(`${sizeStr} ${timeStr} ${c.dim}qmd://${collectionName}/${c.reset}${c.cyan}${file.path}${c.reset}`);
  1050. }
  1051. closeDb();
  1052. }
  1053. // Format date/time like ls -l
  1054. function formatLsTime(date: Date): string {
  1055. const now = new Date();
  1056. const sixMonthsAgo = new Date(now.getTime() - 6 * 30 * 24 * 60 * 60 * 1000);
  1057. const months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
  1058. const month = months[date.getMonth()];
  1059. const day = date.getDate().toString().padStart(2, ' ');
  1060. // If file is older than 6 months, show year instead of time
  1061. if (date < sixMonthsAgo) {
  1062. const year = date.getFullYear();
  1063. return `${month} ${day} ${year}`;
  1064. } else {
  1065. const hours = date.getHours().toString().padStart(2, '0');
  1066. const minutes = date.getMinutes().toString().padStart(2, '0');
  1067. return `${month} ${day} ${hours}:${minutes}`;
  1068. }
  1069. }
  1070. // Collection management commands
  1071. function collectionList(): void {
  1072. const db = getDb();
  1073. const collections = listCollections(db);
  1074. if (collections.length === 0) {
  1075. console.log("No collections found. Run 'qmd add .' to create one.");
  1076. closeDb();
  1077. return;
  1078. }
  1079. console.log(`${c.bold}Collections (${collections.length}):${c.reset}\n`);
  1080. for (const coll of collections) {
  1081. const updatedAt = coll.last_modified ? new Date(coll.last_modified) : new Date();
  1082. const timeAgo = formatTimeAgo(updatedAt);
  1083. console.log(`${c.cyan}${coll.name}${c.reset} ${c.dim}(qmd://${coll.name}/)${c.reset}`);
  1084. console.log(` ${c.dim}Pattern:${c.reset} ${coll.glob_pattern}`);
  1085. console.log(` ${c.dim}Files:${c.reset} ${coll.active_count}`);
  1086. console.log(` ${c.dim}Updated:${c.reset} ${timeAgo}`);
  1087. console.log();
  1088. }
  1089. closeDb();
  1090. }
  1091. async function collectionAdd(pwd: string, globPattern: string, name?: string): Promise<void> {
  1092. // If name not provided, generate from pwd basename
  1093. let collName = name;
  1094. if (!collName) {
  1095. const parts = pwd.split('/').filter(Boolean);
  1096. collName = parts[parts.length - 1] || 'root';
  1097. }
  1098. // Check if collection with this name already exists in YAML
  1099. const existing = getCollectionFromYaml(collName);
  1100. if (existing) {
  1101. console.error(`${c.yellow}Collection '${collName}' already exists.${c.reset}`);
  1102. console.error(`Use a different name with --name <name>`);
  1103. process.exit(1);
  1104. }
  1105. // Check if a collection with this pwd+glob already exists in YAML
  1106. const allCollections = yamlListCollections();
  1107. const existingPwdGlob = allCollections.find(c => c.path === pwd && c.pattern === globPattern);
  1108. if (existingPwdGlob) {
  1109. console.error(`${c.yellow}A collection already exists for this path and pattern:${c.reset}`);
  1110. console.error(` Name: ${existingPwdGlob.name} (qmd://${existingPwdGlob.name}/)`);
  1111. console.error(` Pattern: ${globPattern}`);
  1112. console.error(`\nUse 'qmd update' to re-index it, or remove it first with 'qmd collection remove ${existingPwdGlob.name}'`);
  1113. process.exit(1);
  1114. }
  1115. // Add to YAML config
  1116. const { addCollection } = await import("./collections.js");
  1117. addCollection(collName, pwd, globPattern);
  1118. // Create the collection and index files
  1119. console.log(`Creating collection '${collName}'...`);
  1120. await indexFiles(pwd, globPattern, collName);
  1121. console.log(`${c.green}✓${c.reset} Collection '${collName}' created successfully`);
  1122. }
  1123. function collectionRemove(name: string): void {
  1124. // Check if collection exists in YAML
  1125. const coll = getCollectionFromYaml(name);
  1126. if (!coll) {
  1127. console.error(`${c.yellow}Collection not found: ${name}${c.reset}`);
  1128. console.error(`Run 'qmd collection list' to see available collections.`);
  1129. process.exit(1);
  1130. }
  1131. const db = getDb();
  1132. const result = removeCollection(db, name);
  1133. closeDb();
  1134. console.log(`${c.green}✓${c.reset} Removed collection '${name}'`);
  1135. console.log(` Deleted ${result.deletedDocs} documents`);
  1136. if (result.cleanedHashes > 0) {
  1137. console.log(` Cleaned up ${result.cleanedHashes} orphaned content hashes`);
  1138. }
  1139. }
  1140. function collectionRename(oldName: string, newName: string): void {
  1141. // Check if old collection exists in YAML
  1142. const coll = getCollectionFromYaml(oldName);
  1143. if (!coll) {
  1144. console.error(`${c.yellow}Collection not found: ${oldName}${c.reset}`);
  1145. console.error(`Run 'qmd collection list' to see available collections.`);
  1146. process.exit(1);
  1147. }
  1148. // Check if new name already exists in YAML
  1149. const existing = getCollectionFromYaml(newName);
  1150. if (existing) {
  1151. console.error(`${c.yellow}Collection name already exists: ${newName}${c.reset}`);
  1152. console.error(`Choose a different name or remove the existing collection first.`);
  1153. process.exit(1);
  1154. }
  1155. const db = getDb();
  1156. renameCollection(db, oldName, newName);
  1157. closeDb();
  1158. console.log(`${c.green}✓${c.reset} Renamed collection '${oldName}' to '${newName}'`);
  1159. console.log(` Virtual paths updated: ${c.cyan}qmd://${oldName}/${c.reset} → ${c.cyan}qmd://${newName}/${c.reset}`);
  1160. }
  1161. async function indexFiles(pwd?: string, globPattern: string = DEFAULT_GLOB, collectionName?: string, suppressEmbedNotice: boolean = false): Promise<void> {
  1162. const db = getDb();
  1163. const resolvedPwd = pwd || getPwd();
  1164. const now = new Date().toISOString();
  1165. const excludeDirs = ["node_modules", ".git", ".cache", "vendor", "dist", "build"];
  1166. // Clear Ollama cache on index
  1167. clearCache(db);
  1168. // Collection name must be provided (from YAML)
  1169. if (!collectionName) {
  1170. throw new Error("Collection name is required. Collections must be defined in ~/.config/qmd/index.yml");
  1171. }
  1172. console.log(`Collection: ${resolvedPwd} (${globPattern})`);
  1173. progress.indeterminate();
  1174. const glob = new Glob(globPattern);
  1175. const files: string[] = [];
  1176. for await (const file of glob.scan({ cwd: resolvedPwd, onlyFiles: true, followSymlinks: true })) {
  1177. // Skip node_modules, hidden folders (.*), and other common excludes
  1178. const parts = file.split("/");
  1179. const shouldSkip = parts.some(part =>
  1180. part === "node_modules" ||
  1181. part.startsWith(".") ||
  1182. excludeDirs.includes(part)
  1183. );
  1184. if (!shouldSkip) {
  1185. files.push(file);
  1186. }
  1187. }
  1188. const total = files.length;
  1189. if (total === 0) {
  1190. progress.clear();
  1191. console.log("No files found matching pattern.");
  1192. closeDb();
  1193. return;
  1194. }
  1195. let indexed = 0, updated = 0, unchanged = 0, processed = 0;
  1196. const seenPaths = new Set<string>();
  1197. const startTime = Date.now();
  1198. for (const relativeFile of files) {
  1199. const filepath = getRealPath(resolve(resolvedPwd, relativeFile));
  1200. const path = handelize(relativeFile); // Normalize path for token-friendliness
  1201. seenPaths.add(path);
  1202. const content = await Bun.file(filepath).text();
  1203. // Skip empty files - nothing useful to index
  1204. if (!content.trim()) {
  1205. processed++;
  1206. continue;
  1207. }
  1208. const hash = await hashContent(content);
  1209. const title = extractTitle(content, relativeFile);
  1210. // Check if document exists in this collection with this path
  1211. const existing = findActiveDocument(db, collectionName, path);
  1212. if (existing) {
  1213. if (existing.hash === hash) {
  1214. // Hash unchanged, but check if title needs updating
  1215. if (existing.title !== title) {
  1216. updateDocumentTitle(db, existing.id, title, now);
  1217. updated++;
  1218. } else {
  1219. unchanged++;
  1220. }
  1221. } else {
  1222. // Content changed - insert new content hash and update document
  1223. insertContent(db, hash, content, now);
  1224. const stat = await Bun.file(filepath).stat();
  1225. updateDocument(db, existing.id, title, hash,
  1226. stat ? new Date(stat.mtime).toISOString() : now);
  1227. updated++;
  1228. }
  1229. } else {
  1230. // New document - insert content and document
  1231. indexed++;
  1232. insertContent(db, hash, content, now);
  1233. const stat = await Bun.file(filepath).stat();
  1234. insertDocument(db, collectionName, path, title, hash,
  1235. stat ? new Date(stat.birthtime).toISOString() : now,
  1236. stat ? new Date(stat.mtime).toISOString() : now);
  1237. }
  1238. processed++;
  1239. progress.set((processed / total) * 100);
  1240. const elapsed = (Date.now() - startTime) / 1000;
  1241. const rate = processed / elapsed;
  1242. const remaining = (total - processed) / rate;
  1243. const eta = processed > 2 ? ` ETA: ${formatETA(remaining)}` : "";
  1244. process.stderr.write(`\rIndexing: ${processed}/${total}${eta} `);
  1245. }
  1246. // Deactivate documents in this collection that no longer exist
  1247. const allActive = getActiveDocumentPaths(db, collectionName);
  1248. let removed = 0;
  1249. for (const path of allActive) {
  1250. if (!seenPaths.has(path)) {
  1251. deactivateDocument(db, collectionName, path);
  1252. removed++;
  1253. }
  1254. }
  1255. // Clean up orphaned content hashes (content not referenced by any document)
  1256. const orphanedContent = cleanupOrphanedContent(db);
  1257. // Check if vector index needs updating
  1258. const needsEmbedding = getHashesNeedingEmbedding(db);
  1259. progress.clear();
  1260. console.log(`\nIndexed: ${indexed} new, ${updated} updated, ${unchanged} unchanged, ${removed} removed`);
  1261. if (orphanedContent > 0) {
  1262. console.log(`Cleaned up ${orphanedContent} orphaned content hash(es)`);
  1263. }
  1264. if (needsEmbedding > 0 && !suppressEmbedNotice) {
  1265. console.log(`\nRun 'qmd embed' to update embeddings (${needsEmbedding} unique hashes need vectors)`);
  1266. }
  1267. closeDb();
  1268. }
  1269. function renderProgressBar(percent: number, width: number = 30): string {
  1270. const filled = Math.round((percent / 100) * width);
  1271. const empty = width - filled;
  1272. const bar = "█".repeat(filled) + "░".repeat(empty);
  1273. return bar;
  1274. }
  1275. async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = false): Promise<void> {
  1276. const db = getDb();
  1277. const now = new Date().toISOString();
  1278. // If force, clear all vectors
  1279. if (force) {
  1280. console.log(`${c.yellow}Force re-indexing: clearing all vectors...${c.reset}`);
  1281. clearAllEmbeddings(db);
  1282. }
  1283. // Find unique hashes that need embedding (from active documents)
  1284. const hashesToEmbed = getHashesForEmbedding(db);
  1285. if (hashesToEmbed.length === 0) {
  1286. console.log(`${c.green}✓ All content hashes already have embeddings.${c.reset}`);
  1287. closeDb();
  1288. return;
  1289. }
  1290. // Prepare documents with chunks
  1291. type ChunkItem = { hash: string; title: string; text: string; seq: number; pos: number; tokens: number; bytes: number; displayName: string };
  1292. const allChunks: ChunkItem[] = [];
  1293. let multiChunkDocs = 0;
  1294. // Chunk all documents using actual token counts
  1295. process.stderr.write(`Chunking ${hashesToEmbed.length} documents by token count...\n`);
  1296. for (const item of hashesToEmbed) {
  1297. const encoder = new TextEncoder();
  1298. const bodyBytes = encoder.encode(item.body).length;
  1299. if (bodyBytes === 0) continue; // Skip empty
  1300. const title = extractTitle(item.body, item.path);
  1301. const displayName = item.path;
  1302. const chunks = await chunkDocumentByTokens(item.body); // Uses actual tokenizer
  1303. if (chunks.length > 1) multiChunkDocs++;
  1304. for (let seq = 0; seq < chunks.length; seq++) {
  1305. allChunks.push({
  1306. hash: item.hash,
  1307. title,
  1308. text: chunks[seq]!.text, // Chunk is guaranteed to exist by seq loop
  1309. seq,
  1310. pos: chunks[seq]!.pos,
  1311. tokens: chunks[seq]!.tokens,
  1312. bytes: encoder.encode(chunks[seq]!.text).length,
  1313. displayName,
  1314. });
  1315. }
  1316. }
  1317. if (allChunks.length === 0) {
  1318. console.log(`${c.green}✓ No non-empty documents to embed.${c.reset}`);
  1319. closeDb();
  1320. return;
  1321. }
  1322. const totalBytes = allChunks.reduce((sum, c) => sum + c.bytes, 0);
  1323. const totalChunks = allChunks.length;
  1324. const totalDocs = hashesToEmbed.length;
  1325. console.log(`${c.bold}Embedding ${totalDocs} documents${c.reset} ${c.dim}(${totalChunks} chunks, ${formatBytes(totalBytes)})${c.reset}`);
  1326. if (multiChunkDocs > 0) {
  1327. console.log(`${c.dim}${multiChunkDocs} documents split into multiple chunks${c.reset}`);
  1328. }
  1329. console.log(`${c.dim}Model: ${model}${c.reset}\n`);
  1330. // Hide cursor during embedding
  1331. cursor.hide();
  1332. // Get embedding dimensions from first chunk
  1333. progress.indeterminate();
  1334. const llm = getDefaultLlamaCpp();
  1335. const firstChunk = allChunks[0];
  1336. if (!firstChunk) {
  1337. throw new Error("No chunks available to embed");
  1338. }
  1339. const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title);
  1340. const firstResult = await llm.embed(firstText);
  1341. if (!firstResult) {
  1342. throw new Error("Failed to get embedding dimensions from first chunk");
  1343. }
  1344. ensureVecTable(db, firstResult.embedding.length);
  1345. let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
  1346. const startTime = Date.now();
  1347. // Batch embedding for better throughput
  1348. // Process in batches of 32 to balance memory usage and efficiency
  1349. const BATCH_SIZE = 32;
  1350. for (let batchStart = 0; batchStart < allChunks.length; batchStart += BATCH_SIZE) {
  1351. const batchEnd = Math.min(batchStart + BATCH_SIZE, allChunks.length);
  1352. const batch = allChunks.slice(batchStart, batchEnd);
  1353. // Format texts for embedding
  1354. const texts = batch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title));
  1355. try {
  1356. // Batch embed all texts at once
  1357. const embeddings = await llm.embedBatch(texts);
  1358. // Insert each embedding
  1359. for (let i = 0; i < batch.length; i++) {
  1360. const chunk = batch[i]!;
  1361. const embedding = embeddings[i];
  1362. if (embedding) {
  1363. insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
  1364. chunksEmbedded++;
  1365. } else {
  1366. errors++;
  1367. console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}${c.reset}`);
  1368. }
  1369. bytesProcessed += chunk.bytes;
  1370. }
  1371. } catch (err) {
  1372. // If batch fails, try individual embeddings as fallback
  1373. for (const chunk of batch) {
  1374. try {
  1375. const text = formatDocForEmbedding(chunk.text, chunk.title);
  1376. const result = await llm.embed(text);
  1377. if (result) {
  1378. insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
  1379. chunksEmbedded++;
  1380. } else {
  1381. errors++;
  1382. }
  1383. } catch (innerErr) {
  1384. errors++;
  1385. console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}: ${innerErr}${c.reset}`);
  1386. }
  1387. bytesProcessed += chunk.bytes;
  1388. }
  1389. }
  1390. const percent = (bytesProcessed / totalBytes) * 100;
  1391. progress.set(percent);
  1392. const elapsed = (Date.now() - startTime) / 1000;
  1393. const bytesPerSec = bytesProcessed / elapsed;
  1394. const remainingBytes = totalBytes - bytesProcessed;
  1395. const etaSec = remainingBytes / bytesPerSec;
  1396. const bar = renderProgressBar(percent);
  1397. const percentStr = percent.toFixed(0).padStart(3);
  1398. const throughput = `${formatBytes(bytesPerSec)}/s`;
  1399. const eta = elapsed > 2 ? formatETA(etaSec) : "...";
  1400. const errStr = errors > 0 ? ` ${c.yellow}${errors} err${c.reset}` : "";
  1401. process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${chunksEmbedded}/${totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset} `);
  1402. }
  1403. progress.clear();
  1404. cursor.show();
  1405. const totalTimeSec = (Date.now() - startTime) / 1000;
  1406. const avgThroughput = formatBytes(totalBytes / totalTimeSec);
  1407. console.log(`\r${c.green}${renderProgressBar(100)}${c.reset} ${c.bold}100%${c.reset} `);
  1408. console.log(`\n${c.green}✓ Done!${c.reset} Embedded ${c.bold}${chunksEmbedded}${c.reset} chunks from ${c.bold}${totalDocs}${c.reset} documents in ${c.bold}${formatETA(totalTimeSec)}${c.reset} ${c.dim}(${avgThroughput}/s)${c.reset}`);
  1409. if (errors > 0) {
  1410. console.log(`${c.yellow}⚠ ${errors} chunks failed${c.reset}`);
  1411. }
  1412. closeDb();
  1413. }
  1414. // Sanitize a term for FTS5: remove punctuation except apostrophes
  1415. function sanitizeFTS5Term(term: string): string {
  1416. // Remove all non-alphanumeric except apostrophes (for contractions like "don't")
  1417. return term.replace(/[^\w']/g, '').trim();
  1418. }
  1419. // Build FTS5 query: phrase-aware with fallback to individual terms
  1420. function buildFTS5Query(query: string): string {
  1421. // Sanitize the full query for phrase matching
  1422. const sanitizedQuery = query.replace(/[^\w\s']/g, '').trim();
  1423. const terms = query
  1424. .split(/\s+/)
  1425. .map(sanitizeFTS5Term)
  1426. .filter(term => term.length >= 2); // Skip single chars and empty
  1427. if (terms.length === 0) return "";
  1428. if (terms.length === 1) return `"${terms[0]!.replace(/"/g, '""')}"`;
  1429. // Strategy: exact phrase OR proximity match OR individual terms
  1430. // Exact phrase matches rank highest, then close proximity, then any term
  1431. const phrase = `"${sanitizedQuery.replace(/"/g, '""')}"`;
  1432. const quotedTerms = terms.map(t => `"${t.replace(/"/g, '""')}"`);
  1433. // FTS5 NEAR syntax: NEAR(term1 term2, distance)
  1434. const nearPhrase = `NEAR(${quotedTerms.join(' ')}, 10)`;
  1435. const orTerms = quotedTerms.join(' OR ');
  1436. // Exact phrase > proximity > any term
  1437. return `(${phrase}) OR (${nearPhrase}) OR (${orTerms})`;
  1438. }
  1439. // Normalize BM25 score to 0-1 range using sigmoid
  1440. function normalizeBM25(score: number): number {
  1441. // BM25 scores are negative in SQLite (lower = better)
  1442. // Typical range: -15 (excellent) to -2 (weak match)
  1443. // Map to 0-1 where higher is better
  1444. const absScore = Math.abs(score);
  1445. // Sigmoid-ish normalization: maps ~2-15 range to ~0.1-0.95
  1446. return 1 / (1 + Math.exp(-(absScore - 5) / 3));
  1447. }
  1448. function normalizeScores(results: SearchResult[]): SearchResult[] {
  1449. if (results.length === 0) return results;
  1450. const maxScore = Math.max(...results.map(r => r.score));
  1451. const minScore = Math.min(...results.map(r => r.score));
  1452. const range = maxScore - minScore || 1;
  1453. return results.map(r => ({ ...r, score: (r.score - minScore) / range }));
  1454. }
  1455. // Reciprocal Rank Fusion: combines multiple ranked lists
  1456. // RRF score = sum(1 / (k + rank)) across all lists where doc appears
  1457. // k=60 is standard, provides good balance between top and lower ranks
  1458. function reciprocalRankFusion(
  1459. resultLists: RankedResult[][],
  1460. weights: number[] = [], // Weight per result list (default 1.0)
  1461. k: number = 60
  1462. ): RankedResult[] {
  1463. const scores = new Map<string, { score: number; displayPath: string; title: string; body: string; bestRank: number }>();
  1464. for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
  1465. const results = resultLists[listIdx];
  1466. if (!results) continue;
  1467. const weight = weights[listIdx] ?? 1.0;
  1468. for (let rank = 0; rank < results.length; rank++) {
  1469. const doc = results[rank];
  1470. if (!doc) continue; // Ensure doc is not undefined
  1471. const rrfScore = weight / (k + rank + 1);
  1472. const existing = scores.get(doc.file);
  1473. if (existing) {
  1474. existing.score += rrfScore;
  1475. existing.bestRank = Math.min(existing.bestRank, rank);
  1476. } else {
  1477. scores.set(doc.file, { score: rrfScore, displayPath: doc.displayPath, title: doc.title, body: doc.body, bestRank: rank });
  1478. }
  1479. }
  1480. }
  1481. // Add bonus for best rank: documents that ranked #1-3 in any list get a boost
  1482. // This prevents dilution of exact matches by expansion queries
  1483. return Array.from(scores.entries())
  1484. .map(([file, { score, displayPath, title, body, bestRank }]) => {
  1485. let bonus = 0;
  1486. if (bestRank === 0) bonus = 0.05; // Ranked #1 somewhere
  1487. else if (bestRank <= 2) bonus = 0.02; // Ranked top-3 somewhere
  1488. return { file, displayPath, title, body, score: score + bonus };
  1489. })
  1490. .sort((a, b) => b.score - a.score);
  1491. }
  1492. type OutputOptions = {
  1493. format: OutputFormat;
  1494. full: boolean;
  1495. limit: number;
  1496. minScore: number;
  1497. all?: boolean;
  1498. collection?: string; // Filter by collection name (pwd suffix match)
  1499. lineNumbers?: boolean; // Add line numbers to output
  1500. context?: string; // Optional context for query expansion
  1501. };
  1502. // Highlight query terms in text (skip short words < 3 chars)
  1503. function highlightTerms(text: string, query: string): string {
  1504. if (!useColor) return text;
  1505. const terms = query.toLowerCase().split(/\s+/).filter(t => t.length >= 3);
  1506. let result = text;
  1507. for (const term of terms) {
  1508. const regex = new RegExp(`(${term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')})`, 'gi');
  1509. result = result.replace(regex, `${c.yellow}${c.bold}$1${c.reset}`);
  1510. }
  1511. return result;
  1512. }
  1513. // Format score with color based on value
  1514. function formatScore(score: number): string {
  1515. const pct = (score * 100).toFixed(0).padStart(3);
  1516. if (!useColor) return `${pct}%`;
  1517. if (score >= 0.7) return `${c.green}${pct}%${c.reset}`;
  1518. if (score >= 0.4) return `${c.yellow}${pct}%${c.reset}`;
  1519. return `${c.dim}${pct}%${c.reset}`;
  1520. }
  1521. // Shorten directory path for display - relative to $HOME (used for context paths, not documents)
  1522. function shortPath(dirpath: string): string {
  1523. const home = homedir();
  1524. if (dirpath.startsWith(home)) {
  1525. return '~' + dirpath.slice(home.length);
  1526. }
  1527. return dirpath;
  1528. }
  1529. // Add line numbers to text content
  1530. function addLineNumbers(text: string, startLine: number = 1): string {
  1531. const lines = text.split('\n');
  1532. return lines.map((line, i) => `${startLine + i}: ${line}`).join('\n');
  1533. }
  1534. function outputResults(results: { file: string; displayPath: string; title: string; body: string; score: number; context?: string | null; chunkPos?: number; hash?: string; docid?: string }[], query: string, opts: OutputOptions): void {
  1535. const filtered = results.filter(r => r.score >= opts.minScore).slice(0, opts.limit);
  1536. if (filtered.length === 0) {
  1537. console.log("No results found above minimum score threshold.");
  1538. return;
  1539. }
  1540. // Helper to create qmd:// URI from displayPath
  1541. const toQmdPath = (displayPath: string) => `qmd://${displayPath}`;
  1542. if (opts.format === "json") {
  1543. // JSON output for LLM consumption
  1544. const output = filtered.map(row => {
  1545. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
  1546. let body = opts.full ? row.body : undefined;
  1547. let snippet = !opts.full ? extractSnippet(row.body, query, 300, row.chunkPos).snippet : undefined;
  1548. if (opts.lineNumbers) {
  1549. if (body) body = addLineNumbers(body);
  1550. if (snippet) snippet = addLineNumbers(snippet);
  1551. }
  1552. return {
  1553. ...(docid && { docid: `#${docid}` }),
  1554. score: Math.round(row.score * 100) / 100,
  1555. file: toQmdPath(row.displayPath),
  1556. title: row.title,
  1557. ...(row.context && { context: row.context }),
  1558. ...(body && { body }),
  1559. ...(snippet && { snippet }),
  1560. };
  1561. });
  1562. console.log(JSON.stringify(output, null, 2));
  1563. } else if (opts.format === "files") {
  1564. // Simple docid,score,filepath,context output
  1565. for (const row of filtered) {
  1566. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
  1567. const ctx = row.context ? `,"${row.context.replace(/"/g, '""')}"` : "";
  1568. console.log(`#${docid},${row.score.toFixed(2)},${toQmdPath(row.displayPath)}${ctx}`);
  1569. }
  1570. } else if (opts.format === "cli") {
  1571. for (let i = 0; i < filtered.length; i++) {
  1572. const row = filtered[i];
  1573. if (!row) continue;
  1574. const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos);
  1575. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
  1576. // Line 1: filepath with docid
  1577. const path = toQmdPath(row.displayPath);
  1578. // Only show :line if we actually found a term match in the snippet body (exclude header line).
  1579. const snippetBody = snippet.split("\n").slice(1).join("\n").toLowerCase();
  1580. const hasMatch = query.toLowerCase().split(/\s+/).some(t => t.length > 0 && snippetBody.includes(t));
  1581. const lineInfo = hasMatch ? `:${line}` : "";
  1582. const docidStr = docid ? ` ${c.dim}#${docid}${c.reset}` : "";
  1583. console.log(`${c.cyan}${path}${c.dim}${lineInfo}${c.reset}${docidStr}`);
  1584. // Line 2: Title (if available)
  1585. if (row.title) {
  1586. console.log(`${c.bold}Title: ${row.title}${c.reset}`);
  1587. }
  1588. // Line 3: Context (if available)
  1589. if (row.context) {
  1590. console.log(`${c.dim}Context: ${row.context}${c.reset}`);
  1591. }
  1592. // Line 4: Score
  1593. const score = formatScore(row.score);
  1594. console.log(`Score: ${c.bold}${score}${c.reset}`);
  1595. console.log();
  1596. // Snippet with highlighting (diff-style header included)
  1597. let displaySnippet = opts.lineNumbers ? addLineNumbers(snippet, line) : snippet;
  1598. const highlighted = highlightTerms(displaySnippet, query);
  1599. console.log(highlighted);
  1600. // Double empty line between results
  1601. if (i < filtered.length - 1) console.log('\n');
  1602. }
  1603. } else if (opts.format === "md") {
  1604. for (let i = 0; i < filtered.length; i++) {
  1605. const row = filtered[i];
  1606. if (!row) continue;
  1607. const heading = row.title || row.displayPath;
  1608. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
  1609. let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos).snippet;
  1610. if (opts.lineNumbers) {
  1611. content = addLineNumbers(content);
  1612. }
  1613. const docidLine = docid ? `**docid:** \`#${docid}\`\n` : "";
  1614. const contextLine = row.context ? `**context:** ${row.context}\n` : "";
  1615. console.log(`---\n# ${heading}\n${docidLine}${contextLine}\n${content}\n`);
  1616. }
  1617. } else if (opts.format === "xml") {
  1618. for (const row of filtered) {
  1619. const titleAttr = row.title ? ` title="${row.title.replace(/"/g, '&quot;')}"` : "";
  1620. const contextAttr = row.context ? ` context="${row.context.replace(/"/g, '&quot;')}"` : "";
  1621. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
  1622. let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos).snippet;
  1623. if (opts.lineNumbers) {
  1624. content = addLineNumbers(content);
  1625. }
  1626. console.log(`<file docid="#${docid}" name="${toQmdPath(row.displayPath)}"${titleAttr}${contextAttr}>\n${content}\n</file>\n`);
  1627. }
  1628. } else {
  1629. // CSV format
  1630. console.log("docid,score,file,title,context,line,snippet");
  1631. for (const row of filtered) {
  1632. const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos);
  1633. let content = opts.full ? row.body : snippet;
  1634. if (opts.lineNumbers) {
  1635. content = addLineNumbers(content, line);
  1636. }
  1637. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
  1638. const snippetText = content || "";
  1639. console.log(`#${docid},${row.score.toFixed(4)},${escapeCSV(toQmdPath(row.displayPath))},${escapeCSV(row.title || "")},${escapeCSV(row.context || "")},${line},${escapeCSV(snippetText)}`);
  1640. }
  1641. }
  1642. }
  1643. function search(query: string, opts: OutputOptions): void {
  1644. const db = getDb();
  1645. // Validate collection filter if specified
  1646. let collectionName: string | undefined;
  1647. if (opts.collection) {
  1648. const coll = getCollectionFromYaml(opts.collection);
  1649. if (!coll) {
  1650. console.error(`Collection not found: ${opts.collection}`);
  1651. closeDb();
  1652. process.exit(1);
  1653. }
  1654. collectionName = opts.collection;
  1655. }
  1656. // Use large limit for --all, otherwise fetch more than needed and let outputResults filter
  1657. const fetchLimit = opts.all ? 100000 : Math.max(50, opts.limit * 2);
  1658. // searchFTS accepts collection name as number parameter for legacy reasons (will be fixed in store.ts)
  1659. const results = searchFTS(db, query, fetchLimit, collectionName as any);
  1660. // Add context to results
  1661. const resultsWithContext = results.map(r => ({
  1662. file: r.filepath,
  1663. displayPath: r.displayPath,
  1664. title: r.title,
  1665. body: r.body || "",
  1666. score: r.score,
  1667. context: getContextForFile(db, r.filepath),
  1668. hash: r.hash,
  1669. docid: r.docid,
  1670. }));
  1671. closeDb();
  1672. if (resultsWithContext.length === 0) {
  1673. console.log("No results found.");
  1674. return;
  1675. }
  1676. outputResults(resultsWithContext, query, opts);
  1677. }
  1678. async function vectorSearch(query: string, opts: OutputOptions, model: string = DEFAULT_EMBED_MODEL): Promise<void> {
  1679. const db = getDb();
  1680. // Validate collection filter if specified
  1681. let collectionName: string | undefined;
  1682. if (opts.collection) {
  1683. const coll = getCollectionFromYaml(opts.collection);
  1684. if (!coll) {
  1685. console.error(`Collection not found: ${opts.collection}`);
  1686. closeDb();
  1687. process.exit(1);
  1688. }
  1689. collectionName = opts.collection;
  1690. }
  1691. const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1692. if (!tableExists) {
  1693. console.error("Vector index not found. Run 'qmd embed' first to create embeddings.");
  1694. closeDb();
  1695. return;
  1696. }
  1697. // Check index health and warn about issues
  1698. checkIndexHealth(db);
  1699. // Expand query using structured output (no lexical for vector-only search)
  1700. const queryables = await expandQueryStructured(query, false, opts.context);
  1701. // Build list of queries for vector search: original, vec, and hyde
  1702. const vectorQueries: string[] = [query];
  1703. for (const q of queryables) {
  1704. if (q.type === 'vec' || q.type === 'hyde') {
  1705. if (q.text && q.text !== query) {
  1706. vectorQueries.push(q.text);
  1707. }
  1708. }
  1709. }
  1710. process.stderr.write(`${c.dim}Searching ${vectorQueries.length} vector queries...${c.reset}\n`);
  1711. // Collect results from all query variations
  1712. const perQueryLimit = opts.all ? 500 : 20;
  1713. const allResults = new Map<string, { file: string; displayPath: string; title: string; body: string; score: number; hash: string }>();
  1714. // IMPORTANT: Run vector searches sequentially, not with Promise.all.
  1715. // node-llama-cpp's embedding context hangs when multiple concurrent embed() calls
  1716. // are made. This is a known limitation of the LlamaEmbeddingContext.
  1717. // See: https://github.com/tobi/qmd/pull/23
  1718. for (const q of vectorQueries) {
  1719. const vecResults = await searchVec(db, q, model, perQueryLimit, collectionName as any);
  1720. for (const r of vecResults) {
  1721. const existing = allResults.get(r.filepath);
  1722. if (!existing || r.score > existing.score) {
  1723. allResults.set(r.filepath, { file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score, hash: r.hash });
  1724. }
  1725. }
  1726. }
  1727. // Sort by max score and limit to requested count
  1728. const results = Array.from(allResults.values())
  1729. .sort((a, b) => b.score - a.score)
  1730. .slice(0, opts.limit)
  1731. .map(r => ({ ...r, context: getContextForFile(db, r.file) }));
  1732. closeDb();
  1733. if (results.length === 0) {
  1734. console.log("No results found.");
  1735. return;
  1736. }
  1737. outputResults(results, query, { ...opts, limit: results.length }); // Already limited
  1738. }
  1739. // Expand query using structured output with GBNF grammar
  1740. async function expandQueryStructured(query: string, includeLexical: boolean = true, context?: string): Promise<Queryable[]> {
  1741. process.stderr.write(`${c.dim}Expanding query...${c.reset}\n`);
  1742. const llm = getDefaultLlamaCpp();
  1743. const queryables = await llm.expandQuery(query, { includeLexical, context });
  1744. // Log the expansion as a tree
  1745. const lines: string[] = [];
  1746. const bothLabel = includeLexical ? ' · (lexical+vector)' : ' · (vector)';
  1747. lines.push(`${c.dim}├─ ${query}${bothLabel}${c.reset}`);
  1748. for (let i = 0; i < queryables.length; i++) {
  1749. const q = queryables[i];
  1750. if (!q || q.text === query) continue;
  1751. let textPreview = q.text.replace(/\n/g, ' ');
  1752. if (textPreview.length > 80) {
  1753. textPreview = textPreview.substring(0, 77) + '...';
  1754. }
  1755. const label = q.type === 'lex' ? 'lexical' : (q.type === 'hyde' ? 'hyde' : 'vector');
  1756. lines.push(`${c.dim}├─ ${textPreview} · (${label})${c.reset}`);
  1757. }
  1758. // Fix last item to use └─ instead of ├─
  1759. if (lines.length > 0) {
  1760. lines[lines.length - 1] = lines[lines.length - 1]!.replace('├─', '└─');
  1761. }
  1762. for (const line of lines) {
  1763. process.stderr.write(line + '\n');
  1764. }
  1765. return queryables;
  1766. }
  1767. async function expandQuery(query: string, _model: string = DEFAULT_QUERY_MODEL, _db?: Database): Promise<string[]> {
  1768. const queryables = await expandQueryStructured(query, true);
  1769. const queries = new Set<string>([query]);
  1770. for (const q of queryables) {
  1771. queries.add(q.text);
  1772. }
  1773. return Array.from(queries);
  1774. }
  1775. async function querySearch(query: string, opts: OutputOptions, embedModel: string = DEFAULT_EMBED_MODEL, rerankModel: string = DEFAULT_RERANK_MODEL): Promise<void> {
  1776. const db = getDb();
  1777. // Validate collection filter if specified
  1778. let collectionName: string | undefined;
  1779. if (opts.collection) {
  1780. const coll = getCollectionFromYaml(opts.collection);
  1781. if (!coll) {
  1782. console.error(`Collection not found: ${opts.collection}`);
  1783. closeDb();
  1784. process.exit(1);
  1785. }
  1786. collectionName = opts.collection;
  1787. }
  1788. // Check index health and warn about issues
  1789. checkIndexHealth(db);
  1790. // Run initial BM25 search (will be reused for retrieval)
  1791. const initialFts = searchFTS(db, query, 20, collectionName as any);
  1792. let hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1793. // Check if initial results have strong signals (skip expansion if so)
  1794. // Strong signal = top result is strong AND clearly separated from runner-up.
  1795. // This avoids skipping expansion when BM25 has lots of mediocre matches.
  1796. const topScore = initialFts[0]?.score ?? 0;
  1797. const secondScore = initialFts[1]?.score ?? 0;
  1798. const hasStrongSignal = initialFts.length > 0 && topScore >= 0.85 && (topScore - secondScore) >= 0.15;
  1799. let ftsQueries: string[] = [query];
  1800. let vectorQueries: string[] = [query];
  1801. if (hasStrongSignal) {
  1802. // Strong BM25 signal - skip expensive LLM expansion
  1803. process.stderr.write(`${c.dim}Strong BM25 signal (${topScore.toFixed(2)}) - skipping expansion${c.reset}\n`);
  1804. // Still log the "expansion tree" in the same style as vsearch for consistency.
  1805. {
  1806. const lines: string[] = [];
  1807. lines.push(`${c.dim}├─ ${query} · (lexical+vector)${c.reset}`);
  1808. lines[lines.length - 1] = lines[lines.length - 1]!.replace('├─', '└─');
  1809. for (const line of lines) process.stderr.write(line + '\n');
  1810. }
  1811. } else {
  1812. // Weak signal - expand query for better recall
  1813. const queryables = await expandQueryStructured(query, true, opts.context);
  1814. for (const q of queryables) {
  1815. if (q.type === 'lex') {
  1816. if (q.text && q.text !== query) ftsQueries.push(q.text);
  1817. } else if (q.type === 'vec' || q.type === 'hyde') {
  1818. if (q.text && q.text !== query) vectorQueries.push(q.text);
  1819. }
  1820. }
  1821. }
  1822. process.stderr.write(`${c.dim}Searching ${ftsQueries.length} lexical + ${vectorQueries.length} vector queries...${c.reset}\n`);
  1823. // Collect ranked result lists for RRF fusion
  1824. const rankedLists: RankedResult[][] = [];
  1825. // Map to store hash by filepath for final results
  1826. const hashMap = new Map<string, string>();
  1827. // Run all searches concurrently (FTS + Vector)
  1828. const searchPromises: Promise<void>[] = [];
  1829. // FTS searches
  1830. for (const q of ftsQueries) {
  1831. if (!q) continue;
  1832. searchPromises.push((async () => {
  1833. const ftsResults = searchFTS(db, q, 20, (collectionName || "") as any);
  1834. if (ftsResults.length > 0) {
  1835. for (const r of ftsResults) {
  1836. // Mutex for hashMap is not strictly needed as it's just adding values
  1837. hashMap.set(r.filepath, r.hash);
  1838. }
  1839. rankedLists.push(ftsResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score })));
  1840. }
  1841. })());
  1842. }
  1843. // Vector searches
  1844. if (hasVectors) {
  1845. for (const q of vectorQueries) {
  1846. if (!q) continue;
  1847. searchPromises.push((async () => {
  1848. const vecResults = await searchVec(db, q, embedModel, 20, (collectionName || "") as any);
  1849. if (vecResults.length > 0) {
  1850. for (const r of vecResults) hashMap.set(r.filepath, r.hash);
  1851. rankedLists.push(vecResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score })));
  1852. }
  1853. })());
  1854. }
  1855. }
  1856. await Promise.all(searchPromises);
  1857. // Apply Reciprocal Rank Fusion to combine all ranked lists
  1858. // Give 2x weight to original query results (first 2 lists: FTS + vector)
  1859. const weights = rankedLists.map((_, i) => i < 2 ? 2.0 : 1.0);
  1860. const fused = reciprocalRankFusion(rankedLists, weights);
  1861. // Hard cap reranking for latency/cost. We rerank per-document (best chunk only).
  1862. const RERANK_DOC_LIMIT = 40;
  1863. const candidates = fused.slice(0, RERANK_DOC_LIMIT);
  1864. if (candidates.length === 0) {
  1865. console.log("No results found.");
  1866. closeDb();
  1867. return;
  1868. }
  1869. // Rerank multiple chunks per document, then aggregate scores
  1870. // This improves ranking for long documents where keyword-matched chunk isn't always best
  1871. // We only rerank ONE chunk per document (best chunk by a simple keyword heuristic),
  1872. // so we never rerank more than RERANK_DOC_LIMIT items.
  1873. const chunksToRerank: { file: string; text: string; chunkIdx: number }[] = [];
  1874. const docChunkMap = new Map<string, { chunks: { text: string; pos: number }[]; bestIdx: number }>();
  1875. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
  1876. for (const c of candidates) {
  1877. const chunks = chunkDocument(c.body);
  1878. if (chunks.length === 0) continue;
  1879. // Choose best chunk by keyword matches; fall back to first chunk.
  1880. let bestIdx = 0;
  1881. let bestScore = -1;
  1882. for (let i = 0; i < chunks.length; i++) {
  1883. const chunkLower = chunks[i]!.text.toLowerCase();
  1884. const score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
  1885. if (score > bestScore) {
  1886. bestScore = score;
  1887. bestIdx = i;
  1888. }
  1889. }
  1890. chunksToRerank.push({ file: c.file, text: chunks[bestIdx]!.text, chunkIdx: bestIdx });
  1891. docChunkMap.set(c.file, { chunks, bestIdx });
  1892. }
  1893. // Rerank selected chunks (with caching). One chunk per doc -> one rerank item per doc.
  1894. const reranked = await rerank(
  1895. query,
  1896. chunksToRerank.map(c => ({ file: c.file, text: c.text })),
  1897. rerankModel,
  1898. db
  1899. );
  1900. const aggregatedScores = new Map<string, { score: number; bestChunkIdx: number }>();
  1901. for (const r of reranked) {
  1902. const chunkInfo = docChunkMap.get(r.file);
  1903. aggregatedScores.set(r.file, { score: r.score, bestChunkIdx: chunkInfo?.bestIdx ?? 0 });
  1904. }
  1905. // Blend RRF position score with aggregated reranker score using position-aware weights
  1906. // Top retrieval results get more protection from reranker disagreement
  1907. const candidateMap = new Map(candidates.map(c => [c.file, { displayPath: c.displayPath, title: c.title, body: c.body }]));
  1908. const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1])); // 1-indexed rank
  1909. const finalResults = Array.from(aggregatedScores.entries()).map(([file, { score: rerankScore, bestChunkIdx }]) => {
  1910. const rrfRank = rrfRankMap.get(file) || 30;
  1911. // Position-aware blending: top retrieval results preserved more
  1912. // Rank 1-3: 75% RRF, 25% reranker (trust retrieval for exact matches)
  1913. // Rank 4-10: 60% RRF, 40% reranker
  1914. // Rank 11+: 40% RRF, 60% reranker (trust reranker for lower-ranked)
  1915. let rrfWeight: number;
  1916. if (rrfRank <= 3) {
  1917. rrfWeight = 0.75;
  1918. } else if (rrfRank <= 10) {
  1919. rrfWeight = 0.60;
  1920. } else {
  1921. rrfWeight = 0.40;
  1922. }
  1923. const rrfScore = 1 / rrfRank; // Position-based: 1, 0.5, 0.33...
  1924. const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * rerankScore;
  1925. const candidate = candidateMap.get(file);
  1926. // Use the best-scoring chunk's text for the body (better for snippets)
  1927. const chunkInfo = docChunkMap.get(file);
  1928. const chunkBody = chunkInfo ? (chunkInfo.chunks[bestChunkIdx]?.text || chunkInfo.chunks[0]!.text) : candidate?.body || "";
  1929. const chunkPos = chunkInfo ? (chunkInfo.chunks[bestChunkIdx]?.pos || 0) : 0;
  1930. return {
  1931. file,
  1932. displayPath: candidate?.displayPath || "",
  1933. title: candidate?.title || "",
  1934. body: chunkBody,
  1935. chunkPos,
  1936. score: blendedScore,
  1937. context: getContextForFile(db, file),
  1938. hash: hashMap.get(file) || "",
  1939. };
  1940. }).sort((a, b) => b.score - a.score);
  1941. // Deduplicate by file (safety net - shouldn't happen but prevents duplicate output)
  1942. const seenFiles = new Set<string>();
  1943. const dedupedResults = finalResults.filter(r => {
  1944. if (seenFiles.has(r.file)) return false;
  1945. seenFiles.add(r.file);
  1946. return true;
  1947. });
  1948. closeDb();
  1949. outputResults(dedupedResults, query, opts);
  1950. }
  1951. // Parse CLI arguments using util.parseArgs
  1952. function parseCLI() {
  1953. const { values, positionals } = parseArgs({
  1954. args: Bun.argv.slice(2), // Skip bun and script path
  1955. options: {
  1956. // Global options
  1957. context: {
  1958. type: "string",
  1959. },
  1960. "no-lex": {
  1961. type: "boolean",
  1962. },
  1963. help: { type: "boolean", short: "h" },
  1964. // Search options
  1965. n: { type: "string" },
  1966. "min-score": { type: "string" },
  1967. all: { type: "boolean" },
  1968. full: { type: "boolean" },
  1969. csv: { type: "boolean" },
  1970. md: { type: "boolean" },
  1971. xml: { type: "boolean" },
  1972. files: { type: "boolean" },
  1973. json: { type: "boolean" },
  1974. collection: { type: "string", short: "c" }, // Filter by collection
  1975. // Collection options
  1976. name: { type: "string" }, // collection name
  1977. mask: { type: "string" }, // glob pattern
  1978. // Embed options
  1979. force: { type: "boolean", short: "f" },
  1980. // Update options
  1981. pull: { type: "boolean" }, // git pull before update
  1982. // Get options
  1983. l: { type: "string" }, // max lines
  1984. from: { type: "string" }, // start line
  1985. "max-bytes": { type: "string" }, // max bytes for multi-get
  1986. "line-numbers": { type: "boolean" }, // add line numbers to output
  1987. },
  1988. allowPositionals: true,
  1989. strict: false, // Allow unknown options to pass through
  1990. });
  1991. // Select index name (default: "index")
  1992. const indexName = values.index as string | undefined;
  1993. if (indexName) {
  1994. setIndexName(indexName);
  1995. }
  1996. // Determine output format
  1997. let format: OutputFormat = "cli";
  1998. if (values.csv) format = "csv";
  1999. else if (values.md) format = "md";
  2000. else if (values.xml) format = "xml";
  2001. else if (values.files) format = "files";
  2002. else if (values.json) format = "json";
  2003. // Default limit: 20 for --files/--json, 5 otherwise
  2004. // --all means return all results (use very large limit)
  2005. const defaultLimit = (format === "files" || format === "json") ? 20 : 5;
  2006. const isAll = !!values.all;
  2007. const opts: OutputOptions = {
  2008. format,
  2009. full: !!values.full,
  2010. limit: isAll ? 100000 : (values.n ? parseInt(String(values.n), 10) || defaultLimit : defaultLimit),
  2011. minScore: values["min-score"] ? parseFloat(String(values["min-score"])) || 0 : 0,
  2012. all: isAll,
  2013. collection: values.collection as string | undefined,
  2014. lineNumbers: !!values["line-numbers"],
  2015. };
  2016. return {
  2017. command: positionals[0] || "",
  2018. args: positionals.slice(1),
  2019. query: positionals.slice(1).join(" "),
  2020. opts,
  2021. values,
  2022. };
  2023. }
  2024. function showHelp(): void {
  2025. console.log("Usage:");
  2026. console.log(" qmd collection add [path] --name <name> --mask <pattern> - Create/index collection");
  2027. console.log(" qmd collection list - List all collections with details");
  2028. console.log(" qmd collection remove <name> - Remove a collection by name");
  2029. console.log(" qmd collection rename <old> <new> - Rename a collection");
  2030. console.log(" qmd ls [collection[/path]] - List collections or files in a collection");
  2031. console.log(" qmd context add [path] \"text\" - Add context for path (defaults to current dir)");
  2032. console.log(" qmd context list - List all contexts");
  2033. console.log(" qmd context rm <path> - Remove context");
  2034. console.log(" qmd get <file>[:line] [-l N] [--from N] - Get document (optionally from line, max N lines)");
  2035. console.log(" qmd multi-get <pattern> [-l N] [--max-bytes N] - Get multiple docs by glob or comma-separated list");
  2036. console.log(" qmd status - Show index status and collections");
  2037. console.log(" qmd update [--pull] - Re-index all collections (--pull: git pull first)");
  2038. console.log(" qmd embed [-f] - Create vector embeddings (800 tokens/chunk, 15% overlap)");
  2039. console.log(" qmd cleanup - Remove cache and orphaned data, vacuum DB");
  2040. console.log(" qmd search <query> - Full-text search (BM25)");
  2041. console.log(" qmd vsearch <query> - Vector similarity search");
  2042. console.log(" qmd query <query> - Combined search with query expansion + reranking");
  2043. console.log(" qmd mcp - Start MCP server (for AI agent integration)");
  2044. console.log("");
  2045. console.log("Global options:");
  2046. console.log(" --index <name> - Use custom index name (default: index)");
  2047. console.log("");
  2048. console.log("Search options:");
  2049. console.log(" -n <num> - Number of results (default: 5, or 20 for --files)");
  2050. console.log(" --all - Return all matches (use with --min-score to filter)");
  2051. console.log(" --min-score <num> - Minimum similarity score");
  2052. console.log(" --full - Output full document instead of snippet");
  2053. console.log(" --line-numbers - Add line numbers to output");
  2054. console.log(" --files - Output docid,score,filepath,context (default: 20 results)");
  2055. console.log(" --json - JSON output with snippets (default: 20 results)");
  2056. console.log(" --csv - CSV output with snippets");
  2057. console.log(" --md - Markdown output");
  2058. console.log(" --xml - XML output");
  2059. console.log(" -c, --collection <name> - Filter results to a specific collection");
  2060. console.log("");
  2061. console.log("Multi-get options:");
  2062. console.log(" -l <num> - Maximum lines per file");
  2063. console.log(" --max-bytes <num> - Skip files larger than N bytes (default: 10240)");
  2064. console.log(" --json/--csv/--md/--xml/--files - Output format (same as search)");
  2065. console.log("");
  2066. console.log("Models (auto-downloaded from HuggingFace):");
  2067. console.log(" Embedding: embeddinggemma-300M-Q8_0");
  2068. console.log(" Reranking: qwen3-reranker-0.6b-q8_0");
  2069. console.log(" Generation: Qwen3-0.6B-Q8_0");
  2070. console.log("");
  2071. console.log(`Index: ${getDbPath()}`);
  2072. }
  2073. // Main CLI - only run if this is the main module
  2074. if (import.meta.main) {
  2075. const cli = parseCLI();
  2076. if (!cli.command || cli.values.help) {
  2077. showHelp();
  2078. process.exit(cli.values.help ? 0 : 1);
  2079. }
  2080. switch (cli.command) {
  2081. case "context": {
  2082. const subcommand = cli.args[0];
  2083. if (!subcommand) {
  2084. console.error("Usage: qmd context <add|list|check|rm>");
  2085. console.error("");
  2086. console.error("Commands:");
  2087. console.error(" qmd context add [path] \"text\" - Add context (defaults to current dir)");
  2088. console.error(" qmd context add / \"text\" - Add global context to all collections");
  2089. console.error(" qmd context list - List all contexts");
  2090. console.error(" qmd context check - Check for missing contexts");
  2091. console.error(" qmd context rm <path> - Remove context");
  2092. process.exit(1);
  2093. }
  2094. switch (subcommand) {
  2095. case "add": {
  2096. if (cli.args.length < 2) {
  2097. console.error("Usage: qmd context add [path] \"text\"");
  2098. console.error("");
  2099. console.error("Examples:");
  2100. console.error(" qmd context add \"Context for current directory\"");
  2101. console.error(" qmd context add . \"Context for current directory\"");
  2102. console.error(" qmd context add /subfolder \"Context for subfolder\"");
  2103. console.error(" qmd context add / \"Global context for all collections\"");
  2104. console.error("");
  2105. console.error(" Using virtual paths:");
  2106. console.error(" qmd context add qmd://journals/ \"Context for entire journals collection\"");
  2107. console.error(" qmd context add qmd://journals/2024 \"Context for 2024 journals\"");
  2108. process.exit(1);
  2109. }
  2110. let pathArg: string | undefined;
  2111. let contextText: string;
  2112. // Check if first arg looks like a path or if it's the context text
  2113. const firstArg = cli.args[1] || '';
  2114. const secondArg = cli.args[2];
  2115. if (secondArg) {
  2116. // Two args: path + context
  2117. pathArg = firstArg;
  2118. contextText = cli.args.slice(2).join(" ");
  2119. } else {
  2120. // One arg: context only (use current directory)
  2121. pathArg = undefined;
  2122. contextText = firstArg;
  2123. }
  2124. await contextAdd(pathArg, contextText);
  2125. break;
  2126. }
  2127. case "list": {
  2128. contextList();
  2129. break;
  2130. }
  2131. case "check": {
  2132. contextCheck();
  2133. break;
  2134. }
  2135. case "rm":
  2136. case "remove": {
  2137. if (cli.args.length < 2 || !cli.args[1]) {
  2138. console.error("Usage: qmd context rm <path>");
  2139. console.error("Examples:");
  2140. console.error(" qmd context rm /");
  2141. console.error(" qmd context rm qmd://journals/2024");
  2142. process.exit(1);
  2143. }
  2144. contextRemove(cli.args[1]);
  2145. break;
  2146. }
  2147. default:
  2148. console.error(`Unknown subcommand: ${subcommand}`);
  2149. console.error("Available: add, list, check, rm");
  2150. process.exit(1);
  2151. }
  2152. break;
  2153. }
  2154. case "get": {
  2155. if (!cli.args[0]) {
  2156. console.error("Usage: qmd get <filepath>[:line] [--from <line>] [-l <lines>] [--line-numbers]");
  2157. process.exit(1);
  2158. }
  2159. const fromLine = cli.values.from ? parseInt(cli.values.from as string, 10) : undefined;
  2160. const maxLines = cli.values.l ? parseInt(cli.values.l as string, 10) : undefined;
  2161. getDocument(cli.args[0], fromLine, maxLines, cli.opts.lineNumbers);
  2162. break;
  2163. }
  2164. case "multi-get": {
  2165. if (!cli.args[0]) {
  2166. console.error("Usage: qmd multi-get <pattern> [-l <lines>] [--max-bytes <bytes>] [--json|--csv|--md|--xml|--files]");
  2167. console.error(" pattern: glob (e.g., 'journals/2025-05*.md') or comma-separated list");
  2168. process.exit(1);
  2169. }
  2170. const maxLinesMulti = cli.values.l ? parseInt(cli.values.l as string, 10) : undefined;
  2171. const maxBytes = cli.values["max-bytes"] ? parseInt(cli.values["max-bytes"] as string, 10) : DEFAULT_MULTI_GET_MAX_BYTES;
  2172. multiGet(cli.args[0], maxLinesMulti, maxBytes, cli.opts.format);
  2173. break;
  2174. }
  2175. case "ls": {
  2176. listFiles(cli.args[0]);
  2177. break;
  2178. }
  2179. case "collection": {
  2180. const subcommand = cli.args[0];
  2181. switch (subcommand) {
  2182. case "list": {
  2183. collectionList();
  2184. break;
  2185. }
  2186. case "add": {
  2187. const pwd = cli.args[1] || getPwd();
  2188. const resolvedPwd = pwd === '.' ? getPwd() : getRealPath(resolve(pwd));
  2189. const globPattern = cli.values.mask as string || DEFAULT_GLOB;
  2190. const name = cli.values.name as string | undefined;
  2191. await collectionAdd(resolvedPwd, globPattern, name);
  2192. break;
  2193. }
  2194. case "remove":
  2195. case "rm": {
  2196. if (!cli.args[1]) {
  2197. console.error("Usage: qmd collection remove <name>");
  2198. console.error(" Use 'qmd collection list' to see available collections");
  2199. process.exit(1);
  2200. }
  2201. collectionRemove(cli.args[1]);
  2202. break;
  2203. }
  2204. case "rename":
  2205. case "mv": {
  2206. if (!cli.args[1] || !cli.args[2]) {
  2207. console.error("Usage: qmd collection rename <old-name> <new-name>");
  2208. console.error(" Use 'qmd collection list' to see available collections");
  2209. process.exit(1);
  2210. }
  2211. collectionRename(cli.args[1], cli.args[2]);
  2212. break;
  2213. }
  2214. default:
  2215. console.error(`Unknown subcommand: ${subcommand}`);
  2216. console.error("Available: list, add, remove, rename");
  2217. process.exit(1);
  2218. }
  2219. break;
  2220. }
  2221. case "status":
  2222. showStatus();
  2223. break;
  2224. case "update":
  2225. await updateCollections();
  2226. break;
  2227. case "embed":
  2228. await vectorIndex(DEFAULT_EMBED_MODEL, !!cli.values.force);
  2229. break;
  2230. case "search":
  2231. if (!cli.query) {
  2232. console.error("Usage: qmd search [options] <query>");
  2233. process.exit(1);
  2234. }
  2235. search(cli.query, cli.opts);
  2236. break;
  2237. case "vsearch":
  2238. if (!cli.query) {
  2239. console.error("Usage: qmd vsearch [options] <query>");
  2240. process.exit(1);
  2241. }
  2242. // Default min-score for vector search is 0.3
  2243. if (!cli.values["min-score"]) {
  2244. cli.opts.minScore = 0.3;
  2245. }
  2246. await vectorSearch(cli.query, cli.opts);
  2247. break;
  2248. case "query":
  2249. if (!cli.query) {
  2250. console.error("Usage: qmd query [options] <query>");
  2251. process.exit(1);
  2252. }
  2253. await querySearch(cli.query, cli.opts);
  2254. break;
  2255. case "mcp": {
  2256. const { startMcpServer } = await import("./mcp.js");
  2257. await startMcpServer();
  2258. break;
  2259. }
  2260. case "cleanup": {
  2261. const db = getDb();
  2262. // 1. Clear llm_cache
  2263. const cacheCount = deleteLLMCache(db);
  2264. console.log(`${c.green}✓${c.reset} Cleared ${cacheCount} cached API responses`);
  2265. // 2. Remove orphaned vectors
  2266. const orphanedVecs = cleanupOrphanedVectors(db);
  2267. if (orphanedVecs > 0) {
  2268. console.log(`${c.green}✓${c.reset} Removed ${orphanedVecs} orphaned embedding chunks`);
  2269. } else {
  2270. console.log(`${c.dim}No orphaned embeddings to remove${c.reset}`);
  2271. }
  2272. // 3. Remove inactive documents
  2273. const inactiveDocs = deleteInactiveDocuments(db);
  2274. if (inactiveDocs > 0) {
  2275. console.log(`${c.green}✓${c.reset} Removed ${inactiveDocs} inactive document records`);
  2276. }
  2277. // 4. Vacuum to reclaim space
  2278. vacuumDatabase(db);
  2279. console.log(`${c.green}✓${c.reset} Database vacuumed`);
  2280. closeDb();
  2281. break;
  2282. }
  2283. default:
  2284. console.error(`Unknown command: ${cli.command}`);
  2285. console.error("Run 'qmd --help' for usage.");
  2286. process.exit(1);
  2287. }
  2288. if (cli.command !== "mcp") {
  2289. await disposeDefaultLlamaCpp();
  2290. process.exit(0);
  2291. }
  2292. } // end if (import.meta.main)