qmd.ts 96 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734
  1. import { openDatabase } from "./db.js";
  2. import type { Database } from "./db.js";
  3. import fastGlob from "fast-glob";
  4. import { execSync, spawn as nodeSpawn } from "child_process";
  5. import { fileURLToPath } from "url";
  6. import { dirname, join as pathJoin } from "path";
  7. import { parseArgs } from "util";
  8. import { readFileSync, statSync, existsSync, unlinkSync, writeFileSync, openSync, closeSync, mkdirSync } from "fs";
  9. import {
  10. getPwd,
  11. getRealPath,
  12. homedir,
  13. resolve,
  14. enableProductionMode,
  15. searchFTS,
  16. extractSnippet,
  17. getContextForFile,
  18. getContextForPath,
  19. listCollections,
  20. removeCollection,
  21. renameCollection,
  22. findSimilarFiles,
  23. findDocumentByDocid,
  24. isDocid,
  25. matchFilesByGlob,
  26. getHashesNeedingEmbedding,
  27. getHashesForEmbedding,
  28. clearAllEmbeddings,
  29. insertEmbedding,
  30. getStatus,
  31. hashContent,
  32. extractTitle,
  33. formatDocForEmbedding,
  34. chunkDocumentByTokens,
  35. clearCache,
  36. getCacheKey,
  37. getCachedResult,
  38. setCachedResult,
  39. getIndexHealth,
  40. parseVirtualPath,
  41. buildVirtualPath,
  42. isVirtualPath,
  43. resolveVirtualPath,
  44. toVirtualPath,
  45. insertContent,
  46. insertDocument,
  47. findActiveDocument,
  48. updateDocumentTitle,
  49. updateDocument,
  50. deactivateDocument,
  51. getActiveDocumentPaths,
  52. cleanupOrphanedContent,
  53. deleteLLMCache,
  54. deleteInactiveDocuments,
  55. cleanupOrphanedVectors,
  56. vacuumDatabase,
  57. getCollectionsWithoutContext,
  58. getTopLevelPathsWithoutContext,
  59. handelize,
  60. hybridQuery,
  61. vectorSearchQuery,
  62. structuredSearch,
  63. addLineNumbers,
  64. type ExpandedQuery,
  65. type StructuredSubSearch,
  66. DEFAULT_EMBED_MODEL,
  67. DEFAULT_RERANK_MODEL,
  68. DEFAULT_GLOB,
  69. DEFAULT_MULTI_GET_MAX_BYTES,
  70. createStore,
  71. getDefaultDbPath,
  72. } from "./store.js";
  73. import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js";
  74. import {
  75. formatSearchResults,
  76. formatDocuments,
  77. escapeXml,
  78. escapeCSV,
  79. type OutputFormat,
  80. } from "./formatter.js";
  81. import {
  82. getCollection as getCollectionFromYaml,
  83. listCollections as yamlListCollections,
  84. addContext as yamlAddContext,
  85. removeContext as yamlRemoveContext,
  86. setGlobalContext,
  87. listAllContexts,
  88. setConfigIndexName,
  89. } from "./collections.js";
  90. // Enable production mode - allows using default database path
  91. // Tests must set INDEX_PATH or use createStore() with explicit path
  92. enableProductionMode();
  93. // =============================================================================
  94. // Store/DB lifecycle (no legacy singletons in store.ts)
  95. // =============================================================================
  96. let store: ReturnType<typeof createStore> | null = null;
  97. let storeDbPathOverride: string | undefined;
  98. function getStore(): ReturnType<typeof createStore> {
  99. if (!store) {
  100. store = createStore(storeDbPathOverride);
  101. }
  102. return store;
  103. }
  104. function getDb(): Database {
  105. return getStore().db;
  106. }
  107. function closeDb(): void {
  108. if (store) {
  109. store.close();
  110. store = null;
  111. }
  112. }
  113. function getDbPath(): string {
  114. return store?.dbPath ?? storeDbPathOverride ?? getDefaultDbPath();
  115. }
  116. function setIndexName(name: string | null): void {
  117. let normalizedName = name;
  118. // Normalize relative paths to prevent malformed database paths
  119. if (name && name.includes('/')) {
  120. const { resolve } = require('path');
  121. const { cwd } = require('process');
  122. const absolutePath = resolve(cwd(), name);
  123. // Replace path separators with underscores to create a valid filename
  124. normalizedName = absolutePath.replace(/\//g, '_').replace(/^_/, '');
  125. }
  126. storeDbPathOverride = normalizedName ? getDefaultDbPath(normalizedName) : undefined;
  127. // Reset open handle so next use opens the new index
  128. closeDb();
  129. }
  130. function ensureVecTable(_db: Database, dimensions: number): void {
  131. // Store owns the DB; ignore `_db` and ensure vec table on the active store
  132. getStore().ensureVecTable(dimensions);
  133. }
  134. // Terminal colors (respects NO_COLOR env)
  135. const useColor = !process.env.NO_COLOR && process.stdout.isTTY;
  136. const c = {
  137. reset: useColor ? "\x1b[0m" : "",
  138. dim: useColor ? "\x1b[2m" : "",
  139. bold: useColor ? "\x1b[1m" : "",
  140. cyan: useColor ? "\x1b[36m" : "",
  141. yellow: useColor ? "\x1b[33m" : "",
  142. green: useColor ? "\x1b[32m" : "",
  143. magenta: useColor ? "\x1b[35m" : "",
  144. blue: useColor ? "\x1b[34m" : "",
  145. };
  146. // Terminal cursor control
  147. const cursor = {
  148. hide() { process.stderr.write('\x1b[?25l'); },
  149. show() { process.stderr.write('\x1b[?25h'); },
  150. };
  151. // Ensure cursor is restored on exit
  152. process.on('SIGINT', () => { cursor.show(); process.exit(130); });
  153. process.on('SIGTERM', () => { cursor.show(); process.exit(143); });
  154. // Terminal progress bar using OSC 9;4 escape sequence
  155. const progress = {
  156. set(percent: number) {
  157. process.stderr.write(`\x1b]9;4;1;${Math.round(percent)}\x07`);
  158. },
  159. clear() {
  160. process.stderr.write(`\x1b]9;4;0\x07`);
  161. },
  162. indeterminate() {
  163. process.stderr.write(`\x1b]9;4;3\x07`);
  164. },
  165. error() {
  166. process.stderr.write(`\x1b]9;4;2\x07`);
  167. },
  168. };
  169. // Format seconds into human-readable ETA
  170. function formatETA(seconds: number): string {
  171. if (seconds < 60) return `${Math.round(seconds)}s`;
  172. if (seconds < 3600) return `${Math.floor(seconds / 60)}m ${Math.round(seconds % 60)}s`;
  173. return `${Math.floor(seconds / 3600)}h ${Math.floor((seconds % 3600) / 60)}m`;
  174. }
  175. // Check index health and print warnings/tips
  176. function checkIndexHealth(db: Database): void {
  177. const { needsEmbedding, totalDocs, daysStale } = getIndexHealth(db);
  178. // Warn if many docs need embedding
  179. if (needsEmbedding > 0) {
  180. const pct = Math.round((needsEmbedding / totalDocs) * 100);
  181. if (pct >= 10) {
  182. process.stderr.write(`${c.yellow}Warning: ${needsEmbedding} documents (${pct}%) need embeddings. Run 'qmd embed' for better results.${c.reset}\n`);
  183. } else {
  184. process.stderr.write(`${c.dim}Tip: ${needsEmbedding} documents need embeddings. Run 'qmd embed' to index them.${c.reset}\n`);
  185. }
  186. }
  187. // Check if most recent document update is older than 2 weeks
  188. if (daysStale !== null && daysStale >= 14) {
  189. process.stderr.write(`${c.dim}Tip: Index last updated ${daysStale} days ago. Run 'qmd update' to refresh.${c.reset}\n`);
  190. }
  191. }
  192. // Compute unique display path for a document
  193. // Always include at least parent folder + filename, add more parent dirs until unique
  194. function computeDisplayPath(
  195. filepath: string,
  196. collectionPath: string,
  197. existingPaths: Set<string>
  198. ): string {
  199. // Get path relative to collection (include collection dir name)
  200. const collectionDir = collectionPath.replace(/\/$/, '');
  201. const collectionName = collectionDir.split('/').pop() || '';
  202. let relativePath: string;
  203. if (filepath.startsWith(collectionDir + '/')) {
  204. // filepath is under collection: use collection name + relative path
  205. relativePath = collectionName + filepath.slice(collectionDir.length);
  206. } else {
  207. // Fallback: just use the filepath
  208. relativePath = filepath;
  209. }
  210. const parts = relativePath.split('/').filter(p => p.length > 0);
  211. // Always include at least parent folder + filename (minimum 2 parts if available)
  212. // Then add more parent dirs until unique
  213. const minParts = Math.min(2, parts.length);
  214. for (let i = parts.length - minParts; i >= 0; i--) {
  215. const candidate = parts.slice(i).join('/');
  216. if (!existingPaths.has(candidate)) {
  217. return candidate;
  218. }
  219. }
  220. // Absolute fallback: use full path (should be unique)
  221. return filepath;
  222. }
  223. function formatTimeAgo(date: Date): string {
  224. const seconds = Math.floor((Date.now() - date.getTime()) / 1000);
  225. if (seconds < 60) return `${seconds}s ago`;
  226. const minutes = Math.floor(seconds / 60);
  227. if (minutes < 60) return `${minutes}m ago`;
  228. const hours = Math.floor(minutes / 60);
  229. if (hours < 24) return `${hours}h ago`;
  230. const days = Math.floor(hours / 24);
  231. return `${days}d ago`;
  232. }
  233. function formatBytes(bytes: number): string {
  234. if (bytes < 1024) return `${bytes} B`;
  235. if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
  236. if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
  237. return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
  238. }
  239. async function showStatus(): Promise<void> {
  240. const dbPath = getDbPath();
  241. const db = getDb();
  242. // Collections are defined in YAML; no duplicate cleanup needed.
  243. // Collections are defined in YAML; no duplicate cleanup needed.
  244. // Index size
  245. let indexSize = 0;
  246. try {
  247. const stat = statSync(dbPath).size;
  248. indexSize = stat;
  249. } catch { }
  250. // Collections info (from YAML + database stats)
  251. const collections = listCollections(db);
  252. // Overall stats
  253. const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number };
  254. const vectorCount = db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get() as { count: number };
  255. const needsEmbedding = getHashesNeedingEmbedding(db);
  256. // Most recent update across all collections
  257. const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
  258. console.log(`${c.bold}QMD Status${c.reset}\n`);
  259. console.log(`Index: ${dbPath}`);
  260. console.log(`Size: ${formatBytes(indexSize)}`);
  261. // MCP daemon status (check PID file liveness)
  262. const mcpCacheDir = process.env.XDG_CACHE_HOME
  263. ? resolve(process.env.XDG_CACHE_HOME, "qmd")
  264. : resolve(homedir(), ".cache", "qmd");
  265. const mcpPidPath = resolve(mcpCacheDir, "mcp.pid");
  266. if (existsSync(mcpPidPath)) {
  267. const mcpPid = parseInt(readFileSync(mcpPidPath, "utf-8").trim());
  268. try {
  269. process.kill(mcpPid, 0);
  270. console.log(`MCP: ${c.green}running${c.reset} (PID ${mcpPid})`);
  271. } catch {
  272. unlinkSync(mcpPidPath);
  273. // Stale PID file cleaned up silently
  274. }
  275. }
  276. console.log("");
  277. console.log(`${c.bold}Documents${c.reset}`);
  278. console.log(` Total: ${totalDocs.count} files indexed`);
  279. console.log(` Vectors: ${vectorCount.count} embedded`);
  280. if (needsEmbedding > 0) {
  281. console.log(` ${c.yellow}Pending: ${needsEmbedding} need embedding${c.reset} (run 'qmd embed')`);
  282. }
  283. if (mostRecent.latest) {
  284. const lastUpdate = new Date(mostRecent.latest);
  285. console.log(` Updated: ${formatTimeAgo(lastUpdate)}`);
  286. }
  287. // Get all contexts grouped by collection (from YAML)
  288. const allContexts = listAllContexts();
  289. const contextsByCollection = new Map<string, { path_prefix: string; context: string }[]>();
  290. for (const ctx of allContexts) {
  291. // Group contexts by collection name
  292. if (!contextsByCollection.has(ctx.collection)) {
  293. contextsByCollection.set(ctx.collection, []);
  294. }
  295. contextsByCollection.get(ctx.collection)!.push({
  296. path_prefix: ctx.path,
  297. context: ctx.context
  298. });
  299. }
  300. if (collections.length > 0) {
  301. console.log(`\n${c.bold}Collections${c.reset}`);
  302. for (const col of collections) {
  303. const lastMod = col.last_modified ? formatTimeAgo(new Date(col.last_modified)) : "never";
  304. const contexts = contextsByCollection.get(col.name) || [];
  305. console.log(` ${c.cyan}${col.name}${c.reset} ${c.dim}(qmd://${col.name}/)${c.reset}`);
  306. console.log(` ${c.dim}Pattern:${c.reset} ${col.glob_pattern}`);
  307. console.log(` ${c.dim}Files:${c.reset} ${col.active_count} (updated ${lastMod})`);
  308. if (contexts.length > 0) {
  309. console.log(` ${c.dim}Contexts:${c.reset} ${contexts.length}`);
  310. for (const ctx of contexts) {
  311. // Handle both empty string and '/' as root context
  312. const pathDisplay = (ctx.path_prefix === '' || ctx.path_prefix === '/') ? '/' : `/${ctx.path_prefix}`;
  313. const contextPreview = ctx.context.length > 60
  314. ? ctx.context.substring(0, 57) + '...'
  315. : ctx.context;
  316. console.log(` ${c.dim}${pathDisplay}:${c.reset} ${contextPreview}`);
  317. }
  318. }
  319. }
  320. // Show examples of virtual paths
  321. console.log(`\n${c.bold}Examples${c.reset}`);
  322. console.log(` ${c.dim}# List files in a collection${c.reset}`);
  323. if (collections.length > 0 && collections[0]) {
  324. console.log(` qmd ls ${collections[0].name}`);
  325. }
  326. console.log(` ${c.dim}# Get a document${c.reset}`);
  327. if (collections.length > 0 && collections[0]) {
  328. console.log(` qmd get qmd://${collections[0].name}/path/to/file.md`);
  329. }
  330. console.log(` ${c.dim}# Search within a collection${c.reset}`);
  331. if (collections.length > 0 && collections[0]) {
  332. console.log(` qmd search "query" -c ${collections[0].name}`);
  333. }
  334. } else {
  335. console.log(`\n${c.dim}No collections. Run 'qmd collection add .' to index markdown files.${c.reset}`);
  336. }
  337. // Models
  338. {
  339. // hf:org/repo/file.gguf → https://huggingface.co/org/repo
  340. const hfLink = (uri: string) => {
  341. const match = uri.match(/^hf:([^/]+\/[^/]+)\//);
  342. return match ? `https://huggingface.co/${match[1]}` : uri;
  343. };
  344. console.log(`\n${c.bold}Models${c.reset}`);
  345. console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`);
  346. console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)}`);
  347. console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
  348. }
  349. // Device / GPU info
  350. try {
  351. const llm = getDefaultLlamaCpp();
  352. const device = await llm.getDeviceInfo();
  353. console.log(`\n${c.bold}Device${c.reset}`);
  354. if (device.gpu) {
  355. console.log(` GPU: ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
  356. if (device.gpuDevices.length > 0) {
  357. // Deduplicate and count GPUs
  358. const counts = new Map<string, number>();
  359. for (const name of device.gpuDevices) {
  360. counts.set(name, (counts.get(name) || 0) + 1);
  361. }
  362. const deviceStr = Array.from(counts.entries())
  363. .map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
  364. .join(', ');
  365. console.log(` Devices: ${deviceStr}`);
  366. }
  367. if (device.vram) {
  368. console.log(` VRAM: ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
  369. }
  370. } else {
  371. console.log(` GPU: ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
  372. console.log(` ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
  373. }
  374. console.log(` CPU: ${device.cpuCores} math cores`);
  375. } catch {
  376. // Don't fail status if LLM init fails
  377. }
  378. closeDb();
  379. }
  380. async function updateCollections(): Promise<void> {
  381. const db = getDb();
  382. // Collections are defined in YAML; no duplicate cleanup needed.
  383. // Clear Ollama cache on update
  384. clearCache(db);
  385. const collections = listCollections(db);
  386. if (collections.length === 0) {
  387. console.log(`${c.dim}No collections found. Run 'qmd collection add .' to index markdown files.${c.reset}`);
  388. closeDb();
  389. return;
  390. }
  391. // Don't close db here - indexFiles will reuse it and close at the end
  392. console.log(`${c.bold}Updating ${collections.length} collection(s)...${c.reset}\n`);
  393. for (let i = 0; i < collections.length; i++) {
  394. const col = collections[i];
  395. if (!col) continue;
  396. console.log(`${c.cyan}[${i + 1}/${collections.length}]${c.reset} ${c.bold}${col.name}${c.reset} ${c.dim}(${col.glob_pattern})${c.reset}`);
  397. // Execute custom update command if specified in YAML
  398. const yamlCol = getCollectionFromYaml(col.name);
  399. if (yamlCol?.update) {
  400. console.log(`${c.dim} Running update command: ${yamlCol.update}${c.reset}`);
  401. try {
  402. const proc = nodeSpawn("bash", ["-c", yamlCol.update], {
  403. cwd: col.pwd,
  404. stdio: ["ignore", "pipe", "pipe"],
  405. });
  406. const [output, errorOutput, exitCode] = await new Promise<[string, string, number]>((resolve, reject) => {
  407. let out = "";
  408. let err = "";
  409. proc.stdout?.on("data", (d: Buffer) => { out += d.toString(); });
  410. proc.stderr?.on("data", (d: Buffer) => { err += d.toString(); });
  411. proc.on("error", reject);
  412. proc.on("close", (code) => resolve([out, err, code ?? 1]));
  413. });
  414. if (output.trim()) {
  415. console.log(output.trim().split('\n').map(l => ` ${l}`).join('\n'));
  416. }
  417. if (errorOutput.trim()) {
  418. console.log(errorOutput.trim().split('\n').map(l => ` ${l}`).join('\n'));
  419. }
  420. if (exitCode !== 0) {
  421. console.log(`${c.yellow}✗ Update command failed with exit code ${exitCode}${c.reset}`);
  422. process.exit(exitCode);
  423. }
  424. } catch (err) {
  425. console.log(`${c.yellow}✗ Update command failed: ${err}${c.reset}`);
  426. process.exit(1);
  427. }
  428. }
  429. await indexFiles(col.pwd, col.glob_pattern, col.name, true);
  430. console.log("");
  431. }
  432. // Check if any documents need embedding (show once at end)
  433. const finalDb = getDb();
  434. const needsEmbedding = getHashesNeedingEmbedding(finalDb);
  435. closeDb();
  436. console.log(`${c.green}✓ All collections updated.${c.reset}`);
  437. if (needsEmbedding > 0) {
  438. console.log(`\nRun 'qmd embed' to update embeddings (${needsEmbedding} unique hashes need vectors)`);
  439. }
  440. }
  441. /**
  442. * Detect which collection (if any) contains the given filesystem path.
  443. * Returns { collectionId, collectionName, relativePath } or null if not in any collection.
  444. */
  445. function detectCollectionFromPath(db: Database, fsPath: string): { collectionName: string; relativePath: string } | null {
  446. const realPath = getRealPath(fsPath);
  447. // Find collections that this path is under from YAML
  448. const allCollections = yamlListCollections();
  449. // Find longest matching path
  450. let bestMatch: { name: string; path: string } | null = null;
  451. for (const coll of allCollections) {
  452. if (realPath.startsWith(coll.path + '/') || realPath === coll.path) {
  453. if (!bestMatch || coll.path.length > bestMatch.path.length) {
  454. bestMatch = { name: coll.name, path: coll.path };
  455. }
  456. }
  457. }
  458. if (!bestMatch) return null;
  459. // Calculate relative path
  460. let relativePath = realPath;
  461. if (relativePath.startsWith(bestMatch.path + '/')) {
  462. relativePath = relativePath.slice(bestMatch.path.length + 1);
  463. } else if (relativePath === bestMatch.path) {
  464. relativePath = '';
  465. }
  466. return {
  467. collectionName: bestMatch.name,
  468. relativePath
  469. };
  470. }
  471. async function contextAdd(pathArg: string | undefined, contextText: string): Promise<void> {
  472. const db = getDb();
  473. // Handle "/" as global context (applies to all collections)
  474. if (pathArg === '/') {
  475. setGlobalContext(contextText);
  476. console.log(`${c.green}✓${c.reset} Set global context`);
  477. console.log(`${c.dim}Context: ${contextText}${c.reset}`);
  478. closeDb();
  479. return;
  480. }
  481. // Resolve path - defaults to current directory if not provided
  482. let fsPath = pathArg || '.';
  483. if (fsPath === '.' || fsPath === './') {
  484. fsPath = getPwd();
  485. } else if (fsPath.startsWith('~/')) {
  486. fsPath = homedir() + fsPath.slice(1);
  487. } else if (!fsPath.startsWith('/') && !fsPath.startsWith('qmd://')) {
  488. fsPath = resolve(getPwd(), fsPath);
  489. }
  490. // Handle virtual paths (qmd://collection/path)
  491. if (isVirtualPath(fsPath)) {
  492. const parsed = parseVirtualPath(fsPath);
  493. if (!parsed) {
  494. console.error(`${c.yellow}Invalid virtual path: ${fsPath}${c.reset}`);
  495. process.exit(1);
  496. }
  497. const coll = getCollectionFromYaml(parsed.collectionName);
  498. if (!coll) {
  499. console.error(`${c.yellow}Collection not found: ${parsed.collectionName}${c.reset}`);
  500. process.exit(1);
  501. }
  502. yamlAddContext(parsed.collectionName, parsed.path, contextText);
  503. const displayPath = parsed.path
  504. ? `qmd://${parsed.collectionName}/${parsed.path}`
  505. : `qmd://${parsed.collectionName}/ (collection root)`;
  506. console.log(`${c.green}✓${c.reset} Added context for: ${displayPath}`);
  507. console.log(`${c.dim}Context: ${contextText}${c.reset}`);
  508. closeDb();
  509. return;
  510. }
  511. // Detect collection from filesystem path
  512. const detected = detectCollectionFromPath(db, fsPath);
  513. if (!detected) {
  514. console.error(`${c.yellow}Path is not in any indexed collection: ${fsPath}${c.reset}`);
  515. console.error(`${c.dim}Run 'qmd status' to see indexed collections${c.reset}`);
  516. process.exit(1);
  517. }
  518. yamlAddContext(detected.collectionName, detected.relativePath, contextText);
  519. const displayPath = detected.relativePath ? `qmd://${detected.collectionName}/${detected.relativePath}` : `qmd://${detected.collectionName}/`;
  520. console.log(`${c.green}✓${c.reset} Added context for: ${displayPath}`);
  521. console.log(`${c.dim}Context: ${contextText}${c.reset}`);
  522. closeDb();
  523. }
  524. function contextList(): void {
  525. const db = getDb();
  526. const allContexts = listAllContexts();
  527. if (allContexts.length === 0) {
  528. console.log(`${c.dim}No contexts configured. Use 'qmd context add' to add one.${c.reset}`);
  529. closeDb();
  530. return;
  531. }
  532. console.log(`\n${c.bold}Configured Contexts${c.reset}\n`);
  533. let lastCollection = '';
  534. for (const ctx of allContexts) {
  535. if (ctx.collection !== lastCollection) {
  536. console.log(`${c.cyan}${ctx.collection}${c.reset}`);
  537. lastCollection = ctx.collection;
  538. }
  539. const displayPath = ctx.path ? ` ${ctx.path}` : ' / (root)';
  540. console.log(`${displayPath}`);
  541. console.log(` ${c.dim}${ctx.context}${c.reset}`);
  542. }
  543. closeDb();
  544. }
  545. function contextRemove(pathArg: string): void {
  546. if (pathArg === '/') {
  547. // Remove global context
  548. setGlobalContext(undefined);
  549. console.log(`${c.green}✓${c.reset} Removed global context`);
  550. return;
  551. }
  552. // Handle virtual paths
  553. if (isVirtualPath(pathArg)) {
  554. const parsed = parseVirtualPath(pathArg);
  555. if (!parsed) {
  556. console.error(`${c.yellow}Invalid virtual path: ${pathArg}${c.reset}`);
  557. process.exit(1);
  558. }
  559. const coll = getCollectionFromYaml(parsed.collectionName);
  560. if (!coll) {
  561. console.error(`${c.yellow}Collection not found: ${parsed.collectionName}${c.reset}`);
  562. process.exit(1);
  563. }
  564. const success = yamlRemoveContext(coll.name, parsed.path);
  565. if (!success) {
  566. console.error(`${c.yellow}No context found for: ${pathArg}${c.reset}`);
  567. process.exit(1);
  568. }
  569. console.log(`${c.green}✓${c.reset} Removed context for: ${pathArg}`);
  570. return;
  571. }
  572. // Handle filesystem paths
  573. let fsPath = pathArg;
  574. if (fsPath === '.' || fsPath === './') {
  575. fsPath = getPwd();
  576. } else if (fsPath.startsWith('~/')) {
  577. fsPath = homedir() + fsPath.slice(1);
  578. } else if (!fsPath.startsWith('/')) {
  579. fsPath = resolve(getPwd(), fsPath);
  580. }
  581. const db = getDb();
  582. const detected = detectCollectionFromPath(db, fsPath);
  583. closeDb();
  584. if (!detected) {
  585. console.error(`${c.yellow}Path is not in any indexed collection: ${fsPath}${c.reset}`);
  586. process.exit(1);
  587. }
  588. const success = yamlRemoveContext(detected.collectionName, detected.relativePath);
  589. if (!success) {
  590. console.error(`${c.yellow}No context found for: qmd://${detected.collectionName}/${detected.relativePath}${c.reset}`);
  591. process.exit(1);
  592. }
  593. console.log(`${c.green}✓${c.reset} Removed context for: qmd://${detected.collectionName}/${detected.relativePath}`);
  594. }
  595. function contextCheck(): void {
  596. const db = getDb();
  597. // Get collections without any context
  598. const collectionsWithoutContext = getCollectionsWithoutContext(db);
  599. // Get all collections to check for missing path contexts
  600. const allCollections = listCollections(db);
  601. if (collectionsWithoutContext.length === 0 && allCollections.length > 0) {
  602. // Check if all collections have contexts
  603. console.log(`\n${c.green}✓${c.reset} ${c.bold}All collections have context configured${c.reset}\n`);
  604. }
  605. if (collectionsWithoutContext.length > 0) {
  606. console.log(`\n${c.yellow}Collections without any context:${c.reset}\n`);
  607. for (const coll of collectionsWithoutContext) {
  608. console.log(`${c.cyan}${coll.name}${c.reset} ${c.dim}(${coll.doc_count} documents)${c.reset}`);
  609. console.log(` ${c.dim}Suggestion: qmd context add qmd://${coll.name}/ "Description of ${coll.name}"${c.reset}\n`);
  610. }
  611. }
  612. // Check for top-level paths without context within collections that DO have context
  613. const collectionsWithContext = allCollections.filter(c =>
  614. c && !collectionsWithoutContext.some(cwc => cwc.name === c.name)
  615. );
  616. let hasPathSuggestions = false;
  617. for (const coll of collectionsWithContext) {
  618. if (!coll) continue;
  619. const missingPaths = getTopLevelPathsWithoutContext(db, coll.name);
  620. if (missingPaths.length > 0) {
  621. if (!hasPathSuggestions) {
  622. console.log(`${c.yellow}Top-level directories without context:${c.reset}\n`);
  623. hasPathSuggestions = true;
  624. }
  625. console.log(`${c.cyan}${coll.name}${c.reset}`);
  626. for (const path of missingPaths) {
  627. console.log(` ${path}`);
  628. console.log(` ${c.dim}Suggestion: qmd context add qmd://${coll.name}/${path} "Description of ${path}"${c.reset}`);
  629. }
  630. console.log('');
  631. }
  632. }
  633. if (collectionsWithoutContext.length === 0 && !hasPathSuggestions) {
  634. console.log(`${c.dim}All collections and major paths have context configured.${c.reset}`);
  635. console.log(`${c.dim}Use 'qmd context list' to see all configured contexts.${c.reset}\n`);
  636. }
  637. closeDb();
  638. }
  639. function getDocument(filename: string, fromLine?: number, maxLines?: number, lineNumbers?: boolean): void {
  640. const db = getDb();
  641. // Parse :linenum suffix from filename (e.g., "file.md:100")
  642. let inputPath = filename;
  643. const colonMatch = inputPath.match(/:(\d+)$/);
  644. if (colonMatch && !fromLine) {
  645. const matched = colonMatch[1];
  646. if (matched) {
  647. fromLine = parseInt(matched, 10);
  648. inputPath = inputPath.slice(0, -colonMatch[0].length);
  649. }
  650. }
  651. // Handle docid lookup (#abc123, abc123, "#abc123", "abc123", etc.)
  652. if (isDocid(inputPath)) {
  653. const docidMatch = findDocumentByDocid(db, inputPath);
  654. if (docidMatch) {
  655. inputPath = docidMatch.filepath;
  656. } else {
  657. console.error(`Document not found: ${filename}`);
  658. closeDb();
  659. process.exit(1);
  660. }
  661. }
  662. let doc: { collectionName: string; path: string; body: string } | null = null;
  663. let virtualPath: string;
  664. // Handle virtual paths (qmd://collection/path)
  665. if (isVirtualPath(inputPath)) {
  666. const parsed = parseVirtualPath(inputPath);
  667. if (!parsed) {
  668. console.error(`Invalid virtual path: ${inputPath}`);
  669. closeDb();
  670. process.exit(1);
  671. }
  672. // Try exact match on collection + path
  673. doc = db.prepare(`
  674. SELECT d.collection as collectionName, d.path, content.doc as body
  675. FROM documents d
  676. JOIN content ON content.hash = d.hash
  677. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  678. `).get(parsed.collectionName, parsed.path) as typeof doc;
  679. if (!doc) {
  680. // Try fuzzy match by path ending
  681. doc = db.prepare(`
  682. SELECT d.collection as collectionName, d.path, content.doc as body
  683. FROM documents d
  684. JOIN content ON content.hash = d.hash
  685. WHERE d.collection = ? AND d.path LIKE ? AND d.active = 1
  686. LIMIT 1
  687. `).get(parsed.collectionName, `%${parsed.path}`) as typeof doc;
  688. }
  689. virtualPath = inputPath;
  690. } else {
  691. // Try to interpret as collection/path format first (before filesystem path)
  692. // If path is relative (no / or ~ prefix), check if first component is a collection name
  693. if (!inputPath.startsWith('/') && !inputPath.startsWith('~')) {
  694. const parts = inputPath.split('/');
  695. if (parts.length >= 2) {
  696. const possibleCollection = parts[0];
  697. const possiblePath = parts.slice(1).join('/');
  698. // Check if this collection exists
  699. const collExists = possibleCollection ? db.prepare(`
  700. SELECT 1 FROM documents WHERE collection = ? AND active = 1 LIMIT 1
  701. `).get(possibleCollection) : null;
  702. if (collExists) {
  703. // Try exact match on collection + path
  704. doc = db.prepare(`
  705. SELECT d.collection as collectionName, d.path, content.doc as body
  706. FROM documents d
  707. JOIN content ON content.hash = d.hash
  708. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  709. `).get(possibleCollection || "", possiblePath || "") as { collectionName: string; path: string; body: string } | null;
  710. if (!doc) {
  711. // Try fuzzy match by path ending
  712. doc = db.prepare(`
  713. SELECT d.collection as collectionName, d.path, content.doc as body
  714. FROM documents d
  715. JOIN content ON content.hash = d.hash
  716. WHERE d.collection = ? AND d.path LIKE ? AND d.active = 1
  717. LIMIT 1
  718. `).get(possibleCollection || "", `%${possiblePath}`) as { collectionName: string; path: string; body: string } | null;
  719. }
  720. if (doc) {
  721. virtualPath = buildVirtualPath(doc.collectionName, doc.path);
  722. // Skip the filesystem path handling below
  723. }
  724. }
  725. }
  726. }
  727. // If not found as collection/path, handle as filesystem paths
  728. if (!doc) {
  729. let fsPath = inputPath;
  730. // Expand ~ to home directory
  731. if (fsPath.startsWith('~/')) {
  732. fsPath = homedir() + fsPath.slice(1);
  733. } else if (!fsPath.startsWith('/')) {
  734. // Relative path - resolve from current directory
  735. fsPath = resolve(getPwd(), fsPath);
  736. }
  737. fsPath = getRealPath(fsPath);
  738. // Try to detect which collection contains this path
  739. const detected = detectCollectionFromPath(db, fsPath);
  740. if (detected) {
  741. // Found collection - query by collection name + relative path
  742. doc = db.prepare(`
  743. SELECT d.collection as collectionName, d.path, content.doc as body
  744. FROM documents d
  745. JOIN content ON content.hash = d.hash
  746. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  747. `).get(detected.collectionName, detected.relativePath) as { collectionName: string; path: string; body: string } | null;
  748. }
  749. // Fuzzy match by filename (last component of path)
  750. if (!doc) {
  751. const filename = inputPath.split('/').pop() || inputPath;
  752. doc = db.prepare(`
  753. SELECT d.collection as collectionName, d.path, content.doc as body
  754. FROM documents d
  755. JOIN content ON content.hash = d.hash
  756. WHERE d.path LIKE ? AND d.active = 1
  757. LIMIT 1
  758. `).get(`%${filename}`) as { collectionName: string; path: string; body: string } | null;
  759. }
  760. if (doc) {
  761. virtualPath = buildVirtualPath(doc.collectionName, doc.path);
  762. } else {
  763. virtualPath = inputPath;
  764. }
  765. }
  766. }
  767. // Ensure doc is not null before proceeding
  768. if (!doc) {
  769. console.error(`Document not found: ${filename}`);
  770. closeDb();
  771. process.exit(1);
  772. }
  773. // Get context for this file
  774. const context = getContextForPath(db, doc.collectionName, doc.path);
  775. let output = doc.body;
  776. const startLine = fromLine || 1;
  777. // Apply line filtering if specified
  778. if (fromLine !== undefined || maxLines !== undefined) {
  779. const lines = output.split('\n');
  780. const start = startLine - 1; // Convert to 0-indexed
  781. const end = maxLines !== undefined ? start + maxLines : lines.length;
  782. output = lines.slice(start, end).join('\n');
  783. }
  784. // Add line numbers if requested
  785. if (lineNumbers) {
  786. output = addLineNumbers(output, startLine);
  787. }
  788. // Output context header if exists
  789. if (context) {
  790. console.log(`Folder Context: ${context}\n---\n`);
  791. }
  792. console.log(output);
  793. closeDb();
  794. }
  795. // Multi-get: fetch multiple documents by glob pattern or comma-separated list
  796. function multiGet(pattern: string, maxLines?: number, maxBytes: number = DEFAULT_MULTI_GET_MAX_BYTES, format: OutputFormat = "cli"): void {
  797. const db = getDb();
  798. // Check if it's a comma-separated list or a glob pattern
  799. const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
  800. let files: { filepath: string; displayPath: string; bodyLength: number; collection?: string; path?: string }[];
  801. if (isCommaSeparated) {
  802. // Comma-separated list of files (can be virtual paths or relative paths)
  803. const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
  804. files = [];
  805. for (const name of names) {
  806. let doc: { virtual_path: string; body_length: number; collection: string; path: string } | null = null;
  807. // Handle virtual paths
  808. if (isVirtualPath(name)) {
  809. const parsed = parseVirtualPath(name);
  810. if (parsed) {
  811. // Try exact match on collection + path
  812. doc = db.prepare(`
  813. SELECT
  814. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  815. LENGTH(content.doc) as body_length,
  816. d.collection,
  817. d.path
  818. FROM documents d
  819. JOIN content ON content.hash = d.hash
  820. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  821. `).get(parsed.collectionName, parsed.path) as typeof doc;
  822. }
  823. } else {
  824. // Try exact match on path
  825. doc = db.prepare(`
  826. SELECT
  827. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  828. LENGTH(content.doc) as body_length,
  829. d.collection,
  830. d.path
  831. FROM documents d
  832. JOIN content ON content.hash = d.hash
  833. WHERE d.path = ? AND d.active = 1
  834. LIMIT 1
  835. `).get(name) as { virtual_path: string; body_length: number; collection: string; path: string } | null;
  836. // Try suffix match
  837. if (!doc) {
  838. doc = db.prepare(`
  839. SELECT
  840. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  841. LENGTH(content.doc) as body_length,
  842. d.collection,
  843. d.path
  844. FROM documents d
  845. JOIN content ON content.hash = d.hash
  846. WHERE d.path LIKE ? AND d.active = 1
  847. LIMIT 1
  848. `).get(`%${name}`) as { virtual_path: string; body_length: number; collection: string; path: string } | null;
  849. }
  850. }
  851. if (doc) {
  852. files.push({
  853. filepath: doc.virtual_path,
  854. displayPath: doc.virtual_path,
  855. bodyLength: doc.body_length,
  856. collection: doc.collection,
  857. path: doc.path
  858. });
  859. } else {
  860. console.error(`File not found: ${name}`);
  861. }
  862. }
  863. } else {
  864. // Glob pattern - matchFilesByGlob now returns virtual paths
  865. files = matchFilesByGlob(db, pattern).map(f => ({
  866. ...f,
  867. collection: undefined, // Will be fetched later if needed
  868. path: undefined
  869. }));
  870. if (files.length === 0) {
  871. console.error(`No files matched pattern: ${pattern}`);
  872. closeDb();
  873. process.exit(1);
  874. }
  875. }
  876. // Collect results for structured output
  877. const results: { file: string; displayPath: string; title: string; body: string; context: string | null; skipped: boolean; skipReason?: string }[] = [];
  878. for (const file of files) {
  879. // Parse virtual path to get collection info if not already available
  880. let collection = file.collection;
  881. let path = file.path;
  882. if (!collection || !path) {
  883. const parsed = parseVirtualPath(file.filepath);
  884. if (parsed) {
  885. collection = parsed.collectionName;
  886. path = parsed.path;
  887. }
  888. }
  889. // Get context using collection-scoped function
  890. const context = collection && path ? getContextForPath(db, collection, path) : null;
  891. // Check size limit
  892. if (file.bodyLength > maxBytes) {
  893. results.push({
  894. file: file.filepath,
  895. displayPath: file.displayPath,
  896. title: file.displayPath.split('/').pop() || file.displayPath,
  897. body: "",
  898. context,
  899. skipped: true,
  900. skipReason: `File too large (${Math.round(file.bodyLength / 1024)}KB > ${Math.round(maxBytes / 1024)}KB). Use 'qmd get ${file.displayPath}' to retrieve.`,
  901. });
  902. continue;
  903. }
  904. // Fetch document content using collection and path
  905. if (!collection || !path) continue;
  906. const doc = db.prepare(`
  907. SELECT content.doc as body, d.title
  908. FROM documents d
  909. JOIN content ON content.hash = d.hash
  910. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  911. `).get(collection, path) as { body: string; title: string } | null;
  912. if (!doc) continue;
  913. let body = doc.body;
  914. // Apply line limit if specified
  915. if (maxLines !== undefined) {
  916. const lines = body.split('\n');
  917. body = lines.slice(0, maxLines).join('\n');
  918. if (lines.length > maxLines) {
  919. body += `\n\n[... truncated ${lines.length - maxLines} more lines]`;
  920. }
  921. }
  922. results.push({
  923. file: file.filepath,
  924. displayPath: file.displayPath,
  925. title: doc.title || file.displayPath.split('/').pop() || file.displayPath,
  926. body,
  927. context,
  928. skipped: false,
  929. });
  930. }
  931. closeDb();
  932. // Output based on format
  933. if (format === "json") {
  934. const output = results.map(r => ({
  935. file: r.displayPath,
  936. title: r.title,
  937. ...(r.context && { context: r.context }),
  938. ...(r.skipped ? { skipped: true, reason: r.skipReason } : { body: r.body }),
  939. }));
  940. console.log(JSON.stringify(output, null, 2));
  941. } else if (format === "csv") {
  942. const escapeField = (val: string | null | undefined): string => {
  943. if (val === null || val === undefined) return "";
  944. const str = String(val);
  945. if (str.includes(",") || str.includes('"') || str.includes("\n")) {
  946. return `"${str.replace(/"/g, '""')}"`;
  947. }
  948. return str;
  949. };
  950. console.log("file,title,context,skipped,body");
  951. for (const r of results) {
  952. console.log([r.displayPath, r.title, r.context, r.skipped ? "true" : "false", r.skipped ? r.skipReason : r.body].map(escapeField).join(","));
  953. }
  954. } else if (format === "files") {
  955. for (const r of results) {
  956. const ctx = r.context ? `,"${r.context.replace(/"/g, '""')}"` : "";
  957. const status = r.skipped ? "[SKIPPED]" : "";
  958. console.log(`${r.displayPath}${ctx}${status ? `,${status}` : ""}`);
  959. }
  960. } else if (format === "md") {
  961. for (const r of results) {
  962. console.log(`## ${r.displayPath}\n`);
  963. if (r.title && r.title !== r.displayPath) console.log(`**Title:** ${r.title}\n`);
  964. if (r.context) console.log(`**Context:** ${r.context}\n`);
  965. if (r.skipped) {
  966. console.log(`> ${r.skipReason}\n`);
  967. } else {
  968. console.log("```");
  969. console.log(r.body);
  970. console.log("```\n");
  971. }
  972. }
  973. } else if (format === "xml") {
  974. console.log('<?xml version="1.0" encoding="UTF-8"?>');
  975. console.log("<documents>");
  976. for (const r of results) {
  977. console.log(" <document>");
  978. console.log(` <file>${escapeXml(r.displayPath)}</file>`);
  979. console.log(` <title>${escapeXml(r.title)}</title>`);
  980. if (r.context) console.log(` <context>${escapeXml(r.context)}</context>`);
  981. if (r.skipped) {
  982. console.log(` <skipped>true</skipped>`);
  983. console.log(` <reason>${escapeXml(r.skipReason || "")}</reason>`);
  984. } else {
  985. console.log(` <body>${escapeXml(r.body)}</body>`);
  986. }
  987. console.log(" </document>");
  988. }
  989. console.log("</documents>");
  990. } else {
  991. // CLI format (default)
  992. for (const r of results) {
  993. console.log(`\n${'='.repeat(60)}`);
  994. console.log(`File: ${r.displayPath}`);
  995. console.log(`${'='.repeat(60)}\n`);
  996. if (r.skipped) {
  997. console.log(`[SKIPPED: ${r.skipReason}]`);
  998. continue;
  999. }
  1000. if (r.context) {
  1001. console.log(`Folder Context: ${r.context}\n---\n`);
  1002. }
  1003. console.log(r.body);
  1004. }
  1005. }
  1006. }
  1007. // List files in virtual file tree
  1008. function listFiles(pathArg?: string): void {
  1009. const db = getDb();
  1010. if (!pathArg) {
  1011. // No argument - list all collections
  1012. const yamlCollections = yamlListCollections();
  1013. if (yamlCollections.length === 0) {
  1014. console.log("No collections found. Run 'qmd collection add .' to index files.");
  1015. closeDb();
  1016. return;
  1017. }
  1018. // Get file counts from database for each collection
  1019. const collections = yamlCollections.map(coll => {
  1020. const stats = db.prepare(`
  1021. SELECT COUNT(*) as file_count
  1022. FROM documents d
  1023. WHERE d.collection = ? AND d.active = 1
  1024. `).get(coll.name) as { file_count: number } | null;
  1025. return {
  1026. name: coll.name,
  1027. file_count: stats?.file_count || 0
  1028. };
  1029. });
  1030. console.log(`${c.bold}Collections:${c.reset}\n`);
  1031. for (const coll of collections) {
  1032. console.log(` ${c.dim}qmd://${c.reset}${c.cyan}${coll.name}/${c.reset} ${c.dim}(${coll.file_count} files)${c.reset}`);
  1033. }
  1034. closeDb();
  1035. return;
  1036. }
  1037. // Parse the path argument
  1038. let collectionName: string;
  1039. let pathPrefix: string | null = null;
  1040. if (pathArg.startsWith('qmd://')) {
  1041. // Virtual path format: qmd://collection/path
  1042. const parsed = parseVirtualPath(pathArg);
  1043. if (!parsed) {
  1044. console.error(`Invalid virtual path: ${pathArg}`);
  1045. closeDb();
  1046. process.exit(1);
  1047. }
  1048. collectionName = parsed.collectionName;
  1049. pathPrefix = parsed.path;
  1050. } else {
  1051. // Just collection name or collection/path
  1052. const parts = pathArg.split('/');
  1053. collectionName = parts[0] || '';
  1054. if (parts.length > 1) {
  1055. pathPrefix = parts.slice(1).join('/');
  1056. }
  1057. }
  1058. // Get the collection
  1059. const coll = getCollectionFromYaml(collectionName);
  1060. if (!coll) {
  1061. console.error(`Collection not found: ${collectionName}`);
  1062. console.error(`Run 'qmd ls' to see available collections.`);
  1063. closeDb();
  1064. process.exit(1);
  1065. }
  1066. // List files in the collection with size and modification time
  1067. let query: string;
  1068. let params: any[];
  1069. if (pathPrefix) {
  1070. // List files under a specific path
  1071. query = `
  1072. SELECT d.path, d.title, d.modified_at, LENGTH(ct.doc) as size
  1073. FROM documents d
  1074. JOIN content ct ON d.hash = ct.hash
  1075. WHERE d.collection = ? AND d.path LIKE ? AND d.active = 1
  1076. ORDER BY d.path
  1077. `;
  1078. params = [coll.name, `${pathPrefix}%`];
  1079. } else {
  1080. // List all files in the collection
  1081. query = `
  1082. SELECT d.path, d.title, d.modified_at, LENGTH(ct.doc) as size
  1083. FROM documents d
  1084. JOIN content ct ON d.hash = ct.hash
  1085. WHERE d.collection = ? AND d.active = 1
  1086. ORDER BY d.path
  1087. `;
  1088. params = [coll.name];
  1089. }
  1090. const files = db.prepare(query).all(...params) as { path: string; title: string; modified_at: string; size: number }[];
  1091. if (files.length === 0) {
  1092. if (pathPrefix) {
  1093. console.log(`No files found under qmd://${collectionName}/${pathPrefix}`);
  1094. } else {
  1095. console.log(`No files found in collection: ${collectionName}`);
  1096. }
  1097. closeDb();
  1098. return;
  1099. }
  1100. // Calculate max widths for alignment
  1101. const maxSize = Math.max(...files.map(f => formatBytes(f.size).length));
  1102. // Output in ls -l style
  1103. for (const file of files) {
  1104. const sizeStr = formatBytes(file.size).padStart(maxSize);
  1105. const date = new Date(file.modified_at);
  1106. const timeStr = formatLsTime(date);
  1107. // Dim the qmd:// prefix, highlight the filename
  1108. console.log(`${sizeStr} ${timeStr} ${c.dim}qmd://${collectionName}/${c.reset}${c.cyan}${file.path}${c.reset}`);
  1109. }
  1110. closeDb();
  1111. }
  1112. // Format date/time like ls -l
  1113. function formatLsTime(date: Date): string {
  1114. const now = new Date();
  1115. const sixMonthsAgo = new Date(now.getTime() - 6 * 30 * 24 * 60 * 60 * 1000);
  1116. const months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
  1117. const month = months[date.getMonth()];
  1118. const day = date.getDate().toString().padStart(2, ' ');
  1119. // If file is older than 6 months, show year instead of time
  1120. if (date < sixMonthsAgo) {
  1121. const year = date.getFullYear();
  1122. return `${month} ${day} ${year}`;
  1123. } else {
  1124. const hours = date.getHours().toString().padStart(2, '0');
  1125. const minutes = date.getMinutes().toString().padStart(2, '0');
  1126. return `${month} ${day} ${hours}:${minutes}`;
  1127. }
  1128. }
  1129. // Collection management commands
  1130. function collectionList(): void {
  1131. const db = getDb();
  1132. const collections = listCollections(db);
  1133. if (collections.length === 0) {
  1134. console.log("No collections found. Run 'qmd collection add .' to create one.");
  1135. closeDb();
  1136. return;
  1137. }
  1138. console.log(`${c.bold}Collections (${collections.length}):${c.reset}\n`);
  1139. for (const coll of collections) {
  1140. const updatedAt = coll.last_modified ? new Date(coll.last_modified) : new Date();
  1141. const timeAgo = formatTimeAgo(updatedAt);
  1142. console.log(`${c.cyan}${coll.name}${c.reset} ${c.dim}(qmd://${coll.name}/)${c.reset}`);
  1143. console.log(` ${c.dim}Pattern:${c.reset} ${coll.glob_pattern}`);
  1144. console.log(` ${c.dim}Files:${c.reset} ${coll.active_count}`);
  1145. console.log(` ${c.dim}Updated:${c.reset} ${timeAgo}`);
  1146. console.log();
  1147. }
  1148. closeDb();
  1149. }
  1150. async function collectionAdd(pwd: string, globPattern: string, name?: string): Promise<void> {
  1151. // If name not provided, generate from pwd basename
  1152. let collName = name;
  1153. if (!collName) {
  1154. const parts = pwd.split('/').filter(Boolean);
  1155. collName = parts[parts.length - 1] || 'root';
  1156. }
  1157. // Check if collection with this name already exists in YAML
  1158. const existing = getCollectionFromYaml(collName);
  1159. if (existing) {
  1160. console.error(`${c.yellow}Collection '${collName}' already exists.${c.reset}`);
  1161. console.error(`Use a different name with --name <name>`);
  1162. process.exit(1);
  1163. }
  1164. // Check if a collection with this pwd+glob already exists in YAML
  1165. const allCollections = yamlListCollections();
  1166. const existingPwdGlob = allCollections.find(c => c.path === pwd && c.pattern === globPattern);
  1167. if (existingPwdGlob) {
  1168. console.error(`${c.yellow}A collection already exists for this path and pattern:${c.reset}`);
  1169. console.error(` Name: ${existingPwdGlob.name} (qmd://${existingPwdGlob.name}/)`);
  1170. console.error(` Pattern: ${globPattern}`);
  1171. console.error(`\nUse 'qmd update' to re-index it, or remove it first with 'qmd collection remove ${existingPwdGlob.name}'`);
  1172. process.exit(1);
  1173. }
  1174. // Add to YAML config
  1175. const { addCollection } = await import("./collections.js");
  1176. addCollection(collName, pwd, globPattern);
  1177. // Create the collection and index files
  1178. console.log(`Creating collection '${collName}'...`);
  1179. await indexFiles(pwd, globPattern, collName);
  1180. console.log(`${c.green}✓${c.reset} Collection '${collName}' created successfully`);
  1181. }
  1182. function collectionRemove(name: string): void {
  1183. // Check if collection exists in YAML
  1184. const coll = getCollectionFromYaml(name);
  1185. if (!coll) {
  1186. console.error(`${c.yellow}Collection not found: ${name}${c.reset}`);
  1187. console.error(`Run 'qmd collection list' to see available collections.`);
  1188. process.exit(1);
  1189. }
  1190. const db = getDb();
  1191. const result = removeCollection(db, name);
  1192. closeDb();
  1193. console.log(`${c.green}✓${c.reset} Removed collection '${name}'`);
  1194. console.log(` Deleted ${result.deletedDocs} documents`);
  1195. if (result.cleanedHashes > 0) {
  1196. console.log(` Cleaned up ${result.cleanedHashes} orphaned content hashes`);
  1197. }
  1198. }
  1199. function collectionRename(oldName: string, newName: string): void {
  1200. // Check if old collection exists in YAML
  1201. const coll = getCollectionFromYaml(oldName);
  1202. if (!coll) {
  1203. console.error(`${c.yellow}Collection not found: ${oldName}${c.reset}`);
  1204. console.error(`Run 'qmd collection list' to see available collections.`);
  1205. process.exit(1);
  1206. }
  1207. // Check if new name already exists in YAML
  1208. const existing = getCollectionFromYaml(newName);
  1209. if (existing) {
  1210. console.error(`${c.yellow}Collection name already exists: ${newName}${c.reset}`);
  1211. console.error(`Choose a different name or remove the existing collection first.`);
  1212. process.exit(1);
  1213. }
  1214. const db = getDb();
  1215. renameCollection(db, oldName, newName);
  1216. closeDb();
  1217. console.log(`${c.green}✓${c.reset} Renamed collection '${oldName}' to '${newName}'`);
  1218. console.log(` Virtual paths updated: ${c.cyan}qmd://${oldName}/${c.reset} → ${c.cyan}qmd://${newName}/${c.reset}`);
  1219. }
  1220. async function indexFiles(pwd?: string, globPattern: string = DEFAULT_GLOB, collectionName?: string, suppressEmbedNotice: boolean = false): Promise<void> {
  1221. const db = getDb();
  1222. const resolvedPwd = pwd || getPwd();
  1223. const now = new Date().toISOString();
  1224. const excludeDirs = ["node_modules", ".git", ".cache", "vendor", "dist", "build"];
  1225. // Clear Ollama cache on index
  1226. clearCache(db);
  1227. // Collection name must be provided (from YAML)
  1228. if (!collectionName) {
  1229. throw new Error("Collection name is required. Collections must be defined in ~/.config/qmd/index.yml");
  1230. }
  1231. console.log(`Collection: ${resolvedPwd} (${globPattern})`);
  1232. progress.indeterminate();
  1233. const allFiles: string[] = await fastGlob(globPattern, {
  1234. cwd: resolvedPwd,
  1235. onlyFiles: true,
  1236. followSymbolicLinks: false,
  1237. dot: false,
  1238. ignore: excludeDirs.map(d => `**/${d}/**`),
  1239. });
  1240. // Filter hidden files/folders (dot: false handles top-level but not nested)
  1241. const files = allFiles.filter(file => {
  1242. const parts = file.split("/");
  1243. return !parts.some(part => part.startsWith("."));
  1244. });
  1245. const total = files.length;
  1246. if (total === 0) {
  1247. progress.clear();
  1248. console.log("No files found matching pattern.");
  1249. closeDb();
  1250. return;
  1251. }
  1252. let indexed = 0, updated = 0, unchanged = 0, processed = 0;
  1253. const seenPaths = new Set<string>();
  1254. const startTime = Date.now();
  1255. for (const relativeFile of files) {
  1256. const filepath = getRealPath(resolve(resolvedPwd, relativeFile));
  1257. const path = handelize(relativeFile); // Normalize path for token-friendliness
  1258. seenPaths.add(path);
  1259. const content = readFileSync(filepath, "utf-8");
  1260. // Skip empty files - nothing useful to index
  1261. if (!content.trim()) {
  1262. processed++;
  1263. continue;
  1264. }
  1265. const hash = await hashContent(content);
  1266. const title = extractTitle(content, relativeFile);
  1267. // Check if document exists in this collection with this path
  1268. const existing = findActiveDocument(db, collectionName, path);
  1269. if (existing) {
  1270. if (existing.hash === hash) {
  1271. // Hash unchanged, but check if title needs updating
  1272. if (existing.title !== title) {
  1273. updateDocumentTitle(db, existing.id, title, now);
  1274. updated++;
  1275. } else {
  1276. unchanged++;
  1277. }
  1278. } else {
  1279. // Content changed - insert new content hash and update document
  1280. insertContent(db, hash, content, now);
  1281. const stat = statSync(filepath);
  1282. updateDocument(db, existing.id, title, hash,
  1283. stat ? new Date(stat.mtime).toISOString() : now);
  1284. updated++;
  1285. }
  1286. } else {
  1287. // New document - insert content and document
  1288. indexed++;
  1289. insertContent(db, hash, content, now);
  1290. const stat = statSync(filepath);
  1291. insertDocument(db, collectionName, path, title, hash,
  1292. stat ? new Date(stat.birthtime).toISOString() : now,
  1293. stat ? new Date(stat.mtime).toISOString() : now);
  1294. }
  1295. processed++;
  1296. progress.set((processed / total) * 100);
  1297. const elapsed = (Date.now() - startTime) / 1000;
  1298. const rate = processed / elapsed;
  1299. const remaining = (total - processed) / rate;
  1300. const eta = processed > 2 ? ` ETA: ${formatETA(remaining)}` : "";
  1301. process.stderr.write(`\rIndexing: ${processed}/${total}${eta} `);
  1302. }
  1303. // Deactivate documents in this collection that no longer exist
  1304. const allActive = getActiveDocumentPaths(db, collectionName);
  1305. let removed = 0;
  1306. for (const path of allActive) {
  1307. if (!seenPaths.has(path)) {
  1308. deactivateDocument(db, collectionName, path);
  1309. removed++;
  1310. }
  1311. }
  1312. // Clean up orphaned content hashes (content not referenced by any document)
  1313. const orphanedContent = cleanupOrphanedContent(db);
  1314. // Check if vector index needs updating
  1315. const needsEmbedding = getHashesNeedingEmbedding(db);
  1316. progress.clear();
  1317. console.log(`\nIndexed: ${indexed} new, ${updated} updated, ${unchanged} unchanged, ${removed} removed`);
  1318. if (orphanedContent > 0) {
  1319. console.log(`Cleaned up ${orphanedContent} orphaned content hash(es)`);
  1320. }
  1321. if (needsEmbedding > 0 && !suppressEmbedNotice) {
  1322. console.log(`\nRun 'qmd embed' to update embeddings (${needsEmbedding} unique hashes need vectors)`);
  1323. }
  1324. closeDb();
  1325. }
  1326. function renderProgressBar(percent: number, width: number = 30): string {
  1327. const filled = Math.round((percent / 100) * width);
  1328. const empty = width - filled;
  1329. const bar = "█".repeat(filled) + "░".repeat(empty);
  1330. return bar;
  1331. }
  1332. async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = false): Promise<void> {
  1333. const db = getDb();
  1334. const now = new Date().toISOString();
  1335. // If force, clear all vectors
  1336. if (force) {
  1337. console.log(`${c.yellow}Force re-indexing: clearing all vectors...${c.reset}`);
  1338. clearAllEmbeddings(db);
  1339. }
  1340. // Find unique hashes that need embedding (from active documents)
  1341. const hashesToEmbed = getHashesForEmbedding(db);
  1342. if (hashesToEmbed.length === 0) {
  1343. console.log(`${c.green}✓ All content hashes already have embeddings.${c.reset}`);
  1344. closeDb();
  1345. return;
  1346. }
  1347. // Prepare documents with chunks
  1348. type ChunkItem = { hash: string; title: string; text: string; seq: number; pos: number; tokens: number; bytes: number; displayName: string };
  1349. const allChunks: ChunkItem[] = [];
  1350. let multiChunkDocs = 0;
  1351. // Chunk all documents using actual token counts
  1352. process.stderr.write(`Chunking ${hashesToEmbed.length} documents by token count...\n`);
  1353. for (const item of hashesToEmbed) {
  1354. const encoder = new TextEncoder();
  1355. const bodyBytes = encoder.encode(item.body).length;
  1356. if (bodyBytes === 0) continue; // Skip empty
  1357. const title = extractTitle(item.body, item.path);
  1358. const displayName = item.path;
  1359. const chunks = await chunkDocumentByTokens(item.body); // Uses actual tokenizer
  1360. if (chunks.length > 1) multiChunkDocs++;
  1361. for (let seq = 0; seq < chunks.length; seq++) {
  1362. allChunks.push({
  1363. hash: item.hash,
  1364. title,
  1365. text: chunks[seq]!.text, // Chunk is guaranteed to exist by seq loop
  1366. seq,
  1367. pos: chunks[seq]!.pos,
  1368. tokens: chunks[seq]!.tokens,
  1369. bytes: encoder.encode(chunks[seq]!.text).length,
  1370. displayName,
  1371. });
  1372. }
  1373. }
  1374. if (allChunks.length === 0) {
  1375. console.log(`${c.green}✓ No non-empty documents to embed.${c.reset}`);
  1376. closeDb();
  1377. return;
  1378. }
  1379. const totalBytes = allChunks.reduce((sum, chk) => sum + chk.bytes, 0);
  1380. const totalChunks = allChunks.length;
  1381. const totalDocs = hashesToEmbed.length;
  1382. console.log(`${c.bold}Embedding ${totalDocs} documents${c.reset} ${c.dim}(${totalChunks} chunks, ${formatBytes(totalBytes)})${c.reset}`);
  1383. if (multiChunkDocs > 0) {
  1384. console.log(`${c.dim}${multiChunkDocs} documents split into multiple chunks${c.reset}`);
  1385. }
  1386. console.log(`${c.dim}Model: ${model}${c.reset}\n`);
  1387. // Hide cursor during embedding
  1388. cursor.hide();
  1389. // Wrap all LLM embedding operations in a session for lifecycle management
  1390. // Use 30 minute timeout for large collections
  1391. await withLLMSession(async (session) => {
  1392. // Get embedding dimensions from first chunk
  1393. progress.indeterminate();
  1394. const firstChunk = allChunks[0];
  1395. if (!firstChunk) {
  1396. throw new Error("No chunks available to embed");
  1397. }
  1398. const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title);
  1399. const firstResult = await session.embed(firstText);
  1400. if (!firstResult) {
  1401. throw new Error("Failed to get embedding dimensions from first chunk");
  1402. }
  1403. ensureVecTable(db, firstResult.embedding.length);
  1404. let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
  1405. const startTime = Date.now();
  1406. // Batch embedding for better throughput
  1407. // Process in batches of 32 to balance memory usage and efficiency
  1408. const BATCH_SIZE = 32;
  1409. for (let batchStart = 0; batchStart < allChunks.length; batchStart += BATCH_SIZE) {
  1410. const batchEnd = Math.min(batchStart + BATCH_SIZE, allChunks.length);
  1411. const batch = allChunks.slice(batchStart, batchEnd);
  1412. // Format texts for embedding
  1413. const texts = batch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title));
  1414. try {
  1415. // Batch embed all texts at once
  1416. const embeddings = await session.embedBatch(texts);
  1417. // Insert each embedding
  1418. for (let i = 0; i < batch.length; i++) {
  1419. const chunk = batch[i]!;
  1420. const embedding = embeddings[i];
  1421. if (embedding) {
  1422. insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
  1423. chunksEmbedded++;
  1424. } else {
  1425. errors++;
  1426. console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}${c.reset}`);
  1427. }
  1428. bytesProcessed += chunk.bytes;
  1429. }
  1430. } catch (err) {
  1431. // If batch fails, try individual embeddings as fallback
  1432. for (const chunk of batch) {
  1433. try {
  1434. const text = formatDocForEmbedding(chunk.text, chunk.title);
  1435. const result = await session.embed(text);
  1436. if (result) {
  1437. insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
  1438. chunksEmbedded++;
  1439. } else {
  1440. errors++;
  1441. }
  1442. } catch (innerErr) {
  1443. errors++;
  1444. console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}: ${innerErr}${c.reset}`);
  1445. }
  1446. bytesProcessed += chunk.bytes;
  1447. }
  1448. }
  1449. const percent = (bytesProcessed / totalBytes) * 100;
  1450. progress.set(percent);
  1451. const elapsed = (Date.now() - startTime) / 1000;
  1452. const bytesPerSec = bytesProcessed / elapsed;
  1453. const remainingBytes = totalBytes - bytesProcessed;
  1454. const etaSec = remainingBytes / bytesPerSec;
  1455. const bar = renderProgressBar(percent);
  1456. const percentStr = percent.toFixed(0).padStart(3);
  1457. const throughput = `${formatBytes(bytesPerSec)}/s`;
  1458. const eta = elapsed > 2 ? formatETA(etaSec) : "...";
  1459. const errStr = errors > 0 ? ` ${c.yellow}${errors} err${c.reset}` : "";
  1460. process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${chunksEmbedded}/${totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset} `);
  1461. }
  1462. progress.clear();
  1463. cursor.show();
  1464. const totalTimeSec = (Date.now() - startTime) / 1000;
  1465. const avgThroughput = formatBytes(totalBytes / totalTimeSec);
  1466. console.log(`\r${c.green}${renderProgressBar(100)}${c.reset} ${c.bold}100%${c.reset} `);
  1467. console.log(`\n${c.green}✓ Done!${c.reset} Embedded ${c.bold}${chunksEmbedded}${c.reset} chunks from ${c.bold}${totalDocs}${c.reset} documents in ${c.bold}${formatETA(totalTimeSec)}${c.reset} ${c.dim}(${avgThroughput}/s)${c.reset}`);
  1468. if (errors > 0) {
  1469. console.log(`${c.yellow}⚠ ${errors} chunks failed${c.reset}`);
  1470. }
  1471. }, { maxDuration: 30 * 60 * 1000, name: 'embed-command' });
  1472. closeDb();
  1473. }
  1474. // Sanitize a term for FTS5: remove punctuation except apostrophes
  1475. function sanitizeFTS5Term(term: string): string {
  1476. // Remove all non-alphanumeric except apostrophes (for contractions like "don't")
  1477. return term.replace(/[^\w']/g, '').trim();
  1478. }
  1479. // Build FTS5 query: phrase-aware with fallback to individual terms
  1480. function buildFTS5Query(query: string): string {
  1481. // Sanitize the full query for phrase matching
  1482. const sanitizedQuery = query.replace(/[^\w\s']/g, '').trim();
  1483. const terms = query
  1484. .split(/\s+/)
  1485. .map(sanitizeFTS5Term)
  1486. .filter(term => term.length >= 2); // Skip single chars and empty
  1487. if (terms.length === 0) return "";
  1488. if (terms.length === 1) return `"${terms[0]!.replace(/"/g, '""')}"`;
  1489. // Strategy: exact phrase OR proximity match OR individual terms
  1490. // Exact phrase matches rank highest, then close proximity, then any term
  1491. const phrase = `"${sanitizedQuery.replace(/"/g, '""')}"`;
  1492. const quotedTerms = terms.map(t => `"${t.replace(/"/g, '""')}"`);
  1493. // FTS5 NEAR syntax: NEAR(term1 term2, distance)
  1494. const nearPhrase = `NEAR(${quotedTerms.join(' ')}, 10)`;
  1495. const orTerms = quotedTerms.join(' OR ');
  1496. // Exact phrase > proximity > any term
  1497. return `(${phrase}) OR (${nearPhrase}) OR (${orTerms})`;
  1498. }
  1499. // Normalize BM25 score to 0-1 range using sigmoid
  1500. function normalizeBM25(score: number): number {
  1501. // BM25 scores are negative in SQLite (lower = better)
  1502. // Typical range: -15 (excellent) to -2 (weak match)
  1503. // Map to 0-1 where higher is better
  1504. const absScore = Math.abs(score);
  1505. // Sigmoid-ish normalization: maps ~2-15 range to ~0.1-0.95
  1506. return 1 / (1 + Math.exp(-(absScore - 5) / 3));
  1507. }
  1508. type OutputOptions = {
  1509. format: OutputFormat;
  1510. full: boolean;
  1511. limit: number;
  1512. minScore: number;
  1513. all?: boolean;
  1514. collection?: string | string[]; // Filter by collection name(s)
  1515. lineNumbers?: boolean; // Add line numbers to output
  1516. context?: string; // Optional context for query expansion
  1517. };
  1518. // Highlight query terms in text (skip short words < 3 chars)
  1519. function highlightTerms(text: string, query: string): string {
  1520. if (!useColor) return text;
  1521. const terms = query.toLowerCase().split(/\s+/).filter(t => t.length >= 3);
  1522. let result = text;
  1523. for (const term of terms) {
  1524. const regex = new RegExp(`(${term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')})`, 'gi');
  1525. result = result.replace(regex, `${c.yellow}${c.bold}$1${c.reset}`);
  1526. }
  1527. return result;
  1528. }
  1529. // Format score with color based on value
  1530. function formatScore(score: number): string {
  1531. const pct = (score * 100).toFixed(0).padStart(3);
  1532. if (!useColor) return `${pct}%`;
  1533. if (score >= 0.7) return `${c.green}${pct}%${c.reset}`;
  1534. if (score >= 0.4) return `${c.yellow}${pct}%${c.reset}`;
  1535. return `${c.dim}${pct}%${c.reset}`;
  1536. }
  1537. // Shorten directory path for display - relative to $HOME (used for context paths, not documents)
  1538. function shortPath(dirpath: string): string {
  1539. const home = homedir();
  1540. if (dirpath.startsWith(home)) {
  1541. return '~' + dirpath.slice(home.length);
  1542. }
  1543. return dirpath;
  1544. }
  1545. function outputResults(results: { file: string; displayPath: string; title: string; body: string; score: number; context?: string | null; chunkPos?: number; hash?: string; docid?: string }[], query: string, opts: OutputOptions): void {
  1546. const filtered = results.filter(r => r.score >= opts.minScore).slice(0, opts.limit);
  1547. if (filtered.length === 0) {
  1548. console.log("No results found above minimum score threshold.");
  1549. return;
  1550. }
  1551. // Helper to create qmd:// URI from displayPath
  1552. const toQmdPath = (displayPath: string) => `qmd://${displayPath}`;
  1553. if (opts.format === "json") {
  1554. // JSON output for LLM consumption
  1555. const output = filtered.map(row => {
  1556. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
  1557. let body = opts.full ? row.body : undefined;
  1558. let snippet = !opts.full ? extractSnippet(row.body, query, 300, row.chunkPos).snippet : undefined;
  1559. if (opts.lineNumbers) {
  1560. if (body) body = addLineNumbers(body);
  1561. if (snippet) snippet = addLineNumbers(snippet);
  1562. }
  1563. return {
  1564. ...(docid && { docid: `#${docid}` }),
  1565. score: Math.round(row.score * 100) / 100,
  1566. file: toQmdPath(row.displayPath),
  1567. title: row.title,
  1568. ...(row.context && { context: row.context }),
  1569. ...(body && { body }),
  1570. ...(snippet && { snippet }),
  1571. };
  1572. });
  1573. console.log(JSON.stringify(output, null, 2));
  1574. } else if (opts.format === "files") {
  1575. // Simple docid,score,filepath,context output
  1576. for (const row of filtered) {
  1577. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
  1578. const ctx = row.context ? `,"${row.context.replace(/"/g, '""')}"` : "";
  1579. console.log(`#${docid},${row.score.toFixed(2)},${toQmdPath(row.displayPath)}${ctx}`);
  1580. }
  1581. } else if (opts.format === "cli") {
  1582. for (let i = 0; i < filtered.length; i++) {
  1583. const row = filtered[i];
  1584. if (!row) continue;
  1585. const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos);
  1586. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
  1587. // Line 1: filepath with docid
  1588. const path = toQmdPath(row.displayPath);
  1589. // Only show :line if we actually found a term match in the snippet body (exclude header line).
  1590. const snippetBody = snippet.split("\n").slice(1).join("\n").toLowerCase();
  1591. const hasMatch = query.toLowerCase().split(/\s+/).some(t => t.length > 0 && snippetBody.includes(t));
  1592. const lineInfo = hasMatch ? `:${line}` : "";
  1593. const docidStr = docid ? ` ${c.dim}#${docid}${c.reset}` : "";
  1594. console.log(`${c.cyan}${path}${c.dim}${lineInfo}${c.reset}${docidStr}`);
  1595. // Line 2: Title (if available)
  1596. if (row.title) {
  1597. console.log(`${c.bold}Title: ${row.title}${c.reset}`);
  1598. }
  1599. // Line 3: Context (if available)
  1600. if (row.context) {
  1601. console.log(`${c.dim}Context: ${row.context}${c.reset}`);
  1602. }
  1603. // Line 4: Score
  1604. const score = formatScore(row.score);
  1605. console.log(`Score: ${c.bold}${score}${c.reset}`);
  1606. console.log();
  1607. // Snippet with highlighting (diff-style header included)
  1608. let displaySnippet = opts.lineNumbers ? addLineNumbers(snippet, line) : snippet;
  1609. const highlighted = highlightTerms(displaySnippet, query);
  1610. console.log(highlighted);
  1611. // Double empty line between results
  1612. if (i < filtered.length - 1) console.log('\n');
  1613. }
  1614. } else if (opts.format === "md") {
  1615. for (let i = 0; i < filtered.length; i++) {
  1616. const row = filtered[i];
  1617. if (!row) continue;
  1618. const heading = row.title || row.displayPath;
  1619. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
  1620. let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos).snippet;
  1621. if (opts.lineNumbers) {
  1622. content = addLineNumbers(content);
  1623. }
  1624. const docidLine = docid ? `**docid:** \`#${docid}\`\n` : "";
  1625. const contextLine = row.context ? `**context:** ${row.context}\n` : "";
  1626. console.log(`---\n# ${heading}\n${docidLine}${contextLine}\n${content}\n`);
  1627. }
  1628. } else if (opts.format === "xml") {
  1629. for (const row of filtered) {
  1630. const titleAttr = row.title ? ` title="${row.title.replace(/"/g, '&quot;')}"` : "";
  1631. const contextAttr = row.context ? ` context="${row.context.replace(/"/g, '&quot;')}"` : "";
  1632. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
  1633. let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos).snippet;
  1634. if (opts.lineNumbers) {
  1635. content = addLineNumbers(content);
  1636. }
  1637. console.log(`<file docid="#${docid}" name="${toQmdPath(row.displayPath)}"${titleAttr}${contextAttr}>\n${content}\n</file>\n`);
  1638. }
  1639. } else {
  1640. // CSV format
  1641. console.log("docid,score,file,title,context,line,snippet");
  1642. for (const row of filtered) {
  1643. const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos);
  1644. let content = opts.full ? row.body : snippet;
  1645. if (opts.lineNumbers) {
  1646. content = addLineNumbers(content, line);
  1647. }
  1648. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
  1649. const snippetText = content || "";
  1650. console.log(`#${docid},${row.score.toFixed(4)},${escapeCSV(toQmdPath(row.displayPath))},${escapeCSV(row.title || "")},${escapeCSV(row.context || "")},${line},${escapeCSV(snippetText)}`);
  1651. }
  1652. }
  1653. }
  1654. // Resolve -c collection filter: supports single string, array, or undefined.
  1655. // Returns validated collection names (exits on unknown collection).
  1656. function resolveCollectionFilter(raw: string | string[] | undefined): string[] {
  1657. if (!raw) return [];
  1658. const names = Array.isArray(raw) ? raw : [raw];
  1659. const validated: string[] = [];
  1660. for (const name of names) {
  1661. const coll = getCollectionFromYaml(name);
  1662. if (!coll) {
  1663. console.error(`Collection not found: ${name}`);
  1664. closeDb();
  1665. process.exit(1);
  1666. }
  1667. validated.push(name);
  1668. }
  1669. return validated;
  1670. }
  1671. // Post-filter results to only include files from specified collections.
  1672. function filterByCollections<T extends { filepath?: string; file?: string }>(results: T[], collectionNames: string[]): T[] {
  1673. if (collectionNames.length <= 1) return results;
  1674. const prefixes = collectionNames.map(n => `qmd://${n}/`);
  1675. return results.filter(r => {
  1676. const path = r.filepath || r.file || '';
  1677. return prefixes.some(p => path.startsWith(p));
  1678. });
  1679. }
  1680. /**
  1681. * Parse structured search query syntax.
  1682. * Lines starting with lex:, vec:, or hyde: are routed directly.
  1683. * Plain lines without prefix go through query expansion.
  1684. *
  1685. * Returns null if this is a plain query (single line, no prefix).
  1686. * Returns StructuredSubSearch[] if structured syntax detected.
  1687. * Throws if multiple plain lines (ambiguous).
  1688. *
  1689. * Examples:
  1690. * "CAP theorem" -> null (plain query, use expansion)
  1691. * "lex: CAP theorem" -> [{ type: 'lex', query: 'CAP theorem' }]
  1692. * "lex: CAP\nvec: consistency" -> [{ type: 'lex', ... }, { type: 'vec', ... }]
  1693. * "CAP\nconsistency" -> throws (multiple plain lines)
  1694. */
  1695. function parseStructuredQuery(query: string): StructuredSubSearch[] | null {
  1696. const lines = query.split('\n').map(l => l.trim()).filter(l => l.length > 0);
  1697. if (lines.length === 0) return null;
  1698. const prefixRe = /^(lex|vec|hyde):\s*/i;
  1699. const searches: StructuredSubSearch[] = [];
  1700. const plainLines: string[] = [];
  1701. for (const line of lines) {
  1702. const match = line.match(prefixRe);
  1703. if (match) {
  1704. const type = match[1]!.toLowerCase() as 'lex' | 'vec' | 'hyde';
  1705. const text = line.slice(match[0].length).trim();
  1706. if (text.length > 0) {
  1707. searches.push({ type, query: text });
  1708. }
  1709. } else {
  1710. plainLines.push(line);
  1711. }
  1712. }
  1713. // All plain lines, no prefixes -> null (use normal expansion)
  1714. if (searches.length === 0 && plainLines.length === 1) {
  1715. return null;
  1716. }
  1717. // Multiple plain lines without prefixes -> ambiguous, error
  1718. if (plainLines.length > 1) {
  1719. throw new Error(
  1720. `Ambiguous query: multiple lines without lex:/vec:/hyde: prefix.\n` +
  1721. `Either use a single line (for query expansion) or prefix each line.\n` +
  1722. `Example:\n lex: keyword terms\n vec: natural language question\n hyde: hypothetical answer passage`
  1723. );
  1724. }
  1725. // Mix of prefixed and one plain line -> treat plain as lex
  1726. if (plainLines.length === 1) {
  1727. searches.unshift({ type: 'lex', query: plainLines[0]! });
  1728. }
  1729. return searches.length > 0 ? searches : null;
  1730. }
  1731. function search(query: string, opts: OutputOptions): void {
  1732. const db = getDb();
  1733. // Validate collection filter (supports multiple -c flags)
  1734. const collectionNames = resolveCollectionFilter(opts.collection);
  1735. const singleCollection = collectionNames.length === 1 ? collectionNames[0] : undefined;
  1736. // Use large limit for --all, otherwise fetch more than needed and let outputResults filter
  1737. const fetchLimit = opts.all ? 100000 : Math.max(50, opts.limit * 2);
  1738. const results = filterByCollections(
  1739. searchFTS(db, query, fetchLimit, singleCollection),
  1740. collectionNames
  1741. );
  1742. // Add context to results
  1743. const resultsWithContext = results.map(r => ({
  1744. file: r.filepath,
  1745. displayPath: r.displayPath,
  1746. title: r.title,
  1747. body: r.body || "",
  1748. score: r.score,
  1749. context: getContextForFile(db, r.filepath),
  1750. hash: r.hash,
  1751. docid: r.docid,
  1752. }));
  1753. closeDb();
  1754. if (resultsWithContext.length === 0) {
  1755. if (opts.format === "json") {
  1756. console.log("[]");
  1757. } else {
  1758. console.log("No results found.");
  1759. }
  1760. return;
  1761. }
  1762. outputResults(resultsWithContext, query, opts);
  1763. }
  1764. // Log query expansion as a tree to stderr (CLI progress feedback)
  1765. function logExpansionTree(originalQuery: string, expanded: ExpandedQuery[]): void {
  1766. const lines: string[] = [];
  1767. lines.push(`${c.dim}├─ ${originalQuery}${c.reset}`);
  1768. for (const q of expanded) {
  1769. let preview = q.text.replace(/\n/g, ' ');
  1770. if (preview.length > 72) preview = preview.substring(0, 69) + '...';
  1771. lines.push(`${c.dim}├─ ${q.type}: ${preview}${c.reset}`);
  1772. }
  1773. if (lines.length > 0) {
  1774. lines[lines.length - 1] = lines[lines.length - 1]!.replace('├─', '└─');
  1775. }
  1776. for (const line of lines) process.stderr.write(line + '\n');
  1777. }
  1778. async function vectorSearch(query: string, opts: OutputOptions, _model: string = DEFAULT_EMBED_MODEL): Promise<void> {
  1779. const store = getStore();
  1780. // Validate collection filter (supports multiple -c flags)
  1781. const collectionNames = resolveCollectionFilter(opts.collection);
  1782. const singleCollection = collectionNames.length === 1 ? collectionNames[0] : undefined;
  1783. checkIndexHealth(store.db);
  1784. await withLLMSession(async () => {
  1785. let results = await vectorSearchQuery(store, query, {
  1786. collection: singleCollection,
  1787. limit: opts.all ? 500 : (opts.limit || 10),
  1788. minScore: opts.minScore || 0.3,
  1789. hooks: {
  1790. onExpand: (original, expanded) => {
  1791. logExpansionTree(original, expanded);
  1792. process.stderr.write(`${c.dim}Searching ${expanded.length + 1} vector queries...${c.reset}\n`);
  1793. },
  1794. },
  1795. });
  1796. // Post-filter for multi-collection
  1797. if (collectionNames.length > 1) {
  1798. results = results.filter(r => {
  1799. const prefixes = collectionNames.map(n => `qmd://${n}/`);
  1800. return prefixes.some(p => r.file.startsWith(p));
  1801. });
  1802. }
  1803. closeDb();
  1804. if (results.length === 0) {
  1805. if (opts.format === "json") {
  1806. console.log("[]");
  1807. } else {
  1808. console.log("No results found.");
  1809. }
  1810. return;
  1811. }
  1812. outputResults(results.map(r => ({
  1813. file: r.file,
  1814. displayPath: r.displayPath,
  1815. title: r.title,
  1816. body: r.body,
  1817. score: r.score,
  1818. context: r.context,
  1819. docid: r.docid,
  1820. })), query, { ...opts, limit: results.length });
  1821. }, { maxDuration: 10 * 60 * 1000, name: 'vectorSearch' });
  1822. }
  1823. async function querySearch(query: string, opts: OutputOptions, _embedModel: string = DEFAULT_EMBED_MODEL, _rerankModel: string = DEFAULT_RERANK_MODEL): Promise<void> {
  1824. const store = getStore();
  1825. // Validate collection filter (supports multiple -c flags)
  1826. const collectionNames = resolveCollectionFilter(opts.collection);
  1827. const singleCollection = collectionNames.length === 1 ? collectionNames[0] : undefined;
  1828. checkIndexHealth(store.db);
  1829. // Check for structured query syntax (lex:/vec:/hyde: prefixes)
  1830. const structuredQueries = parseStructuredQuery(query);
  1831. await withLLMSession(async () => {
  1832. let results;
  1833. if (structuredQueries) {
  1834. // Structured search — user provided their own query expansions
  1835. const typeLabels = structuredQueries.map(s => s.type).join('+');
  1836. process.stderr.write(`${c.dim}Structured search: ${structuredQueries.length} queries (${typeLabels})${c.reset}\n`);
  1837. // Log each sub-query
  1838. for (const s of structuredQueries) {
  1839. let preview = s.query.replace(/\n/g, ' ');
  1840. if (preview.length > 72) preview = preview.substring(0, 69) + '...';
  1841. process.stderr.write(`${c.dim}├─ ${s.type}: ${preview}${c.reset}\n`);
  1842. }
  1843. process.stderr.write(`${c.dim}└─ Searching...${c.reset}\n`);
  1844. results = await structuredSearch(store, structuredQueries, {
  1845. collections: singleCollection ? [singleCollection] : undefined,
  1846. limit: opts.all ? 500 : (opts.limit || 10),
  1847. minScore: opts.minScore || 0,
  1848. hooks: {
  1849. onRerankStart: (chunkCount) => {
  1850. process.stderr.write(`${c.dim}Reranking ${chunkCount} chunks...${c.reset}\n`);
  1851. progress.indeterminate();
  1852. },
  1853. onRerankDone: () => {
  1854. progress.clear();
  1855. },
  1856. },
  1857. });
  1858. } else {
  1859. // Standard hybrid query with automatic expansion
  1860. results = await hybridQuery(store, query, {
  1861. collection: singleCollection,
  1862. limit: opts.all ? 500 : (opts.limit || 10),
  1863. minScore: opts.minScore || 0,
  1864. hooks: {
  1865. onStrongSignal: (score) => {
  1866. process.stderr.write(`${c.dim}Strong BM25 signal (${score.toFixed(2)}) — skipping expansion${c.reset}\n`);
  1867. },
  1868. onExpand: (original, expanded) => {
  1869. logExpansionTree(original, expanded);
  1870. process.stderr.write(`${c.dim}Searching ${expanded.length + 1} queries...${c.reset}\n`);
  1871. },
  1872. onRerankStart: (chunkCount) => {
  1873. process.stderr.write(`${c.dim}Reranking ${chunkCount} chunks...${c.reset}\n`);
  1874. progress.indeterminate();
  1875. },
  1876. onRerankDone: () => {
  1877. progress.clear();
  1878. },
  1879. },
  1880. });
  1881. }
  1882. // Post-filter for multi-collection
  1883. if (collectionNames.length > 1) {
  1884. results = results.filter(r => {
  1885. const prefixes = collectionNames.map(n => `qmd://${n}/`);
  1886. return prefixes.some(p => r.file.startsWith(p));
  1887. });
  1888. }
  1889. closeDb();
  1890. if (results.length === 0) {
  1891. if (opts.format === "json") {
  1892. console.log("[]");
  1893. } else {
  1894. console.log("No results found.");
  1895. }
  1896. return;
  1897. }
  1898. // Use first lex/vec query for output context, or original query
  1899. const displayQuery = structuredQueries
  1900. ? (structuredQueries.find(s => s.type === 'lex')?.query || structuredQueries.find(s => s.type === 'vec')?.query || query)
  1901. : query;
  1902. // Map to CLI output format — use bestChunk for snippet display
  1903. outputResults(results.map(r => ({
  1904. file: r.file,
  1905. displayPath: r.displayPath,
  1906. title: r.title,
  1907. body: r.bestChunk,
  1908. chunkPos: r.bestChunkPos,
  1909. score: r.score,
  1910. context: r.context,
  1911. docid: r.docid,
  1912. })), displayQuery, { ...opts, limit: results.length });
  1913. }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
  1914. }
  1915. // Parse CLI arguments using util.parseArgs
  1916. function parseCLI() {
  1917. const { values, positionals } = parseArgs({
  1918. args: process.argv.slice(2), // Skip node and script path
  1919. options: {
  1920. // Global options
  1921. index: {
  1922. type: "string",
  1923. },
  1924. context: {
  1925. type: "string",
  1926. },
  1927. help: { type: "boolean", short: "h" },
  1928. version: { type: "boolean", short: "v" },
  1929. // Search options
  1930. n: { type: "string" },
  1931. "min-score": { type: "string" },
  1932. all: { type: "boolean" },
  1933. full: { type: "boolean" },
  1934. csv: { type: "boolean" },
  1935. md: { type: "boolean" },
  1936. xml: { type: "boolean" },
  1937. files: { type: "boolean" },
  1938. json: { type: "boolean" },
  1939. collection: { type: "string", short: "c", multiple: true }, // Filter by collection(s)
  1940. // Collection options
  1941. name: { type: "string" }, // collection name
  1942. mask: { type: "string" }, // glob pattern
  1943. // Embed options
  1944. force: { type: "boolean", short: "f" },
  1945. // Update options
  1946. pull: { type: "boolean" }, // git pull before update
  1947. refresh: { type: "boolean" },
  1948. // Get options
  1949. l: { type: "string" }, // max lines
  1950. from: { type: "string" }, // start line
  1951. "max-bytes": { type: "string" }, // max bytes for multi-get
  1952. "line-numbers": { type: "boolean" }, // add line numbers to output
  1953. // MCP HTTP transport options
  1954. http: { type: "boolean" },
  1955. daemon: { type: "boolean" },
  1956. port: { type: "string" },
  1957. },
  1958. allowPositionals: true,
  1959. strict: false, // Allow unknown options to pass through
  1960. });
  1961. // Select index name (default: "index")
  1962. const indexName = values.index as string | undefined;
  1963. if (indexName) {
  1964. setIndexName(indexName);
  1965. setConfigIndexName(indexName);
  1966. }
  1967. // Determine output format
  1968. let format: OutputFormat = "cli";
  1969. if (values.csv) format = "csv";
  1970. else if (values.md) format = "md";
  1971. else if (values.xml) format = "xml";
  1972. else if (values.files) format = "files";
  1973. else if (values.json) format = "json";
  1974. // Default limit: 20 for --files/--json, 5 otherwise
  1975. // --all means return all results (use very large limit)
  1976. const defaultLimit = (format === "files" || format === "json") ? 20 : 5;
  1977. const isAll = !!values.all;
  1978. const opts: OutputOptions = {
  1979. format,
  1980. full: !!values.full,
  1981. limit: isAll ? 100000 : (values.n ? parseInt(String(values.n), 10) || defaultLimit : defaultLimit),
  1982. minScore: values["min-score"] ? parseFloat(String(values["min-score"])) || 0 : 0,
  1983. all: isAll,
  1984. collection: values.collection as string[] | undefined,
  1985. lineNumbers: !!values["line-numbers"],
  1986. };
  1987. return {
  1988. command: positionals[0] || "",
  1989. args: positionals.slice(1),
  1990. query: positionals.slice(1).join(" "),
  1991. opts,
  1992. values,
  1993. };
  1994. }
  1995. function showHelp(): void {
  1996. console.log("Usage:");
  1997. console.log(" qmd collection add [path] --name <name> --mask <pattern> - Create/index collection");
  1998. console.log(" qmd collection list - List all collections with details");
  1999. console.log(" qmd collection remove <name> - Remove a collection by name");
  2000. console.log(" qmd collection rename <old> <new> - Rename a collection");
  2001. console.log(" qmd ls [collection[/path]] - List collections or files in a collection");
  2002. console.log(" qmd context add [path] \"text\" - Add context for path (defaults to current dir)");
  2003. console.log(" qmd context list - List all contexts");
  2004. console.log(" qmd context rm <path> - Remove context");
  2005. console.log(" qmd get <file>[:line] [-l N] [--from N] - Get document (optionally from line, max N lines)");
  2006. console.log(" qmd multi-get <pattern> [-l N] [--max-bytes N] - Get multiple docs by glob or comma-separated list");
  2007. console.log(" qmd status - Show index status and collections");
  2008. console.log(" qmd update [--pull] - Re-index all collections (--pull: git pull first)");
  2009. console.log(" qmd embed [-f] - Create vector embeddings (900 tokens/chunk, 15% overlap)");
  2010. console.log(" qmd cleanup - Remove cache and orphaned data, vacuum DB");
  2011. console.log(" qmd query <query> - Search with query expansion + reranking (recommended)");
  2012. console.log(" qmd query 'lex:..\\nvec:...' - Structured search (you provide lex/vec/hyde queries)");
  2013. console.log(" qmd search <query> - Full-text keyword search (BM25, no LLM)");
  2014. console.log(" qmd vsearch <query> - Vector similarity search (no reranking)");
  2015. console.log(" qmd mcp - Start MCP server (stdio transport)");
  2016. console.log(" qmd mcp --http [--port N] - Start MCP server (HTTP transport, default port 8181)");
  2017. console.log(" qmd mcp --http --daemon - Start MCP server as background daemon");
  2018. console.log(" qmd mcp stop - Stop background MCP daemon");
  2019. console.log("");
  2020. console.log("Global options:");
  2021. console.log(" --index <name> - Use custom index name (default: index)");
  2022. console.log("");
  2023. console.log("Search options:");
  2024. console.log(" -n <num> - Number of results (default: 5, or 20 for --files)");
  2025. console.log(" --all - Return all matches (use with --min-score to filter)");
  2026. console.log(" --min-score <num> - Minimum similarity score");
  2027. console.log(" --full - Output full document instead of snippet");
  2028. console.log(" --line-numbers - Add line numbers to output");
  2029. console.log(" --files - Output docid,score,filepath,context (default: 20 results)");
  2030. console.log(" --json - JSON output with snippets (default: 20 results)");
  2031. console.log(" --csv - CSV output with snippets");
  2032. console.log(" --md - Markdown output");
  2033. console.log(" --xml - XML output");
  2034. console.log(" -c, --collection <name> - Filter results to a specific collection");
  2035. console.log("");
  2036. console.log("Structured queries (qmd query):");
  2037. console.log(" Prefix lines with lex:, vec:, or hyde: to skip automatic expansion.");
  2038. console.log(" lex: BM25 keyword search (exact terms)");
  2039. console.log(" vec: Vector similarity (natural language question)");
  2040. console.log(" hyde: Vector similarity (hypothetical answer passage)");
  2041. console.log(" Example: qmd query $'lex: CAP theorem\\nvec: consistency vs availability tradeoff'");
  2042. console.log("");
  2043. console.log("Multi-get options:");
  2044. console.log(" -l <num> - Maximum lines per file");
  2045. console.log(" --max-bytes <num> - Skip files larger than N bytes (default: 10240)");
  2046. console.log(" --json/--csv/--md/--xml/--files - Output format (same as search)");
  2047. console.log("");
  2048. console.log(`Index: ${getDbPath()}`);
  2049. }
  2050. async function showVersion(): Promise<void> {
  2051. const scriptDir = dirname(fileURLToPath(import.meta.url));
  2052. const pkgPath = resolve(scriptDir, "..", "package.json");
  2053. const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
  2054. let commit = "";
  2055. try {
  2056. commit = execSync(`git -C ${scriptDir} rev-parse --short HEAD`, { encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"] }).trim();
  2057. } catch {
  2058. // Not a git repo or git not available
  2059. }
  2060. const versionStr = commit ? `${pkg.version} (${commit})` : pkg.version;
  2061. console.log(`qmd ${versionStr}`);
  2062. }
  2063. // Main CLI - only run if this is the main module
  2064. if (fileURLToPath(import.meta.url) === process.argv[1] || process.argv[1]?.endsWith("/qmd.ts") || process.argv[1]?.endsWith("/qmd.js")) {
  2065. const cli = parseCLI();
  2066. if (cli.values.version) {
  2067. await showVersion();
  2068. process.exit(0);
  2069. }
  2070. if (!cli.command || cli.values.help) {
  2071. showHelp();
  2072. process.exit(cli.values.help ? 0 : 1);
  2073. }
  2074. switch (cli.command) {
  2075. case "context": {
  2076. const subcommand = cli.args[0];
  2077. if (!subcommand) {
  2078. console.error("Usage: qmd context <add|list|check|rm>");
  2079. console.error("");
  2080. console.error("Commands:");
  2081. console.error(" qmd context add [path] \"text\" - Add context (defaults to current dir)");
  2082. console.error(" qmd context add / \"text\" - Add global context to all collections");
  2083. console.error(" qmd context list - List all contexts");
  2084. console.error(" qmd context check - Check for missing contexts");
  2085. console.error(" qmd context rm <path> - Remove context");
  2086. process.exit(1);
  2087. }
  2088. switch (subcommand) {
  2089. case "add": {
  2090. if (cli.args.length < 2) {
  2091. console.error("Usage: qmd context add [path] \"text\"");
  2092. console.error("");
  2093. console.error("Examples:");
  2094. console.error(" qmd context add \"Context for current directory\"");
  2095. console.error(" qmd context add . \"Context for current directory\"");
  2096. console.error(" qmd context add /subfolder \"Context for subfolder\"");
  2097. console.error(" qmd context add / \"Global context for all collections\"");
  2098. console.error("");
  2099. console.error(" Using virtual paths:");
  2100. console.error(" qmd context add qmd://journals/ \"Context for entire journals collection\"");
  2101. console.error(" qmd context add qmd://journals/2024 \"Context for 2024 journals\"");
  2102. process.exit(1);
  2103. }
  2104. let pathArg: string | undefined;
  2105. let contextText: string;
  2106. // Check if first arg looks like a path or if it's the context text
  2107. const firstArg = cli.args[1] || '';
  2108. const secondArg = cli.args[2];
  2109. if (secondArg) {
  2110. // Two args: path + context
  2111. pathArg = firstArg;
  2112. contextText = cli.args.slice(2).join(" ");
  2113. } else {
  2114. // One arg: context only (use current directory)
  2115. pathArg = undefined;
  2116. contextText = firstArg;
  2117. }
  2118. await contextAdd(pathArg, contextText);
  2119. break;
  2120. }
  2121. case "list": {
  2122. contextList();
  2123. break;
  2124. }
  2125. case "check": {
  2126. contextCheck();
  2127. break;
  2128. }
  2129. case "rm":
  2130. case "remove": {
  2131. if (cli.args.length < 2 || !cli.args[1]) {
  2132. console.error("Usage: qmd context rm <path>");
  2133. console.error("Examples:");
  2134. console.error(" qmd context rm /");
  2135. console.error(" qmd context rm qmd://journals/2024");
  2136. process.exit(1);
  2137. }
  2138. contextRemove(cli.args[1]);
  2139. break;
  2140. }
  2141. default:
  2142. console.error(`Unknown subcommand: ${subcommand}`);
  2143. console.error("Available: add, list, check, rm");
  2144. process.exit(1);
  2145. }
  2146. break;
  2147. }
  2148. case "get": {
  2149. if (!cli.args[0]) {
  2150. console.error("Usage: qmd get <filepath>[:line] [--from <line>] [-l <lines>] [--line-numbers]");
  2151. process.exit(1);
  2152. }
  2153. const fromLine = cli.values.from ? parseInt(cli.values.from as string, 10) : undefined;
  2154. const maxLines = cli.values.l ? parseInt(cli.values.l as string, 10) : undefined;
  2155. getDocument(cli.args[0], fromLine, maxLines, cli.opts.lineNumbers);
  2156. break;
  2157. }
  2158. case "multi-get": {
  2159. if (!cli.args[0]) {
  2160. console.error("Usage: qmd multi-get <pattern> [-l <lines>] [--max-bytes <bytes>] [--json|--csv|--md|--xml|--files]");
  2161. console.error(" pattern: glob (e.g., 'journals/2025-05*.md') or comma-separated list");
  2162. process.exit(1);
  2163. }
  2164. const maxLinesMulti = cli.values.l ? parseInt(cli.values.l as string, 10) : undefined;
  2165. const maxBytes = cli.values["max-bytes"] ? parseInt(cli.values["max-bytes"] as string, 10) : DEFAULT_MULTI_GET_MAX_BYTES;
  2166. multiGet(cli.args[0], maxLinesMulti, maxBytes, cli.opts.format);
  2167. break;
  2168. }
  2169. case "ls": {
  2170. listFiles(cli.args[0]);
  2171. break;
  2172. }
  2173. case "collection": {
  2174. const subcommand = cli.args[0];
  2175. switch (subcommand) {
  2176. case "list": {
  2177. collectionList();
  2178. break;
  2179. }
  2180. case "add": {
  2181. const pwd = cli.args[1] || getPwd();
  2182. const resolvedPwd = pwd === '.' ? getPwd() : getRealPath(resolve(pwd));
  2183. const globPattern = cli.values.mask as string || DEFAULT_GLOB;
  2184. const name = cli.values.name as string | undefined;
  2185. await collectionAdd(resolvedPwd, globPattern, name);
  2186. break;
  2187. }
  2188. case "remove":
  2189. case "rm": {
  2190. if (!cli.args[1]) {
  2191. console.error("Usage: qmd collection remove <name>");
  2192. console.error(" Use 'qmd collection list' to see available collections");
  2193. process.exit(1);
  2194. }
  2195. collectionRemove(cli.args[1]);
  2196. break;
  2197. }
  2198. case "rename":
  2199. case "mv": {
  2200. if (!cli.args[1] || !cli.args[2]) {
  2201. console.error("Usage: qmd collection rename <old-name> <new-name>");
  2202. console.error(" Use 'qmd collection list' to see available collections");
  2203. process.exit(1);
  2204. }
  2205. collectionRename(cli.args[1], cli.args[2]);
  2206. break;
  2207. }
  2208. default:
  2209. console.error(`Unknown subcommand: ${subcommand}`);
  2210. console.error("Available: list, add, remove, rename");
  2211. process.exit(1);
  2212. }
  2213. break;
  2214. }
  2215. case "status":
  2216. await showStatus();
  2217. break;
  2218. case "update":
  2219. await updateCollections();
  2220. break;
  2221. case "embed":
  2222. await vectorIndex(DEFAULT_EMBED_MODEL, !!cli.values.force);
  2223. break;
  2224. case "pull": {
  2225. const refresh = cli.values.refresh === undefined ? false : Boolean(cli.values.refresh);
  2226. const models = [
  2227. DEFAULT_EMBED_MODEL_URI,
  2228. DEFAULT_GENERATE_MODEL_URI,
  2229. DEFAULT_RERANK_MODEL_URI,
  2230. ];
  2231. console.log(`${c.bold}Pulling models${c.reset}`);
  2232. const results = await pullModels(models, {
  2233. refresh,
  2234. cacheDir: DEFAULT_MODEL_CACHE_DIR,
  2235. });
  2236. for (const result of results) {
  2237. const size = formatBytes(result.sizeBytes);
  2238. const note = result.refreshed ? "refreshed" : "cached/checked";
  2239. console.log(`- ${result.model} -> ${result.path} (${size}, ${note})`);
  2240. }
  2241. break;
  2242. }
  2243. case "search":
  2244. if (!cli.query) {
  2245. console.error("Usage: qmd search [options] <query>");
  2246. process.exit(1);
  2247. }
  2248. search(cli.query, cli.opts);
  2249. break;
  2250. case "vsearch":
  2251. case "vector-search": // undocumented alias
  2252. if (!cli.query) {
  2253. console.error("Usage: qmd vsearch [options] <query>");
  2254. process.exit(1);
  2255. }
  2256. // Default min-score for vector search is 0.3
  2257. if (!cli.values["min-score"]) {
  2258. cli.opts.minScore = 0.3;
  2259. }
  2260. await vectorSearch(cli.query, cli.opts);
  2261. break;
  2262. case "query":
  2263. case "deep-search": // undocumented alias
  2264. if (!cli.query) {
  2265. console.error("Usage: qmd query [options] <query>");
  2266. process.exit(1);
  2267. }
  2268. await querySearch(cli.query, cli.opts);
  2269. break;
  2270. case "mcp": {
  2271. const sub = cli.args[0]; // stop | status | undefined
  2272. // Cache dir for PID/log files — same dir as the index
  2273. const cacheDir = process.env.XDG_CACHE_HOME
  2274. ? resolve(process.env.XDG_CACHE_HOME, "qmd")
  2275. : resolve(homedir(), ".cache", "qmd");
  2276. const pidPath = resolve(cacheDir, "mcp.pid");
  2277. // Subcommands take priority over flags
  2278. if (sub === "stop") {
  2279. if (!existsSync(pidPath)) {
  2280. console.log("Not running (no PID file).");
  2281. process.exit(0);
  2282. }
  2283. const pid = parseInt(readFileSync(pidPath, "utf-8").trim());
  2284. try {
  2285. process.kill(pid, 0); // alive?
  2286. process.kill(pid, "SIGTERM");
  2287. unlinkSync(pidPath);
  2288. console.log(`Stopped QMD MCP server (PID ${pid}).`);
  2289. } catch {
  2290. unlinkSync(pidPath);
  2291. console.log("Cleaned up stale PID file (server was not running).");
  2292. }
  2293. process.exit(0);
  2294. }
  2295. if (cli.values.http) {
  2296. const port = Number(cli.values.port) || 8181;
  2297. if (cli.values.daemon) {
  2298. // Guard: check if already running
  2299. if (existsSync(pidPath)) {
  2300. const existingPid = parseInt(readFileSync(pidPath, "utf-8").trim());
  2301. try {
  2302. process.kill(existingPid, 0); // alive?
  2303. console.error(`Already running (PID ${existingPid}). Run 'qmd mcp stop' first.`);
  2304. process.exit(1);
  2305. } catch {
  2306. // Stale PID file — continue
  2307. }
  2308. }
  2309. mkdirSync(cacheDir, { recursive: true });
  2310. const logPath = resolve(cacheDir, "mcp.log");
  2311. const logFd = openSync(logPath, "w"); // truncate — fresh log per daemon run
  2312. const selfPath = fileURLToPath(import.meta.url);
  2313. const spawnArgs = selfPath.endsWith(".ts")
  2314. ? ["--import", pathJoin(dirname(selfPath), "..", "node_modules", "tsx", "dist", "esm", "index.mjs"), selfPath, "mcp", "--http", "--port", String(port)]
  2315. : [selfPath, "mcp", "--http", "--port", String(port)];
  2316. const child = nodeSpawn(process.execPath, spawnArgs, {
  2317. stdio: ["ignore", logFd, logFd],
  2318. detached: true,
  2319. });
  2320. child.unref();
  2321. closeSync(logFd); // parent's copy; child inherited the fd
  2322. writeFileSync(pidPath, String(child.pid));
  2323. console.log(`Started on http://localhost:${port}/mcp (PID ${child.pid})`);
  2324. console.log(`Logs: ${logPath}`);
  2325. process.exit(0);
  2326. }
  2327. // Foreground HTTP mode — remove top-level cursor handlers so the
  2328. // async cleanup handlers in startMcpHttpServer actually run.
  2329. process.removeAllListeners("SIGTERM");
  2330. process.removeAllListeners("SIGINT");
  2331. const { startMcpHttpServer } = await import("./mcp.js");
  2332. try {
  2333. await startMcpHttpServer(port);
  2334. } catch (e: any) {
  2335. if (e?.code === "EADDRINUSE") {
  2336. console.error(`Port ${port} already in use. Try a different port with --port.`);
  2337. process.exit(1);
  2338. }
  2339. throw e;
  2340. }
  2341. } else {
  2342. // Default: stdio transport
  2343. const { startMcpServer } = await import("./mcp.js");
  2344. await startMcpServer();
  2345. }
  2346. break;
  2347. }
  2348. case "cleanup": {
  2349. const db = getDb();
  2350. // 1. Clear llm_cache
  2351. const cacheCount = deleteLLMCache(db);
  2352. console.log(`${c.green}✓${c.reset} Cleared ${cacheCount} cached API responses`);
  2353. // 2. Remove orphaned vectors
  2354. const orphanedVecs = cleanupOrphanedVectors(db);
  2355. if (orphanedVecs > 0) {
  2356. console.log(`${c.green}✓${c.reset} Removed ${orphanedVecs} orphaned embedding chunks`);
  2357. } else {
  2358. console.log(`${c.dim}No orphaned embeddings to remove${c.reset}`);
  2359. }
  2360. // 3. Remove inactive documents
  2361. const inactiveDocs = deleteInactiveDocuments(db);
  2362. if (inactiveDocs > 0) {
  2363. console.log(`${c.green}✓${c.reset} Removed ${inactiveDocs} inactive document records`);
  2364. }
  2365. // 4. Vacuum to reclaim space
  2366. vacuumDatabase(db);
  2367. console.log(`${c.green}✓${c.reset} Database vacuumed`);
  2368. closeDb();
  2369. break;
  2370. }
  2371. default:
  2372. console.error(`Unknown command: ${cli.command}`);
  2373. console.error("Run 'qmd --help' for usage.");
  2374. process.exit(1);
  2375. }
  2376. if (cli.command !== "mcp") {
  2377. await disposeDefaultLlamaCpp();
  2378. process.exit(0);
  2379. }
  2380. } // end if (main module)