qmd.ts 92 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634
  1. #!/usr/bin/env bun
  2. import { Database } from "bun:sqlite";
  3. import { Glob, $ } from "bun";
  4. import { parseArgs } from "util";
  5. import * as sqliteVec from "sqlite-vec";
  6. import {
  7. getPwd,
  8. getRealPath,
  9. homedir,
  10. resolve,
  11. enableProductionMode,
  12. searchFTS,
  13. searchVec,
  14. extractSnippet,
  15. getContextForFile,
  16. getContextForPath,
  17. listCollections,
  18. removeCollection,
  19. renameCollection,
  20. findSimilarFiles,
  21. matchFilesByGlob,
  22. getHashesNeedingEmbedding,
  23. getHashesForEmbedding,
  24. clearAllEmbeddings,
  25. insertEmbedding,
  26. getStatus,
  27. hashContent,
  28. extractTitle,
  29. formatDocForEmbedding,
  30. formatQueryForEmbedding,
  31. chunkDocument,
  32. chunkDocumentByTokens,
  33. clearCache,
  34. getCacheKey,
  35. getCachedResult,
  36. setCachedResult,
  37. getIndexHealth,
  38. parseVirtualPath,
  39. buildVirtualPath,
  40. isVirtualPath,
  41. resolveVirtualPath,
  42. toVirtualPath,
  43. insertContent,
  44. insertDocument,
  45. findActiveDocument,
  46. updateDocumentTitle,
  47. updateDocument,
  48. deactivateDocument,
  49. getActiveDocumentPaths,
  50. cleanupOrphanedContent,
  51. deleteLLMCache,
  52. deleteInactiveDocuments,
  53. cleanupOrphanedVectors,
  54. vacuumDatabase,
  55. getCollectionsWithoutContext,
  56. getTopLevelPathsWithoutContext,
  57. handelize,
  58. DEFAULT_EMBED_MODEL,
  59. DEFAULT_QUERY_MODEL,
  60. DEFAULT_RERANK_MODEL,
  61. DEFAULT_GLOB,
  62. DEFAULT_MULTI_GET_MAX_BYTES,
  63. createStore,
  64. getDefaultDbPath,
  65. } from "./store.js";
  66. import { getDefaultLlamaCpp, disposeDefaultLlamaCpp, type RerankDocument, type Queryable, type QueryType } from "./llm.js";
  67. import type { SearchResult, RankedResult } from "./store.js";
  68. import {
  69. formatSearchResults,
  70. formatDocuments,
  71. escapeXml,
  72. escapeCSV,
  73. type OutputFormat,
  74. } from "./formatter.js";
  75. import {
  76. getCollection as getCollectionFromYaml,
  77. listCollections as yamlListCollections,
  78. addContext as yamlAddContext,
  79. removeContext as yamlRemoveContext,
  80. setGlobalContext,
  81. listAllContexts,
  82. } from "./collections.js";
  83. // Enable production mode - allows using default database path
  84. // Tests must set INDEX_PATH or use createStore() with explicit path
  85. enableProductionMode();
  86. // =============================================================================
  87. // Store/DB lifecycle (no legacy singletons in store.ts)
  88. // =============================================================================
  89. let store: ReturnType<typeof createStore> | null = null;
  90. let storeDbPathOverride: string | undefined;
  91. function getStore(): ReturnType<typeof createStore> {
  92. if (!store) {
  93. store = createStore(storeDbPathOverride);
  94. }
  95. return store;
  96. }
  97. function getDb(): Database {
  98. return getStore().db;
  99. }
  100. function closeDb(): void {
  101. if (store) {
  102. store.close();
  103. store = null;
  104. }
  105. }
  106. function getDbPath(): string {
  107. return store?.dbPath ?? storeDbPathOverride ?? getDefaultDbPath();
  108. }
  109. function setIndexName(name: string | null): void {
  110. storeDbPathOverride = name ? getDefaultDbPath(name) : undefined;
  111. // Reset open handle so next use opens the new index
  112. closeDb();
  113. }
  114. function ensureVecTable(_db: Database, dimensions: number): void {
  115. // Store owns the DB; ignore `_db` and ensure vec table on the active store
  116. getStore().ensureVecTable(dimensions);
  117. }
  118. // Terminal colors (respects NO_COLOR env)
  119. const useColor = !process.env.NO_COLOR && process.stdout.isTTY;
  120. const c = {
  121. reset: useColor ? "\x1b[0m" : "",
  122. dim: useColor ? "\x1b[2m" : "",
  123. bold: useColor ? "\x1b[1m" : "",
  124. cyan: useColor ? "\x1b[36m" : "",
  125. yellow: useColor ? "\x1b[33m" : "",
  126. green: useColor ? "\x1b[32m" : "",
  127. magenta: useColor ? "\x1b[35m" : "",
  128. blue: useColor ? "\x1b[34m" : "",
  129. };
  130. // Terminal cursor control
  131. const cursor = {
  132. hide() { process.stderr.write('\x1b[?25l'); },
  133. show() { process.stderr.write('\x1b[?25h'); },
  134. };
  135. // Ensure cursor is restored on exit
  136. process.on('SIGINT', () => { cursor.show(); process.exit(130); });
  137. process.on('SIGTERM', () => { cursor.show(); process.exit(143); });
  138. // Terminal progress bar using OSC 9;4 escape sequence
  139. const progress = {
  140. set(percent: number) {
  141. process.stderr.write(`\x1b]9;4;1;${Math.round(percent)}\x07`);
  142. },
  143. clear() {
  144. process.stderr.write(`\x1b]9;4;0\x07`);
  145. },
  146. indeterminate() {
  147. process.stderr.write(`\x1b]9;4;3\x07`);
  148. },
  149. error() {
  150. process.stderr.write(`\x1b]9;4;2\x07`);
  151. },
  152. };
  153. // Format seconds into human-readable ETA
  154. function formatETA(seconds: number): string {
  155. if (seconds < 60) return `${Math.round(seconds)}s`;
  156. if (seconds < 3600) return `${Math.floor(seconds / 60)}m ${Math.round(seconds % 60)}s`;
  157. return `${Math.floor(seconds / 3600)}h ${Math.floor((seconds % 3600) / 60)}m`;
  158. }
  159. // Check index health and print warnings/tips
  160. function checkIndexHealth(db: Database): void {
  161. const { needsEmbedding, totalDocs, daysStale } = getIndexHealth(db);
  162. // Warn if many docs need embedding
  163. if (needsEmbedding > 0) {
  164. const pct = Math.round((needsEmbedding / totalDocs) * 100);
  165. if (pct >= 10) {
  166. process.stderr.write(`${c.yellow}Warning: ${needsEmbedding} documents (${pct}%) need embeddings. Run 'qmd embed' for better results.${c.reset}\n`);
  167. } else {
  168. process.stderr.write(`${c.dim}Tip: ${needsEmbedding} documents need embeddings. Run 'qmd embed' to index them.${c.reset}\n`);
  169. }
  170. }
  171. // Check if most recent document update is older than 2 weeks
  172. if (daysStale !== null && daysStale >= 14) {
  173. process.stderr.write(`${c.dim}Tip: Index last updated ${daysStale} days ago. Run 'qmd update' to refresh.${c.reset}\n`);
  174. }
  175. }
  176. // Compute unique display path for a document
  177. // Always include at least parent folder + filename, add more parent dirs until unique
  178. function computeDisplayPath(
  179. filepath: string,
  180. collectionPath: string,
  181. existingPaths: Set<string>
  182. ): string {
  183. // Get path relative to collection (include collection dir name)
  184. const collectionDir = collectionPath.replace(/\/$/, '');
  185. const collectionName = collectionDir.split('/').pop() || '';
  186. let relativePath: string;
  187. if (filepath.startsWith(collectionDir + '/')) {
  188. // filepath is under collection: use collection name + relative path
  189. relativePath = collectionName + filepath.slice(collectionDir.length);
  190. } else {
  191. // Fallback: just use the filepath
  192. relativePath = filepath;
  193. }
  194. const parts = relativePath.split('/').filter(p => p.length > 0);
  195. // Always include at least parent folder + filename (minimum 2 parts if available)
  196. // Then add more parent dirs until unique
  197. const minParts = Math.min(2, parts.length);
  198. for (let i = parts.length - minParts; i >= 0; i--) {
  199. const candidate = parts.slice(i).join('/');
  200. if (!existingPaths.has(candidate)) {
  201. return candidate;
  202. }
  203. }
  204. // Absolute fallback: use full path (should be unique)
  205. return filepath;
  206. }
  207. // Rerank documents using node-llama-cpp cross-encoder model
  208. async function rerank(query: string, documents: { file: string; text: string }[], _model: string = DEFAULT_RERANK_MODEL, _db?: Database): Promise<{ file: string; score: number }[]> {
  209. if (documents.length === 0) return [];
  210. const total = documents.length;
  211. process.stderr.write(`Reranking ${total} documents...\n`);
  212. progress.indeterminate();
  213. const llm = getDefaultLlamaCpp();
  214. const rerankDocs: RerankDocument[] = documents.map((doc) => ({
  215. file: doc.file,
  216. text: doc.text.slice(0, 4000), // Truncate to context limit
  217. }));
  218. const result = await llm.rerank(query, rerankDocs);
  219. progress.clear();
  220. process.stderr.write("\n");
  221. return result.results.map((r) => ({ file: r.file, score: r.score }));
  222. }
  223. function formatTimeAgo(date: Date): string {
  224. const seconds = Math.floor((Date.now() - date.getTime()) / 1000);
  225. if (seconds < 60) return `${seconds}s ago`;
  226. const minutes = Math.floor(seconds / 60);
  227. if (minutes < 60) return `${minutes}m ago`;
  228. const hours = Math.floor(minutes / 60);
  229. if (hours < 24) return `${hours}h ago`;
  230. const days = Math.floor(hours / 24);
  231. return `${days}d ago`;
  232. }
  233. function formatBytes(bytes: number): string {
  234. if (bytes < 1024) return `${bytes} B`;
  235. if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
  236. if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
  237. return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
  238. }
  239. function showStatus(): void {
  240. const dbPath = getDbPath();
  241. const db = getDb();
  242. // Collections are defined in YAML; no duplicate cleanup needed.
  243. // Collections are defined in YAML; no duplicate cleanup needed.
  244. // Index size
  245. let indexSize = 0;
  246. try {
  247. const stat = Bun.file(dbPath).size;
  248. indexSize = stat;
  249. } catch { }
  250. // Collections info (from YAML + database stats)
  251. const collections = listCollections(db);
  252. // Overall stats
  253. const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number };
  254. const vectorCount = db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get() as { count: number };
  255. const needsEmbedding = getHashesNeedingEmbedding(db);
  256. // Most recent update across all collections
  257. const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
  258. console.log(`${c.bold}QMD Status${c.reset}\n`);
  259. console.log(`Index: ${dbPath}`);
  260. console.log(`Size: ${formatBytes(indexSize)}\n`);
  261. console.log(`${c.bold}Documents${c.reset}`);
  262. console.log(` Total: ${totalDocs.count} files indexed`);
  263. console.log(` Vectors: ${vectorCount.count} embedded`);
  264. if (needsEmbedding > 0) {
  265. console.log(` ${c.yellow}Pending: ${needsEmbedding} need embedding${c.reset} (run 'qmd embed')`);
  266. }
  267. if (mostRecent.latest) {
  268. const lastUpdate = new Date(mostRecent.latest);
  269. console.log(` Updated: ${formatTimeAgo(lastUpdate)}`);
  270. }
  271. // Get all contexts grouped by collection (from YAML)
  272. const allContexts = listAllContexts();
  273. const contextsByCollection = new Map<string, { path_prefix: string; context: string }[]>();
  274. for (const ctx of allContexts) {
  275. // Group contexts by collection name
  276. if (!contextsByCollection.has(ctx.collection)) {
  277. contextsByCollection.set(ctx.collection, []);
  278. }
  279. contextsByCollection.get(ctx.collection)!.push({
  280. path_prefix: ctx.path,
  281. context: ctx.context
  282. });
  283. }
  284. if (collections.length > 0) {
  285. console.log(`\n${c.bold}Collections${c.reset}`);
  286. for (const col of collections) {
  287. const lastMod = col.last_modified ? formatTimeAgo(new Date(col.last_modified)) : "never";
  288. const contexts = contextsByCollection.get(col.name) || [];
  289. console.log(` ${c.cyan}${col.name}${c.reset} ${c.dim}(qmd://${col.name}/)${c.reset}`);
  290. console.log(` ${c.dim}Pattern:${c.reset} ${col.glob_pattern}`);
  291. console.log(` ${c.dim}Files:${c.reset} ${col.active_count} (updated ${lastMod})`);
  292. if (contexts.length > 0) {
  293. console.log(` ${c.dim}Contexts:${c.reset} ${contexts.length}`);
  294. for (const ctx of contexts) {
  295. // Handle both empty string and '/' as root context
  296. const pathDisplay = (ctx.path_prefix === '' || ctx.path_prefix === '/') ? '/' : `/${ctx.path_prefix}`;
  297. const contextPreview = ctx.context.length > 60
  298. ? ctx.context.substring(0, 57) + '...'
  299. : ctx.context;
  300. console.log(` ${c.dim}${pathDisplay}:${c.reset} ${contextPreview}`);
  301. }
  302. }
  303. }
  304. // Show examples of virtual paths
  305. console.log(`\n${c.bold}Examples${c.reset}`);
  306. console.log(` ${c.dim}# List files in a collection${c.reset}`);
  307. if (collections.length > 0 && collections[0]) {
  308. console.log(` qmd ls ${collections[0].name}`);
  309. }
  310. console.log(` ${c.dim}# Get a document${c.reset}`);
  311. if (collections.length > 0 && collections[0]) {
  312. console.log(` qmd get qmd://${collections[0].name}/path/to/file.md`);
  313. }
  314. console.log(` ${c.dim}# Search within a collection${c.reset}`);
  315. if (collections.length > 0 && collections[0]) {
  316. console.log(` qmd search "query" -c ${collections[0].name}`);
  317. }
  318. } else {
  319. console.log(`\n${c.dim}No collections. Run 'qmd collection add .' to index markdown files.${c.reset}`);
  320. }
  321. closeDb();
  322. }
  323. async function updateCollections(): Promise<void> {
  324. const db = getDb();
  325. // Collections are defined in YAML; no duplicate cleanup needed.
  326. // Clear Ollama cache on update
  327. clearCache(db);
  328. const collections = listCollections(db);
  329. if (collections.length === 0) {
  330. console.log(`${c.dim}No collections found. Run 'qmd collection add .' to index markdown files.${c.reset}`);
  331. closeDb();
  332. return;
  333. }
  334. // Don't close db here - indexFiles will reuse it and close at the end
  335. console.log(`${c.bold}Updating ${collections.length} collection(s)...${c.reset}\n`);
  336. for (let i = 0; i < collections.length; i++) {
  337. const col = collections[i];
  338. if (!col) continue;
  339. console.log(`${c.cyan}[${i + 1}/${collections.length}]${c.reset} ${c.bold}${col.name}${c.reset} ${c.dim}(${col.glob_pattern})${c.reset}`);
  340. // Execute custom update command if specified in YAML
  341. const yamlCol = getCollectionFromYaml(col.name);
  342. if (yamlCol?.update) {
  343. console.log(`${c.dim} Running update command: ${yamlCol.update}${c.reset}`);
  344. try {
  345. const proc = Bun.spawn(["/usr/bin/env", "bash", "-c", yamlCol.update], {
  346. cwd: col.pwd,
  347. stdout: "pipe",
  348. stderr: "pipe",
  349. });
  350. const output = await new Response(proc.stdout).text();
  351. const errorOutput = await new Response(proc.stderr).text();
  352. const exitCode = await proc.exited;
  353. if (output.trim()) {
  354. console.log(output.trim().split('\n').map(l => ` ${l}`).join('\n'));
  355. }
  356. if (errorOutput.trim()) {
  357. console.log(errorOutput.trim().split('\n').map(l => ` ${l}`).join('\n'));
  358. }
  359. if (exitCode !== 0) {
  360. console.log(`${c.yellow}✗ Update command failed with exit code ${exitCode}${c.reset}`);
  361. process.exit(exitCode);
  362. }
  363. } catch (err) {
  364. console.log(`${c.yellow}✗ Update command failed: ${err}${c.reset}`);
  365. process.exit(1);
  366. }
  367. }
  368. await indexFiles(col.pwd, col.glob_pattern, col.name);
  369. console.log("");
  370. }
  371. console.log(`${c.green}✓ All collections updated.${c.reset}`);
  372. }
  373. /**
  374. * Detect which collection (if any) contains the given filesystem path.
  375. * Returns { collectionId, collectionName, relativePath } or null if not in any collection.
  376. */
  377. function detectCollectionFromPath(db: Database, fsPath: string): { collectionName: string; relativePath: string } | null {
  378. const realPath = getRealPath(fsPath);
  379. // Find collections that this path is under from YAML
  380. const allCollections = yamlListCollections();
  381. // Find longest matching path
  382. let bestMatch: { name: string; path: string } | null = null;
  383. for (const coll of allCollections) {
  384. if (realPath.startsWith(coll.path + '/') || realPath === coll.path) {
  385. if (!bestMatch || coll.path.length > bestMatch.path.length) {
  386. bestMatch = { name: coll.name, path: coll.path };
  387. }
  388. }
  389. }
  390. if (!bestMatch) return null;
  391. // Calculate relative path
  392. let relativePath = realPath;
  393. if (relativePath.startsWith(bestMatch.path + '/')) {
  394. relativePath = relativePath.slice(bestMatch.path.length + 1);
  395. } else if (relativePath === bestMatch.path) {
  396. relativePath = '';
  397. }
  398. return {
  399. collectionName: bestMatch.name,
  400. relativePath
  401. };
  402. }
  403. async function contextAdd(pathArg: string | undefined, contextText: string): Promise<void> {
  404. const db = getDb();
  405. // Handle "/" as global context (applies to all collections)
  406. if (pathArg === '/') {
  407. setGlobalContext(contextText);
  408. console.log(`${c.green}✓${c.reset} Set global context`);
  409. console.log(`${c.dim}Context: ${contextText}${c.reset}`);
  410. closeDb();
  411. return;
  412. }
  413. // Resolve path - defaults to current directory if not provided
  414. let fsPath = pathArg || '.';
  415. if (fsPath === '.' || fsPath === './') {
  416. fsPath = getPwd();
  417. } else if (fsPath.startsWith('~/')) {
  418. fsPath = homedir() + fsPath.slice(1);
  419. } else if (!fsPath.startsWith('/') && !fsPath.startsWith('qmd://')) {
  420. fsPath = resolve(getPwd(), fsPath);
  421. }
  422. // Handle virtual paths (qmd://collection/path)
  423. if (isVirtualPath(fsPath)) {
  424. const parsed = parseVirtualPath(fsPath);
  425. if (!parsed) {
  426. console.error(`${c.yellow}Invalid virtual path: ${fsPath}${c.reset}`);
  427. process.exit(1);
  428. }
  429. const coll = getCollectionFromYaml(parsed.collectionName);
  430. if (!coll) {
  431. console.error(`${c.yellow}Collection not found: ${parsed.collectionName}${c.reset}`);
  432. process.exit(1);
  433. }
  434. yamlAddContext(parsed.collectionName, parsed.path, contextText);
  435. const displayPath = parsed.path
  436. ? `qmd://${parsed.collectionName}/${parsed.path}`
  437. : `qmd://${parsed.collectionName}/ (collection root)`;
  438. console.log(`${c.green}✓${c.reset} Added context for: ${displayPath}`);
  439. console.log(`${c.dim}Context: ${contextText}${c.reset}`);
  440. closeDb();
  441. return;
  442. }
  443. // Detect collection from filesystem path
  444. const detected = detectCollectionFromPath(db, fsPath);
  445. if (!detected) {
  446. console.error(`${c.yellow}Path is not in any indexed collection: ${fsPath}${c.reset}`);
  447. console.error(`${c.dim}Run 'qmd status' to see indexed collections${c.reset}`);
  448. process.exit(1);
  449. }
  450. yamlAddContext(detected.collectionName, detected.relativePath, contextText);
  451. const displayPath = detected.relativePath ? `qmd://${detected.collectionName}/${detected.relativePath}` : `qmd://${detected.collectionName}/`;
  452. console.log(`${c.green}✓${c.reset} Added context for: ${displayPath}`);
  453. console.log(`${c.dim}Context: ${contextText}${c.reset}`);
  454. closeDb();
  455. }
  456. function contextList(): void {
  457. const db = getDb();
  458. const allContexts = listAllContexts();
  459. if (allContexts.length === 0) {
  460. console.log(`${c.dim}No contexts configured. Use 'qmd context add' to add one.${c.reset}`);
  461. closeDb();
  462. return;
  463. }
  464. console.log(`\n${c.bold}Configured Contexts${c.reset}\n`);
  465. let lastCollection = '';
  466. for (const ctx of allContexts) {
  467. if (ctx.collection !== lastCollection) {
  468. console.log(`${c.cyan}${ctx.collection}${c.reset}`);
  469. lastCollection = ctx.collection;
  470. }
  471. const displayPath = ctx.path ? ` ${ctx.path}` : ' / (root)';
  472. console.log(`${displayPath}`);
  473. console.log(` ${c.dim}${ctx.context}${c.reset}`);
  474. }
  475. closeDb();
  476. }
  477. function contextRemove(pathArg: string): void {
  478. if (pathArg === '/') {
  479. // Remove global context
  480. setGlobalContext(undefined);
  481. console.log(`${c.green}✓${c.reset} Removed global context`);
  482. return;
  483. }
  484. // Handle virtual paths
  485. if (isVirtualPath(pathArg)) {
  486. const parsed = parseVirtualPath(pathArg);
  487. if (!parsed) {
  488. console.error(`${c.yellow}Invalid virtual path: ${pathArg}${c.reset}`);
  489. process.exit(1);
  490. }
  491. const coll = getCollectionFromYaml(parsed.collectionName);
  492. if (!coll) {
  493. console.error(`${c.yellow}Collection not found: ${parsed.collectionName}${c.reset}`);
  494. process.exit(1);
  495. }
  496. const success = yamlRemoveContext(coll.name, parsed.path);
  497. if (!success) {
  498. console.error(`${c.yellow}No context found for: ${pathArg}${c.reset}`);
  499. process.exit(1);
  500. }
  501. console.log(`${c.green}✓${c.reset} Removed context for: ${pathArg}`);
  502. return;
  503. }
  504. // Handle filesystem paths
  505. let fsPath = pathArg;
  506. if (fsPath === '.' || fsPath === './') {
  507. fsPath = getPwd();
  508. } else if (fsPath.startsWith('~/')) {
  509. fsPath = homedir() + fsPath.slice(1);
  510. } else if (!fsPath.startsWith('/')) {
  511. fsPath = resolve(getPwd(), fsPath);
  512. }
  513. const db = getDb();
  514. const detected = detectCollectionFromPath(db, fsPath);
  515. closeDb();
  516. if (!detected) {
  517. console.error(`${c.yellow}Path is not in any indexed collection: ${fsPath}${c.reset}`);
  518. process.exit(1);
  519. }
  520. const success = yamlRemoveContext(detected.collectionName, detected.relativePath);
  521. if (!success) {
  522. console.error(`${c.yellow}No context found for: qmd://${detected.collectionName}/${detected.relativePath}${c.reset}`);
  523. process.exit(1);
  524. }
  525. console.log(`${c.green}✓${c.reset} Removed context for: qmd://${detected.collectionName}/${detected.relativePath}`);
  526. }
  527. function contextCheck(): void {
  528. const db = getDb();
  529. // Get collections without any context
  530. const collectionsWithoutContext = getCollectionsWithoutContext(db);
  531. // Get all collections to check for missing path contexts
  532. const allCollections = listCollections(db);
  533. if (collectionsWithoutContext.length === 0 && allCollections.length > 0) {
  534. // Check if all collections have contexts
  535. console.log(`\n${c.green}✓${c.reset} ${c.bold}All collections have context configured${c.reset}\n`);
  536. }
  537. if (collectionsWithoutContext.length > 0) {
  538. console.log(`\n${c.yellow}Collections without any context:${c.reset}\n`);
  539. for (const coll of collectionsWithoutContext) {
  540. console.log(`${c.cyan}${coll.name}${c.reset} ${c.dim}(${coll.doc_count} documents)${c.reset}`);
  541. console.log(` ${c.dim}Suggestion: qmd context add qmd://${coll.name}/ "Description of ${coll.name}"${c.reset}\n`);
  542. }
  543. }
  544. // Check for top-level paths without context within collections that DO have context
  545. const collectionsWithContext = allCollections.filter(c =>
  546. c && !collectionsWithoutContext.some(cwc => cwc.name === c.name)
  547. );
  548. let hasPathSuggestions = false;
  549. for (const coll of collectionsWithContext) {
  550. if (!coll) continue;
  551. const missingPaths = getTopLevelPathsWithoutContext(db, coll.name);
  552. if (missingPaths.length > 0) {
  553. if (!hasPathSuggestions) {
  554. console.log(`${c.yellow}Top-level directories without context:${c.reset}\n`);
  555. hasPathSuggestions = true;
  556. }
  557. console.log(`${c.cyan}${coll.name}${c.reset}`);
  558. for (const path of missingPaths) {
  559. console.log(` ${path}`);
  560. console.log(` ${c.dim}Suggestion: qmd context add qmd://${coll.name}/${path} "Description of ${path}"${c.reset}`);
  561. }
  562. console.log('');
  563. }
  564. }
  565. if (collectionsWithoutContext.length === 0 && !hasPathSuggestions) {
  566. console.log(`${c.dim}All collections and major paths have context configured.${c.reset}`);
  567. console.log(`${c.dim}Use 'qmd context list' to see all configured contexts.${c.reset}\n`);
  568. }
  569. closeDb();
  570. }
  571. function getDocument(filename: string, fromLine?: number, maxLines?: number, lineNumbers?: boolean): void {
  572. const db = getDb();
  573. // Parse :linenum suffix from filename (e.g., "file.md:100")
  574. let inputPath = filename;
  575. const colonMatch = inputPath.match(/:(\d+)$/);
  576. if (colonMatch && !fromLine) {
  577. const matched = colonMatch[1];
  578. if (matched) {
  579. fromLine = parseInt(matched, 10);
  580. inputPath = inputPath.slice(0, -colonMatch[0].length);
  581. }
  582. }
  583. let doc: { collectionName: string; path: string; body: string } | null = null;
  584. let virtualPath: string;
  585. // Handle virtual paths (qmd://collection/path)
  586. if (isVirtualPath(inputPath)) {
  587. const parsed = parseVirtualPath(inputPath);
  588. if (!parsed) {
  589. console.error(`Invalid virtual path: ${inputPath}`);
  590. closeDb();
  591. process.exit(1);
  592. }
  593. // Try exact match on collection + path
  594. doc = db.prepare(`
  595. SELECT d.collection as collectionName, d.path, content.doc as body
  596. FROM documents d
  597. JOIN content ON content.hash = d.hash
  598. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  599. `).get(parsed.collectionName, parsed.path) as typeof doc;
  600. if (!doc) {
  601. // Try fuzzy match by path ending
  602. doc = db.prepare(`
  603. SELECT d.collection as collectionName, d.path, content.doc as body
  604. FROM documents d
  605. JOIN content ON content.hash = d.hash
  606. WHERE d.collection = ? AND d.path LIKE ? AND d.active = 1
  607. LIMIT 1
  608. `).get(parsed.collectionName, `%${parsed.path}`) as typeof doc;
  609. }
  610. virtualPath = inputPath;
  611. } else {
  612. // Try to interpret as collection/path format first (before filesystem path)
  613. // If path is relative (no / or ~ prefix), check if first component is a collection name
  614. if (!inputPath.startsWith('/') && !inputPath.startsWith('~')) {
  615. const parts = inputPath.split('/');
  616. if (parts.length >= 2) {
  617. const possibleCollection = parts[0];
  618. const possiblePath = parts.slice(1).join('/');
  619. // Check if this collection exists
  620. const collExists = possibleCollection ? db.prepare(`
  621. SELECT 1 FROM documents WHERE collection = ? AND active = 1 LIMIT 1
  622. `).get(possibleCollection) : null;
  623. if (collExists) {
  624. // Try exact match on collection + path
  625. doc = db.prepare(`
  626. SELECT d.collection as collectionName, d.path, content.doc as body
  627. FROM documents d
  628. JOIN content ON content.hash = d.hash
  629. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  630. `).get(possibleCollection || "", possiblePath || "") as { collectionName: string; path: string; body: string } | null;
  631. if (!doc) {
  632. // Try fuzzy match by path ending
  633. doc = db.prepare(`
  634. SELECT d.collection as collectionName, d.path, content.doc as body
  635. FROM documents d
  636. JOIN content ON content.hash = d.hash
  637. WHERE d.collection = ? AND d.path LIKE ? AND d.active = 1
  638. LIMIT 1
  639. `).get(possibleCollection || "", `%${possiblePath}`) as { collectionName: string; path: string; body: string } | null;
  640. }
  641. if (doc) {
  642. virtualPath = buildVirtualPath(doc.collectionName, doc.path);
  643. // Skip the filesystem path handling below
  644. }
  645. }
  646. }
  647. }
  648. // If not found as collection/path, handle as filesystem paths
  649. if (!doc) {
  650. let fsPath = inputPath;
  651. // Expand ~ to home directory
  652. if (fsPath.startsWith('~/')) {
  653. fsPath = homedir() + fsPath.slice(1);
  654. } else if (!fsPath.startsWith('/')) {
  655. // Relative path - resolve from current directory
  656. fsPath = resolve(getPwd(), fsPath);
  657. }
  658. fsPath = getRealPath(fsPath);
  659. // Try to detect which collection contains this path
  660. const detected = detectCollectionFromPath(db, fsPath);
  661. if (detected) {
  662. // Found collection - query by collection name + relative path
  663. doc = db.prepare(`
  664. SELECT d.collection as collectionName, d.path, content.doc as body
  665. FROM documents d
  666. JOIN content ON content.hash = d.hash
  667. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  668. `).get(detected.collectionName, detected.relativePath) as { collectionName: string; path: string; body: string } | null;
  669. }
  670. // Fuzzy match by filename (last component of path)
  671. if (!doc) {
  672. const filename = inputPath.split('/').pop() || inputPath;
  673. doc = db.prepare(`
  674. SELECT d.collection as collectionName, d.path, content.doc as body
  675. FROM documents d
  676. JOIN content ON content.hash = d.hash
  677. WHERE d.path LIKE ? AND d.active = 1
  678. LIMIT 1
  679. `).get(`%${filename}`) as { collectionName: string; path: string; body: string } | null;
  680. }
  681. if (doc) {
  682. virtualPath = buildVirtualPath(doc.collectionName, doc.path);
  683. } else {
  684. virtualPath = inputPath;
  685. }
  686. }
  687. }
  688. // Ensure doc is not null before proceeding
  689. if (!doc) {
  690. console.error(`Document not found: ${filename}`);
  691. closeDb();
  692. process.exit(1);
  693. }
  694. // Get context for this file
  695. const context = getContextForPath(db, doc.collectionName, doc.path);
  696. let output = doc.body;
  697. const startLine = fromLine || 1;
  698. // Apply line filtering if specified
  699. if (fromLine !== undefined || maxLines !== undefined) {
  700. const lines = output.split('\n');
  701. const start = startLine - 1; // Convert to 0-indexed
  702. const end = maxLines !== undefined ? start + maxLines : lines.length;
  703. output = lines.slice(start, end).join('\n');
  704. }
  705. // Add line numbers if requested
  706. if (lineNumbers) {
  707. output = addLineNumbers(output, startLine);
  708. }
  709. // Output context header if exists
  710. if (context) {
  711. console.log(`Folder Context: ${context}\n---\n`);
  712. }
  713. console.log(output);
  714. closeDb();
  715. }
  716. // Multi-get: fetch multiple documents by glob pattern or comma-separated list
  717. function multiGet(pattern: string, maxLines?: number, maxBytes: number = DEFAULT_MULTI_GET_MAX_BYTES, format: OutputFormat = "cli"): void {
  718. const db = getDb();
  719. // Check if it's a comma-separated list or a glob pattern
  720. const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
  721. let files: { filepath: string; displayPath: string; bodyLength: number; collection?: string; path?: string }[];
  722. if (isCommaSeparated) {
  723. // Comma-separated list of files (can be virtual paths or relative paths)
  724. const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
  725. files = [];
  726. for (const name of names) {
  727. let doc: { virtual_path: string; body_length: number; collection: string; path: string } | null = null;
  728. // Handle virtual paths
  729. if (isVirtualPath(name)) {
  730. const parsed = parseVirtualPath(name);
  731. if (parsed) {
  732. // Try exact match on collection + path
  733. doc = db.prepare(`
  734. SELECT
  735. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  736. LENGTH(content.doc) as body_length,
  737. d.collection,
  738. d.path
  739. FROM documents d
  740. JOIN content ON content.hash = d.hash
  741. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  742. `).get(parsed.collectionName, parsed.path) as typeof doc;
  743. }
  744. } else {
  745. // Try exact match on path
  746. doc = db.prepare(`
  747. SELECT
  748. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  749. LENGTH(content.doc) as body_length,
  750. d.collection,
  751. d.path
  752. FROM documents d
  753. JOIN content ON content.hash = d.hash
  754. WHERE d.path = ? AND d.active = 1
  755. LIMIT 1
  756. `).get(name) as { virtual_path: string; body_length: number; collection: string; path: string } | null;
  757. // Try suffix match
  758. if (!doc) {
  759. doc = db.prepare(`
  760. SELECT
  761. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  762. LENGTH(content.doc) as body_length,
  763. d.collection,
  764. d.path
  765. FROM documents d
  766. JOIN content ON content.hash = d.hash
  767. WHERE d.path LIKE ? AND d.active = 1
  768. LIMIT 1
  769. `).get(`%${name}`) as { virtual_path: string; body_length: number; collection: string; path: string } | null;
  770. }
  771. }
  772. if (doc) {
  773. files.push({
  774. filepath: doc.virtual_path,
  775. displayPath: doc.virtual_path,
  776. bodyLength: doc.body_length,
  777. collection: doc.collection,
  778. path: doc.path
  779. });
  780. } else {
  781. console.error(`File not found: ${name}`);
  782. }
  783. }
  784. } else {
  785. // Glob pattern - matchFilesByGlob now returns virtual paths
  786. files = matchFilesByGlob(db, pattern).map(f => ({
  787. ...f,
  788. collection: undefined, // Will be fetched later if needed
  789. path: undefined
  790. }));
  791. if (files.length === 0) {
  792. console.error(`No files matched pattern: ${pattern}`);
  793. closeDb();
  794. process.exit(1);
  795. }
  796. }
  797. // Collect results for structured output
  798. const results: { file: string; displayPath: string; title: string; body: string; context: string | null; skipped: boolean; skipReason?: string }[] = [];
  799. for (const file of files) {
  800. // Parse virtual path to get collection info if not already available
  801. let collection = file.collection;
  802. let path = file.path;
  803. if (!collection || !path) {
  804. const parsed = parseVirtualPath(file.filepath);
  805. if (parsed) {
  806. collection = parsed.collectionName;
  807. path = parsed.path;
  808. }
  809. }
  810. // Get context using collection-scoped function
  811. const context = collection && path ? getContextForPath(db, collection, path) : null;
  812. // Check size limit
  813. if (file.bodyLength > maxBytes) {
  814. results.push({
  815. file: file.filepath,
  816. displayPath: file.displayPath,
  817. title: file.displayPath.split('/').pop() || file.displayPath,
  818. body: "",
  819. context,
  820. skipped: true,
  821. skipReason: `File too large (${Math.round(file.bodyLength / 1024)}KB > ${Math.round(maxBytes / 1024)}KB). Use 'qmd get ${file.displayPath}' to retrieve.`,
  822. });
  823. continue;
  824. }
  825. // Fetch document content using collection and path
  826. if (!collection || !path) continue;
  827. const doc = db.prepare(`
  828. SELECT content.doc as body, d.title
  829. FROM documents d
  830. JOIN content ON content.hash = d.hash
  831. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  832. `).get(collection, path) as { body: string; title: string } | null;
  833. if (!doc) continue;
  834. let body = doc.body;
  835. // Apply line limit if specified
  836. if (maxLines !== undefined) {
  837. const lines = body.split('\n');
  838. body = lines.slice(0, maxLines).join('\n');
  839. if (lines.length > maxLines) {
  840. body += `\n\n[... truncated ${lines.length - maxLines} more lines]`;
  841. }
  842. }
  843. results.push({
  844. file: file.filepath,
  845. displayPath: file.displayPath,
  846. title: doc.title || file.displayPath.split('/').pop() || file.displayPath,
  847. body,
  848. context,
  849. skipped: false,
  850. });
  851. }
  852. closeDb();
  853. // Output based on format
  854. if (format === "json") {
  855. const output = results.map(r => ({
  856. file: r.displayPath,
  857. title: r.title,
  858. ...(r.context && { context: r.context }),
  859. ...(r.skipped ? { skipped: true, reason: r.skipReason } : { body: r.body }),
  860. }));
  861. console.log(JSON.stringify(output, null, 2));
  862. } else if (format === "csv") {
  863. const escapeField = (val: string | null | undefined): string => {
  864. if (val === null || val === undefined) return "";
  865. const str = String(val);
  866. if (str.includes(",") || str.includes('"') || str.includes("\n")) {
  867. return `"${str.replace(/"/g, '""')}"`;
  868. }
  869. return str;
  870. };
  871. console.log("file,title,context,skipped,body");
  872. for (const r of results) {
  873. console.log([r.displayPath, r.title, r.context, r.skipped ? "true" : "false", r.skipped ? r.skipReason : r.body].map(escapeField).join(","));
  874. }
  875. } else if (format === "files") {
  876. for (const r of results) {
  877. const ctx = r.context ? `,"${r.context.replace(/"/g, '""')}"` : "";
  878. const status = r.skipped ? "[SKIPPED]" : "";
  879. console.log(`${r.displayPath}${ctx}${status ? `,${status}` : ""}`);
  880. }
  881. } else if (format === "md") {
  882. for (const r of results) {
  883. console.log(`## ${r.displayPath}\n`);
  884. if (r.title && r.title !== r.displayPath) console.log(`**Title:** ${r.title}\n`);
  885. if (r.context) console.log(`**Context:** ${r.context}\n`);
  886. if (r.skipped) {
  887. console.log(`> ${r.skipReason}\n`);
  888. } else {
  889. console.log("```");
  890. console.log(r.body);
  891. console.log("```\n");
  892. }
  893. }
  894. } else if (format === "xml") {
  895. console.log('<?xml version="1.0" encoding="UTF-8"?>');
  896. console.log("<documents>");
  897. for (const r of results) {
  898. console.log(" <document>");
  899. console.log(` <file>${escapeXml(r.displayPath)}</file>`);
  900. console.log(` <title>${escapeXml(r.title)}</title>`);
  901. if (r.context) console.log(` <context>${escapeXml(r.context)}</context>`);
  902. if (r.skipped) {
  903. console.log(` <skipped>true</skipped>`);
  904. console.log(` <reason>${escapeXml(r.skipReason || "")}</reason>`);
  905. } else {
  906. console.log(` <body>${escapeXml(r.body)}</body>`);
  907. }
  908. console.log(" </document>");
  909. }
  910. console.log("</documents>");
  911. } else {
  912. // CLI format (default)
  913. for (const r of results) {
  914. console.log(`\n${'='.repeat(60)}`);
  915. console.log(`File: ${r.displayPath}`);
  916. console.log(`${'='.repeat(60)}\n`);
  917. if (r.skipped) {
  918. console.log(`[SKIPPED: ${r.skipReason}]`);
  919. continue;
  920. }
  921. if (r.context) {
  922. console.log(`Folder Context: ${r.context}\n---\n`);
  923. }
  924. console.log(r.body);
  925. }
  926. }
  927. }
  928. // List files in virtual file tree
  929. function listFiles(pathArg?: string): void {
  930. const db = getDb();
  931. if (!pathArg) {
  932. // No argument - list all collections
  933. const yamlCollections = yamlListCollections();
  934. if (yamlCollections.length === 0) {
  935. console.log("No collections found. Run 'qmd add .' to index files.");
  936. closeDb();
  937. return;
  938. }
  939. // Get file counts from database for each collection
  940. const collections = yamlCollections.map(coll => {
  941. const stats = db.prepare(`
  942. SELECT COUNT(*) as file_count
  943. FROM documents d
  944. WHERE d.collection = ? AND d.active = 1
  945. `).get(coll.name) as { file_count: number } | null;
  946. return {
  947. name: coll.name,
  948. file_count: stats?.file_count || 0
  949. };
  950. });
  951. console.log(`${c.bold}Collections:${c.reset}\n`);
  952. for (const coll of collections) {
  953. console.log(` ${c.dim}qmd://${c.reset}${c.cyan}${coll.name}/${c.reset} ${c.dim}(${coll.file_count} files)${c.reset}`);
  954. }
  955. closeDb();
  956. return;
  957. }
  958. // Parse the path argument
  959. let collectionName: string;
  960. let pathPrefix: string | null = null;
  961. if (pathArg.startsWith('qmd://')) {
  962. // Virtual path format: qmd://collection/path
  963. const parsed = parseVirtualPath(pathArg);
  964. if (!parsed) {
  965. console.error(`Invalid virtual path: ${pathArg}`);
  966. closeDb();
  967. process.exit(1);
  968. }
  969. collectionName = parsed.collectionName;
  970. pathPrefix = parsed.path;
  971. } else {
  972. // Just collection name or collection/path
  973. const parts = pathArg.split('/');
  974. collectionName = parts[0] || '';
  975. if (parts.length > 1) {
  976. pathPrefix = parts.slice(1).join('/');
  977. }
  978. }
  979. // Get the collection
  980. const coll = getCollectionFromYaml(collectionName);
  981. if (!coll) {
  982. console.error(`Collection not found: ${collectionName}`);
  983. console.error(`Run 'qmd ls' to see available collections.`);
  984. closeDb();
  985. process.exit(1);
  986. }
  987. // List files in the collection with size and modification time
  988. let query: string;
  989. let params: any[];
  990. if (pathPrefix) {
  991. // List files under a specific path
  992. query = `
  993. SELECT d.path, d.title, d.modified_at, LENGTH(ct.doc) as size
  994. FROM documents d
  995. JOIN content ct ON d.hash = ct.hash
  996. WHERE d.collection = ? AND d.path LIKE ? AND d.active = 1
  997. ORDER BY d.path
  998. `;
  999. params = [coll.name, `${pathPrefix}%`];
  1000. } else {
  1001. // List all files in the collection
  1002. query = `
  1003. SELECT d.path, d.title, d.modified_at, LENGTH(ct.doc) as size
  1004. FROM documents d
  1005. JOIN content ct ON d.hash = ct.hash
  1006. WHERE d.collection = ? AND d.active = 1
  1007. ORDER BY d.path
  1008. `;
  1009. params = [coll.name];
  1010. }
  1011. const files = db.prepare(query).all(...params) as { path: string; title: string; modified_at: string; size: number }[];
  1012. if (files.length === 0) {
  1013. if (pathPrefix) {
  1014. console.log(`No files found under qmd://${collectionName}/${pathPrefix}`);
  1015. } else {
  1016. console.log(`No files found in collection: ${collectionName}`);
  1017. }
  1018. closeDb();
  1019. return;
  1020. }
  1021. // Calculate max widths for alignment
  1022. const maxSize = Math.max(...files.map(f => formatBytes(f.size).length));
  1023. // Output in ls -l style
  1024. for (const file of files) {
  1025. const sizeStr = formatBytes(file.size).padStart(maxSize);
  1026. const date = new Date(file.modified_at);
  1027. const timeStr = formatLsTime(date);
  1028. // Dim the qmd:// prefix, highlight the filename
  1029. console.log(`${sizeStr} ${timeStr} ${c.dim}qmd://${collectionName}/${c.reset}${c.cyan}${file.path}${c.reset}`);
  1030. }
  1031. closeDb();
  1032. }
  1033. // Format date/time like ls -l
  1034. function formatLsTime(date: Date): string {
  1035. const now = new Date();
  1036. const sixMonthsAgo = new Date(now.getTime() - 6 * 30 * 24 * 60 * 60 * 1000);
  1037. const months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
  1038. const month = months[date.getMonth()];
  1039. const day = date.getDate().toString().padStart(2, ' ');
  1040. // If file is older than 6 months, show year instead of time
  1041. if (date < sixMonthsAgo) {
  1042. const year = date.getFullYear();
  1043. return `${month} ${day} ${year}`;
  1044. } else {
  1045. const hours = date.getHours().toString().padStart(2, '0');
  1046. const minutes = date.getMinutes().toString().padStart(2, '0');
  1047. return `${month} ${day} ${hours}:${minutes}`;
  1048. }
  1049. }
  1050. // Collection management commands
  1051. function collectionList(): void {
  1052. const db = getDb();
  1053. const collections = listCollections(db);
  1054. if (collections.length === 0) {
  1055. console.log("No collections found. Run 'qmd add .' to create one.");
  1056. closeDb();
  1057. return;
  1058. }
  1059. console.log(`${c.bold}Collections (${collections.length}):${c.reset}\n`);
  1060. for (const coll of collections) {
  1061. const updatedAt = coll.last_modified ? new Date(coll.last_modified) : new Date();
  1062. const timeAgo = formatTimeAgo(updatedAt);
  1063. console.log(`${c.cyan}${coll.name}${c.reset} ${c.dim}(qmd://${coll.name}/)${c.reset}`);
  1064. console.log(` ${c.dim}Pattern:${c.reset} ${coll.glob_pattern}`);
  1065. console.log(` ${c.dim}Files:${c.reset} ${coll.active_count}`);
  1066. console.log(` ${c.dim}Updated:${c.reset} ${timeAgo}`);
  1067. console.log();
  1068. }
  1069. closeDb();
  1070. }
  1071. async function collectionAdd(pwd: string, globPattern: string, name?: string): Promise<void> {
  1072. // If name not provided, generate from pwd basename
  1073. let collName = name;
  1074. if (!collName) {
  1075. const parts = pwd.split('/').filter(Boolean);
  1076. collName = parts[parts.length - 1] || 'root';
  1077. }
  1078. // Check if collection with this name already exists in YAML
  1079. const existing = getCollectionFromYaml(collName);
  1080. if (existing) {
  1081. console.error(`${c.yellow}Collection '${collName}' already exists.${c.reset}`);
  1082. console.error(`Use a different name with --name <name>`);
  1083. process.exit(1);
  1084. }
  1085. // Check if a collection with this pwd+glob already exists in YAML
  1086. const allCollections = yamlListCollections();
  1087. const existingPwdGlob = allCollections.find(c => c.path === pwd && c.pattern === globPattern);
  1088. if (existingPwdGlob) {
  1089. console.error(`${c.yellow}A collection already exists for this path and pattern:${c.reset}`);
  1090. console.error(` Name: ${existingPwdGlob.name} (qmd://${existingPwdGlob.name}/)`);
  1091. console.error(` Pattern: ${globPattern}`);
  1092. console.error(`\nUse 'qmd update' to re-index it, or remove it first with 'qmd collection remove ${existingPwdGlob.name}'`);
  1093. process.exit(1);
  1094. }
  1095. // Add to YAML config
  1096. const { addCollection } = await import("./collections.js");
  1097. addCollection(collName, pwd, globPattern);
  1098. // Create the collection and index files
  1099. console.log(`Creating collection '${collName}'...`);
  1100. await indexFiles(pwd, globPattern, collName);
  1101. console.log(`${c.green}✓${c.reset} Collection '${collName}' created successfully`);
  1102. }
  1103. function collectionRemove(name: string): void {
  1104. // Check if collection exists in YAML
  1105. const coll = getCollectionFromYaml(name);
  1106. if (!coll) {
  1107. console.error(`${c.yellow}Collection not found: ${name}${c.reset}`);
  1108. console.error(`Run 'qmd collection list' to see available collections.`);
  1109. process.exit(1);
  1110. }
  1111. const db = getDb();
  1112. const result = removeCollection(db, name);
  1113. closeDb();
  1114. console.log(`${c.green}✓${c.reset} Removed collection '${name}'`);
  1115. console.log(` Deleted ${result.deletedDocs} documents`);
  1116. if (result.cleanedHashes > 0) {
  1117. console.log(` Cleaned up ${result.cleanedHashes} orphaned content hashes`);
  1118. }
  1119. }
  1120. function collectionRename(oldName: string, newName: string): void {
  1121. // Check if old collection exists in YAML
  1122. const coll = getCollectionFromYaml(oldName);
  1123. if (!coll) {
  1124. console.error(`${c.yellow}Collection not found: ${oldName}${c.reset}`);
  1125. console.error(`Run 'qmd collection list' to see available collections.`);
  1126. process.exit(1);
  1127. }
  1128. // Check if new name already exists in YAML
  1129. const existing = getCollectionFromYaml(newName);
  1130. if (existing) {
  1131. console.error(`${c.yellow}Collection name already exists: ${newName}${c.reset}`);
  1132. console.error(`Choose a different name or remove the existing collection first.`);
  1133. process.exit(1);
  1134. }
  1135. const db = getDb();
  1136. renameCollection(db, oldName, newName);
  1137. closeDb();
  1138. console.log(`${c.green}✓${c.reset} Renamed collection '${oldName}' to '${newName}'`);
  1139. console.log(` Virtual paths updated: ${c.cyan}qmd://${oldName}/${c.reset} → ${c.cyan}qmd://${newName}/${c.reset}`);
  1140. }
  1141. async function indexFiles(pwd?: string, globPattern: string = DEFAULT_GLOB, collectionName?: string): Promise<void> {
  1142. const db = getDb();
  1143. const resolvedPwd = pwd || getPwd();
  1144. const now = new Date().toISOString();
  1145. const excludeDirs = ["node_modules", ".git", ".cache", "vendor", "dist", "build"];
  1146. // Clear Ollama cache on index
  1147. clearCache(db);
  1148. // Collection name must be provided (from YAML)
  1149. if (!collectionName) {
  1150. throw new Error("Collection name is required. Collections must be defined in ~/.config/qmd/index.yml");
  1151. }
  1152. console.log(`Collection: ${resolvedPwd} (${globPattern})`);
  1153. progress.indeterminate();
  1154. const glob = new Glob(globPattern);
  1155. const files: string[] = [];
  1156. for await (const file of glob.scan({ cwd: resolvedPwd, onlyFiles: true, followSymlinks: true })) {
  1157. // Skip node_modules, hidden folders (.*), and other common excludes
  1158. const parts = file.split("/");
  1159. const shouldSkip = parts.some(part =>
  1160. part === "node_modules" ||
  1161. part.startsWith(".") ||
  1162. excludeDirs.includes(part)
  1163. );
  1164. if (!shouldSkip) {
  1165. files.push(file);
  1166. }
  1167. }
  1168. const total = files.length;
  1169. if (total === 0) {
  1170. progress.clear();
  1171. console.log("No files found matching pattern.");
  1172. closeDb();
  1173. return;
  1174. }
  1175. let indexed = 0, updated = 0, unchanged = 0, processed = 0;
  1176. const seenPaths = new Set<string>();
  1177. const startTime = Date.now();
  1178. for (const relativeFile of files) {
  1179. const filepath = getRealPath(resolve(resolvedPwd, relativeFile));
  1180. const path = handelize(relativeFile); // Normalize path for token-friendliness
  1181. seenPaths.add(path);
  1182. const content = await Bun.file(filepath).text();
  1183. // Skip empty files - nothing useful to index
  1184. if (!content.trim()) {
  1185. processed++;
  1186. continue;
  1187. }
  1188. const hash = await hashContent(content);
  1189. const title = extractTitle(content, relativeFile);
  1190. // Check if document exists in this collection with this path
  1191. const existing = findActiveDocument(db, collectionName, path);
  1192. if (existing) {
  1193. if (existing.hash === hash) {
  1194. // Hash unchanged, but check if title needs updating
  1195. if (existing.title !== title) {
  1196. updateDocumentTitle(db, existing.id, title, now);
  1197. updated++;
  1198. } else {
  1199. unchanged++;
  1200. }
  1201. } else {
  1202. // Content changed - insert new content hash and update document
  1203. insertContent(db, hash, content, now);
  1204. const stat = await Bun.file(filepath).stat();
  1205. updateDocument(db, existing.id, title, hash,
  1206. stat ? new Date(stat.mtime).toISOString() : now);
  1207. updated++;
  1208. }
  1209. } else {
  1210. // New document - insert content and document
  1211. indexed++;
  1212. insertContent(db, hash, content, now);
  1213. const stat = await Bun.file(filepath).stat();
  1214. insertDocument(db, collectionName, path, title, hash,
  1215. stat ? new Date(stat.birthtime).toISOString() : now,
  1216. stat ? new Date(stat.mtime).toISOString() : now);
  1217. }
  1218. processed++;
  1219. progress.set((processed / total) * 100);
  1220. const elapsed = (Date.now() - startTime) / 1000;
  1221. const rate = processed / elapsed;
  1222. const remaining = (total - processed) / rate;
  1223. const eta = processed > 2 ? ` ETA: ${formatETA(remaining)}` : "";
  1224. process.stderr.write(`\rIndexing: ${processed}/${total}${eta} `);
  1225. }
  1226. // Deactivate documents in this collection that no longer exist
  1227. const allActive = getActiveDocumentPaths(db, collectionName);
  1228. let removed = 0;
  1229. for (const path of allActive) {
  1230. if (!seenPaths.has(path)) {
  1231. deactivateDocument(db, collectionName, path);
  1232. removed++;
  1233. }
  1234. }
  1235. // Clean up orphaned content hashes (content not referenced by any document)
  1236. const orphanedContent = cleanupOrphanedContent(db);
  1237. // Check if vector index needs updating
  1238. const needsEmbedding = getHashesNeedingEmbedding(db);
  1239. progress.clear();
  1240. console.log(`\nIndexed: ${indexed} new, ${updated} updated, ${unchanged} unchanged, ${removed} removed`);
  1241. if (orphanedContent > 0) {
  1242. console.log(`Cleaned up ${orphanedContent} orphaned content hash(es)`);
  1243. }
  1244. if (needsEmbedding > 0) {
  1245. console.log(`\nRun 'qmd embed' to update embeddings (${needsEmbedding} unique hashes need vectors)`);
  1246. }
  1247. closeDb();
  1248. }
  1249. function renderProgressBar(percent: number, width: number = 30): string {
  1250. const filled = Math.round((percent / 100) * width);
  1251. const empty = width - filled;
  1252. const bar = "█".repeat(filled) + "░".repeat(empty);
  1253. return bar;
  1254. }
  1255. async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = false): Promise<void> {
  1256. const db = getDb();
  1257. const now = new Date().toISOString();
  1258. // If force, clear all vectors
  1259. if (force) {
  1260. console.log(`${c.yellow}Force re-indexing: clearing all vectors...${c.reset}`);
  1261. clearAllEmbeddings(db);
  1262. }
  1263. // Find unique hashes that need embedding (from active documents)
  1264. const hashesToEmbed = getHashesForEmbedding(db);
  1265. if (hashesToEmbed.length === 0) {
  1266. console.log(`${c.green}✓ All content hashes already have embeddings.${c.reset}`);
  1267. closeDb();
  1268. return;
  1269. }
  1270. // Prepare documents with chunks
  1271. type ChunkItem = { hash: string; title: string; text: string; seq: number; pos: number; tokens: number; bytes: number; displayName: string };
  1272. const allChunks: ChunkItem[] = [];
  1273. let multiChunkDocs = 0;
  1274. // Chunk all documents using actual token counts
  1275. process.stderr.write(`Chunking ${hashesToEmbed.length} documents by token count...\n`);
  1276. for (const item of hashesToEmbed) {
  1277. const encoder = new TextEncoder();
  1278. const bodyBytes = encoder.encode(item.body).length;
  1279. if (bodyBytes === 0) continue; // Skip empty
  1280. const title = extractTitle(item.body, item.path);
  1281. const displayName = item.path;
  1282. const chunks = await chunkDocumentByTokens(item.body); // Uses actual tokenizer
  1283. if (chunks.length > 1) multiChunkDocs++;
  1284. for (let seq = 0; seq < chunks.length; seq++) {
  1285. allChunks.push({
  1286. hash: item.hash,
  1287. title,
  1288. text: chunks[seq]!.text, // Chunk is guaranteed to exist by seq loop
  1289. seq,
  1290. pos: chunks[seq]!.pos,
  1291. tokens: chunks[seq]!.tokens,
  1292. bytes: encoder.encode(chunks[seq]!.text).length,
  1293. displayName,
  1294. });
  1295. }
  1296. }
  1297. if (allChunks.length === 0) {
  1298. console.log(`${c.green}✓ No non-empty documents to embed.${c.reset}`);
  1299. closeDb();
  1300. return;
  1301. }
  1302. const totalBytes = allChunks.reduce((sum, c) => sum + c.bytes, 0);
  1303. const totalChunks = allChunks.length;
  1304. const totalDocs = hashesToEmbed.length;
  1305. console.log(`${c.bold}Embedding ${totalDocs} documents${c.reset} ${c.dim}(${totalChunks} chunks, ${formatBytes(totalBytes)})${c.reset}`);
  1306. if (multiChunkDocs > 0) {
  1307. console.log(`${c.dim}${multiChunkDocs} documents split into multiple chunks${c.reset}`);
  1308. }
  1309. console.log(`${c.dim}Model: ${model}${c.reset}\n`);
  1310. // Hide cursor during embedding
  1311. cursor.hide();
  1312. // Get embedding dimensions from first chunk
  1313. progress.indeterminate();
  1314. const llm = getDefaultLlamaCpp();
  1315. const firstChunk = allChunks[0];
  1316. if (!firstChunk) {
  1317. throw new Error("No chunks available to embed");
  1318. }
  1319. const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title);
  1320. const firstResult = await llm.embed(firstText);
  1321. if (!firstResult) {
  1322. throw new Error("Failed to get embedding dimensions from first chunk");
  1323. }
  1324. ensureVecTable(db, firstResult.embedding.length);
  1325. let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
  1326. const startTime = Date.now();
  1327. // Batch embedding for better throughput
  1328. // Process in batches of 32 to balance memory usage and efficiency
  1329. const BATCH_SIZE = 32;
  1330. for (let batchStart = 0; batchStart < allChunks.length; batchStart += BATCH_SIZE) {
  1331. const batchEnd = Math.min(batchStart + BATCH_SIZE, allChunks.length);
  1332. const batch = allChunks.slice(batchStart, batchEnd);
  1333. // Format texts for embedding
  1334. const texts = batch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title));
  1335. try {
  1336. // Batch embed all texts at once
  1337. const embeddings = await llm.embedBatch(texts);
  1338. // Insert each embedding
  1339. for (let i = 0; i < batch.length; i++) {
  1340. const chunk = batch[i]!;
  1341. const embedding = embeddings[i];
  1342. if (embedding) {
  1343. insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
  1344. chunksEmbedded++;
  1345. } else {
  1346. errors++;
  1347. console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}${c.reset}`);
  1348. }
  1349. bytesProcessed += chunk.bytes;
  1350. }
  1351. } catch (err) {
  1352. // If batch fails, try individual embeddings as fallback
  1353. for (const chunk of batch) {
  1354. try {
  1355. const text = formatDocForEmbedding(chunk.text, chunk.title);
  1356. const result = await llm.embed(text);
  1357. if (result) {
  1358. insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
  1359. chunksEmbedded++;
  1360. } else {
  1361. errors++;
  1362. }
  1363. } catch (innerErr) {
  1364. errors++;
  1365. console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}: ${innerErr}${c.reset}`);
  1366. }
  1367. bytesProcessed += chunk.bytes;
  1368. }
  1369. }
  1370. const percent = (bytesProcessed / totalBytes) * 100;
  1371. progress.set(percent);
  1372. const elapsed = (Date.now() - startTime) / 1000;
  1373. const bytesPerSec = bytesProcessed / elapsed;
  1374. const remainingBytes = totalBytes - bytesProcessed;
  1375. const etaSec = remainingBytes / bytesPerSec;
  1376. const bar = renderProgressBar(percent);
  1377. const percentStr = percent.toFixed(0).padStart(3);
  1378. const throughput = `${formatBytes(bytesPerSec)}/s`;
  1379. const eta = elapsed > 2 ? formatETA(etaSec) : "...";
  1380. const errStr = errors > 0 ? ` ${c.yellow}${errors} err${c.reset}` : "";
  1381. process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${chunksEmbedded}/${totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset} `);
  1382. }
  1383. progress.clear();
  1384. cursor.show();
  1385. const totalTimeSec = (Date.now() - startTime) / 1000;
  1386. const avgThroughput = formatBytes(totalBytes / totalTimeSec);
  1387. console.log(`\r${c.green}${renderProgressBar(100)}${c.reset} ${c.bold}100%${c.reset} `);
  1388. console.log(`\n${c.green}✓ Done!${c.reset} Embedded ${c.bold}${chunksEmbedded}${c.reset} chunks from ${c.bold}${totalDocs}${c.reset} documents in ${c.bold}${formatETA(totalTimeSec)}${c.reset} ${c.dim}(${avgThroughput}/s)${c.reset}`);
  1389. if (errors > 0) {
  1390. console.log(`${c.yellow}⚠ ${errors} chunks failed${c.reset}`);
  1391. }
  1392. closeDb();
  1393. }
  1394. // Sanitize a term for FTS5: remove punctuation except apostrophes
  1395. function sanitizeFTS5Term(term: string): string {
  1396. // Remove all non-alphanumeric except apostrophes (for contractions like "don't")
  1397. return term.replace(/[^\w']/g, '').trim();
  1398. }
  1399. // Build FTS5 query: phrase-aware with fallback to individual terms
  1400. function buildFTS5Query(query: string): string {
  1401. // Sanitize the full query for phrase matching
  1402. const sanitizedQuery = query.replace(/[^\w\s']/g, '').trim();
  1403. const terms = query
  1404. .split(/\s+/)
  1405. .map(sanitizeFTS5Term)
  1406. .filter(term => term.length >= 2); // Skip single chars and empty
  1407. if (terms.length === 0) return "";
  1408. if (terms.length === 1) return `"${terms[0]!.replace(/"/g, '""')}"`;
  1409. // Strategy: exact phrase OR proximity match OR individual terms
  1410. // Exact phrase matches rank highest, then close proximity, then any term
  1411. const phrase = `"${sanitizedQuery.replace(/"/g, '""')}"`;
  1412. const quotedTerms = terms.map(t => `"${t.replace(/"/g, '""')}"`);
  1413. // FTS5 NEAR syntax: NEAR(term1 term2, distance)
  1414. const nearPhrase = `NEAR(${quotedTerms.join(' ')}, 10)`;
  1415. const orTerms = quotedTerms.join(' OR ');
  1416. // Exact phrase > proximity > any term
  1417. return `(${phrase}) OR (${nearPhrase}) OR (${orTerms})`;
  1418. }
  1419. // Normalize BM25 score to 0-1 range using sigmoid
  1420. function normalizeBM25(score: number): number {
  1421. // BM25 scores are negative in SQLite (lower = better)
  1422. // Typical range: -15 (excellent) to -2 (weak match)
  1423. // Map to 0-1 where higher is better
  1424. const absScore = Math.abs(score);
  1425. // Sigmoid-ish normalization: maps ~2-15 range to ~0.1-0.95
  1426. return 1 / (1 + Math.exp(-(absScore - 5) / 3));
  1427. }
  1428. function normalizeScores(results: SearchResult[]): SearchResult[] {
  1429. if (results.length === 0) return results;
  1430. const maxScore = Math.max(...results.map(r => r.score));
  1431. const minScore = Math.min(...results.map(r => r.score));
  1432. const range = maxScore - minScore || 1;
  1433. return results.map(r => ({ ...r, score: (r.score - minScore) / range }));
  1434. }
  1435. // Reciprocal Rank Fusion: combines multiple ranked lists
  1436. // RRF score = sum(1 / (k + rank)) across all lists where doc appears
  1437. // k=60 is standard, provides good balance between top and lower ranks
  1438. function reciprocalRankFusion(
  1439. resultLists: RankedResult[][],
  1440. weights: number[] = [], // Weight per result list (default 1.0)
  1441. k: number = 60
  1442. ): RankedResult[] {
  1443. const scores = new Map<string, { score: number; displayPath: string; title: string; body: string; bestRank: number }>();
  1444. for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
  1445. const results = resultLists[listIdx];
  1446. if (!results) continue;
  1447. const weight = weights[listIdx] ?? 1.0;
  1448. for (let rank = 0; rank < results.length; rank++) {
  1449. const doc = results[rank];
  1450. if (!doc) continue; // Ensure doc is not undefined
  1451. const rrfScore = weight / (k + rank + 1);
  1452. const existing = scores.get(doc.file);
  1453. if (existing) {
  1454. existing.score += rrfScore;
  1455. existing.bestRank = Math.min(existing.bestRank, rank);
  1456. } else {
  1457. scores.set(doc.file, { score: rrfScore, displayPath: doc.displayPath, title: doc.title, body: doc.body, bestRank: rank });
  1458. }
  1459. }
  1460. }
  1461. // Add bonus for best rank: documents that ranked #1-3 in any list get a boost
  1462. // This prevents dilution of exact matches by expansion queries
  1463. return Array.from(scores.entries())
  1464. .map(([file, { score, displayPath, title, body, bestRank }]) => {
  1465. let bonus = 0;
  1466. if (bestRank === 0) bonus = 0.05; // Ranked #1 somewhere
  1467. else if (bestRank <= 2) bonus = 0.02; // Ranked top-3 somewhere
  1468. return { file, displayPath, title, body, score: score + bonus };
  1469. })
  1470. .sort((a, b) => b.score - a.score);
  1471. }
  1472. type OutputOptions = {
  1473. format: OutputFormat;
  1474. full: boolean;
  1475. limit: number;
  1476. minScore: number;
  1477. all?: boolean;
  1478. collection?: string; // Filter by collection name (pwd suffix match)
  1479. lineNumbers?: boolean; // Add line numbers to output
  1480. context?: string; // Optional context for query expansion
  1481. };
  1482. // Highlight query terms in text (skip short words < 3 chars)
  1483. function highlightTerms(text: string, query: string): string {
  1484. if (!useColor) return text;
  1485. const terms = query.toLowerCase().split(/\s+/).filter(t => t.length >= 3);
  1486. let result = text;
  1487. for (const term of terms) {
  1488. const regex = new RegExp(`(${term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')})`, 'gi');
  1489. result = result.replace(regex, `${c.yellow}${c.bold}$1${c.reset}`);
  1490. }
  1491. return result;
  1492. }
  1493. // Format score with color based on value
  1494. function formatScore(score: number): string {
  1495. const pct = (score * 100).toFixed(0).padStart(3);
  1496. if (!useColor) return `${pct}%`;
  1497. if (score >= 0.7) return `${c.green}${pct}%${c.reset}`;
  1498. if (score >= 0.4) return `${c.yellow}${pct}%${c.reset}`;
  1499. return `${c.dim}${pct}%${c.reset}`;
  1500. }
  1501. // Shorten directory path for display - relative to $HOME (used for context paths, not documents)
  1502. function shortPath(dirpath: string): string {
  1503. const home = homedir();
  1504. if (dirpath.startsWith(home)) {
  1505. return '~' + dirpath.slice(home.length);
  1506. }
  1507. return dirpath;
  1508. }
  1509. // Add line numbers to text content
  1510. function addLineNumbers(text: string, startLine: number = 1): string {
  1511. const lines = text.split('\n');
  1512. return lines.map((line, i) => `${startLine + i}: ${line}`).join('\n');
  1513. }
  1514. function outputResults(results: { file: string; displayPath: string; title: string; body: string; score: number; context?: string | null; chunkPos?: number; hash?: string; docid?: string }[], query: string, opts: OutputOptions): void {
  1515. const filtered = results.filter(r => r.score >= opts.minScore).slice(0, opts.limit);
  1516. if (filtered.length === 0) {
  1517. console.log("No results found above minimum score threshold.");
  1518. return;
  1519. }
  1520. // Helper to create qmd:// URI from displayPath
  1521. const toQmdPath = (displayPath: string) => `qmd://${displayPath}`;
  1522. if (opts.format === "json") {
  1523. // JSON output for LLM consumption
  1524. const output = filtered.map(row => {
  1525. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
  1526. let body = opts.full ? row.body : undefined;
  1527. let snippet = !opts.full ? extractSnippet(row.body, query, 300, row.chunkPos).snippet : undefined;
  1528. if (opts.lineNumbers) {
  1529. if (body) body = addLineNumbers(body);
  1530. if (snippet) snippet = addLineNumbers(snippet);
  1531. }
  1532. return {
  1533. ...(docid && { docid: `#${docid}` }),
  1534. score: Math.round(row.score * 100) / 100,
  1535. file: toQmdPath(row.displayPath),
  1536. title: row.title,
  1537. ...(row.context && { context: row.context }),
  1538. ...(body && { body }),
  1539. ...(snippet && { snippet }),
  1540. };
  1541. });
  1542. console.log(JSON.stringify(output, null, 2));
  1543. } else if (opts.format === "files") {
  1544. // Simple docid,score,filepath,context output
  1545. for (const row of filtered) {
  1546. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
  1547. const ctx = row.context ? `,"${row.context.replace(/"/g, '""')}"` : "";
  1548. console.log(`#${docid},${row.score.toFixed(2)},${toQmdPath(row.displayPath)}${ctx}`);
  1549. }
  1550. } else if (opts.format === "cli") {
  1551. for (let i = 0; i < filtered.length; i++) {
  1552. const row = filtered[i];
  1553. if (!row) continue;
  1554. const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos);
  1555. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
  1556. // Line 1: filepath with docid
  1557. const path = toQmdPath(row.displayPath);
  1558. // Only show :line if we actually found a term match in the snippet body (exclude header line).
  1559. const snippetBody = snippet.split("\n").slice(1).join("\n").toLowerCase();
  1560. const hasMatch = query.toLowerCase().split(/\s+/).some(t => t.length > 0 && snippetBody.includes(t));
  1561. const lineInfo = hasMatch ? `:${line}` : "";
  1562. const docidStr = docid ? ` ${c.dim}#${docid}${c.reset}` : "";
  1563. console.log(`${c.cyan}${path}${c.dim}${lineInfo}${c.reset}${docidStr}`);
  1564. // Line 2: Title (if available)
  1565. if (row.title) {
  1566. console.log(`${c.bold}Title: ${row.title}${c.reset}`);
  1567. }
  1568. // Line 3: Context (if available)
  1569. if (row.context) {
  1570. console.log(`${c.dim}Context: ${row.context}${c.reset}`);
  1571. }
  1572. // Line 4: Score
  1573. const score = formatScore(row.score);
  1574. console.log(`Score: ${c.bold}${score}${c.reset}`);
  1575. console.log();
  1576. // Snippet with highlighting (diff-style header included)
  1577. let displaySnippet = opts.lineNumbers ? addLineNumbers(snippet, line) : snippet;
  1578. const highlighted = highlightTerms(displaySnippet, query);
  1579. console.log(highlighted);
  1580. // Double empty line between results
  1581. if (i < filtered.length - 1) console.log('\n');
  1582. }
  1583. } else if (opts.format === "md") {
  1584. for (let i = 0; i < filtered.length; i++) {
  1585. const row = filtered[i];
  1586. if (!row) continue;
  1587. const heading = row.title || row.displayPath;
  1588. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
  1589. let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos).snippet;
  1590. if (opts.lineNumbers) {
  1591. content = addLineNumbers(content);
  1592. }
  1593. const docidLine = docid ? `**docid:** \`#${docid}\`\n` : "";
  1594. const contextLine = row.context ? `**context:** ${row.context}\n` : "";
  1595. console.log(`---\n# ${heading}\n${docidLine}${contextLine}\n${content}\n`);
  1596. }
  1597. } else if (opts.format === "xml") {
  1598. for (const row of filtered) {
  1599. const titleAttr = row.title ? ` title="${row.title.replace(/"/g, '&quot;')}"` : "";
  1600. const contextAttr = row.context ? ` context="${row.context.replace(/"/g, '&quot;')}"` : "";
  1601. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
  1602. let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos).snippet;
  1603. if (opts.lineNumbers) {
  1604. content = addLineNumbers(content);
  1605. }
  1606. console.log(`<file docid="#${docid}" name="${toQmdPath(row.displayPath)}"${titleAttr}${contextAttr}>\n${content}\n</file>\n`);
  1607. }
  1608. } else {
  1609. // CSV format
  1610. console.log("docid,score,file,title,context,line,snippet");
  1611. for (const row of filtered) {
  1612. const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos);
  1613. let content = opts.full ? row.body : snippet;
  1614. if (opts.lineNumbers) {
  1615. content = addLineNumbers(content, line);
  1616. }
  1617. const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
  1618. const snippetText = content || "";
  1619. console.log(`#${docid},${row.score.toFixed(4)},${escapeCSV(toQmdPath(row.displayPath))},${escapeCSV(row.title || "")},${escapeCSV(row.context || "")},${line},${escapeCSV(snippetText)}`);
  1620. }
  1621. }
  1622. }
  1623. function search(query: string, opts: OutputOptions): void {
  1624. const db = getDb();
  1625. // Validate collection filter if specified
  1626. let collectionName: string | undefined;
  1627. if (opts.collection) {
  1628. const coll = getCollectionFromYaml(opts.collection);
  1629. if (!coll) {
  1630. console.error(`Collection not found: ${opts.collection}`);
  1631. closeDb();
  1632. process.exit(1);
  1633. }
  1634. collectionName = opts.collection;
  1635. }
  1636. // Use large limit for --all, otherwise fetch more than needed and let outputResults filter
  1637. const fetchLimit = opts.all ? 100000 : Math.max(50, opts.limit * 2);
  1638. // searchFTS accepts collection name as number parameter for legacy reasons (will be fixed in store.ts)
  1639. const results = searchFTS(db, query, fetchLimit, collectionName as any);
  1640. // Add context to results
  1641. const resultsWithContext = results.map(r => ({
  1642. file: r.filepath,
  1643. displayPath: r.displayPath,
  1644. title: r.title,
  1645. body: r.body || "",
  1646. score: r.score,
  1647. context: getContextForFile(db, r.filepath),
  1648. hash: r.hash,
  1649. docid: r.docid,
  1650. }));
  1651. closeDb();
  1652. if (resultsWithContext.length === 0) {
  1653. console.log("No results found.");
  1654. return;
  1655. }
  1656. outputResults(resultsWithContext, query, opts);
  1657. }
  1658. async function vectorSearch(query: string, opts: OutputOptions, model: string = DEFAULT_EMBED_MODEL): Promise<void> {
  1659. const db = getDb();
  1660. // Validate collection filter if specified
  1661. let collectionName: string | undefined;
  1662. if (opts.collection) {
  1663. const coll = getCollectionFromYaml(opts.collection);
  1664. if (!coll) {
  1665. console.error(`Collection not found: ${opts.collection}`);
  1666. closeDb();
  1667. process.exit(1);
  1668. }
  1669. collectionName = opts.collection;
  1670. }
  1671. const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1672. if (!tableExists) {
  1673. console.error("Vector index not found. Run 'qmd embed' first to create embeddings.");
  1674. closeDb();
  1675. return;
  1676. }
  1677. // Check index health and warn about issues
  1678. checkIndexHealth(db);
  1679. // Expand query using structured output (no lexical for vector-only search)
  1680. const queryables = await expandQueryStructured(query, false, opts.context);
  1681. // Build list of queries for vector search: original, vec, and hyde
  1682. const vectorQueries: string[] = [query];
  1683. for (const q of queryables) {
  1684. if (q.type === 'vec' || q.type === 'hyde') {
  1685. if (q.text && q.text !== query) {
  1686. vectorQueries.push(q.text);
  1687. }
  1688. }
  1689. }
  1690. process.stderr.write(`${c.dim}Searching ${vectorQueries.length} vector queries...${c.reset}\n`);
  1691. // Collect results from all query variations
  1692. const perQueryLimit = opts.all ? 500 : 20;
  1693. const allResults = new Map<string, { file: string; displayPath: string; title: string; body: string; score: number; hash: string }>();
  1694. // Use Promise.all for concurrent vector searches
  1695. await Promise.all(vectorQueries.map(async (q) => {
  1696. const vecResults = await searchVec(db, q, model, perQueryLimit, collectionName as any);
  1697. for (const r of vecResults) {
  1698. const existing = allResults.get(r.filepath);
  1699. if (!existing || r.score > existing.score) {
  1700. allResults.set(r.filepath, { file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score, hash: r.hash });
  1701. }
  1702. }
  1703. }));
  1704. // Sort by max score and limit to requested count
  1705. const results = Array.from(allResults.values())
  1706. .sort((a, b) => b.score - a.score)
  1707. .slice(0, opts.limit)
  1708. .map(r => ({ ...r, context: getContextForFile(db, r.file) }));
  1709. closeDb();
  1710. if (results.length === 0) {
  1711. console.log("No results found.");
  1712. return;
  1713. }
  1714. outputResults(results, query, { ...opts, limit: results.length }); // Already limited
  1715. }
  1716. // Expand query using structured output with GBNF grammar
  1717. async function expandQueryStructured(query: string, includeLexical: boolean = true, context?: string): Promise<Queryable[]> {
  1718. process.stderr.write(`${c.dim}Expanding query...${c.reset}\n`);
  1719. const llm = getDefaultLlamaCpp();
  1720. const queryables = await llm.expandQuery(query, { includeLexical, context });
  1721. // Log the expansion as a tree
  1722. const lines: string[] = [];
  1723. const bothLabel = includeLexical ? ' · (lexical+vector)' : ' · (vector)';
  1724. lines.push(`${c.dim}├─ ${query}${bothLabel}${c.reset}`);
  1725. for (let i = 0; i < queryables.length; i++) {
  1726. const q = queryables[i];
  1727. if (!q || q.text === query) continue;
  1728. let textPreview = q.text.replace(/\n/g, ' ');
  1729. if (textPreview.length > 80) {
  1730. textPreview = textPreview.substring(0, 77) + '...';
  1731. }
  1732. const label = q.type === 'lex' ? 'lexical' : (q.type === 'hyde' ? 'hyde' : 'vector');
  1733. lines.push(`${c.dim}├─ ${textPreview} · (${label})${c.reset}`);
  1734. }
  1735. // Fix last item to use └─ instead of ├─
  1736. if (lines.length > 0) {
  1737. lines[lines.length - 1] = lines[lines.length - 1]!.replace('├─', '└─');
  1738. }
  1739. for (const line of lines) {
  1740. process.stderr.write(line + '\n');
  1741. }
  1742. return queryables;
  1743. }
  1744. async function expandQuery(query: string, _model: string = DEFAULT_QUERY_MODEL, _db?: Database): Promise<string[]> {
  1745. const queryables = await expandQueryStructured(query, true);
  1746. const queries = new Set<string>([query]);
  1747. for (const q of queryables) {
  1748. queries.add(q.text);
  1749. }
  1750. return Array.from(queries);
  1751. }
  1752. async function querySearch(query: string, opts: OutputOptions, embedModel: string = DEFAULT_EMBED_MODEL, rerankModel: string = DEFAULT_RERANK_MODEL): Promise<void> {
  1753. const db = getDb();
  1754. // Validate collection filter if specified
  1755. let collectionName: string | undefined;
  1756. if (opts.collection) {
  1757. const coll = getCollectionFromYaml(opts.collection);
  1758. if (!coll) {
  1759. console.error(`Collection not found: ${opts.collection}`);
  1760. closeDb();
  1761. process.exit(1);
  1762. }
  1763. collectionName = opts.collection;
  1764. }
  1765. // Check index health and warn about issues
  1766. checkIndexHealth(db);
  1767. // Run initial BM25 search (will be reused for retrieval)
  1768. const initialFts = searchFTS(db, query, 20, collectionName as any);
  1769. let hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1770. // Check if initial results have strong signals (skip expansion if so)
  1771. // Strong signal = top result is strong AND clearly separated from runner-up.
  1772. // This avoids skipping expansion when BM25 has lots of mediocre matches.
  1773. const topScore = initialFts[0]?.score ?? 0;
  1774. const secondScore = initialFts[1]?.score ?? 0;
  1775. const hasStrongSignal = initialFts.length > 0 && topScore >= 0.85 && (topScore - secondScore) >= 0.15;
  1776. let ftsQueries: string[] = [query];
  1777. let vectorQueries: string[] = [query];
  1778. if (hasStrongSignal) {
  1779. // Strong BM25 signal - skip expensive LLM expansion
  1780. process.stderr.write(`${c.dim}Strong BM25 signal (${topScore.toFixed(2)}) - skipping expansion${c.reset}\n`);
  1781. // Still log the "expansion tree" in the same style as vsearch for consistency.
  1782. {
  1783. const lines: string[] = [];
  1784. lines.push(`${c.dim}├─ ${query} · (lexical+vector)${c.reset}`);
  1785. lines[lines.length - 1] = lines[lines.length - 1]!.replace('├─', '└─');
  1786. for (const line of lines) process.stderr.write(line + '\n');
  1787. }
  1788. } else {
  1789. // Weak signal - expand query for better recall
  1790. const queryables = await expandQueryStructured(query, true, opts.context);
  1791. for (const q of queryables) {
  1792. if (q.type === 'lex') {
  1793. if (q.text && q.text !== query) ftsQueries.push(q.text);
  1794. } else if (q.type === 'vec' || q.type === 'hyde') {
  1795. if (q.text && q.text !== query) vectorQueries.push(q.text);
  1796. }
  1797. }
  1798. }
  1799. process.stderr.write(`${c.dim}Searching ${ftsQueries.length} lexical + ${vectorQueries.length} vector queries...${c.reset}\n`);
  1800. // Collect ranked result lists for RRF fusion
  1801. const rankedLists: RankedResult[][] = [];
  1802. // Map to store hash by filepath for final results
  1803. const hashMap = new Map<string, string>();
  1804. // Run all searches concurrently (FTS + Vector)
  1805. const searchPromises: Promise<void>[] = [];
  1806. // FTS searches
  1807. for (const q of ftsQueries) {
  1808. if (!q) continue;
  1809. searchPromises.push((async () => {
  1810. const ftsResults = searchFTS(db, q, 20, (collectionName || "") as any);
  1811. if (ftsResults.length > 0) {
  1812. for (const r of ftsResults) {
  1813. // Mutex for hashMap is not strictly needed as it's just adding values
  1814. hashMap.set(r.filepath, r.hash);
  1815. }
  1816. rankedLists.push(ftsResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score })));
  1817. }
  1818. })());
  1819. }
  1820. // Vector searches
  1821. if (hasVectors) {
  1822. for (const q of vectorQueries) {
  1823. if (!q) continue;
  1824. searchPromises.push((async () => {
  1825. const vecResults = await searchVec(db, q, embedModel, 20, (collectionName || "") as any);
  1826. if (vecResults.length > 0) {
  1827. for (const r of vecResults) hashMap.set(r.filepath, r.hash);
  1828. rankedLists.push(vecResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score })));
  1829. }
  1830. })());
  1831. }
  1832. }
  1833. await Promise.all(searchPromises);
  1834. // Apply Reciprocal Rank Fusion to combine all ranked lists
  1835. // Give 2x weight to original query results (first 2 lists: FTS + vector)
  1836. const weights = rankedLists.map((_, i) => i < 2 ? 2.0 : 1.0);
  1837. const fused = reciprocalRankFusion(rankedLists, weights);
  1838. // Hard cap reranking for latency/cost. We rerank per-document (best chunk only).
  1839. const RERANK_DOC_LIMIT = 40;
  1840. const candidates = fused.slice(0, RERANK_DOC_LIMIT);
  1841. if (candidates.length === 0) {
  1842. console.log("No results found.");
  1843. closeDb();
  1844. return;
  1845. }
  1846. // Rerank multiple chunks per document, then aggregate scores
  1847. // This improves ranking for long documents where keyword-matched chunk isn't always best
  1848. // We only rerank ONE chunk per document (best chunk by a simple keyword heuristic),
  1849. // so we never rerank more than RERANK_DOC_LIMIT items.
  1850. const chunksToRerank: { file: string; text: string; chunkIdx: number }[] = [];
  1851. const docChunkMap = new Map<string, { chunks: { text: string; pos: number }[]; bestIdx: number }>();
  1852. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
  1853. for (const c of candidates) {
  1854. const chunks = chunkDocument(c.body);
  1855. if (chunks.length === 0) continue;
  1856. // Choose best chunk by keyword matches; fall back to first chunk.
  1857. let bestIdx = 0;
  1858. let bestScore = -1;
  1859. for (let i = 0; i < chunks.length; i++) {
  1860. const chunkLower = chunks[i]!.text.toLowerCase();
  1861. const score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
  1862. if (score > bestScore) {
  1863. bestScore = score;
  1864. bestIdx = i;
  1865. }
  1866. }
  1867. chunksToRerank.push({ file: c.file, text: chunks[bestIdx]!.text, chunkIdx: bestIdx });
  1868. docChunkMap.set(c.file, { chunks, bestIdx });
  1869. }
  1870. // Rerank selected chunks (with caching). One chunk per doc -> one rerank item per doc.
  1871. const reranked = await rerank(
  1872. query,
  1873. chunksToRerank.map(c => ({ file: c.file, text: c.text })),
  1874. rerankModel,
  1875. db
  1876. );
  1877. const aggregatedScores = new Map<string, { score: number; bestChunkIdx: number }>();
  1878. for (const r of reranked) {
  1879. const chunkInfo = docChunkMap.get(r.file);
  1880. aggregatedScores.set(r.file, { score: r.score, bestChunkIdx: chunkInfo?.bestIdx ?? 0 });
  1881. }
  1882. // Blend RRF position score with aggregated reranker score using position-aware weights
  1883. // Top retrieval results get more protection from reranker disagreement
  1884. const candidateMap = new Map(candidates.map(c => [c.file, { displayPath: c.displayPath, title: c.title, body: c.body }]));
  1885. const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1])); // 1-indexed rank
  1886. const finalResults = Array.from(aggregatedScores.entries()).map(([file, { score: rerankScore, bestChunkIdx }]) => {
  1887. const rrfRank = rrfRankMap.get(file) || 30;
  1888. // Position-aware blending: top retrieval results preserved more
  1889. // Rank 1-3: 75% RRF, 25% reranker (trust retrieval for exact matches)
  1890. // Rank 4-10: 60% RRF, 40% reranker
  1891. // Rank 11+: 40% RRF, 60% reranker (trust reranker for lower-ranked)
  1892. let rrfWeight: number;
  1893. if (rrfRank <= 3) {
  1894. rrfWeight = 0.75;
  1895. } else if (rrfRank <= 10) {
  1896. rrfWeight = 0.60;
  1897. } else {
  1898. rrfWeight = 0.40;
  1899. }
  1900. const rrfScore = 1 / rrfRank; // Position-based: 1, 0.5, 0.33...
  1901. const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * rerankScore;
  1902. const candidate = candidateMap.get(file);
  1903. // Use the best-scoring chunk's text for the body (better for snippets)
  1904. const chunkInfo = docChunkMap.get(file);
  1905. const chunkBody = chunkInfo ? (chunkInfo.chunks[bestChunkIdx]?.text || chunkInfo.chunks[0]!.text) : candidate?.body || "";
  1906. const chunkPos = chunkInfo ? (chunkInfo.chunks[bestChunkIdx]?.pos || 0) : 0;
  1907. return {
  1908. file,
  1909. displayPath: candidate?.displayPath || "",
  1910. title: candidate?.title || "",
  1911. body: chunkBody,
  1912. chunkPos,
  1913. score: blendedScore,
  1914. context: getContextForFile(db, file),
  1915. hash: hashMap.get(file) || "",
  1916. };
  1917. }).sort((a, b) => b.score - a.score);
  1918. // Deduplicate by file (safety net - shouldn't happen but prevents duplicate output)
  1919. const seenFiles = new Set<string>();
  1920. const dedupedResults = finalResults.filter(r => {
  1921. if (seenFiles.has(r.file)) return false;
  1922. seenFiles.add(r.file);
  1923. return true;
  1924. });
  1925. closeDb();
  1926. outputResults(dedupedResults, query, opts);
  1927. }
  1928. // Parse CLI arguments using util.parseArgs
  1929. function parseCLI() {
  1930. const { values, positionals } = parseArgs({
  1931. args: Bun.argv.slice(2), // Skip bun and script path
  1932. options: {
  1933. // Global options
  1934. context: {
  1935. type: "string",
  1936. },
  1937. "no-lex": {
  1938. type: "boolean",
  1939. },
  1940. help: { type: "boolean", short: "h" },
  1941. // Search options
  1942. n: { type: "string" },
  1943. "min-score": { type: "string" },
  1944. all: { type: "boolean" },
  1945. full: { type: "boolean" },
  1946. csv: { type: "boolean" },
  1947. md: { type: "boolean" },
  1948. xml: { type: "boolean" },
  1949. files: { type: "boolean" },
  1950. json: { type: "boolean" },
  1951. collection: { type: "string", short: "c" }, // Filter by collection
  1952. // Collection options
  1953. name: { type: "string" }, // collection name
  1954. mask: { type: "string" }, // glob pattern
  1955. // Embed options
  1956. force: { type: "boolean", short: "f" },
  1957. // Update options
  1958. pull: { type: "boolean" }, // git pull before update
  1959. // Get options
  1960. l: { type: "string" }, // max lines
  1961. from: { type: "string" }, // start line
  1962. "max-bytes": { type: "string" }, // max bytes for multi-get
  1963. "line-numbers": { type: "boolean" }, // add line numbers to output
  1964. },
  1965. allowPositionals: true,
  1966. strict: false, // Allow unknown options to pass through
  1967. });
  1968. // Select index name (default: "index")
  1969. const indexName = values.index as string | undefined;
  1970. if (indexName) {
  1971. setIndexName(indexName);
  1972. }
  1973. // Determine output format
  1974. let format: OutputFormat = "cli";
  1975. if (values.csv) format = "csv";
  1976. else if (values.md) format = "md";
  1977. else if (values.xml) format = "xml";
  1978. else if (values.files) format = "files";
  1979. else if (values.json) format = "json";
  1980. // Default limit: 20 for --files/--json, 5 otherwise
  1981. // --all means return all results (use very large limit)
  1982. const defaultLimit = (format === "files" || format === "json") ? 20 : 5;
  1983. const isAll = !!values.all;
  1984. const opts: OutputOptions = {
  1985. format,
  1986. full: !!values.full,
  1987. limit: isAll ? 100000 : (values.n ? parseInt(String(values.n), 10) || defaultLimit : defaultLimit),
  1988. minScore: values["min-score"] ? parseFloat(String(values["min-score"])) || 0 : 0,
  1989. all: isAll,
  1990. collection: values.collection as string | undefined,
  1991. lineNumbers: !!values["line-numbers"],
  1992. };
  1993. return {
  1994. command: positionals[0] || "",
  1995. args: positionals.slice(1),
  1996. query: positionals.slice(1).join(" "),
  1997. opts,
  1998. values,
  1999. };
  2000. }
  2001. function showHelp(): void {
  2002. console.log("Usage:");
  2003. console.log(" qmd collection add [path] --name <name> --mask <pattern> - Create/index collection");
  2004. console.log(" qmd collection list - List all collections with details");
  2005. console.log(" qmd collection remove <name> - Remove a collection by name");
  2006. console.log(" qmd collection rename <old> <new> - Rename a collection");
  2007. console.log(" qmd ls [collection[/path]] - List collections or files in a collection");
  2008. console.log(" qmd context add [path] \"text\" - Add context for path (defaults to current dir)");
  2009. console.log(" qmd context list - List all contexts");
  2010. console.log(" qmd context rm <path> - Remove context");
  2011. console.log(" qmd get <file>[:line] [-l N] [--from N] - Get document (optionally from line, max N lines)");
  2012. console.log(" qmd multi-get <pattern> [-l N] [--max-bytes N] - Get multiple docs by glob or comma-separated list");
  2013. console.log(" qmd status - Show index status and collections");
  2014. console.log(" qmd update [--pull] - Re-index all collections (--pull: git pull first)");
  2015. console.log(" qmd embed [-f] - Create vector embeddings (800 tokens/chunk, 15% overlap)");
  2016. console.log(" qmd cleanup - Remove cache and orphaned data, vacuum DB");
  2017. console.log(" qmd search <query> - Full-text search (BM25)");
  2018. console.log(" qmd vsearch <query> - Vector similarity search");
  2019. console.log(" qmd query <query> - Combined search with query expansion + reranking");
  2020. console.log(" qmd mcp - Start MCP server (for AI agent integration)");
  2021. console.log("");
  2022. console.log("Global options:");
  2023. console.log(" --index <name> - Use custom index name (default: index)");
  2024. console.log("");
  2025. console.log("Search options:");
  2026. console.log(" -n <num> - Number of results (default: 5, or 20 for --files)");
  2027. console.log(" --all - Return all matches (use with --min-score to filter)");
  2028. console.log(" --min-score <num> - Minimum similarity score");
  2029. console.log(" --full - Output full document instead of snippet");
  2030. console.log(" --line-numbers - Add line numbers to output");
  2031. console.log(" --files - Output docid,score,filepath,context (default: 20 results)");
  2032. console.log(" --json - JSON output with snippets (default: 20 results)");
  2033. console.log(" --csv - CSV output with snippets");
  2034. console.log(" --md - Markdown output");
  2035. console.log(" --xml - XML output");
  2036. console.log(" -c, --collection <name> - Filter results to a specific collection");
  2037. console.log("");
  2038. console.log("Multi-get options:");
  2039. console.log(" -l <num> - Maximum lines per file");
  2040. console.log(" --max-bytes <num> - Skip files larger than N bytes (default: 10240)");
  2041. console.log(" --json/--csv/--md/--xml/--files - Output format (same as search)");
  2042. console.log("");
  2043. console.log("Models (auto-downloaded from HuggingFace):");
  2044. console.log(" Embedding: embeddinggemma-300M-Q8_0");
  2045. console.log(" Reranking: qwen3-reranker-0.6b-q8_0");
  2046. console.log(" Generation: Qwen3-0.6B-Q8_0");
  2047. console.log("");
  2048. console.log(`Index: ${getDbPath()}`);
  2049. }
  2050. // Main CLI - only run if this is the main module
  2051. if (import.meta.main) {
  2052. const cli = parseCLI();
  2053. if (!cli.command || cli.values.help) {
  2054. showHelp();
  2055. process.exit(cli.values.help ? 0 : 1);
  2056. }
  2057. switch (cli.command) {
  2058. case "context": {
  2059. const subcommand = cli.args[0];
  2060. if (!subcommand) {
  2061. console.error("Usage: qmd context <add|list|check|rm>");
  2062. console.error("");
  2063. console.error("Commands:");
  2064. console.error(" qmd context add [path] \"text\" - Add context (defaults to current dir)");
  2065. console.error(" qmd context add / \"text\" - Add global context to all collections");
  2066. console.error(" qmd context list - List all contexts");
  2067. console.error(" qmd context check - Check for missing contexts");
  2068. console.error(" qmd context rm <path> - Remove context");
  2069. process.exit(1);
  2070. }
  2071. switch (subcommand) {
  2072. case "add": {
  2073. if (cli.args.length < 2) {
  2074. console.error("Usage: qmd context add [path] \"text\"");
  2075. console.error("");
  2076. console.error("Examples:");
  2077. console.error(" qmd context add \"Context for current directory\"");
  2078. console.error(" qmd context add . \"Context for current directory\"");
  2079. console.error(" qmd context add /subfolder \"Context for subfolder\"");
  2080. console.error(" qmd context add / \"Global context for all collections\"");
  2081. console.error("");
  2082. console.error(" Using virtual paths:");
  2083. console.error(" qmd context add qmd://journals/ \"Context for entire journals collection\"");
  2084. console.error(" qmd context add qmd://journals/2024 \"Context for 2024 journals\"");
  2085. process.exit(1);
  2086. }
  2087. let pathArg: string | undefined;
  2088. let contextText: string;
  2089. // Check if first arg looks like a path or if it's the context text
  2090. const firstArg = cli.args[1] || '';
  2091. const secondArg = cli.args[2];
  2092. if (secondArg) {
  2093. // Two args: path + context
  2094. pathArg = firstArg;
  2095. contextText = cli.args.slice(2).join(" ");
  2096. } else {
  2097. // One arg: context only (use current directory)
  2098. pathArg = undefined;
  2099. contextText = firstArg;
  2100. }
  2101. await contextAdd(pathArg, contextText);
  2102. break;
  2103. }
  2104. case "list": {
  2105. contextList();
  2106. break;
  2107. }
  2108. case "check": {
  2109. contextCheck();
  2110. break;
  2111. }
  2112. case "rm":
  2113. case "remove": {
  2114. if (cli.args.length < 2 || !cli.args[1]) {
  2115. console.error("Usage: qmd context rm <path>");
  2116. console.error("Examples:");
  2117. console.error(" qmd context rm /");
  2118. console.error(" qmd context rm qmd://journals/2024");
  2119. process.exit(1);
  2120. }
  2121. contextRemove(cli.args[1]);
  2122. break;
  2123. }
  2124. default:
  2125. console.error(`Unknown subcommand: ${subcommand}`);
  2126. console.error("Available: add, list, check, rm");
  2127. process.exit(1);
  2128. }
  2129. break;
  2130. }
  2131. case "get": {
  2132. if (!cli.args[0]) {
  2133. console.error("Usage: qmd get <filepath>[:line] [--from <line>] [-l <lines>] [--line-numbers]");
  2134. process.exit(1);
  2135. }
  2136. const fromLine = cli.values.from ? parseInt(cli.values.from as string, 10) : undefined;
  2137. const maxLines = cli.values.l ? parseInt(cli.values.l as string, 10) : undefined;
  2138. getDocument(cli.args[0], fromLine, maxLines, cli.opts.lineNumbers);
  2139. break;
  2140. }
  2141. case "multi-get": {
  2142. if (!cli.args[0]) {
  2143. console.error("Usage: qmd multi-get <pattern> [-l <lines>] [--max-bytes <bytes>] [--json|--csv|--md|--xml|--files]");
  2144. console.error(" pattern: glob (e.g., 'journals/2025-05*.md') or comma-separated list");
  2145. process.exit(1);
  2146. }
  2147. const maxLinesMulti = cli.values.l ? parseInt(cli.values.l as string, 10) : undefined;
  2148. const maxBytes = cli.values["max-bytes"] ? parseInt(cli.values["max-bytes"] as string, 10) : DEFAULT_MULTI_GET_MAX_BYTES;
  2149. multiGet(cli.args[0], maxLinesMulti, maxBytes, cli.opts.format);
  2150. break;
  2151. }
  2152. case "ls": {
  2153. listFiles(cli.args[0]);
  2154. break;
  2155. }
  2156. case "collection": {
  2157. const subcommand = cli.args[0];
  2158. switch (subcommand) {
  2159. case "list": {
  2160. collectionList();
  2161. break;
  2162. }
  2163. case "add": {
  2164. const pwd = cli.args[1] || getPwd();
  2165. const resolvedPwd = pwd === '.' ? getPwd() : getRealPath(resolve(pwd));
  2166. const globPattern = cli.values.mask as string || DEFAULT_GLOB;
  2167. const name = cli.values.name as string | undefined;
  2168. await collectionAdd(resolvedPwd, globPattern, name);
  2169. break;
  2170. }
  2171. case "remove":
  2172. case "rm": {
  2173. if (!cli.args[1]) {
  2174. console.error("Usage: qmd collection remove <name>");
  2175. console.error(" Use 'qmd collection list' to see available collections");
  2176. process.exit(1);
  2177. }
  2178. collectionRemove(cli.args[1]);
  2179. break;
  2180. }
  2181. case "rename":
  2182. case "mv": {
  2183. if (!cli.args[1] || !cli.args[2]) {
  2184. console.error("Usage: qmd collection rename <old-name> <new-name>");
  2185. console.error(" Use 'qmd collection list' to see available collections");
  2186. process.exit(1);
  2187. }
  2188. collectionRename(cli.args[1], cli.args[2]);
  2189. break;
  2190. }
  2191. default:
  2192. console.error(`Unknown subcommand: ${subcommand}`);
  2193. console.error("Available: list, add, remove, rename");
  2194. process.exit(1);
  2195. }
  2196. break;
  2197. }
  2198. case "status":
  2199. showStatus();
  2200. break;
  2201. case "update":
  2202. await updateCollections();
  2203. break;
  2204. case "embed":
  2205. await vectorIndex(DEFAULT_EMBED_MODEL, !!cli.values.force);
  2206. break;
  2207. case "search":
  2208. if (!cli.query) {
  2209. console.error("Usage: qmd search [options] <query>");
  2210. process.exit(1);
  2211. }
  2212. search(cli.query, cli.opts);
  2213. break;
  2214. case "vsearch":
  2215. if (!cli.query) {
  2216. console.error("Usage: qmd vsearch [options] <query>");
  2217. process.exit(1);
  2218. }
  2219. // Default min-score for vector search is 0.3
  2220. if (!cli.values["min-score"]) {
  2221. cli.opts.minScore = 0.3;
  2222. }
  2223. await vectorSearch(cli.query, cli.opts);
  2224. break;
  2225. case "query":
  2226. if (!cli.query) {
  2227. console.error("Usage: qmd query [options] <query>");
  2228. process.exit(1);
  2229. }
  2230. await querySearch(cli.query, cli.opts);
  2231. break;
  2232. case "mcp": {
  2233. const { startMcpServer } = await import("./mcp.js");
  2234. await startMcpServer();
  2235. break;
  2236. }
  2237. case "cleanup": {
  2238. const db = getDb();
  2239. // 1. Clear llm_cache
  2240. const cacheCount = deleteLLMCache(db);
  2241. console.log(`${c.green}✓${c.reset} Cleared ${cacheCount} cached API responses`);
  2242. // 2. Remove orphaned vectors
  2243. const orphanedVecs = cleanupOrphanedVectors(db);
  2244. if (orphanedVecs > 0) {
  2245. console.log(`${c.green}✓${c.reset} Removed ${orphanedVecs} orphaned embedding chunks`);
  2246. } else {
  2247. console.log(`${c.dim}No orphaned embeddings to remove${c.reset}`);
  2248. }
  2249. // 3. Remove inactive documents
  2250. const inactiveDocs = deleteInactiveDocuments(db);
  2251. if (inactiveDocs > 0) {
  2252. console.log(`${c.green}✓${c.reset} Removed ${inactiveDocs} inactive document records`);
  2253. }
  2254. // 4. Vacuum to reclaim space
  2255. vacuumDatabase(db);
  2256. console.log(`${c.green}✓${c.reset} Database vacuumed`);
  2257. closeDb();
  2258. break;
  2259. }
  2260. default:
  2261. console.error(`Unknown command: ${cli.command}`);
  2262. console.error("Run 'qmd --help' for usage.");
  2263. process.exit(1);
  2264. }
  2265. if (cli.command !== "mcp") {
  2266. await disposeDefaultLlamaCpp();
  2267. process.exit(0);
  2268. }
  2269. } // end if (import.meta.main)