qmd.ts 83 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368
  1. #!/usr/bin/env bun
  2. import { Database } from "bun:sqlite";
  3. import { Glob, $ } from "bun";
  4. import { parseArgs } from "util";
  5. import * as sqliteVec from "sqlite-vec";
  6. import {
  7. getDb,
  8. closeDb,
  9. getDbPath,
  10. getPwd,
  11. getRealPath,
  12. homedir,
  13. resolve,
  14. setCustomIndexName,
  15. searchFTS,
  16. searchVec,
  17. reciprocalRankFusion,
  18. extractSnippet,
  19. getContextForFile,
  20. getContextForPath,
  21. getCollectionIdByName,
  22. getCollectionByName,
  23. findSimilarFiles,
  24. matchFilesByGlob,
  25. getHashesNeedingEmbedding,
  26. getDocument as storeGetDocument,
  27. getMultipleDocuments as storeMultiGetDocuments,
  28. getStatus,
  29. hashContent,
  30. extractTitle,
  31. formatDocForEmbedding,
  32. formatQueryForEmbedding,
  33. chunkDocument,
  34. ensureVecTable,
  35. clearCache,
  36. getCacheKey,
  37. getCachedResult,
  38. setCachedResult,
  39. getIndexHealth,
  40. parseVirtualPath,
  41. buildVirtualPath,
  42. isVirtualPath,
  43. resolveVirtualPath,
  44. toVirtualPath,
  45. OLLAMA_URL,
  46. DEFAULT_EMBED_MODEL,
  47. DEFAULT_QUERY_MODEL,
  48. DEFAULT_RERANK_MODEL,
  49. DEFAULT_GLOB,
  50. DEFAULT_MULTI_GET_MAX_BYTES,
  51. } from "./store.js";
  52. import type { SearchResult, RankedResult } from "./store.js";
  53. import {
  54. formatSearchResults,
  55. formatDocuments,
  56. escapeXml,
  57. escapeCSV,
  58. type OutputFormat,
  59. } from "./formatter.js";
  60. // Chunking: ~2000 tokens per chunk, ~3 bytes/token = 6KB
  61. const CHUNK_BYTE_SIZE = 6 * 1024;
  62. // Terminal colors (respects NO_COLOR env)
  63. const useColor = !process.env.NO_COLOR && process.stdout.isTTY;
  64. const c = {
  65. reset: useColor ? "\x1b[0m" : "",
  66. dim: useColor ? "\x1b[2m" : "",
  67. bold: useColor ? "\x1b[1m" : "",
  68. cyan: useColor ? "\x1b[36m" : "",
  69. yellow: useColor ? "\x1b[33m" : "",
  70. green: useColor ? "\x1b[32m" : "",
  71. magenta: useColor ? "\x1b[35m" : "",
  72. blue: useColor ? "\x1b[34m" : "",
  73. };
  74. // Terminal cursor control
  75. const cursor = {
  76. hide() { process.stderr.write('\x1b[?25l'); },
  77. show() { process.stderr.write('\x1b[?25h'); },
  78. };
  79. // Ensure cursor is restored on exit
  80. process.on('SIGINT', () => { cursor.show(); process.exit(130); });
  81. process.on('SIGTERM', () => { cursor.show(); process.exit(143); });
  82. // Terminal progress bar using OSC 9;4 escape sequence
  83. const progress = {
  84. set(percent: number) {
  85. process.stderr.write(`\x1b]9;4;1;${Math.round(percent)}\x07`);
  86. },
  87. clear() {
  88. process.stderr.write(`\x1b]9;4;0\x07`);
  89. },
  90. indeterminate() {
  91. process.stderr.write(`\x1b]9;4;3\x07`);
  92. },
  93. error() {
  94. process.stderr.write(`\x1b]9;4;2\x07`);
  95. },
  96. };
  97. // Format seconds into human-readable ETA
  98. function formatETA(seconds: number): string {
  99. if (seconds < 60) return `${Math.round(seconds)}s`;
  100. if (seconds < 3600) return `${Math.floor(seconds / 60)}m ${Math.round(seconds % 60)}s`;
  101. return `${Math.floor(seconds / 3600)}h ${Math.floor((seconds % 3600) / 60)}m`;
  102. }
  103. // Check index health and print warnings/tips
  104. function checkIndexHealth(db: Database): void {
  105. const { needsEmbedding, totalDocs, daysStale } = getIndexHealth(db);
  106. // Warn if many docs need embedding
  107. if (needsEmbedding > 0) {
  108. const pct = Math.round((needsEmbedding / totalDocs) * 100);
  109. if (pct >= 10) {
  110. process.stderr.write(`${c.yellow}Warning: ${needsEmbedding} documents (${pct}%) need embeddings. Run 'qmd embed' for better results.${c.reset}\n`);
  111. } else {
  112. process.stderr.write(`${c.dim}Tip: ${needsEmbedding} documents need embeddings. Run 'qmd embed' to index them.${c.reset}\n`);
  113. }
  114. }
  115. // Check if most recent document update is older than 2 weeks
  116. if (daysStale !== null && daysStale >= 14) {
  117. process.stderr.write(`${c.dim}Tip: Index last updated ${daysStale} days ago. Run 'qmd update' to refresh.${c.reset}\n`);
  118. }
  119. }
  120. // Compute unique display path for a document
  121. // Always include at least parent folder + filename, add more parent dirs until unique
  122. function computeDisplayPath(
  123. filepath: string,
  124. collectionPath: string,
  125. existingPaths: Set<string>
  126. ): string {
  127. // Get path relative to collection (include collection dir name)
  128. const collectionDir = collectionPath.replace(/\/$/, '');
  129. const collectionName = collectionDir.split('/').pop() || '';
  130. let relativePath: string;
  131. if (filepath.startsWith(collectionDir + '/')) {
  132. // filepath is under collection: use collection name + relative path
  133. relativePath = collectionName + filepath.slice(collectionDir.length);
  134. } else {
  135. // Fallback: just use the filepath
  136. relativePath = filepath;
  137. }
  138. const parts = relativePath.split('/').filter(p => p.length > 0);
  139. // Always include at least parent folder + filename (minimum 2 parts if available)
  140. // Then add more parent dirs until unique
  141. const minParts = Math.min(2, parts.length);
  142. for (let i = parts.length - minParts; i >= 0; i--) {
  143. const candidate = parts.slice(i).join('/');
  144. if (!existingPaths.has(candidate)) {
  145. return candidate;
  146. }
  147. }
  148. // Absolute fallback: use full path (should be unique)
  149. return filepath;
  150. }
  151. // Auto-pull model if not found
  152. async function ensureModelAvailable(model: string): Promise<void> {
  153. try {
  154. const response = await fetch(`${OLLAMA_URL}/api/show`, {
  155. method: "POST",
  156. headers: { "Content-Type": "application/json" },
  157. body: JSON.stringify({ name: model }),
  158. });
  159. if (response.ok) return;
  160. } catch {
  161. // Continue to pull attempt
  162. }
  163. console.log(`Model ${model} not found. Pulling...`);
  164. progress.indeterminate();
  165. const pullResponse = await fetch(`${OLLAMA_URL}/api/pull`, {
  166. method: "POST",
  167. headers: { "Content-Type": "application/json" },
  168. body: JSON.stringify({ name: model, stream: false }),
  169. });
  170. if (!pullResponse.ok) {
  171. progress.error();
  172. throw new Error(`Failed to pull model ${model}: ${pullResponse.status} - ${await pullResponse.text()}`);
  173. }
  174. progress.clear();
  175. console.log(`Model ${model} pulled successfully.`);
  176. }
  177. async function getEmbedding(text: string, model: string, isQuery: boolean = false, title?: string, retried: boolean = false): Promise<number[]> {
  178. const input = isQuery ? formatQueryForEmbedding(text) : formatDocForEmbedding(text, title);
  179. const response = await fetch(`${OLLAMA_URL}/api/embed`, {
  180. method: "POST",
  181. headers: { "Content-Type": "application/json" },
  182. body: JSON.stringify({ model, input }),
  183. });
  184. if (!response.ok) {
  185. const errorText = await response.text();
  186. if (!retried && (errorText.includes("not found") || errorText.includes("does not exist"))) {
  187. await ensureModelAvailable(model);
  188. return getEmbedding(text, model, isQuery, title, true);
  189. }
  190. throw new Error(`Ollama API error: ${response.status} - ${errorText}`);
  191. }
  192. const data = await response.json() as { embeddings: number[][] };
  193. return data.embeddings[0];
  194. }
  195. // Qwen3-Reranker prompt format (trained for yes/no relevance classification)
  196. const RERANK_SYSTEM = `Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".`;
  197. function formatRerankPrompt(query: string, title: string, doc: string): string {
  198. return `<Instruct>: Determine if this document from a Shopify knowledge base is relevant to the search query. The query may reference specific Shopify programs, competitions, features, or named concepts (e.g., "Build a Business" competition, "Shop Pay", "Polaris"). Match documents that discuss the queried topic, even if phrasing differs.
  199. <Query>: ${query}
  200. <Document Title>: ${title}
  201. <Document>: ${doc}`;
  202. }
  203. type LogProb = { token: string; logprob: number };
  204. type RerankResponse = {
  205. response: string;
  206. logprobs?: LogProb[];
  207. };
  208. function parseRerankResponse(data: RerankResponse): number {
  209. if (!data.logprobs || data.logprobs.length === 0) {
  210. throw new Error("Reranker response missing logprobs");
  211. }
  212. const firstToken = data.logprobs[0];
  213. const token = firstToken.token.toLowerCase().trim();
  214. const confidence = Math.exp(firstToken.logprob);
  215. if (token === "yes") {
  216. return confidence;
  217. }
  218. if (token === "no") {
  219. return (1 - confidence) * 0.3;
  220. }
  221. throw new Error(`Unexpected reranker token: "${token}"`);
  222. }
  223. async function rerankSingle(prompt: string, model: string, db?: Database, retried: boolean = false): Promise<number> {
  224. // Use generate with raw template for qwen3-reranker format
  225. // Include empty <think> tags as per HuggingFace reference implementation
  226. const fullPrompt = `<|im_start|>system
  227. ${RERANK_SYSTEM}<|im_end|>
  228. <|im_start|>user
  229. ${prompt}<|im_end|>
  230. <|im_start|>assistant
  231. <think>
  232. </think>
  233. `;
  234. const requestBody = {
  235. model,
  236. prompt: fullPrompt,
  237. raw: true,
  238. stream: false,
  239. logprobs: true,
  240. options: { num_predict: 1 },
  241. };
  242. // Check cache
  243. const cacheKey = db ? getCacheKey(`${OLLAMA_URL}/api/generate`, requestBody) : "";
  244. if (db) {
  245. const cached = getCachedResult(db, cacheKey);
  246. if (cached) {
  247. const data = JSON.parse(cached) as RerankResponse;
  248. return parseRerankResponse(data);
  249. }
  250. }
  251. const response = await fetch(`${OLLAMA_URL}/api/generate`, {
  252. method: "POST",
  253. headers: { "Content-Type": "application/json" },
  254. body: JSON.stringify(requestBody),
  255. });
  256. if (!response.ok) {
  257. const errorText = await response.text();
  258. if (!retried && (errorText.includes("not found") || errorText.includes("does not exist"))) {
  259. await ensureModelAvailable(model);
  260. return rerankSingle(prompt, model, db, true);
  261. }
  262. throw new Error(`Ollama API error: ${response.status} - ${errorText}`);
  263. }
  264. const data = await response.json() as RerankResponse;
  265. // Cache the result
  266. if (db) {
  267. setCachedResult(db, cacheKey, JSON.stringify(data));
  268. }
  269. return parseRerankResponse(data);
  270. }
  271. async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db?: Database): Promise<{ file: string; score: number }[]> {
  272. const results: { file: string; score: number }[] = [];
  273. const total = documents.length;
  274. const PARALLEL = 5;
  275. process.stderr.write(`Reranking ${total} documents with ${model} (parallel: ${PARALLEL})...\n`);
  276. progress.indeterminate();
  277. // Process in parallel batches
  278. for (let i = 0; i < documents.length; i += PARALLEL) {
  279. const batch = documents.slice(i, i + PARALLEL);
  280. const batchResults = await Promise.all(
  281. batch.map(async (doc) => {
  282. try {
  283. // Extract title from filename for reranker context
  284. const title = doc.file.split('/').pop()?.replace(/\.md$/, '') || doc.file;
  285. const prompt = formatRerankPrompt(query, title, doc.text.slice(0, 4000));
  286. const score = await rerankSingle(prompt, model, db);
  287. return { file: doc.file, score };
  288. } catch (err) {
  289. return { file: doc.file, score: 0 };
  290. }
  291. })
  292. );
  293. results.push(...batchResults);
  294. const processed = Math.min(i + PARALLEL, total);
  295. progress.set((processed / total) * 100);
  296. process.stderr.write(`\rReranking: ${processed}/${total}`);
  297. }
  298. progress.clear();
  299. process.stderr.write("\n");
  300. return results.sort((a, b) => b.score - a.score);
  301. }
  302. function getOrCreateCollection(db: Database, pwd: string, globPattern: string, name?: string): number {
  303. const now = new Date().toISOString();
  304. // Generate collection name from pwd basename if not provided
  305. if (!name) {
  306. const parts = pwd.split('/').filter(Boolean);
  307. name = parts[parts.length - 1] || 'root';
  308. }
  309. // Check if collection with this pwd+glob already exists
  310. const existing = db.prepare(`SELECT id FROM collections WHERE pwd = ? AND glob_pattern = ?`).get(pwd, globPattern) as { id: number } | null;
  311. if (existing) return existing.id;
  312. // Try to insert with generated name
  313. try {
  314. const result = db.prepare(`INSERT INTO collections (name, pwd, glob_pattern, created_at, updated_at) VALUES (?, ?, ?, ?, ?)`).run(name, pwd, globPattern, now, now);
  315. return result.lastInsertRowid as number;
  316. } catch (e) {
  317. // Name collision - append a unique suffix
  318. const allCollections = db.prepare(`SELECT name FROM collections WHERE name LIKE ?`).all(`${name}%`) as { name: string }[];
  319. let suffix = 2;
  320. let uniqueName = `${name}-${suffix}`;
  321. while (allCollections.some(c => c.name === uniqueName)) {
  322. suffix++;
  323. uniqueName = `${name}-${suffix}`;
  324. }
  325. const result = db.prepare(`INSERT INTO collections (name, pwd, glob_pattern, created_at, updated_at) VALUES (?, ?, ?, ?, ?)`).run(uniqueName, pwd, globPattern, now, now);
  326. return result.lastInsertRowid as number;
  327. }
  328. }
  329. function cleanupDuplicateCollections(db: Database): void {
  330. // Remove duplicate collections keeping the oldest one
  331. db.exec(`
  332. DELETE FROM collections WHERE id NOT IN (
  333. SELECT MIN(id) FROM collections GROUP BY pwd, glob_pattern
  334. )
  335. `);
  336. // Remove bogus "." glob pattern entries (from earlier bug)
  337. db.exec(`DELETE FROM collections WHERE glob_pattern = '.'`);
  338. }
  339. function formatTimeAgo(date: Date): string {
  340. const seconds = Math.floor((Date.now() - date.getTime()) / 1000);
  341. if (seconds < 60) return `${seconds}s ago`;
  342. const minutes = Math.floor(seconds / 60);
  343. if (minutes < 60) return `${minutes}m ago`;
  344. const hours = Math.floor(minutes / 60);
  345. if (hours < 24) return `${hours}h ago`;
  346. const days = Math.floor(hours / 24);
  347. return `${days}d ago`;
  348. }
  349. function formatBytes(bytes: number): string {
  350. if (bytes < 1024) return `${bytes} B`;
  351. if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
  352. if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
  353. return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
  354. }
  355. function showStatus(): void {
  356. const dbPath = getDbPath();
  357. const db = getDb();
  358. // Cleanup any duplicate collections
  359. cleanupDuplicateCollections(db);
  360. // Index size
  361. let indexSize = 0;
  362. try {
  363. const stat = Bun.file(dbPath).size;
  364. indexSize = stat;
  365. } catch {}
  366. // Collections info
  367. const collections = db.prepare(`
  368. SELECT c.id, c.pwd, c.glob_pattern, c.created_at,
  369. COUNT(d.id) as doc_count,
  370. SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count,
  371. MAX(d.modified_at) as last_modified
  372. FROM collections c
  373. LEFT JOIN documents d ON d.collection_id = c.id
  374. GROUP BY c.id
  375. ORDER BY c.created_at DESC
  376. `).all() as { id: number; pwd: string; glob_pattern: string; created_at: string; doc_count: number; active_count: number; last_modified: string | null }[];
  377. // Overall stats
  378. const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number };
  379. const vectorCount = db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get() as { count: number };
  380. const needsEmbedding = getHashesNeedingEmbedding(db);
  381. // Most recent update across all collections
  382. const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
  383. console.log(`${c.bold}QMD Status${c.reset}\n`);
  384. console.log(`Index: ${dbPath}`);
  385. console.log(`Size: ${formatBytes(indexSize)}\n`);
  386. console.log(`${c.bold}Documents${c.reset}`);
  387. console.log(` Total: ${totalDocs.count} files indexed`);
  388. console.log(` Vectors: ${vectorCount.count} embedded`);
  389. if (needsEmbedding > 0) {
  390. console.log(` ${c.yellow}Pending: ${needsEmbedding} need embedding${c.reset} (run 'qmd embed')`);
  391. }
  392. if (mostRecent.latest) {
  393. const lastUpdate = new Date(mostRecent.latest);
  394. console.log(` Updated: ${formatTimeAgo(lastUpdate)}`);
  395. }
  396. // Get all path contexts
  397. const pathContexts = db.prepare(`SELECT path_prefix, context FROM path_contexts ORDER BY path_prefix`).all() as { path_prefix: string; context: string }[];
  398. if (collections.length > 0) {
  399. console.log(`\n${c.bold}Collections${c.reset}`);
  400. for (const col of collections) {
  401. const lastMod = col.last_modified ? formatTimeAgo(new Date(col.last_modified)) : "never";
  402. console.log(` ${c.cyan}${col.pwd}${c.reset}`);
  403. console.log(` ${col.glob_pattern} → ${col.active_count} docs (updated ${lastMod})`);
  404. // Show contexts that match this collection's path
  405. const matchingContexts = pathContexts.filter(ctx =>
  406. ctx.path_prefix.startsWith(col.pwd) || col.pwd.startsWith(ctx.path_prefix)
  407. );
  408. for (const ctx of matchingContexts) {
  409. const displayPath = shortPath(ctx.path_prefix);
  410. console.log(` ${c.dim}context: ${displayPath} → "${ctx.context}"${c.reset}`);
  411. }
  412. }
  413. } else {
  414. console.log(`\n${c.dim}No collections. Run 'qmd add .' to index markdown files.${c.reset}`);
  415. }
  416. closeDb();
  417. }
  418. // Update display_paths for all documents that have empty display_path
  419. function updateDisplayPaths(db: Database): number {
  420. // Get all docs with empty display_path, grouped by collection
  421. const emptyDocs = db.prepare(`
  422. SELECT d.id, d.filepath, c.pwd
  423. FROM documents d
  424. JOIN collections c ON d.collection_id = c.id
  425. WHERE d.active = 1 AND (d.display_path IS NULL OR d.display_path = '')
  426. `).all() as { id: number; filepath: string; pwd: string }[];
  427. if (emptyDocs.length === 0) return 0;
  428. // Collect existing display_paths
  429. const existingPaths = new Set<string>(
  430. (db.prepare(`SELECT display_path FROM documents WHERE active = 1 AND display_path != ''`).all() as { display_path: string }[])
  431. .map(r => r.display_path)
  432. );
  433. const updateStmt = db.prepare(`UPDATE documents SET display_path = ? WHERE id = ?`);
  434. let updated = 0;
  435. for (const doc of emptyDocs) {
  436. const displayPath = computeDisplayPath(doc.filepath, doc.pwd, existingPaths);
  437. updateStmt.run(displayPath, doc.id);
  438. existingPaths.add(displayPath);
  439. updated++;
  440. }
  441. return updated;
  442. }
  443. async function updateCollections(): Promise<void> {
  444. const db = getDb();
  445. cleanupDuplicateCollections(db);
  446. // Clear Ollama cache on update
  447. clearCache(db);
  448. const collections = db.prepare(`SELECT id, pwd, glob_pattern FROM collections`).all() as { id: number; pwd: string; glob_pattern: string }[];
  449. if (collections.length === 0) {
  450. console.log(`${c.dim}No collections found. Run 'qmd add .' to index markdown files.${c.reset}`);
  451. closeDb();
  452. return;
  453. }
  454. // Update display_paths for any documents missing them (migration)
  455. const pathsUpdated = updateDisplayPaths(db);
  456. if (pathsUpdated > 0) {
  457. console.log(`${c.green}✓${c.reset} Updated ${pathsUpdated} display paths`);
  458. }
  459. // Don't close db here - indexFiles will reuse it and close at the end
  460. console.log(`${c.bold}Updating ${collections.length} collection(s)...${c.reset}\n`);
  461. for (let i = 0; i < collections.length; i++) {
  462. const col = collections[i];
  463. console.log(`${c.cyan}[${i + 1}/${collections.length}]${c.reset} ${c.bold}${col.pwd}${c.reset}`);
  464. console.log(`${c.dim} Pattern: ${col.glob_pattern}${c.reset}`);
  465. // Temporarily set PWD for indexing
  466. const originalPwd = process.env.PWD;
  467. process.env.PWD = col.pwd;
  468. await indexFiles(col.glob_pattern);
  469. process.env.PWD = originalPwd;
  470. console.log("");
  471. }
  472. console.log(`${c.green}✓ All collections updated.${c.reset}`);
  473. }
  474. /**
  475. * Detect which collection (if any) contains the given filesystem path.
  476. * Returns { collectionId, collectionName, relativePath } or null if not in any collection.
  477. */
  478. function detectCollectionFromPath(db: Database, fsPath: string): { collectionId: number; collectionName: string; relativePath: string } | null {
  479. const realPath = getRealPath(fsPath);
  480. // Find collections that this path is under
  481. const collections = db.prepare(`
  482. SELECT id, name, pwd
  483. FROM collections
  484. WHERE ? LIKE pwd || '/%' OR ? = pwd
  485. ORDER BY LENGTH(pwd) DESC
  486. LIMIT 1
  487. `).get(realPath, realPath) as { id: number; name: string; pwd: string } | null;
  488. if (!collections) return null;
  489. // Calculate relative path
  490. let relativePath = realPath;
  491. if (relativePath.startsWith(collections.pwd + '/')) {
  492. relativePath = relativePath.slice(collections.pwd.length + 1);
  493. } else if (relativePath === collections.pwd) {
  494. relativePath = '';
  495. }
  496. return {
  497. collectionId: collections.id,
  498. collectionName: collections.name,
  499. relativePath
  500. };
  501. }
  502. async function contextAdd(pathArg: string | undefined, contextText: string): Promise<void> {
  503. const db = getDb();
  504. const now = new Date().toISOString();
  505. // Handle "/" as global/root context (applies to all collections)
  506. if (pathArg === '/') {
  507. // Find all collections and add context to each
  508. const collections = db.prepare(`SELECT id, name FROM collections`).all() as { id: number; name: string }[];
  509. for (const coll of collections) {
  510. db.prepare(`
  511. INSERT INTO path_contexts (collection_id, path_prefix, context, created_at)
  512. VALUES (?, '', ?, ?)
  513. ON CONFLICT(collection_id, path_prefix) DO UPDATE SET context = excluded.context
  514. `).run(coll.id, contextText, now);
  515. }
  516. console.log(`${c.green}✓${c.reset} Added global context to ${collections.length} collection(s)`);
  517. console.log(`${c.dim}Context: ${contextText}${c.reset}`);
  518. closeDb();
  519. return;
  520. }
  521. // Resolve path - defaults to current directory if not provided
  522. let fsPath = pathArg || '.';
  523. if (fsPath === '.' || fsPath === './') {
  524. fsPath = getPwd();
  525. } else if (fsPath.startsWith('~/')) {
  526. fsPath = homedir() + fsPath.slice(1);
  527. } else if (!fsPath.startsWith('/') && !fsPath.startsWith('qmd://')) {
  528. fsPath = resolve(getPwd(), fsPath);
  529. }
  530. // Handle virtual paths (qmd://collection/path)
  531. if (isVirtualPath(fsPath)) {
  532. const parsed = parseVirtualPath(fsPath);
  533. if (!parsed) {
  534. console.error(`${c.yellow}Invalid virtual path: ${fsPath}${c.reset}`);
  535. process.exit(1);
  536. }
  537. const coll = getCollectionByName(db, parsed.collectionName);
  538. if (!coll) {
  539. console.error(`${c.yellow}Collection not found: ${parsed.collectionName}${c.reset}`);
  540. process.exit(1);
  541. }
  542. db.prepare(`
  543. INSERT INTO path_contexts (collection_id, path_prefix, context, created_at)
  544. VALUES (?, ?, ?, ?)
  545. ON CONFLICT(collection_id, path_prefix) DO UPDATE SET context = excluded.context
  546. `).run(coll.id, parsed.path, contextText, now);
  547. console.log(`${c.green}✓${c.reset} Added context for: qmd://${parsed.collectionName}/${parsed.path || ''}`);
  548. console.log(`${c.dim}Context: ${contextText}${c.reset}`);
  549. closeDb();
  550. return;
  551. }
  552. // Detect collection from filesystem path
  553. const detected = detectCollectionFromPath(db, fsPath);
  554. if (!detected) {
  555. console.error(`${c.yellow}Path is not in any indexed collection: ${fsPath}${c.reset}`);
  556. console.error(`${c.dim}Run 'qmd status' to see indexed collections${c.reset}`);
  557. process.exit(1);
  558. }
  559. db.prepare(`
  560. INSERT INTO path_contexts (collection_id, path_prefix, context, created_at)
  561. VALUES (?, ?, ?, ?)
  562. ON CONFLICT(collection_id, path_prefix) DO UPDATE SET context = excluded.context
  563. `).run(detected.collectionId, detected.relativePath, contextText, now);
  564. const displayPath = detected.relativePath ? `qmd://${detected.collectionName}/${detected.relativePath}` : `qmd://${detected.collectionName}/`;
  565. console.log(`${c.green}✓${c.reset} Added context for: ${displayPath}`);
  566. console.log(`${c.dim}Context: ${contextText}${c.reset}`);
  567. closeDb();
  568. }
  569. function contextList(): void {
  570. const db = getDb();
  571. const contexts = db.prepare(`
  572. SELECT c.name as collection_name, pc.path_prefix, pc.context
  573. FROM path_contexts pc
  574. JOIN collections c ON c.id = pc.collection_id
  575. ORDER BY c.name, LENGTH(pc.path_prefix) DESC, pc.path_prefix
  576. `).all() as { collection_name: string; path_prefix: string; context: string }[];
  577. if (contexts.length === 0) {
  578. console.log(`${c.dim}No contexts configured. Use 'qmd context add' to add one.${c.reset}`);
  579. closeDb();
  580. return;
  581. }
  582. console.log(`\n${c.bold}Configured Contexts${c.reset}\n`);
  583. let lastCollection = '';
  584. for (const ctx of contexts) {
  585. if (ctx.collection_name !== lastCollection) {
  586. console.log(`${c.cyan}${ctx.collection_name}${c.reset}`);
  587. lastCollection = ctx.collection_name;
  588. }
  589. const path = ctx.path_prefix || '/';
  590. const displayPath = ctx.path_prefix ? ` ${path}` : ' / (root)';
  591. console.log(`${displayPath}`);
  592. console.log(` ${c.dim}${ctx.context}${c.reset}`);
  593. }
  594. closeDb();
  595. }
  596. function contextRemove(pathArg: string): void {
  597. const db = getDb();
  598. if (pathArg === '/') {
  599. // Remove all root contexts
  600. const result = db.prepare(`DELETE FROM path_contexts WHERE path_prefix = ''`).run();
  601. console.log(`${c.green}✓${c.reset} Removed ${result.changes} global context(s)`);
  602. closeDb();
  603. return;
  604. }
  605. // Handle virtual paths
  606. if (isVirtualPath(pathArg)) {
  607. const parsed = parseVirtualPath(pathArg);
  608. if (!parsed) {
  609. console.error(`${c.yellow}Invalid virtual path: ${pathArg}${c.reset}`);
  610. process.exit(1);
  611. }
  612. const coll = getCollectionByName(db, parsed.collectionName);
  613. if (!coll) {
  614. console.error(`${c.yellow}Collection not found: ${parsed.collectionName}${c.reset}`);
  615. process.exit(1);
  616. }
  617. const result = db.prepare(`
  618. DELETE FROM path_contexts
  619. WHERE collection_id = ? AND path_prefix = ?
  620. `).run(coll.id, parsed.path);
  621. if (result.changes === 0) {
  622. console.error(`${c.yellow}No context found for: ${pathArg}${c.reset}`);
  623. process.exit(1);
  624. }
  625. console.log(`${c.green}✓${c.reset} Removed context for: ${pathArg}`);
  626. closeDb();
  627. return;
  628. }
  629. // Handle filesystem paths
  630. let fsPath = pathArg;
  631. if (fsPath === '.' || fsPath === './') {
  632. fsPath = getPwd();
  633. } else if (fsPath.startsWith('~/')) {
  634. fsPath = homedir() + fsPath.slice(1);
  635. } else if (!fsPath.startsWith('/')) {
  636. fsPath = resolve(getPwd(), fsPath);
  637. }
  638. const detected = detectCollectionFromPath(db, fsPath);
  639. if (!detected) {
  640. console.error(`${c.yellow}Path is not in any indexed collection: ${fsPath}${c.reset}`);
  641. process.exit(1);
  642. }
  643. const result = db.prepare(`
  644. DELETE FROM path_contexts
  645. WHERE collection_id = ? AND path_prefix = ?
  646. `).run(detected.collectionId, detected.relativePath);
  647. if (result.changes === 0) {
  648. console.error(`${c.yellow}No context found for: qmd://${detected.collectionName}/${detected.relativePath}${c.reset}`);
  649. process.exit(1);
  650. }
  651. console.log(`${c.green}✓${c.reset} Removed context for: qmd://${detected.collectionName}/${detected.relativePath}`);
  652. closeDb();
  653. }
  654. function getDocument(filename: string, fromLine?: number, maxLines?: number): void {
  655. const db = getDb();
  656. // Parse :linenum suffix from filename (e.g., "file.md:100")
  657. let inputPath = filename;
  658. const colonMatch = inputPath.match(/:(\d+)$/);
  659. if (colonMatch && !fromLine) {
  660. fromLine = parseInt(colonMatch[1], 10);
  661. inputPath = inputPath.slice(0, -colonMatch[0].length);
  662. }
  663. let doc: { collectionId: number; collectionName: string; path: string; body: string } | null = null;
  664. let virtualPath: string;
  665. // Handle virtual paths (qmd://collection/path)
  666. if (isVirtualPath(inputPath)) {
  667. const parsed = parseVirtualPath(inputPath);
  668. if (!parsed) {
  669. console.error(`Invalid virtual path: ${inputPath}`);
  670. closeDb();
  671. process.exit(1);
  672. }
  673. // Try exact match on collection + path
  674. doc = db.prepare(`
  675. SELECT c.id as collectionId, c.name as collectionName, d.path, content.doc as body
  676. FROM documents d
  677. JOIN collections c ON c.id = d.collection_id
  678. JOIN content ON content.hash = d.hash
  679. WHERE c.name = ? AND d.path = ? AND d.active = 1
  680. `).get(parsed.collectionName, parsed.path) as typeof doc;
  681. if (!doc) {
  682. // Try fuzzy match by path ending
  683. doc = db.prepare(`
  684. SELECT c.id as collectionId, c.name as collectionName, d.path, content.doc as body
  685. FROM documents d
  686. JOIN collections c ON c.id = d.collection_id
  687. JOIN content ON content.hash = d.hash
  688. WHERE c.name = ? AND d.path LIKE ? AND d.active = 1
  689. LIMIT 1
  690. `).get(parsed.collectionName, `%${parsed.path}`) as typeof doc;
  691. }
  692. virtualPath = inputPath;
  693. } else {
  694. // Handle filesystem paths
  695. let fsPath = inputPath;
  696. // Expand ~ to home directory
  697. if (fsPath.startsWith('~/')) {
  698. fsPath = homedir() + fsPath.slice(1);
  699. } else if (!fsPath.startsWith('/')) {
  700. // Relative path - resolve from current directory
  701. fsPath = resolve(getPwd(), fsPath);
  702. }
  703. fsPath = getRealPath(fsPath);
  704. // Try to detect which collection contains this path
  705. const detected = detectCollectionFromPath(db, fsPath);
  706. if (detected) {
  707. // Found collection - query by collection_id + relative path
  708. doc = db.prepare(`
  709. SELECT c.id as collectionId, c.name as collectionName, d.path, content.doc as body
  710. FROM documents d
  711. JOIN collections c ON c.id = d.collection_id
  712. JOIN content ON content.hash = d.hash
  713. WHERE c.id = ? AND d.path = ? AND d.active = 1
  714. `).get(detected.collectionId, detected.relativePath) as typeof doc;
  715. }
  716. // Fuzzy match by filename (last component of path)
  717. if (!doc) {
  718. const filename = inputPath.split('/').pop() || inputPath;
  719. doc = db.prepare(`
  720. SELECT c.id as collectionId, c.name as collectionName, d.path, content.doc as body
  721. FROM documents d
  722. JOIN collections c ON c.id = d.collection_id
  723. JOIN content ON content.hash = d.hash
  724. WHERE d.path LIKE ? AND d.active = 1
  725. LIMIT 1
  726. `).get(`%${filename}`) as typeof doc;
  727. }
  728. if (doc) {
  729. virtualPath = buildVirtualPath(doc.collectionName, doc.path);
  730. } else {
  731. virtualPath = inputPath;
  732. }
  733. }
  734. if (!doc) {
  735. console.error(`Document not found: ${filename}`);
  736. closeDb();
  737. process.exit(1);
  738. }
  739. // Get context for this file
  740. const context = getContextForPath(db, doc.collectionId, doc.path);
  741. let output = doc.body;
  742. // Apply line filtering if specified
  743. if (fromLine !== undefined || maxLines !== undefined) {
  744. const lines = output.split('\n');
  745. const start = (fromLine || 1) - 1; // Convert to 0-indexed
  746. const end = maxLines !== undefined ? start + maxLines : lines.length;
  747. output = lines.slice(start, end).join('\n');
  748. }
  749. // Output context header if exists
  750. if (context) {
  751. console.log(`Folder Context: ${context}\n---\n`);
  752. }
  753. console.log(output);
  754. closeDb();
  755. }
  756. // Multi-get: fetch multiple documents by glob pattern or comma-separated list
  757. function multiGet(pattern: string, maxLines?: number, maxBytes: number = DEFAULT_MULTI_GET_MAX_BYTES, format: OutputFormat = "cli"): void {
  758. const db = getDb();
  759. // Check if it's a comma-separated list or a glob pattern
  760. const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
  761. let files: { filepath: string; displayPath: string; bodyLength: number; collectionId?: number; path?: string }[];
  762. if (isCommaSeparated) {
  763. // Comma-separated list of files (can be virtual paths or relative paths)
  764. const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
  765. files = [];
  766. for (const name of names) {
  767. let doc: { virtual_path: string; body_length: number; collection_id: number; path: string } | null = null;
  768. // Handle virtual paths
  769. if (isVirtualPath(name)) {
  770. const parsed = parseVirtualPath(name);
  771. if (parsed) {
  772. // Try exact match on collection + path
  773. doc = db.prepare(`
  774. SELECT
  775. 'qmd://' || c.name || '/' || d.path as virtual_path,
  776. LENGTH(content.doc) as body_length,
  777. d.collection_id,
  778. d.path
  779. FROM documents d
  780. JOIN collections c ON c.id = d.collection_id
  781. JOIN content ON content.hash = d.hash
  782. WHERE c.name = ? AND d.path = ? AND d.active = 1
  783. `).get(parsed.collectionName, parsed.path) as typeof doc;
  784. }
  785. } else {
  786. // Try exact match on path
  787. doc = db.prepare(`
  788. SELECT
  789. 'qmd://' || c.name || '/' || d.path as virtual_path,
  790. LENGTH(content.doc) as body_length,
  791. d.collection_id,
  792. d.path
  793. FROM documents d
  794. JOIN collections c ON c.id = d.collection_id
  795. JOIN content ON content.hash = d.hash
  796. WHERE d.path = ? AND d.active = 1
  797. LIMIT 1
  798. `).get(name) as typeof doc;
  799. // Try suffix match
  800. if (!doc) {
  801. doc = db.prepare(`
  802. SELECT
  803. 'qmd://' || c.name || '/' || d.path as virtual_path,
  804. LENGTH(content.doc) as body_length,
  805. d.collection_id,
  806. d.path
  807. FROM documents d
  808. JOIN collections c ON c.id = d.collection_id
  809. JOIN content ON content.hash = d.hash
  810. WHERE d.path LIKE ? AND d.active = 1
  811. LIMIT 1
  812. `).get(`%${name}`) as typeof doc;
  813. }
  814. }
  815. if (doc) {
  816. files.push({
  817. filepath: doc.virtual_path,
  818. displayPath: doc.virtual_path,
  819. bodyLength: doc.body_length,
  820. collectionId: doc.collection_id,
  821. path: doc.path
  822. });
  823. } else {
  824. console.error(`File not found: ${name}`);
  825. }
  826. }
  827. } else {
  828. // Glob pattern - matchFilesByGlob now returns virtual paths
  829. files = matchFilesByGlob(db, pattern).map(f => ({
  830. ...f,
  831. collectionId: undefined, // Will be fetched later if needed
  832. path: undefined
  833. }));
  834. if (files.length === 0) {
  835. console.error(`No files matched pattern: ${pattern}`);
  836. closeDb();
  837. process.exit(1);
  838. }
  839. }
  840. // Collect results for structured output
  841. const results: { file: string; displayPath: string; title: string; body: string; context: string | null; skipped: boolean; skipReason?: string }[] = [];
  842. for (const file of files) {
  843. // Parse virtual path to get collection info if not already available
  844. let collectionId = file.collectionId;
  845. let path = file.path;
  846. if (!collectionId || !path) {
  847. const parsed = parseVirtualPath(file.displayPath);
  848. if (parsed) {
  849. const coll = getCollectionByName(db, parsed.collectionName);
  850. if (coll) {
  851. collectionId = coll.id;
  852. path = parsed.path;
  853. }
  854. }
  855. }
  856. // Get context using collection-scoped function
  857. const context = collectionId && path ? getContextForPath(db, collectionId, path) : null;
  858. // Check size limit
  859. if (file.bodyLength > maxBytes) {
  860. results.push({
  861. file: file.filepath,
  862. displayPath: file.displayPath,
  863. title: file.displayPath.split('/').pop() || file.displayPath,
  864. body: "",
  865. context,
  866. skipped: true,
  867. skipReason: `File too large (${Math.round(file.bodyLength / 1024)}KB > ${Math.round(maxBytes / 1024)}KB). Use 'qmd get ${file.displayPath}' to retrieve.`,
  868. });
  869. continue;
  870. }
  871. // Fetch document content - use virtual path to query
  872. const parsed = parseVirtualPath(file.displayPath);
  873. if (!parsed) continue;
  874. const doc = db.prepare(`
  875. SELECT content.doc as body, d.title
  876. FROM documents d
  877. JOIN collections c ON c.id = d.collection_id
  878. JOIN content ON content.hash = d.hash
  879. WHERE c.name = ? AND d.path = ? AND d.active = 1
  880. `).get(parsed.collectionName, parsed.path) as { body: string; title: string } | null;
  881. if (!doc) continue;
  882. let body = doc.body;
  883. // Apply line limit if specified
  884. if (maxLines !== undefined) {
  885. const lines = body.split('\n');
  886. body = lines.slice(0, maxLines).join('\n');
  887. if (lines.length > maxLines) {
  888. body += `\n\n[... truncated ${lines.length - maxLines} more lines]`;
  889. }
  890. }
  891. results.push({
  892. file: file.filepath,
  893. displayPath: file.displayPath,
  894. title: doc.title || file.displayPath.split('/').pop() || file.displayPath,
  895. body,
  896. context,
  897. skipped: false,
  898. });
  899. }
  900. closeDb();
  901. // Output based on format
  902. if (format === "json") {
  903. const output = results.map(r => ({
  904. file: r.displayPath,
  905. title: r.title,
  906. ...(r.context && { context: r.context }),
  907. ...(r.skipped ? { skipped: true, reason: r.skipReason } : { body: r.body }),
  908. }));
  909. console.log(JSON.stringify(output, null, 2));
  910. } else if (format === "csv") {
  911. const escapeField = (val: string | null): string => {
  912. if (val === null || val === undefined) return "";
  913. const str = String(val);
  914. if (str.includes(",") || str.includes('"') || str.includes("\n")) {
  915. return `"${str.replace(/"/g, '""')}"`;
  916. }
  917. return str;
  918. };
  919. console.log("file,title,context,skipped,body");
  920. for (const r of results) {
  921. console.log([r.displayPath, r.title, r.context || "", r.skipped ? "true" : "false", r.skipped ? r.skipReason : r.body].map(escapeField).join(","));
  922. }
  923. } else if (format === "files") {
  924. for (const r of results) {
  925. const ctx = r.context ? `,"${r.context.replace(/"/g, '""')}"` : "";
  926. const status = r.skipped ? "[SKIPPED]" : "";
  927. console.log(`${r.displayPath}${ctx}${status ? `,${status}` : ""}`);
  928. }
  929. } else if (format === "md") {
  930. for (const r of results) {
  931. console.log(`## ${r.displayPath}\n`);
  932. if (r.title && r.title !== r.displayPath) console.log(`**Title:** ${r.title}\n`);
  933. if (r.context) console.log(`**Context:** ${r.context}\n`);
  934. if (r.skipped) {
  935. console.log(`> ${r.skipReason}\n`);
  936. } else {
  937. console.log("```");
  938. console.log(r.body);
  939. console.log("```\n");
  940. }
  941. }
  942. } else if (format === "xml") {
  943. console.log('<?xml version="1.0" encoding="UTF-8"?>');
  944. console.log("<documents>");
  945. for (const r of results) {
  946. console.log(" <document>");
  947. console.log(` <file>${escapeXml(r.displayPath)}</file>`);
  948. console.log(` <title>${escapeXml(r.title)}</title>`);
  949. if (r.context) console.log(` <context>${escapeXml(r.context)}</context>`);
  950. if (r.skipped) {
  951. console.log(` <skipped>true</skipped>`);
  952. console.log(` <reason>${escapeXml(r.skipReason || "")}</reason>`);
  953. } else {
  954. console.log(` <body>${escapeXml(r.body)}</body>`);
  955. }
  956. console.log(" </document>");
  957. }
  958. console.log("</documents>");
  959. } else {
  960. // CLI format (default)
  961. for (const r of results) {
  962. console.log(`\n${'='.repeat(60)}`);
  963. console.log(`File: ${r.displayPath}`);
  964. console.log(`${'='.repeat(60)}\n`);
  965. if (r.skipped) {
  966. console.log(`[SKIPPED: ${r.skipReason}]`);
  967. continue;
  968. }
  969. if (r.context) {
  970. console.log(`Folder Context: ${r.context}\n---\n`);
  971. }
  972. console.log(r.body);
  973. }
  974. }
  975. }
  976. async function dropCollection(globPattern: string): Promise<void> {
  977. const db = getDb();
  978. const pwd = getPwd();
  979. const collection = db.prepare(`SELECT id FROM collections WHERE pwd = ? AND glob_pattern = ?`).get(pwd, globPattern) as { id: number } | null;
  980. if (!collection) {
  981. // No collection to drop - this is fine, we'll create one during indexing
  982. return;
  983. }
  984. // Delete documents in this collection
  985. const deleted = db.prepare(`DELETE FROM documents WHERE collection_id = ?`).run(collection.id);
  986. // Delete the collection
  987. db.prepare(`DELETE FROM collections WHERE id = ?`).run(collection.id);
  988. console.log(`Dropped collection: ${pwd} (${globPattern})`);
  989. console.log(`Removed ${deleted.changes} documents`);
  990. console.log(`(Vectors kept for potential reuse)`);
  991. // Don't close db - indexFiles will use it and close at the end
  992. }
  993. async function indexFiles(globPattern: string = DEFAULT_GLOB): Promise<void> {
  994. const db = getDb();
  995. const pwd = getPwd();
  996. const now = new Date().toISOString();
  997. const excludeDirs = ["node_modules", ".git", ".cache", "vendor", "dist", "build"];
  998. // Clear Ollama cache on index
  999. clearCache(db);
  1000. // Get or create collection for this (pwd, glob)
  1001. const collectionId = getOrCreateCollection(db, pwd, globPattern);
  1002. console.log(`Collection: ${pwd} (${globPattern})`);
  1003. progress.indeterminate();
  1004. const glob = new Glob(globPattern);
  1005. const files: string[] = [];
  1006. for await (const file of glob.scan({ cwd: pwd, onlyFiles: true, followSymlinks: true })) {
  1007. // Skip node_modules, hidden folders (.*), and other common excludes
  1008. const parts = file.split("/");
  1009. const shouldSkip = parts.some(part =>
  1010. part === "node_modules" ||
  1011. part.startsWith(".") ||
  1012. excludeDirs.includes(part)
  1013. );
  1014. if (!shouldSkip) {
  1015. files.push(file);
  1016. }
  1017. }
  1018. const total = files.length;
  1019. if (total === 0) {
  1020. progress.clear();
  1021. console.log("No files found matching pattern.");
  1022. closeDb();
  1023. return;
  1024. }
  1025. // Prepared statements for new schema
  1026. const insertContentStmt = db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`);
  1027. const insertDocStmt = db.prepare(`INSERT INTO documents (collection_id, path, title, hash, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, 1)`);
  1028. const deactivateStmt = db.prepare(`UPDATE documents SET active = 0 WHERE collection_id = ? AND path = ? AND active = 1`);
  1029. const findActiveStmt = db.prepare(`SELECT id, hash, title FROM documents WHERE collection_id = ? AND path = ? AND active = 1`);
  1030. const updateTitleStmt = db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`);
  1031. let indexed = 0, updated = 0, unchanged = 0, processed = 0;
  1032. const seenPaths = new Set<string>();
  1033. const startTime = Date.now();
  1034. for (const relativeFile of files) {
  1035. const filepath = getRealPath(resolve(pwd, relativeFile));
  1036. const path = relativeFile; // Use relative path as-is
  1037. seenPaths.add(path);
  1038. const content = await Bun.file(filepath).text();
  1039. const hash = await hashContent(content);
  1040. const title = extractTitle(content, relativeFile);
  1041. // Check if document exists in this collection with this path
  1042. const existing = findActiveStmt.get(collectionId, path) as { id: number; hash: string; title: string } | null;
  1043. if (existing) {
  1044. if (existing.hash === hash) {
  1045. // Hash unchanged, but check if title needs updating
  1046. if (existing.title !== title) {
  1047. updateTitleStmt.run(title, now, existing.id);
  1048. updated++;
  1049. } else {
  1050. unchanged++;
  1051. }
  1052. } else {
  1053. // Content changed - insert new content hash and update document
  1054. insertContentStmt.run(hash, content, now);
  1055. deactivateStmt.run(collectionId, path);
  1056. updated++;
  1057. const stat = await Bun.file(filepath).stat();
  1058. insertDocStmt.run(collectionId, path, title, hash,
  1059. stat ? new Date(stat.birthtime).toISOString() : now,
  1060. stat ? new Date(stat.mtime).toISOString() : now);
  1061. }
  1062. } else {
  1063. // New document - insert content and document
  1064. indexed++;
  1065. insertContentStmt.run(hash, content, now);
  1066. const stat = await Bun.file(filepath).stat();
  1067. insertDocStmt.run(collectionId, path, title, hash,
  1068. stat ? new Date(stat.birthtime).toISOString() : now,
  1069. stat ? new Date(stat.mtime).toISOString() : now);
  1070. }
  1071. processed++;
  1072. progress.set((processed / total) * 100);
  1073. const elapsed = (Date.now() - startTime) / 1000;
  1074. const rate = processed / elapsed;
  1075. const remaining = (total - processed) / rate;
  1076. const eta = processed > 2 ? ` ETA: ${formatETA(remaining)}` : "";
  1077. process.stderr.write(`\rIndexing: ${processed}/${total}${eta} `);
  1078. }
  1079. // Deactivate documents in this collection that no longer exist
  1080. const allActive = db.prepare(`SELECT path FROM documents WHERE collection_id = ? AND active = 1`).all(collectionId) as { path: string }[];
  1081. let removed = 0;
  1082. for (const row of allActive) {
  1083. if (!seenPaths.has(row.path)) {
  1084. deactivateStmt.run(collectionId, row.path);
  1085. removed++;
  1086. }
  1087. }
  1088. // Clean up orphaned content hashes (content not referenced by any document)
  1089. const cleanupResult = db.prepare(`
  1090. DELETE FROM content
  1091. WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  1092. `).run();
  1093. const orphanedContent = cleanupResult.changes;
  1094. // Check if vector index needs updating
  1095. const needsEmbedding = getHashesNeedingEmbedding(db);
  1096. progress.clear();
  1097. console.log(`\nIndexed: ${indexed} new, ${updated} updated, ${unchanged} unchanged, ${removed} removed`);
  1098. if (orphanedContent > 0) {
  1099. console.log(`Cleaned up ${orphanedContent} orphaned content hash(es)`);
  1100. }
  1101. if (needsEmbedding > 0) {
  1102. console.log(`\nRun 'qmd embed' to update embeddings (${needsEmbedding} unique hashes need vectors)`);
  1103. }
  1104. closeDb();
  1105. }
  1106. function renderProgressBar(percent: number, width: number = 30): string {
  1107. const filled = Math.round((percent / 100) * width);
  1108. const empty = width - filled;
  1109. const bar = "█".repeat(filled) + "░".repeat(empty);
  1110. return bar;
  1111. }
  1112. async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = false): Promise<void> {
  1113. const db = getDb();
  1114. const now = new Date().toISOString();
  1115. // If force, clear all vectors
  1116. if (force) {
  1117. console.log(`${c.yellow}Force re-indexing: clearing all vectors...${c.reset}`);
  1118. db.exec(`DELETE FROM content_vectors`);
  1119. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  1120. }
  1121. // Find unique hashes that need embedding (from active documents)
  1122. // Use MIN(filepath) to get one representative filepath per hash
  1123. const hashesToEmbed = db.prepare(`
  1124. SELECT d.hash, d.body, MIN(d.filepath) as filepath, MIN(d.display_path) as display_path
  1125. FROM documents d
  1126. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  1127. WHERE d.active = 1 AND v.hash IS NULL
  1128. GROUP BY d.hash
  1129. `).all() as { hash: string; body: string; filepath: string; display_path: string }[];
  1130. if (hashesToEmbed.length === 0) {
  1131. console.log(`${c.green}✓ All content hashes already have embeddings.${c.reset}`);
  1132. closeDb();
  1133. return;
  1134. }
  1135. // Prepare documents with chunks
  1136. type ChunkItem = { hash: string; title: string; text: string; seq: number; pos: number; bytes: number; displayName: string };
  1137. const allChunks: ChunkItem[] = [];
  1138. let multiChunkDocs = 0;
  1139. for (const item of hashesToEmbed) {
  1140. const encoder = new TextEncoder();
  1141. const bodyBytes = encoder.encode(item.body).length;
  1142. if (bodyBytes === 0) continue; // Skip empty
  1143. const title = extractTitle(item.body, item.filepath);
  1144. const displayName = item.display_path || item.filepath;
  1145. const chunks = chunkDocument(item.body, CHUNK_BYTE_SIZE);
  1146. if (chunks.length > 1) multiChunkDocs++;
  1147. for (let seq = 0; seq < chunks.length; seq++) {
  1148. allChunks.push({
  1149. hash: item.hash,
  1150. title,
  1151. text: chunks[seq].text,
  1152. seq,
  1153. pos: chunks[seq].pos,
  1154. bytes: encoder.encode(chunks[seq].text).length,
  1155. displayName,
  1156. });
  1157. }
  1158. }
  1159. if (allChunks.length === 0) {
  1160. console.log(`${c.green}✓ No non-empty documents to embed.${c.reset}`);
  1161. closeDb();
  1162. return;
  1163. }
  1164. const totalBytes = allChunks.reduce((sum, c) => sum + c.bytes, 0);
  1165. const totalChunks = allChunks.length;
  1166. const totalDocs = hashesToEmbed.length;
  1167. console.log(`${c.bold}Embedding ${totalDocs} documents${c.reset} ${c.dim}(${totalChunks} chunks, ${formatBytes(totalBytes)})${c.reset}`);
  1168. if (multiChunkDocs > 0) {
  1169. console.log(`${c.dim}${multiChunkDocs} documents split into multiple chunks${c.reset}`);
  1170. }
  1171. console.log(`${c.dim}Model: ${model}${c.reset}\n`);
  1172. // Hide cursor during embedding
  1173. cursor.hide();
  1174. // Get embedding dimensions from first chunk
  1175. progress.indeterminate();
  1176. const firstEmbedding = await getEmbedding(allChunks[0].text, model, false, allChunks[0].title);
  1177. ensureVecTable(db, firstEmbedding.length);
  1178. const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
  1179. const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
  1180. let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
  1181. const startTime = Date.now();
  1182. // Insert first chunk
  1183. const firstHashSeq = `${allChunks[0].hash}_${allChunks[0].seq}`;
  1184. insertVecStmt.run(firstHashSeq, new Float32Array(firstEmbedding));
  1185. insertContentVectorStmt.run(allChunks[0].hash, allChunks[0].seq, allChunks[0].pos, model, now);
  1186. chunksEmbedded++;
  1187. bytesProcessed += allChunks[0].bytes;
  1188. for (let i = 1; i < allChunks.length; i++) {
  1189. const chunk = allChunks[i];
  1190. try {
  1191. const embedding = await getEmbedding(chunk.text, model, false, chunk.title);
  1192. const hashSeq = `${chunk.hash}_${chunk.seq}`;
  1193. insertVecStmt.run(hashSeq, new Float32Array(embedding));
  1194. insertContentVectorStmt.run(chunk.hash, chunk.seq, chunk.pos, model, now);
  1195. chunksEmbedded++;
  1196. bytesProcessed += chunk.bytes;
  1197. } catch (err) {
  1198. errors++;
  1199. bytesProcessed += chunk.bytes;
  1200. progress.error();
  1201. console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}: ${err}${c.reset}`);
  1202. }
  1203. const percent = (bytesProcessed / totalBytes) * 100;
  1204. progress.set(percent);
  1205. const elapsed = (Date.now() - startTime) / 1000;
  1206. const bytesPerSec = bytesProcessed / elapsed;
  1207. const remainingBytes = totalBytes - bytesProcessed;
  1208. const etaSec = remainingBytes / bytesPerSec;
  1209. const bar = renderProgressBar(percent);
  1210. const percentStr = percent.toFixed(0).padStart(3);
  1211. const throughput = `${formatBytes(bytesPerSec)}/s`;
  1212. const eta = elapsed > 2 ? formatETA(etaSec) : "...";
  1213. const errStr = errors > 0 ? ` ${c.yellow}${errors} err${c.reset}` : "";
  1214. process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${chunksEmbedded}/${totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset} `);
  1215. }
  1216. progress.clear();
  1217. cursor.show();
  1218. const totalTimeSec = (Date.now() - startTime) / 1000;
  1219. const avgThroughput = formatBytes(totalBytes / totalTimeSec);
  1220. console.log(`\r${c.green}${renderProgressBar(100)}${c.reset} ${c.bold}100%${c.reset} `);
  1221. console.log(`\n${c.green}✓ Done!${c.reset} Embedded ${c.bold}${chunksEmbedded}${c.reset} chunks from ${c.bold}${totalDocs}${c.reset} documents in ${c.bold}${formatETA(totalTimeSec)}${c.reset} ${c.dim}(${avgThroughput}/s)${c.reset}`);
  1222. if (errors > 0) {
  1223. console.log(`${c.yellow}⚠ ${errors} chunks failed${c.reset}`);
  1224. }
  1225. closeDb();
  1226. }
  1227. // Sanitize a term for FTS5: remove punctuation except apostrophes
  1228. function sanitizeFTS5Term(term: string): string {
  1229. // Remove all non-alphanumeric except apostrophes (for contractions like "don't")
  1230. return term.replace(/[^\w']/g, '').trim();
  1231. }
  1232. // Build FTS5 query: phrase-aware with fallback to individual terms
  1233. function buildFTS5Query(query: string): string {
  1234. // Sanitize the full query for phrase matching
  1235. const sanitizedQuery = query.replace(/[^\w\s']/g, '').trim();
  1236. const terms = query
  1237. .split(/\s+/)
  1238. .map(sanitizeFTS5Term)
  1239. .filter(term => term.length >= 2); // Skip single chars and empty
  1240. if (terms.length === 0) return "";
  1241. if (terms.length === 1) return `"${terms[0].replace(/"/g, '""')}"`;
  1242. // Strategy: exact phrase OR proximity match OR individual terms
  1243. // Exact phrase matches rank highest, then close proximity, then any term
  1244. const phrase = `"${sanitizedQuery.replace(/"/g, '""')}"`;
  1245. const quotedTerms = terms.map(t => `"${t.replace(/"/g, '""')}"`);
  1246. // FTS5 NEAR syntax: NEAR(term1 term2, distance)
  1247. const nearPhrase = `NEAR(${quotedTerms.join(' ')}, 10)`;
  1248. const orTerms = quotedTerms.join(' OR ');
  1249. // Exact phrase > proximity > any term
  1250. return `(${phrase}) OR (${nearPhrase}) OR (${orTerms})`;
  1251. }
  1252. // Normalize BM25 score to 0-1 range using sigmoid
  1253. function normalizeBM25(score: number): number {
  1254. // BM25 scores are negative in SQLite (lower = better)
  1255. // Typical range: -15 (excellent) to -2 (weak match)
  1256. // Map to 0-1 where higher is better
  1257. const absScore = Math.abs(score);
  1258. // Sigmoid-ish normalization: maps ~2-15 range to ~0.1-0.95
  1259. return 1 / (1 + Math.exp(-(absScore - 5) / 3));
  1260. }
  1261. // Get collection ID by name (matches pwd or glob_pattern suffix)
  1262. function getCollectionIdByName(db: Database, name: string): number | null {
  1263. // Search both pwd and glob_pattern columns for the name
  1264. const result = db.prepare(`
  1265. SELECT id FROM collections
  1266. WHERE pwd LIKE ? OR glob_pattern LIKE ?
  1267. ORDER BY LENGTH(pwd) DESC
  1268. LIMIT 1
  1269. `).get(`%${name}%`, `%${name}%`) as { id: number } | null;
  1270. return result?.id || null;
  1271. }
  1272. // searchFTS and searchVec are now imported from store.ts with updated schema
  1273. // Removed duplicate searchFTS and searchVec functions - using store.ts versions instead
  1274. async function REMOVED_searchVec(db: Database, query: string, model: string, limit: number = 20, collectionId?: number): Promise<SearchResult[]> {
  1275. const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1276. if (!tableExists) return [];
  1277. const queryEmbedding = await getEmbedding(query, model, true);
  1278. const queryVec = new Float32Array(queryEmbedding);
  1279. // Join: vectors_vec -> content_vectors -> documents
  1280. // Over-retrieve to handle multiple chunks per document, then dedupe
  1281. let sql = `
  1282. SELECT d.filepath, d.display_path, d.title, d.body, vec.distance, cv.pos
  1283. FROM vectors_vec vec
  1284. JOIN content_vectors cv ON vec.hash_seq = cv.hash || '_' || cv.seq
  1285. JOIN documents d ON d.hash = cv.hash AND d.active = 1
  1286. WHERE vec.embedding MATCH ? AND k = ?
  1287. `;
  1288. if (collectionId !== undefined) {
  1289. sql += ` AND d.collection_id = ${collectionId}`;
  1290. }
  1291. sql += ` ORDER BY vec.distance`;
  1292. const stmt = db.prepare(sql);
  1293. const rawResults = stmt.all(queryVec, limit * 3) as { filepath: string; display_path: string; title: string; body: string; distance: number; pos: number }[];
  1294. // Aggregate chunks per document: max score + small bonus for additional matches
  1295. const byFile = new Map<string, { filepath: string; displayPath: string; title: string; body: string; chunkCount: number; bestPos: number; bestDist: number }>();
  1296. for (const r of rawResults) {
  1297. const existing = byFile.get(r.filepath);
  1298. if (!existing) {
  1299. byFile.set(r.filepath, { filepath: r.filepath, displayPath: r.display_path, title: r.title, body: r.body, chunkCount: 1, bestPos: r.pos, bestDist: r.distance });
  1300. } else {
  1301. existing.chunkCount++;
  1302. if (r.distance < existing.bestDist) {
  1303. existing.bestDist = r.distance;
  1304. existing.bestPos = r.pos;
  1305. }
  1306. }
  1307. }
  1308. // Score = max chunk score + 0.02 bonus per additional chunk (capped at +0.1)
  1309. return Array.from(byFile.values())
  1310. .map(r => {
  1311. const maxScore = 1 / (1 + r.bestDist);
  1312. const bonusChunks = Math.min(r.chunkCount - 1, 5);
  1313. const bonus = bonusChunks * 0.02;
  1314. return {
  1315. file: r.filepath,
  1316. displayPath: r.displayPath,
  1317. title: r.title,
  1318. body: r.body,
  1319. score: maxScore + bonus,
  1320. source: "vec" as const,
  1321. chunkPos: r.bestPos,
  1322. };
  1323. })
  1324. .sort((a, b) => b.score - a.score)
  1325. .slice(0, limit);
  1326. }
  1327. function normalizeScores(results: SearchResult[]): SearchResult[] {
  1328. if (results.length === 0) return results;
  1329. const maxScore = Math.max(...results.map(r => r.score));
  1330. const minScore = Math.min(...results.map(r => r.score));
  1331. const range = maxScore - minScore || 1;
  1332. return results.map(r => ({ ...r, score: (r.score - minScore) / range }));
  1333. }
  1334. // Reciprocal Rank Fusion: combines multiple ranked lists
  1335. // RRF score = sum(1 / (k + rank)) across all lists where doc appears
  1336. // k=60 is standard, provides good balance between top and lower ranks
  1337. export type RankedResult = { file: string; displayPath: string; title: string; body: string; score: number };
  1338. function reciprocalRankFusion(
  1339. resultLists: RankedResult[][],
  1340. weights: number[] = [], // Weight per result list (default 1.0)
  1341. k: number = 60
  1342. ): RankedResult[] {
  1343. const scores = new Map<string, { score: number; displayPath: string; title: string; body: string; bestRank: number }>();
  1344. for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
  1345. const results = resultLists[listIdx];
  1346. const weight = weights[listIdx] ?? 1.0;
  1347. for (let rank = 0; rank < results.length; rank++) {
  1348. const doc = results[rank];
  1349. const rrfScore = weight / (k + rank + 1);
  1350. const existing = scores.get(doc.file);
  1351. if (existing) {
  1352. existing.score += rrfScore;
  1353. existing.bestRank = Math.min(existing.bestRank, rank);
  1354. } else {
  1355. scores.set(doc.file, { score: rrfScore, displayPath: doc.displayPath, title: doc.title, body: doc.body, bestRank: rank });
  1356. }
  1357. }
  1358. }
  1359. // Add bonus for best rank: documents that ranked #1-3 in any list get a boost
  1360. // This prevents dilution of exact matches by expansion queries
  1361. return Array.from(scores.entries())
  1362. .map(([file, { score, displayPath, title, body, bestRank }]) => {
  1363. let bonus = 0;
  1364. if (bestRank === 0) bonus = 0.05; // Ranked #1 somewhere
  1365. else if (bestRank <= 2) bonus = 0.02; // Ranked top-3 somewhere
  1366. return { file, displayPath, title, body, score: score + bonus };
  1367. })
  1368. .sort((a, b) => b.score - a.score);
  1369. }
  1370. type OutputOptions = {
  1371. format: OutputFormat;
  1372. full: boolean;
  1373. limit: number;
  1374. minScore: number;
  1375. all?: boolean;
  1376. collection?: string; // Filter by collection name (pwd suffix match)
  1377. };
  1378. // Extract snippet with more context lines for CLI display
  1379. function extractSnippetWithContext(body: string, query: string, contextLines = 3, chunkPos?: number): { line: number; snippet: string; hasMatch: boolean } {
  1380. // If chunkPos provided, focus search on that area
  1381. let lineOffset = 0;
  1382. let searchBody = body;
  1383. if (chunkPos && chunkPos > 0) {
  1384. const contextStart = Math.max(0, chunkPos - 200);
  1385. searchBody = body.slice(contextStart);
  1386. if (contextStart > 0) {
  1387. lineOffset = body.slice(0, contextStart).split('\n').length - 1;
  1388. }
  1389. }
  1390. const lines = searchBody.split('\n');
  1391. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
  1392. let bestLine = 0, bestScore = -1;
  1393. for (let i = 0; i < lines.length; i++) {
  1394. const lineLower = lines[i].toLowerCase();
  1395. let score = 0;
  1396. for (const term of queryTerms) {
  1397. if (lineLower.includes(term)) score++;
  1398. }
  1399. if (score > bestScore) {
  1400. bestScore = score;
  1401. bestLine = i;
  1402. }
  1403. }
  1404. // No query match found - return beginning of chunk area or file
  1405. if (bestScore <= 0) {
  1406. const preview = lines.slice(0, contextLines * 2).join('\n').trim();
  1407. return { line: lineOffset + 1, snippet: preview, hasMatch: false };
  1408. }
  1409. const startLine = Math.max(0, bestLine - contextLines);
  1410. const endLine = Math.min(lines.length, bestLine + contextLines + 1);
  1411. const snippet = lines.slice(startLine, endLine).join('\n').trim();
  1412. return { line: lineOffset + bestLine + 1, snippet, hasMatch: true };
  1413. }
  1414. // Highlight query terms in text (skip short words < 3 chars)
  1415. function highlightTerms(text: string, query: string): string {
  1416. if (!useColor) return text;
  1417. const terms = query.toLowerCase().split(/\s+/).filter(t => t.length >= 3);
  1418. let result = text;
  1419. for (const term of terms) {
  1420. const regex = new RegExp(`(${term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')})`, 'gi');
  1421. result = result.replace(regex, `${c.yellow}${c.bold}$1${c.reset}`);
  1422. }
  1423. return result;
  1424. }
  1425. // Format score with color based on value
  1426. function formatScore(score: number): string {
  1427. const pct = (score * 100).toFixed(0).padStart(3);
  1428. if (!useColor) return `${pct}%`;
  1429. if (score >= 0.7) return `${c.green}${pct}%${c.reset}`;
  1430. if (score >= 0.4) return `${c.yellow}${pct}%${c.reset}`;
  1431. return `${c.dim}${pct}%${c.reset}`;
  1432. }
  1433. // Shorten directory path for display - relative to $HOME (used for context paths, not documents)
  1434. function shortPath(dirpath: string): string {
  1435. const home = homedir();
  1436. if (dirpath.startsWith(home)) {
  1437. return '~' + dirpath.slice(home.length);
  1438. }
  1439. return dirpath;
  1440. }
  1441. function outputResults(results: { file: string; displayPath: string; title: string; body: string; score: number; context?: string | null; chunkPos?: number }[], query: string, opts: OutputOptions): void {
  1442. const filtered = results.filter(r => r.score >= opts.minScore).slice(0, opts.limit);
  1443. if (filtered.length === 0) {
  1444. console.log("No results found above minimum score threshold.");
  1445. return;
  1446. }
  1447. if (opts.format === "json") {
  1448. // JSON output for LLM consumption
  1449. const output = filtered.map(row => ({
  1450. score: Math.round(row.score * 100) / 100,
  1451. file: row.displayPath,
  1452. title: row.title,
  1453. ...(row.context && { context: row.context }),
  1454. ...(opts.full && { body: row.body }),
  1455. ...(!opts.full && { snippet: extractSnippet(row.body, query, 300, row.chunkPos).snippet }),
  1456. }));
  1457. console.log(JSON.stringify(output, null, 2));
  1458. } else if (opts.format === "files") {
  1459. // Simple score,filepath,context output
  1460. for (const row of filtered) {
  1461. const ctx = row.context ? `,"${row.context.replace(/"/g, '""')}"` : "";
  1462. console.log(`${row.score.toFixed(2)},${row.displayPath}${ctx}`);
  1463. }
  1464. } else if (opts.format === "cli") {
  1465. for (let i = 0; i < filtered.length; i++) {
  1466. const row = filtered[i];
  1467. const { line, snippet, hasMatch } = extractSnippetWithContext(row.body, query, 2, row.chunkPos);
  1468. // Line 1: filepath
  1469. const path = row.displayPath;
  1470. const lineInfo = hasMatch ? `:${line}` : "";
  1471. console.log(`${c.cyan}${path}${c.dim}${lineInfo}${c.reset}`);
  1472. // Line 2: Title (if available)
  1473. if (row.title) {
  1474. console.log(`${c.bold}Title: ${row.title}${c.reset}`);
  1475. }
  1476. // Line 3: Context (if available)
  1477. if (row.context) {
  1478. console.log(`${c.dim}Context: ${row.context}${c.reset}`);
  1479. }
  1480. // Line 4: Score
  1481. const score = formatScore(row.score);
  1482. console.log(`Score: ${c.bold}${score}${c.reset}`);
  1483. console.log();
  1484. // Snippet with highlighting (no leading | chars for better word wrap)
  1485. const highlighted = highlightTerms(snippet, query);
  1486. console.log(highlighted);
  1487. // Double empty line between results
  1488. if (i < filtered.length - 1) console.log('\n');
  1489. }
  1490. } else if (opts.format === "md") {
  1491. for (const row of filtered) {
  1492. const heading = row.title || row.displayPath;
  1493. if (opts.full) {
  1494. console.log(`---\n# ${heading}\n\n${row.body}\n`);
  1495. } else {
  1496. const { snippet } = extractSnippet(row.body, query, 500, row.chunkPos);
  1497. console.log(`---\n# ${heading}\n\n${snippet}\n`);
  1498. }
  1499. }
  1500. } else if (opts.format === "xml") {
  1501. for (const row of filtered) {
  1502. const titleAttr = row.title ? ` title="${row.title.replace(/"/g, '&quot;')}"` : "";
  1503. if (opts.full) {
  1504. console.log(`<file name="${row.displayPath}"${titleAttr}>\n${row.body}\n</file>\n`);
  1505. } else {
  1506. const { snippet } = extractSnippet(row.body, query, 500, row.chunkPos);
  1507. console.log(`<file name="${row.displayPath}"${titleAttr}>\n${snippet}\n</file>\n`);
  1508. }
  1509. }
  1510. } else {
  1511. // CSV format
  1512. console.log("score,file,title,line,snippet");
  1513. for (const row of filtered) {
  1514. const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos);
  1515. const content = opts.full ? row.body : snippet;
  1516. console.log(`${row.score.toFixed(4)},${escapeCSV(row.displayPath)},${escapeCSV(row.title)},${line},${escapeCSV(content)}`);
  1517. }
  1518. }
  1519. }
  1520. function search(query: string, opts: OutputOptions): void {
  1521. const db = getDb();
  1522. // Resolve collection filter if specified
  1523. let collectionId: number | undefined;
  1524. if (opts.collection) {
  1525. collectionId = getCollectionIdByName(db, opts.collection) ?? undefined;
  1526. if (collectionId === undefined) {
  1527. console.error(`Collection not found: ${opts.collection}`);
  1528. closeDb();
  1529. process.exit(1);
  1530. }
  1531. }
  1532. // Use large limit for --all, otherwise fetch more than needed and let outputResults filter
  1533. const fetchLimit = opts.all ? 100000 : Math.max(50, opts.limit * 2);
  1534. const results = searchFTS(db, query, fetchLimit, collectionId);
  1535. // Add context to results
  1536. const resultsWithContext = results.map(r => ({
  1537. ...r,
  1538. context: getContextForFile(db, r.file),
  1539. }));
  1540. closeDb();
  1541. if (resultsWithContext.length === 0) {
  1542. console.log("No results found.");
  1543. return;
  1544. }
  1545. outputResults(resultsWithContext, query, opts);
  1546. }
  1547. async function vectorSearch(query: string, opts: OutputOptions, model: string = DEFAULT_EMBED_MODEL): Promise<void> {
  1548. const db = getDb();
  1549. // Resolve collection filter if specified
  1550. let collectionId: number | undefined;
  1551. if (opts.collection) {
  1552. collectionId = getCollectionIdByName(db, opts.collection) ?? undefined;
  1553. if (collectionId === undefined) {
  1554. console.error(`Collection not found: ${opts.collection}`);
  1555. closeDb();
  1556. process.exit(1);
  1557. }
  1558. }
  1559. const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1560. if (!tableExists) {
  1561. console.error("Vector index not found. Run 'qmd embed' first to create embeddings.");
  1562. closeDb();
  1563. return;
  1564. }
  1565. // Check index health and warn about issues
  1566. checkIndexHealth(db);
  1567. // Expand query to multiple variations (with caching)
  1568. const queries = await expandQuery(query, DEFAULT_QUERY_MODEL, db);
  1569. process.stderr.write(`Searching with ${queries.length} query variations...\n`);
  1570. // Collect results from all query variations
  1571. // For --all, fetch more results per query
  1572. const perQueryLimit = opts.all ? 500 : 20;
  1573. const allResults = new Map<string, { file: string; displayPath: string; title: string; body: string; score: number }>();
  1574. for (const q of queries) {
  1575. const vecResults = await searchVec(db, q, model, perQueryLimit, collectionId);
  1576. for (const r of vecResults) {
  1577. const existing = allResults.get(r.file);
  1578. if (!existing || r.score > existing.score) {
  1579. allResults.set(r.file, { file: r.file, displayPath: r.displayPath, title: r.title, body: r.body, score: r.score });
  1580. }
  1581. }
  1582. }
  1583. // Sort by max score and limit to requested count
  1584. const results = Array.from(allResults.values())
  1585. .sort((a, b) => b.score - a.score)
  1586. .slice(0, opts.limit)
  1587. .map(r => ({ ...r, context: getContextForFile(db, r.file) }));
  1588. closeDb();
  1589. if (results.length === 0) {
  1590. console.log("No results found.");
  1591. return;
  1592. }
  1593. outputResults(results, query, { ...opts, limit: results.length }); // Already limited
  1594. }
  1595. async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db?: Database): Promise<string[]> {
  1596. process.stderr.write("Generating query variations...\n");
  1597. const prompt = `You are a search query expander. Given a search query, generate 2 alternative queries that would help find relevant documents.
  1598. Rules:
  1599. - Use synonyms and related terminology (e.g., "craft" → "craftsmanship", "quality", "excellence")
  1600. - Rephrase to capture different angles (e.g., "engineering culture" → "technical excellence", "developer practices")
  1601. - Keep proper nouns and named concepts exactly as written (e.g., "Build a Business", "Stripe", "Shopify")
  1602. - Each variation should be 3-8 words, natural search terms
  1603. - Do NOT just append words like "search" or "find" or "documents"
  1604. Query: "${query}"
  1605. Output exactly 2 variations, one per line, no numbering or bullets:`;
  1606. const requestBody = {
  1607. model,
  1608. prompt,
  1609. stream: false,
  1610. think: false,
  1611. options: { num_predict: 150 },
  1612. };
  1613. // Check cache
  1614. const cacheDb = db || getDb();
  1615. const cacheKey = getCacheKey(`${OLLAMA_URL}/api/generate`, requestBody);
  1616. const cached = getCachedResult(cacheDb, cacheKey);
  1617. let responseText: string;
  1618. if (cached) {
  1619. responseText = cached;
  1620. } else {
  1621. const response = await fetch(`${OLLAMA_URL}/api/generate`, {
  1622. method: "POST",
  1623. headers: { "Content-Type": "application/json" },
  1624. body: JSON.stringify(requestBody),
  1625. });
  1626. if (!response.ok) {
  1627. const errorText = await response.text();
  1628. if (errorText.includes("not found") || errorText.includes("does not exist")) {
  1629. await ensureModelAvailable(model);
  1630. if (!db) cacheDb.close();
  1631. return expandQuery(query, model, db);
  1632. }
  1633. if (!db) cacheDb.close();
  1634. return [query];
  1635. }
  1636. const data = await response.json() as { response: string };
  1637. responseText = data.response;
  1638. setCachedResult(cacheDb, cacheKey, responseText);
  1639. }
  1640. if (!db) cacheDb.close();
  1641. const lines = responseText.trim().split('\n')
  1642. .map(l => l.replace(/^[\d\.\-\*\"\s]+/, '').replace(/["\s]+$/, '').trim())
  1643. .filter(l => l.length > 2 && l.length < 100 && !l.startsWith('<') && !l.toLowerCase().includes('variation'))
  1644. .slice(0, 2);
  1645. const allQueries = [query, ...lines];
  1646. process.stderr.write(`${c.dim}Queries: ${allQueries.join(' | ')}${c.reset}\n`);
  1647. return allQueries;
  1648. }
  1649. async function querySearch(query: string, opts: OutputOptions, embedModel: string = DEFAULT_EMBED_MODEL, rerankModel: string = DEFAULT_RERANK_MODEL): Promise<void> {
  1650. const db = getDb();
  1651. // Resolve collection filter if specified
  1652. let collectionId: number | undefined;
  1653. if (opts.collection) {
  1654. collectionId = getCollectionIdByName(db, opts.collection) ?? undefined;
  1655. if (collectionId === undefined) {
  1656. console.error(`Collection not found: ${opts.collection}`);
  1657. closeDb();
  1658. process.exit(1);
  1659. }
  1660. }
  1661. // Check index health and warn about issues
  1662. checkIndexHealth(db);
  1663. // Expand query to multiple variations (with caching)
  1664. const queries = await expandQuery(query, DEFAULT_QUERY_MODEL, db);
  1665. process.stderr.write(`Searching with ${queries.length} query variations...\n`);
  1666. // Collect ranked result lists for RRF fusion
  1667. const rankedLists: RankedResult[][] = [];
  1668. const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1669. for (const q of queries) {
  1670. // FTS search - get ranked results
  1671. const ftsResults = searchFTS(db, q, 20, collectionId);
  1672. if (ftsResults.length > 0) {
  1673. rankedLists.push(ftsResults.map(r => ({ file: r.file, displayPath: r.displayPath, title: r.title, body: r.body, score: r.score })));
  1674. }
  1675. // Vector search - get ranked results
  1676. if (hasVectors) {
  1677. const vecResults = await searchVec(db, q, embedModel, 20, collectionId);
  1678. if (vecResults.length > 0) {
  1679. rankedLists.push(vecResults.map(r => ({ file: r.file, displayPath: r.displayPath, title: r.title, body: r.body, score: r.score })));
  1680. }
  1681. }
  1682. }
  1683. // Apply Reciprocal Rank Fusion to combine all ranked lists
  1684. // Give 2x weight to original query results (first 2 lists: FTS + vector)
  1685. const weights = rankedLists.map((_, i) => i < 2 ? 2.0 : 1.0);
  1686. const fused = reciprocalRankFusion(rankedLists, weights);
  1687. const candidates = fused.slice(0, 30); // Over-retrieve for reranking
  1688. if (candidates.length === 0) {
  1689. console.log("No results found.");
  1690. closeDb();
  1691. return;
  1692. }
  1693. // Rerank with the original query (with caching)
  1694. const reranked = await rerank(
  1695. query,
  1696. candidates.map(c => ({ file: c.file, text: c.body })),
  1697. rerankModel,
  1698. db
  1699. );
  1700. // Blend RRF position score with reranker score using position-aware weights
  1701. // Top retrieval results get more protection from reranker disagreement
  1702. const candidateMap = new Map(candidates.map(c => [c.file, { displayPath: c.displayPath, title: c.title, body: c.body }]));
  1703. const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1])); // 1-indexed rank
  1704. const finalResults = reranked.map(r => {
  1705. const rrfRank = rrfRankMap.get(r.file) || 30;
  1706. // Position-aware blending: top retrieval results preserved more
  1707. // Rank 1-3: 75% RRF, 25% reranker (trust retrieval for exact matches)
  1708. // Rank 4-10: 60% RRF, 40% reranker
  1709. // Rank 11+: 40% RRF, 60% reranker (trust reranker for lower-ranked)
  1710. let rrfWeight: number;
  1711. if (rrfRank <= 3) {
  1712. rrfWeight = 0.75;
  1713. } else if (rrfRank <= 10) {
  1714. rrfWeight = 0.60;
  1715. } else {
  1716. rrfWeight = 0.40;
  1717. }
  1718. const rrfScore = 1 / rrfRank; // Position-based: 1, 0.5, 0.33...
  1719. const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * r.score;
  1720. const candidate = candidateMap.get(r.file);
  1721. return {
  1722. file: r.file,
  1723. displayPath: candidate?.displayPath || "",
  1724. title: candidate?.title || "",
  1725. body: candidate?.body || "",
  1726. score: blendedScore,
  1727. context: getContextForFile(db, r.file),
  1728. };
  1729. }).sort((a, b) => b.score - a.score);
  1730. closeDb();
  1731. outputResults(finalResults, query, opts);
  1732. }
  1733. // Parse CLI arguments using util.parseArgs
  1734. function parseCLI() {
  1735. const { values, positionals } = parseArgs({
  1736. args: Bun.argv.slice(2), // Skip bun and script path
  1737. options: {
  1738. // Global options
  1739. index: { type: "string" },
  1740. help: { type: "boolean", short: "h" },
  1741. // Search options
  1742. n: { type: "string" },
  1743. "min-score": { type: "string" },
  1744. all: { type: "boolean" },
  1745. full: { type: "boolean" },
  1746. csv: { type: "boolean" },
  1747. md: { type: "boolean" },
  1748. xml: { type: "boolean" },
  1749. files: { type: "boolean" },
  1750. json: { type: "boolean" },
  1751. collection: { type: "string", short: "c" }, // Filter by collection
  1752. // Add options
  1753. drop: { type: "boolean" },
  1754. // Embed options
  1755. force: { type: "boolean", short: "f" },
  1756. // Get options
  1757. l: { type: "string" }, // max lines
  1758. from: { type: "string" }, // start line
  1759. "max-bytes": { type: "string" }, // max bytes for multi-get
  1760. },
  1761. allowPositionals: true,
  1762. strict: false, // Allow unknown options to pass through
  1763. });
  1764. // Set global index name in store
  1765. if (values.index) {
  1766. setCustomIndexName(values.index);
  1767. }
  1768. // Determine output format
  1769. let format: OutputFormat = "cli";
  1770. if (values.csv) format = "csv";
  1771. else if (values.md) format = "md";
  1772. else if (values.xml) format = "xml";
  1773. else if (values.files) format = "files";
  1774. else if (values.json) format = "json";
  1775. // Default limit: 20 for --files/--json, 5 otherwise
  1776. // --all means return all results (use very large limit)
  1777. const defaultLimit = (format === "files" || format === "json") ? 20 : 5;
  1778. const isAll = values.all || false;
  1779. const opts: OutputOptions = {
  1780. format,
  1781. full: values.full || false,
  1782. limit: isAll ? 100000 : (values.n ? parseInt(values.n, 10) || defaultLimit : defaultLimit),
  1783. minScore: values["min-score"] ? parseFloat(values["min-score"]) || 0 : 0,
  1784. all: isAll,
  1785. collection: values.collection as string | undefined,
  1786. };
  1787. return {
  1788. command: positionals[0] || "",
  1789. args: positionals.slice(1),
  1790. query: positionals.slice(1).join(" "),
  1791. opts,
  1792. values,
  1793. };
  1794. }
  1795. function showHelp(): void {
  1796. console.log("Usage:");
  1797. console.log(" qmd add [--drop] [glob] - Add/update collection from $PWD (default: **/*.md)");
  1798. console.log(" qmd context add [path] \"text\" - Add context for path (defaults to current dir)");
  1799. console.log(" qmd context list - List all contexts");
  1800. console.log(" qmd context rm <path> - Remove context");
  1801. console.log(" qmd get <file>[:line] [-l N] [--from N] - Get document (optionally from line, max N lines)");
  1802. console.log(" qmd multi-get <pattern> [-l N] [--max-bytes N] - Get multiple docs by glob or comma-separated list");
  1803. console.log(" qmd status - Show index status and collections");
  1804. console.log(" qmd update - Re-index all collections");
  1805. console.log(" qmd embed [-f] - Create vector embeddings (chunks ~6KB each)");
  1806. console.log(" qmd cleanup - Remove cache and orphaned data, vacuum DB");
  1807. console.log(" qmd search <query> - Full-text search (BM25)");
  1808. console.log(" qmd vsearch <query> - Vector similarity search");
  1809. console.log(" qmd query <query> - Combined search with query expansion + reranking");
  1810. console.log(" qmd mcp - Start MCP server (for AI agent integration)");
  1811. console.log("");
  1812. console.log("Global options:");
  1813. console.log(" --index <name> - Use custom index name (default: index)");
  1814. console.log("");
  1815. console.log("Search options:");
  1816. console.log(" -n <num> - Number of results (default: 5, or 20 for --files)");
  1817. console.log(" --all - Return all matches (use with --min-score to filter)");
  1818. console.log(" --min-score <num> - Minimum similarity score");
  1819. console.log(" --full - Output full document instead of snippet");
  1820. console.log(" --files - Output score,filepath,context (default: 20 results)");
  1821. console.log(" --json - JSON output with snippets (default: 20 results)");
  1822. console.log(" --csv - CSV output with snippets");
  1823. console.log(" --md - Markdown output");
  1824. console.log(" --xml - XML output");
  1825. console.log(" -c, --collection <name> - Filter results to a specific collection");
  1826. console.log("");
  1827. console.log("Multi-get options:");
  1828. console.log(" -l <num> - Maximum lines per file");
  1829. console.log(" --max-bytes <num> - Skip files larger than N bytes (default: 10240)");
  1830. console.log(" --json/--csv/--md/--xml/--files - Output format (same as search)");
  1831. console.log("");
  1832. console.log("Environment:");
  1833. console.log(" OLLAMA_URL - Ollama server URL (default: http://localhost:11434)");
  1834. console.log("");
  1835. console.log("Models:");
  1836. console.log(` Embedding: ${DEFAULT_EMBED_MODEL}`);
  1837. console.log(` Reranking: ${DEFAULT_RERANK_MODEL}`);
  1838. console.log("");
  1839. console.log(`Index: ${getDbPath()}`);
  1840. }
  1841. // Main CLI - only run if this is the main module
  1842. if (import.meta.main) {
  1843. const cli = parseCLI();
  1844. if (!cli.command || cli.values.help) {
  1845. showHelp();
  1846. process.exit(cli.values.help ? 0 : 1);
  1847. }
  1848. switch (cli.command) {
  1849. case "add": {
  1850. const globArg = cli.args[0];
  1851. // Treat "." as "use default glob in current directory"
  1852. const globPattern = (!globArg || globArg === ".") ? DEFAULT_GLOB : globArg;
  1853. if (cli.values.drop) {
  1854. await dropCollection(globPattern);
  1855. }
  1856. await indexFiles(globPattern);
  1857. break;
  1858. }
  1859. case "context": {
  1860. const subcommand = cli.args[0];
  1861. if (!subcommand) {
  1862. console.error("Usage: qmd context <add|list|rm>");
  1863. console.error("");
  1864. console.error("Commands:");
  1865. console.error(" qmd context add [path] \"text\" - Add context (defaults to current dir)");
  1866. console.error(" qmd context add / \"text\" - Add global context to all collections");
  1867. console.error(" qmd context list - List all contexts");
  1868. console.error(" qmd context rm <path> - Remove context");
  1869. process.exit(1);
  1870. }
  1871. switch (subcommand) {
  1872. case "add": {
  1873. if (cli.args.length < 2) {
  1874. console.error("Usage: qmd context add [path] \"text\"");
  1875. console.error("Examples:");
  1876. console.error(" qmd context add \"Context for current directory\"");
  1877. console.error(" qmd context add . \"Context for current directory\"");
  1878. console.error(" qmd context add /subfolder \"Context for subfolder\"");
  1879. console.error(" qmd context add / \"Global context for all collections\"");
  1880. console.error(" qmd context add qmd://journals/2024 \"Context for 2024 journals\"");
  1881. process.exit(1);
  1882. }
  1883. let pathArg: string | undefined;
  1884. let contextText: string;
  1885. // Check if first arg looks like a path or if it's the context text
  1886. const firstArg = cli.args[1];
  1887. const secondArg = cli.args[2];
  1888. if (secondArg) {
  1889. // Two args: path + context
  1890. pathArg = firstArg;
  1891. contextText = cli.args.slice(2).join(" ");
  1892. } else {
  1893. // One arg: context only (use current directory)
  1894. pathArg = undefined;
  1895. contextText = firstArg;
  1896. }
  1897. await contextAdd(pathArg, contextText);
  1898. break;
  1899. }
  1900. case "list": {
  1901. contextList();
  1902. break;
  1903. }
  1904. case "rm":
  1905. case "remove": {
  1906. if (cli.args.length < 2) {
  1907. console.error("Usage: qmd context rm <path>");
  1908. console.error("Examples:");
  1909. console.error(" qmd context rm /");
  1910. console.error(" qmd context rm qmd://journals/2024");
  1911. process.exit(1);
  1912. }
  1913. contextRemove(cli.args[1]);
  1914. break;
  1915. }
  1916. default:
  1917. console.error(`Unknown subcommand: ${subcommand}`);
  1918. console.error("Available: add, list, rm");
  1919. process.exit(1);
  1920. }
  1921. break;
  1922. }
  1923. // Legacy alias for backwards compatibility
  1924. case "add-context": {
  1925. console.error(`${c.yellow}Note: 'qmd add-context' is deprecated. Use 'qmd context add' instead.${c.reset}`);
  1926. if (cli.args.length === 0) {
  1927. console.error("Usage: qmd context add [path] \"text\"");
  1928. process.exit(1);
  1929. }
  1930. let pathArg: string | undefined;
  1931. let contextText: string;
  1932. if (cli.args.length === 1) {
  1933. pathArg = undefined;
  1934. contextText = cli.args[0];
  1935. } else {
  1936. pathArg = cli.args[0];
  1937. contextText = cli.args.slice(1).join(" ");
  1938. }
  1939. await contextAdd(pathArg, contextText);
  1940. break;
  1941. }
  1942. case "get": {
  1943. if (!cli.args[0]) {
  1944. console.error("Usage: qmd get <filepath>[:line] [--from <line>] [-l <lines>]");
  1945. process.exit(1);
  1946. }
  1947. const fromLine = cli.values.from ? parseInt(cli.values.from as string, 10) : undefined;
  1948. const maxLines = cli.values.l ? parseInt(cli.values.l as string, 10) : undefined;
  1949. getDocument(cli.args[0], fromLine, maxLines);
  1950. break;
  1951. }
  1952. case "multi-get": {
  1953. if (!cli.args[0]) {
  1954. console.error("Usage: qmd multi-get <pattern> [-l <lines>] [--max-bytes <bytes>] [--json|--csv|--md|--xml|--files]");
  1955. console.error(" pattern: glob (e.g., 'journals/2025-05*.md') or comma-separated list");
  1956. process.exit(1);
  1957. }
  1958. const maxLinesMulti = cli.values.l ? parseInt(cli.values.l as string, 10) : undefined;
  1959. const maxBytes = cli.values["max-bytes"] ? parseInt(cli.values["max-bytes"] as string, 10) : DEFAULT_MULTI_GET_MAX_BYTES;
  1960. multiGet(cli.args[0], maxLinesMulti, maxBytes, cli.opts.format);
  1961. break;
  1962. }
  1963. case "status":
  1964. showStatus();
  1965. break;
  1966. case "update":
  1967. await updateCollections();
  1968. break;
  1969. case "embed":
  1970. await vectorIndex(DEFAULT_EMBED_MODEL, cli.values.force || false);
  1971. break;
  1972. case "search":
  1973. if (!cli.query) {
  1974. console.error("Usage: qmd search [options] <query>");
  1975. process.exit(1);
  1976. }
  1977. search(cli.query, cli.opts);
  1978. break;
  1979. case "vsearch":
  1980. if (!cli.query) {
  1981. console.error("Usage: qmd vsearch [options] <query>");
  1982. process.exit(1);
  1983. }
  1984. // Default min-score for vector search is 0.3
  1985. if (!cli.values["min-score"]) {
  1986. cli.opts.minScore = 0.3;
  1987. }
  1988. await vectorSearch(cli.query, cli.opts);
  1989. break;
  1990. case "query":
  1991. if (!cli.query) {
  1992. console.error("Usage: qmd query [options] <query>");
  1993. process.exit(1);
  1994. }
  1995. await querySearch(cli.query, cli.opts);
  1996. break;
  1997. case "mcp": {
  1998. const { startMcpServer } = await import("./mcp.js");
  1999. await startMcpServer();
  2000. break;
  2001. }
  2002. case "cleanup": {
  2003. const db = getDb();
  2004. // 1. Clear ollama_cache
  2005. const cacheCount = db.prepare(`SELECT COUNT(*) as c FROM ollama_cache`).get() as { c: number };
  2006. db.exec(`DELETE FROM ollama_cache`);
  2007. console.log(`${c.green}✓${c.reset} Cleared ${cacheCount.c} cached API responses`);
  2008. // 2. Remove orphaned vectors (no active document with that hash)
  2009. const orphanedVecs = db.prepare(`
  2010. SELECT COUNT(*) as c FROM content_vectors cv
  2011. WHERE NOT EXISTS (
  2012. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  2013. )
  2014. `).get() as { c: number };
  2015. if (orphanedVecs.c > 0) {
  2016. db.exec(`
  2017. DELETE FROM vectors_vec WHERE hash_seq IN (
  2018. SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
  2019. WHERE NOT EXISTS (
  2020. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  2021. )
  2022. )
  2023. `);
  2024. db.exec(`
  2025. DELETE FROM content_vectors WHERE hash NOT IN (
  2026. SELECT hash FROM documents WHERE active = 1
  2027. )
  2028. `);
  2029. console.log(`${c.green}✓${c.reset} Removed ${orphanedVecs.c} orphaned embedding chunks`);
  2030. } else {
  2031. console.log(`${c.dim}No orphaned embeddings to remove${c.reset}`);
  2032. }
  2033. // 3. Count inactive documents
  2034. const inactiveDocs = db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 0`).get() as { c: number };
  2035. if (inactiveDocs.c > 0) {
  2036. db.exec(`DELETE FROM documents WHERE active = 0`);
  2037. console.log(`${c.green}✓${c.reset} Removed ${inactiveDocs.c} inactive document records`);
  2038. }
  2039. // 4. Vacuum to reclaim space
  2040. db.exec(`VACUUM`);
  2041. console.log(`${c.green}✓${c.reset} Database vacuumed`);
  2042. closeDb();
  2043. break;
  2044. }
  2045. default:
  2046. console.error(`Unknown command: ${cli.command}`);
  2047. console.error("Run 'qmd --help' for usage.");
  2048. process.exit(1);
  2049. }
  2050. } // end if (import.meta.main)