qmd.ts 74 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087
  1. #!/usr/bin/env bun
  2. import { Database } from "bun:sqlite";
  3. import { Glob, $ } from "bun";
  4. import { parseArgs } from "util";
  5. import * as sqliteVec from "sqlite-vec";
  6. const HOME = Bun.env.HOME || "/tmp";
  7. function homedir(): string {
  8. return HOME;
  9. }
  10. function resolve(...paths: string[]): string {
  11. // Simple path resolution
  12. let result = paths[0].startsWith('/') ? '' : Bun.env.PWD || process.cwd();
  13. for (const p of paths) {
  14. if (p.startsWith('/')) {
  15. result = p;
  16. } else {
  17. result = result + '/' + p;
  18. }
  19. }
  20. // Normalize: remove // and resolve . and ..
  21. const parts = result.split('/').filter(Boolean);
  22. const normalized: string[] = [];
  23. for (const part of parts) {
  24. if (part === '..') normalized.pop();
  25. else if (part !== '.') normalized.push(part);
  26. }
  27. return '/' + normalized.join('/');
  28. }
  29. // On macOS, use Homebrew's SQLite which supports extensions
  30. if (process.platform === "darwin") {
  31. const homebrewSqlitePath = "/opt/homebrew/opt/sqlite/lib/libsqlite3.dylib";
  32. if (Bun.file(homebrewSqlitePath).size > 0) {
  33. Database.setCustomSQLite(homebrewSqlitePath);
  34. }
  35. }
  36. const DEFAULT_EMBED_MODEL = "embeddinggemma";
  37. const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
  38. const DEFAULT_QUERY_MODEL = "qwen3:0.6b";
  39. const DEFAULT_GLOB = "**/*.md";
  40. const OLLAMA_URL = process.env.OLLAMA_URL || "http://localhost:11434";
  41. // Chunking: ~2000 tokens per chunk, ~3 bytes/token = 6KB
  42. const CHUNK_TOKEN_LENGTH = 2000;
  43. const CHUNK_BYTE_SIZE = 6 * 1024;
  44. // Terminal colors (respects NO_COLOR env)
  45. const useColor = !process.env.NO_COLOR && process.stdout.isTTY;
  46. const c = {
  47. reset: useColor ? "\x1b[0m" : "",
  48. dim: useColor ? "\x1b[2m" : "",
  49. bold: useColor ? "\x1b[1m" : "",
  50. cyan: useColor ? "\x1b[36m" : "",
  51. yellow: useColor ? "\x1b[33m" : "",
  52. green: useColor ? "\x1b[32m" : "",
  53. magenta: useColor ? "\x1b[35m" : "",
  54. blue: useColor ? "\x1b[34m" : "",
  55. };
  56. // Global state for --index option
  57. let customIndexName: string | null = null;
  58. // Terminal cursor control
  59. const cursor = {
  60. hide() { process.stderr.write('\x1b[?25l'); },
  61. show() { process.stderr.write('\x1b[?25h'); },
  62. };
  63. // Ensure cursor is restored on exit
  64. process.on('SIGINT', () => { cursor.show(); process.exit(130); });
  65. process.on('SIGTERM', () => { cursor.show(); process.exit(143); });
  66. // Terminal progress bar using OSC 9;4 escape sequence
  67. const progress = {
  68. set(percent: number) {
  69. process.stderr.write(`\x1b]9;4;1;${Math.round(percent)}\x07`);
  70. },
  71. clear() {
  72. process.stderr.write(`\x1b]9;4;0\x07`);
  73. },
  74. indeterminate() {
  75. process.stderr.write(`\x1b]9;4;3\x07`);
  76. },
  77. error() {
  78. process.stderr.write(`\x1b]9;4;2\x07`);
  79. },
  80. };
  81. // Format seconds into human-readable ETA
  82. function formatETA(seconds: number): string {
  83. if (seconds < 60) return `${Math.round(seconds)}s`;
  84. if (seconds < 3600) return `${Math.floor(seconds / 60)}m ${Math.round(seconds % 60)}s`;
  85. return `${Math.floor(seconds / 3600)}h ${Math.floor((seconds % 3600) / 60)}m`;
  86. }
  87. function getDbPath(): string {
  88. const cacheDir = Bun.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
  89. const qmdCacheDir = resolve(cacheDir, "qmd");
  90. // Ensure cache directory exists
  91. try { Bun.spawnSync(["mkdir", "-p", qmdCacheDir]); } catch {}
  92. const dbName = customIndexName || "index";
  93. return resolve(qmdCacheDir, `${dbName}.sqlite`);
  94. }
  95. function getPwd(): string {
  96. return process.env.PWD || process.cwd();
  97. }
  98. // Get canonical realpath, falling back to resolved path if file doesn't exist
  99. function getRealPath(path: string): string {
  100. try {
  101. const result = Bun.spawnSync(["realpath", path]);
  102. if (result.success) {
  103. return result.stdout.toString().trim();
  104. }
  105. } catch {}
  106. return resolve(path);
  107. }
  108. /*
  109. Schema:
  110. CREATE TABLE collections (
  111. id INTEGER PRIMARY KEY AUTOINCREMENT,
  112. pwd TEXT NOT NULL,
  113. glob_pattern TEXT NOT NULL,
  114. created_at TEXT NOT NULL,
  115. UNIQUE(pwd, glob_pattern)
  116. );
  117. CREATE TABLE documents (
  118. id INTEGER PRIMARY KEY AUTOINCREMENT,
  119. collection_id INTEGER NOT NULL,
  120. name TEXT NOT NULL,
  121. title TEXT NOT NULL,
  122. hash TEXT NOT NULL,
  123. filepath TEXT NOT NULL,
  124. body TEXT NOT NULL,
  125. created_at TEXT NOT NULL,
  126. modified_at TEXT NOT NULL,
  127. active INTEGER NOT NULL DEFAULT 1,
  128. FOREIGN KEY (collection_id) REFERENCES collections(id)
  129. );
  130. CREATE TABLE content_vectors (
  131. hash TEXT NOT NULL,
  132. seq INTEGER NOT NULL DEFAULT 0, -- chunk sequence (0, 1, 2...)
  133. pos INTEGER NOT NULL DEFAULT 0, -- character position in document
  134. model TEXT NOT NULL,
  135. embedded_at TEXT NOT NULL,
  136. PRIMARY KEY (hash, seq)
  137. );
  138. CREATE VIRTUAL TABLE vectors_vec USING vec0(
  139. hash_seq TEXT PRIMARY KEY, -- "{hash}_{seq}"
  140. embedding float[N]
  141. );
  142. CREATE VIRTUAL TABLE documents_fts USING fts5(...);
  143. */
  144. function getDb(): Database {
  145. const db = new Database(getDbPath());
  146. sqliteVec.load(db);
  147. db.exec("PRAGMA journal_mode = WAL");
  148. // Collections table
  149. db.exec(`
  150. CREATE TABLE IF NOT EXISTS collections (
  151. id INTEGER PRIMARY KEY AUTOINCREMENT,
  152. pwd TEXT NOT NULL,
  153. glob_pattern TEXT NOT NULL,
  154. created_at TEXT NOT NULL,
  155. context TEXT,
  156. UNIQUE(pwd, glob_pattern)
  157. )
  158. `);
  159. // Path-based context (more flexible than collection-level)
  160. db.exec(`
  161. CREATE TABLE IF NOT EXISTS path_contexts (
  162. id INTEGER PRIMARY KEY AUTOINCREMENT,
  163. path_prefix TEXT NOT NULL UNIQUE,
  164. context TEXT NOT NULL,
  165. created_at TEXT NOT NULL
  166. )
  167. `);
  168. db.exec(`CREATE INDEX IF NOT EXISTS idx_path_contexts_prefix ON path_contexts(path_prefix)`);
  169. // Cache table for Ollama API calls (not embeddings)
  170. db.exec(`
  171. CREATE TABLE IF NOT EXISTS ollama_cache (
  172. hash TEXT PRIMARY KEY,
  173. result TEXT NOT NULL,
  174. created_at TEXT NOT NULL
  175. )
  176. `);
  177. // Documents table with collection_id and full filepath
  178. db.exec(`
  179. CREATE TABLE IF NOT EXISTS documents (
  180. id INTEGER PRIMARY KEY AUTOINCREMENT,
  181. collection_id INTEGER NOT NULL,
  182. name TEXT NOT NULL,
  183. title TEXT NOT NULL,
  184. hash TEXT NOT NULL,
  185. filepath TEXT NOT NULL,
  186. display_path TEXT NOT NULL DEFAULT '',
  187. body TEXT NOT NULL,
  188. created_at TEXT NOT NULL,
  189. modified_at TEXT NOT NULL,
  190. active INTEGER NOT NULL DEFAULT 1,
  191. FOREIGN KEY (collection_id) REFERENCES collections(id)
  192. )
  193. `);
  194. // Migration: add display_path column if missing
  195. const docInfo = db.prepare(`PRAGMA table_info(documents)`).all() as { name: string }[];
  196. const hasDisplayPath = docInfo.some(col => col.name === 'display_path');
  197. if (!hasDisplayPath) {
  198. db.exec(`ALTER TABLE documents ADD COLUMN display_path TEXT NOT NULL DEFAULT ''`);
  199. }
  200. // Unique index on display_path (only for non-empty values)
  201. db.exec(`CREATE UNIQUE INDEX IF NOT EXISTS idx_documents_display_path ON documents(display_path) WHERE display_path != '' AND active = 1`);
  202. // Content vectors keyed by (hash, seq) for chunked embeddings
  203. // Migration: check if old schema (no seq column) and recreate
  204. const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
  205. const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
  206. if (cvInfo.length > 0 && !hasSeqColumn) {
  207. // Old schema without chunking - drop and recreate (embeddings need regenerating anyway)
  208. db.exec(`DROP TABLE IF EXISTS content_vectors`);
  209. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  210. }
  211. db.exec(`
  212. CREATE TABLE IF NOT EXISTS content_vectors (
  213. hash TEXT NOT NULL,
  214. seq INTEGER NOT NULL DEFAULT 0,
  215. pos INTEGER NOT NULL DEFAULT 0,
  216. model TEXT NOT NULL,
  217. embedded_at TEXT NOT NULL,
  218. PRIMARY KEY (hash, seq)
  219. )
  220. `);
  221. // FTS on documents
  222. db.exec(`
  223. CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
  224. name, body,
  225. content='documents',
  226. content_rowid='id',
  227. tokenize='porter unicode61'
  228. )
  229. `);
  230. db.exec(`
  231. CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents BEGIN
  232. INSERT INTO documents_fts(rowid, name, body) VALUES (new.id, new.name, new.body);
  233. END
  234. `);
  235. db.exec(`
  236. CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
  237. INSERT INTO documents_fts(documents_fts, rowid, name, body) VALUES('delete', old.id, old.name, old.body);
  238. END
  239. `);
  240. db.exec(`
  241. CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents BEGIN
  242. INSERT INTO documents_fts(documents_fts, rowid, name, body) VALUES('delete', old.id, old.name, old.body);
  243. INSERT INTO documents_fts(rowid, name, body) VALUES (new.id, new.name, new.body);
  244. END
  245. `);
  246. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection_id, active)`);
  247. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`);
  248. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_filepath ON documents(filepath, active)`);
  249. // Ensure only one active document per filepath
  250. db.exec(`CREATE UNIQUE INDEX IF NOT EXISTS idx_documents_filepath_active ON documents(filepath) WHERE active = 1`);
  251. return db;
  252. }
  253. function ensureVecTable(db: Database, dimensions: number): void {
  254. const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get() as { sql: string } | null;
  255. if (tableInfo) {
  256. // Check for correct dimensions and hash_seq key (not old 'hash' key)
  257. const match = tableInfo.sql.match(/float\[(\d+)\]/);
  258. const hasHashSeq = tableInfo.sql.includes('hash_seq');
  259. if (match && parseInt(match[1]) === dimensions && hasHashSeq) return;
  260. db.exec("DROP TABLE IF EXISTS vectors_vec");
  261. }
  262. // Use hash_seq as composite key: "{hash}_{seq}" (e.g., "abc123_0", "abc123_1")
  263. db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}])`);
  264. }
  265. function getHashesNeedingEmbedding(db: Database): number {
  266. // Check for hashes missing the first chunk (seq=0)
  267. const result = db.prepare(`
  268. SELECT COUNT(DISTINCT d.hash) as count
  269. FROM documents d
  270. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  271. WHERE d.active = 1 AND v.hash IS NULL
  272. `).get() as { count: number };
  273. return result.count;
  274. }
  275. async function hashContent(content: string): Promise<string> {
  276. const hash = new Bun.CryptoHasher("sha256");
  277. hash.update(content);
  278. return hash.digest("hex");
  279. }
  280. // Cache helpers for Ollama API calls (not embeddings)
  281. function getCacheKey(url: string, body: object): string {
  282. const hash = new Bun.CryptoHasher("sha256");
  283. hash.update(url);
  284. hash.update(JSON.stringify(body));
  285. return hash.digest("hex");
  286. }
  287. function getCachedResult(db: Database, cacheKey: string): string | null {
  288. const row = db.prepare(`SELECT result FROM ollama_cache WHERE hash = ?`).get(cacheKey) as { result: string } | null;
  289. return row?.result || null;
  290. }
  291. function setCachedResult(db: Database, cacheKey: string, result: string): void {
  292. const now = new Date().toISOString();
  293. db.prepare(`INSERT OR REPLACE INTO ollama_cache (hash, result, created_at) VALUES (?, ?, ?)`).run(cacheKey, result, now);
  294. // 1 in 100 chance to truncate to most recent 1000 entries
  295. if (Math.random() < 0.01) {
  296. db.exec(`DELETE FROM ollama_cache WHERE hash NOT IN (SELECT hash FROM ollama_cache ORDER BY created_at DESC LIMIT 1000)`);
  297. }
  298. }
  299. function clearCache(db: Database): void {
  300. db.exec(`DELETE FROM ollama_cache`);
  301. }
  302. // Extract title from first markdown headline, or use filename as fallback
  303. function extractTitle(content: string, filename: string): string {
  304. const match = content.match(/^##?\s+(.+)$/m);
  305. if (match) {
  306. const title = match[1].trim();
  307. // Skip generic "📝 Notes" heading, find next ## instead
  308. if (title === "📝 Notes" || title === "Notes") {
  309. const nextMatch = content.match(/^##\s+(.+)$/m);
  310. if (nextMatch) return nextMatch[1].trim();
  311. }
  312. return title;
  313. }
  314. return filename.replace(/\.md$/, "").split("/").pop() || filename;
  315. }
  316. // Format text for EmbeddingGemma
  317. function formatQueryForEmbedding(query: string): string {
  318. return `task: search result | query: ${query}`;
  319. }
  320. function formatDocForEmbedding(text: string, title?: string): string {
  321. return `title: ${title || "none"} | text: ${text}`;
  322. }
  323. // Chunk document into ~6KB pieces, breaking at word boundaries
  324. function chunkDocument(content: string, maxBytes: number = CHUNK_BYTE_SIZE): { text: string; pos: number }[] {
  325. const encoder = new TextEncoder();
  326. const totalBytes = encoder.encode(content).length;
  327. // Single chunk if small enough
  328. if (totalBytes <= maxBytes) {
  329. return [{ text: content, pos: 0 }];
  330. }
  331. const chunks: { text: string; pos: number }[] = [];
  332. let charPos = 0;
  333. while (charPos < content.length) {
  334. // Find chunk boundary at ~maxBytes
  335. let endPos = charPos;
  336. let byteCount = 0;
  337. // Advance character by character, counting bytes
  338. while (endPos < content.length && byteCount < maxBytes) {
  339. const charBytes = encoder.encode(content[endPos]).length;
  340. if (byteCount + charBytes > maxBytes) break;
  341. byteCount += charBytes;
  342. endPos++;
  343. }
  344. // Back up to word boundary (paragraph, newline, or space)
  345. if (endPos < content.length && endPos > charPos) {
  346. const slice = content.slice(charPos, endPos);
  347. // Prefer paragraph break, then sentence end, then newline, then space
  348. const paragraphBreak = slice.lastIndexOf('\n\n');
  349. const sentenceEnd = Math.max(
  350. slice.lastIndexOf('. '),
  351. slice.lastIndexOf('.\n'),
  352. slice.lastIndexOf('? '),
  353. slice.lastIndexOf('?\n'),
  354. slice.lastIndexOf('! '),
  355. slice.lastIndexOf('!\n')
  356. );
  357. const lineBreak = slice.lastIndexOf('\n');
  358. const spaceBreak = slice.lastIndexOf(' ');
  359. let breakPoint = -1;
  360. if (paragraphBreak > slice.length * 0.5) {
  361. breakPoint = paragraphBreak + 2; // Include the double newline
  362. } else if (sentenceEnd > slice.length * 0.5) {
  363. breakPoint = sentenceEnd + 2; // Include period and space
  364. } else if (lineBreak > slice.length * 0.3) {
  365. breakPoint = lineBreak + 1;
  366. } else if (spaceBreak > slice.length * 0.3) {
  367. breakPoint = spaceBreak + 1;
  368. }
  369. if (breakPoint > 0) {
  370. endPos = charPos + breakPoint;
  371. }
  372. }
  373. // Ensure we make progress (at least one character)
  374. if (endPos <= charPos) {
  375. endPos = charPos + 1;
  376. }
  377. chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
  378. charPos = endPos;
  379. }
  380. return chunks;
  381. }
  382. // Compute unique display path for a document
  383. // Always include at least parent folder + filename, add more parent dirs until unique
  384. function computeDisplayPath(
  385. filepath: string,
  386. collectionPath: string,
  387. existingPaths: Set<string>
  388. ): string {
  389. // Get path relative to collection (include collection dir name)
  390. const collectionDir = collectionPath.replace(/\/$/, '');
  391. const collectionName = collectionDir.split('/').pop() || '';
  392. let relativePath: string;
  393. if (filepath.startsWith(collectionDir + '/')) {
  394. // filepath is under collection: use collection name + relative path
  395. relativePath = collectionName + filepath.slice(collectionDir.length);
  396. } else {
  397. // Fallback: just use the filepath
  398. relativePath = filepath;
  399. }
  400. const parts = relativePath.split('/').filter(p => p.length > 0);
  401. // Always include at least parent folder + filename (minimum 2 parts if available)
  402. // Then add more parent dirs until unique
  403. const minParts = Math.min(2, parts.length);
  404. for (let i = parts.length - minParts; i >= 0; i--) {
  405. const candidate = parts.slice(i).join('/');
  406. if (!existingPaths.has(candidate)) {
  407. return candidate;
  408. }
  409. }
  410. // Absolute fallback: use full path (should be unique)
  411. return filepath;
  412. }
  413. // Auto-pull model if not found
  414. async function ensureModelAvailable(model: string): Promise<void> {
  415. try {
  416. const response = await fetch(`${OLLAMA_URL}/api/show`, {
  417. method: "POST",
  418. headers: { "Content-Type": "application/json" },
  419. body: JSON.stringify({ name: model }),
  420. });
  421. if (response.ok) return;
  422. } catch {
  423. // Continue to pull attempt
  424. }
  425. console.log(`Model ${model} not found. Pulling...`);
  426. progress.indeterminate();
  427. const pullResponse = await fetch(`${OLLAMA_URL}/api/pull`, {
  428. method: "POST",
  429. headers: { "Content-Type": "application/json" },
  430. body: JSON.stringify({ name: model, stream: false }),
  431. });
  432. if (!pullResponse.ok) {
  433. progress.error();
  434. throw new Error(`Failed to pull model ${model}: ${pullResponse.status} - ${await pullResponse.text()}`);
  435. }
  436. progress.clear();
  437. console.log(`Model ${model} pulled successfully.`);
  438. }
  439. async function getEmbedding(text: string, model: string, isQuery: boolean = false, title?: string, retried: boolean = false): Promise<number[]> {
  440. const input = isQuery ? formatQueryForEmbedding(text) : formatDocForEmbedding(text, title);
  441. const response = await fetch(`${OLLAMA_URL}/api/embed`, {
  442. method: "POST",
  443. headers: { "Content-Type": "application/json" },
  444. body: JSON.stringify({ model, input }),
  445. });
  446. if (!response.ok) {
  447. const errorText = await response.text();
  448. if (!retried && (errorText.includes("not found") || errorText.includes("does not exist"))) {
  449. await ensureModelAvailable(model);
  450. return getEmbedding(text, model, isQuery, title, true);
  451. }
  452. throw new Error(`Ollama API error: ${response.status} - ${errorText}`);
  453. }
  454. const data = await response.json() as { embeddings: number[][] };
  455. return data.embeddings[0];
  456. }
  457. // Qwen3-Reranker prompt format (trained for yes/no relevance classification)
  458. const RERANK_SYSTEM = `Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".`;
  459. function formatRerankPrompt(query: string, title: string, doc: string): string {
  460. return `<Instruct>: Determine if this document from a Shopify knowledge base is relevant to the search query. The query may reference specific Shopify programs, competitions, features, or named concepts (e.g., "Build a Business" competition, "Shop Pay", "Polaris"). Match documents that discuss the queried topic, even if phrasing differs.
  461. <Query>: ${query}
  462. <Document Title>: ${title}
  463. <Document>: ${doc}`;
  464. }
  465. type LogProb = { token: string; logprob: number };
  466. type RerankResponse = {
  467. response: string;
  468. logprobs?: LogProb[];
  469. };
  470. function parseRerankResponse(data: RerankResponse): number {
  471. if (!data.logprobs || data.logprobs.length === 0) {
  472. throw new Error("Reranker response missing logprobs");
  473. }
  474. const firstToken = data.logprobs[0];
  475. const token = firstToken.token.toLowerCase().trim();
  476. const confidence = Math.exp(firstToken.logprob);
  477. if (token === "yes") {
  478. return confidence;
  479. }
  480. if (token === "no") {
  481. return (1 - confidence) * 0.3;
  482. }
  483. throw new Error(`Unexpected reranker token: "${token}"`);
  484. }
  485. async function rerankSingle(prompt: string, model: string, db?: Database, retried: boolean = false): Promise<number> {
  486. // Use generate with raw template for qwen3-reranker format
  487. // Include empty <think> tags as per HuggingFace reference implementation
  488. const fullPrompt = `<|im_start|>system
  489. ${RERANK_SYSTEM}<|im_end|>
  490. <|im_start|>user
  491. ${prompt}<|im_end|>
  492. <|im_start|>assistant
  493. <think>
  494. </think>
  495. `;
  496. const requestBody = {
  497. model,
  498. prompt: fullPrompt,
  499. raw: true,
  500. stream: false,
  501. logprobs: true,
  502. options: { num_predict: 1 },
  503. };
  504. // Check cache
  505. const cacheKey = db ? getCacheKey(`${OLLAMA_URL}/api/generate`, requestBody) : "";
  506. if (db) {
  507. const cached = getCachedResult(db, cacheKey);
  508. if (cached) {
  509. const data = JSON.parse(cached) as RerankResponse;
  510. return parseRerankResponse(data);
  511. }
  512. }
  513. const response = await fetch(`${OLLAMA_URL}/api/generate`, {
  514. method: "POST",
  515. headers: { "Content-Type": "application/json" },
  516. body: JSON.stringify(requestBody),
  517. });
  518. if (!response.ok) {
  519. const errorText = await response.text();
  520. if (!retried && (errorText.includes("not found") || errorText.includes("does not exist"))) {
  521. await ensureModelAvailable(model);
  522. return rerankSingle(prompt, model, db, true);
  523. }
  524. throw new Error(`Ollama API error: ${response.status} - ${errorText}`);
  525. }
  526. const data = await response.json() as RerankResponse;
  527. // Cache the result
  528. if (db) {
  529. setCachedResult(db, cacheKey, JSON.stringify(data));
  530. }
  531. return parseRerankResponse(data);
  532. }
  533. async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db?: Database): Promise<{ file: string; score: number }[]> {
  534. const results: { file: string; score: number }[] = [];
  535. const total = documents.length;
  536. const PARALLEL = 5;
  537. process.stderr.write(`Reranking ${total} documents with ${model} (parallel: ${PARALLEL})...\n`);
  538. progress.indeterminate();
  539. // Process in parallel batches
  540. for (let i = 0; i < documents.length; i += PARALLEL) {
  541. const batch = documents.slice(i, i + PARALLEL);
  542. const batchResults = await Promise.all(
  543. batch.map(async (doc) => {
  544. try {
  545. // Extract title from filename for reranker context
  546. const title = doc.file.split('/').pop()?.replace(/\.md$/, '') || doc.file;
  547. const prompt = formatRerankPrompt(query, title, doc.text.slice(0, 4000));
  548. const score = await rerankSingle(prompt, model, db);
  549. return { file: doc.file, score };
  550. } catch (err) {
  551. return { file: doc.file, score: 0 };
  552. }
  553. })
  554. );
  555. results.push(...batchResults);
  556. const processed = Math.min(i + PARALLEL, total);
  557. progress.set((processed / total) * 100);
  558. process.stderr.write(`\rReranking: ${processed}/${total}`);
  559. }
  560. progress.clear();
  561. process.stderr.write("\n");
  562. return results.sort((a, b) => b.score - a.score);
  563. }
  564. function getOrCreateCollection(db: Database, pwd: string, globPattern: string): number {
  565. const now = new Date().toISOString();
  566. // Use INSERT OR IGNORE to handle race conditions, then SELECT
  567. db.prepare(`INSERT OR IGNORE INTO collections (pwd, glob_pattern, created_at) VALUES (?, ?, ?)`).run(pwd, globPattern, now);
  568. const existing = db.prepare(`SELECT id FROM collections WHERE pwd = ? AND glob_pattern = ?`).get(pwd, globPattern) as { id: number };
  569. return existing.id;
  570. }
  571. function cleanupDuplicateCollections(db: Database): void {
  572. // Remove duplicate collections keeping the oldest one
  573. db.exec(`
  574. DELETE FROM collections WHERE id NOT IN (
  575. SELECT MIN(id) FROM collections GROUP BY pwd, glob_pattern
  576. )
  577. `);
  578. // Remove bogus "." glob pattern entries (from earlier bug)
  579. db.exec(`DELETE FROM collections WHERE glob_pattern = '.'`);
  580. }
  581. function formatTimeAgo(date: Date): string {
  582. const seconds = Math.floor((Date.now() - date.getTime()) / 1000);
  583. if (seconds < 60) return `${seconds}s ago`;
  584. const minutes = Math.floor(seconds / 60);
  585. if (minutes < 60) return `${minutes}m ago`;
  586. const hours = Math.floor(minutes / 60);
  587. if (hours < 24) return `${hours}h ago`;
  588. const days = Math.floor(hours / 24);
  589. return `${days}d ago`;
  590. }
  591. function formatBytes(bytes: number): string {
  592. if (bytes < 1024) return `${bytes} B`;
  593. if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
  594. if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
  595. return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
  596. }
  597. function showStatus(): void {
  598. const dbPath = getDbPath();
  599. const db = getDb();
  600. // Cleanup any duplicate collections
  601. cleanupDuplicateCollections(db);
  602. // Index size
  603. let indexSize = 0;
  604. try {
  605. const stat = Bun.file(dbPath).size;
  606. indexSize = stat;
  607. } catch {}
  608. // Collections info
  609. const collections = db.prepare(`
  610. SELECT c.id, c.pwd, c.glob_pattern, c.created_at,
  611. COUNT(d.id) as doc_count,
  612. SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count,
  613. MAX(d.modified_at) as last_modified
  614. FROM collections c
  615. LEFT JOIN documents d ON d.collection_id = c.id
  616. GROUP BY c.id
  617. ORDER BY c.created_at DESC
  618. `).all() as { id: number; pwd: string; glob_pattern: string; created_at: string; doc_count: number; active_count: number; last_modified: string | null }[];
  619. // Overall stats
  620. const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number };
  621. const vectorCount = db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get() as { count: number };
  622. const needsEmbedding = getHashesNeedingEmbedding(db);
  623. // Most recent update across all collections
  624. const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
  625. console.log(`${c.bold}QMD Status${c.reset}\n`);
  626. console.log(`Index: ${dbPath}`);
  627. console.log(`Size: ${formatBytes(indexSize)}\n`);
  628. console.log(`${c.bold}Documents${c.reset}`);
  629. console.log(` Total: ${totalDocs.count} files indexed`);
  630. console.log(` Vectors: ${vectorCount.count} embedded`);
  631. if (needsEmbedding > 0) {
  632. console.log(` ${c.yellow}Pending: ${needsEmbedding} need embedding${c.reset} (run 'qmd embed')`);
  633. }
  634. if (mostRecent.latest) {
  635. const lastUpdate = new Date(mostRecent.latest);
  636. console.log(` Updated: ${formatTimeAgo(lastUpdate)}`);
  637. }
  638. // Get all path contexts
  639. const pathContexts = db.prepare(`SELECT path_prefix, context FROM path_contexts ORDER BY path_prefix`).all() as { path_prefix: string; context: string }[];
  640. if (collections.length > 0) {
  641. console.log(`\n${c.bold}Collections${c.reset}`);
  642. for (const col of collections) {
  643. const lastMod = col.last_modified ? formatTimeAgo(new Date(col.last_modified)) : "never";
  644. console.log(` ${c.cyan}${col.pwd}${c.reset}`);
  645. console.log(` ${col.glob_pattern} → ${col.active_count} docs (updated ${lastMod})`);
  646. // Show contexts that match this collection's path
  647. const matchingContexts = pathContexts.filter(ctx =>
  648. ctx.path_prefix.startsWith(col.pwd) || col.pwd.startsWith(ctx.path_prefix)
  649. );
  650. for (const ctx of matchingContexts) {
  651. const displayPath = shortPath(ctx.path_prefix);
  652. console.log(` ${c.dim}context: ${displayPath} → "${ctx.context}"${c.reset}`);
  653. }
  654. }
  655. } else {
  656. console.log(`\n${c.dim}No collections. Run 'qmd add .' to index markdown files.${c.reset}`);
  657. }
  658. db.close();
  659. }
  660. // Update display_paths for all documents that have empty display_path
  661. function updateDisplayPaths(db: Database): number {
  662. // Get all docs with empty display_path, grouped by collection
  663. const emptyDocs = db.prepare(`
  664. SELECT d.id, d.filepath, c.pwd
  665. FROM documents d
  666. JOIN collections c ON d.collection_id = c.id
  667. WHERE d.active = 1 AND (d.display_path IS NULL OR d.display_path = '')
  668. `).all() as { id: number; filepath: string; pwd: string }[];
  669. if (emptyDocs.length === 0) return 0;
  670. // Collect existing display_paths
  671. const existingPaths = new Set<string>(
  672. (db.prepare(`SELECT display_path FROM documents WHERE active = 1 AND display_path != ''`).all() as { display_path: string }[])
  673. .map(r => r.display_path)
  674. );
  675. const updateStmt = db.prepare(`UPDATE documents SET display_path = ? WHERE id = ?`);
  676. let updated = 0;
  677. for (const doc of emptyDocs) {
  678. const displayPath = computeDisplayPath(doc.filepath, doc.pwd, existingPaths);
  679. updateStmt.run(displayPath, doc.id);
  680. existingPaths.add(displayPath);
  681. updated++;
  682. }
  683. return updated;
  684. }
  685. async function updateAllCollections(): Promise<void> {
  686. const db = getDb();
  687. cleanupDuplicateCollections(db);
  688. // Clear Ollama cache on update
  689. clearCache(db);
  690. const collections = db.prepare(`SELECT id, pwd, glob_pattern FROM collections`).all() as { id: number; pwd: string; glob_pattern: string }[];
  691. if (collections.length === 0) {
  692. console.log(`${c.dim}No collections found. Run 'qmd add .' to index markdown files.${c.reset}`);
  693. db.close();
  694. return;
  695. }
  696. // Update display_paths for any documents missing them (migration)
  697. const pathsUpdated = updateDisplayPaths(db);
  698. if (pathsUpdated > 0) {
  699. console.log(`${c.green}✓${c.reset} Updated ${pathsUpdated} display paths`);
  700. }
  701. db.close();
  702. console.log(`${c.bold}Updating ${collections.length} collection(s)...${c.reset}\n`);
  703. for (let i = 0; i < collections.length; i++) {
  704. const col = collections[i];
  705. console.log(`${c.cyan}[${i + 1}/${collections.length}]${c.reset} ${c.bold}${col.pwd}${c.reset}`);
  706. console.log(`${c.dim} Pattern: ${col.glob_pattern}${c.reset}`);
  707. // Temporarily set PWD for indexing
  708. const originalPwd = process.env.PWD;
  709. process.env.PWD = col.pwd;
  710. await indexFiles(col.glob_pattern);
  711. process.env.PWD = originalPwd;
  712. console.log("");
  713. }
  714. console.log(`${c.green}✓ All collections updated.${c.reset}`);
  715. }
  716. async function addContext(pathArg: string, contextText: string): Promise<void> {
  717. const db = getDb();
  718. const now = new Date().toISOString();
  719. // Resolve path - could be relative, absolute, or use ~
  720. let pathPrefix = pathArg;
  721. if (pathPrefix === '.' || pathPrefix === './') {
  722. pathPrefix = getPwd();
  723. } else if (pathPrefix.startsWith('~/')) {
  724. pathPrefix = homedir() + pathPrefix.slice(1);
  725. } else if (!pathPrefix.startsWith('/')) {
  726. pathPrefix = resolve(getPwd(), pathPrefix);
  727. }
  728. // Get realpath and normalize: remove trailing slash
  729. pathPrefix = getRealPath(pathPrefix).replace(/\/$/, '');
  730. // Insert or update
  731. db.prepare(`INSERT INTO path_contexts (path_prefix, context, created_at) VALUES (?, ?, ?)
  732. ON CONFLICT(path_prefix) DO UPDATE SET context = excluded.context`).run(pathPrefix, contextText, now);
  733. console.log(`${c.green}✓${c.reset} Added context for: ${shortPath(pathPrefix)}`);
  734. console.log(`${c.dim}Context: ${contextText}${c.reset}`);
  735. db.close();
  736. }
  737. function getDocument(filename: string, fromLine?: number, maxLines?: number): void {
  738. const db = getDb();
  739. // Parse :linenum suffix from filename (e.g., "file.md:100")
  740. let filepath = filename;
  741. const colonMatch = filepath.match(/:(\d+)$/);
  742. if (colonMatch && !fromLine) {
  743. fromLine = parseInt(colonMatch[1], 10);
  744. filepath = filepath.slice(0, -colonMatch[0].length);
  745. }
  746. // Expand ~ to home directory
  747. if (filepath.startsWith('~/')) {
  748. filepath = homedir() + filepath.slice(1);
  749. }
  750. // Try exact match first
  751. let doc = db.prepare(`SELECT filepath, body FROM documents WHERE filepath = ? AND active = 1`).get(filepath) as { filepath: string; body: string } | null;
  752. // Try matching by filename ending (allows partial paths)
  753. if (!doc) {
  754. doc = db.prepare(`SELECT filepath, body FROM documents WHERE filepath LIKE ? AND active = 1 LIMIT 1`).get(`%${filepath}`) as { filepath: string; body: string } | null;
  755. }
  756. if (!doc) {
  757. console.error(`Document not found: ${filename}`);
  758. db.close();
  759. process.exit(1);
  760. }
  761. // Get context for this file
  762. const context = getContextForFile(db, doc.filepath);
  763. let output = doc.body;
  764. // Apply line filtering if specified
  765. if (fromLine !== undefined || maxLines !== undefined) {
  766. const lines = output.split('\n');
  767. const start = (fromLine || 1) - 1; // Convert to 0-indexed
  768. const end = maxLines !== undefined ? start + maxLines : lines.length;
  769. output = lines.slice(start, end).join('\n');
  770. }
  771. // Output context header if exists
  772. if (context) {
  773. console.log(`Folder Context: ${context}\n---\n`);
  774. }
  775. console.log(output);
  776. db.close();
  777. }
  778. // Get context for a filepath (finds most specific matching path prefix)
  779. function getContextForFile(db: Database, filepath: string): string | null {
  780. // Find all matching prefixes and return the longest (most specific) one
  781. const result = db.prepare(`
  782. SELECT context FROM path_contexts
  783. WHERE ? LIKE path_prefix || '%'
  784. ORDER BY LENGTH(path_prefix) DESC
  785. LIMIT 1
  786. `).get(filepath) as { context: string } | null;
  787. return result?.context || null;
  788. }
  789. async function dropCollection(globPattern: string): Promise<void> {
  790. const db = getDb();
  791. const pwd = getPwd();
  792. const collection = db.prepare(`SELECT id FROM collections WHERE pwd = ? AND glob_pattern = ?`).get(pwd, globPattern) as { id: number } | null;
  793. if (!collection) {
  794. console.log(`No collection found for ${pwd} with pattern ${globPattern}`);
  795. db.close();
  796. return;
  797. }
  798. // Delete documents in this collection
  799. const deleted = db.prepare(`DELETE FROM documents WHERE collection_id = ?`).run(collection.id);
  800. // Delete the collection
  801. db.prepare(`DELETE FROM collections WHERE id = ?`).run(collection.id);
  802. console.log(`Dropped collection: ${pwd} (${globPattern})`);
  803. console.log(`Removed ${deleted.changes} documents`);
  804. console.log(`(Vectors kept for potential reuse)`);
  805. db.close();
  806. }
  807. async function indexFiles(globPattern: string = DEFAULT_GLOB): Promise<void> {
  808. const db = getDb();
  809. const pwd = getPwd();
  810. const now = new Date().toISOString();
  811. const excludeDirs = ["node_modules", ".git", ".cache", "vendor", "dist", "build"];
  812. // Clear Ollama cache on index
  813. clearCache(db);
  814. // Get or create collection for this (pwd, glob)
  815. const collectionId = getOrCreateCollection(db, pwd, globPattern);
  816. console.log(`Collection: ${pwd} (${globPattern})`);
  817. progress.indeterminate();
  818. const glob = new Glob(globPattern);
  819. const files: string[] = [];
  820. for await (const file of glob.scan({ cwd: pwd, onlyFiles: true, followSymlinks: true })) {
  821. // Skip node_modules, hidden folders (.*), and other common excludes
  822. const parts = file.split("/");
  823. const shouldSkip = parts.some(part =>
  824. part === "node_modules" ||
  825. part.startsWith(".") ||
  826. excludeDirs.includes(part)
  827. );
  828. if (!shouldSkip) {
  829. files.push(file);
  830. }
  831. }
  832. const total = files.length;
  833. if (total === 0) {
  834. progress.clear();
  835. console.log("No files found matching pattern.");
  836. db.close();
  837. return;
  838. }
  839. const insertStmt = db.prepare(`INSERT INTO documents (collection_id, name, title, hash, filepath, display_path, body, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 1)`);
  840. const deactivateStmt = db.prepare(`UPDATE documents SET active = 0 WHERE collection_id = ? AND filepath = ? AND active = 1`);
  841. const findActiveStmt = db.prepare(`SELECT id, hash, title, display_path FROM documents WHERE collection_id = ? AND filepath = ? AND active = 1`);
  842. const findActiveAnyCollectionStmt = db.prepare(`SELECT id, collection_id, hash, title, display_path FROM documents WHERE filepath = ? AND active = 1`);
  843. const updateTitleStmt = db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`);
  844. const updateDisplayPathStmt = db.prepare(`UPDATE documents SET display_path = ? WHERE id = ?`);
  845. // Collect all existing display_paths for uniqueness check
  846. const existingDisplayPaths = new Set<string>(
  847. (db.prepare(`SELECT display_path FROM documents WHERE active = 1 AND display_path != ''`).all() as { display_path: string }[])
  848. .map(r => r.display_path)
  849. );
  850. let indexed = 0, updated = 0, unchanged = 0, processed = 0;
  851. const seenFiles = new Set<string>();
  852. const startTime = Date.now();
  853. for (const relativeFile of files) {
  854. const filepath = getRealPath(resolve(pwd, relativeFile));
  855. seenFiles.add(filepath);
  856. const content = await Bun.file(filepath).text();
  857. const hash = await hashContent(content);
  858. const name = relativeFile.replace(/\.md$/, "").split("/").pop() || relativeFile;
  859. const title = extractTitle(content, relativeFile);
  860. // First check if file exists in THIS collection
  861. const existing = findActiveStmt.get(collectionId, filepath) as { id: number; hash: string; title: string; display_path: string } | null;
  862. if (existing) {
  863. if (existing.hash === hash) {
  864. // Hash unchanged, but check if title needs updating
  865. if (existing.title !== title) {
  866. updateTitleStmt.run(title, now, existing.id);
  867. updated++;
  868. } else {
  869. unchanged++;
  870. }
  871. // Update display_path if empty
  872. if (!existing.display_path) {
  873. const displayPath = computeDisplayPath(filepath, pwd, existingDisplayPaths);
  874. updateDisplayPathStmt.run(displayPath, existing.id);
  875. existingDisplayPaths.add(displayPath);
  876. }
  877. } else {
  878. // Content changed - deactivate old, insert new
  879. existingDisplayPaths.delete(existing.display_path);
  880. deactivateStmt.run(collectionId, filepath);
  881. updated++;
  882. const stat = await Bun.file(filepath).stat();
  883. const displayPath = computeDisplayPath(filepath, pwd, existingDisplayPaths);
  884. insertStmt.run(collectionId, name, title, hash, filepath, displayPath, content, stat ? new Date(stat.birthtime).toISOString() : now, stat ? new Date(stat.mtime).toISOString() : now);
  885. existingDisplayPaths.add(displayPath);
  886. }
  887. } else {
  888. // Check if file exists in ANY collection (would violate unique constraint)
  889. const existingAnywhere = findActiveAnyCollectionStmt.get(filepath) as { id: number; collection_id: number; hash: string; title: string; display_path: string } | null;
  890. if (existingAnywhere) {
  891. // File already indexed in another collection - skip it
  892. unchanged++;
  893. } else {
  894. indexed++;
  895. const stat = await Bun.file(filepath).stat();
  896. const displayPath = computeDisplayPath(filepath, pwd, existingDisplayPaths);
  897. insertStmt.run(collectionId, name, title, hash, filepath, displayPath, content, stat ? new Date(stat.birthtime).toISOString() : now, stat ? new Date(stat.mtime).toISOString() : now);
  898. existingDisplayPaths.add(displayPath);
  899. }
  900. }
  901. processed++;
  902. progress.set((processed / total) * 100);
  903. const elapsed = (Date.now() - startTime) / 1000;
  904. const rate = processed / elapsed;
  905. const remaining = (total - processed) / rate;
  906. const eta = processed > 2 ? ` ETA: ${formatETA(remaining)}` : "";
  907. process.stderr.write(`\rIndexing: ${processed}/${total}${eta} `);
  908. }
  909. // Deactivate documents in this collection that no longer exist
  910. const allActive = db.prepare(`SELECT filepath FROM documents WHERE collection_id = ? AND active = 1`).all(collectionId) as { filepath: string }[];
  911. let removed = 0;
  912. for (const row of allActive) {
  913. if (!seenFiles.has(row.filepath)) {
  914. deactivateStmt.run(collectionId, row.filepath);
  915. removed++;
  916. }
  917. }
  918. // Check if vector index needs updating
  919. const needsEmbedding = getHashesNeedingEmbedding(db);
  920. progress.clear();
  921. console.log(`\nIndexed: ${indexed} new, ${updated} updated, ${unchanged} unchanged, ${removed} removed`);
  922. if (needsEmbedding > 0) {
  923. console.log(`\nRun 'qmd embed' to update embeddings (${needsEmbedding} unique hashes need vectors)`);
  924. }
  925. db.close();
  926. }
  927. function renderProgressBar(percent: number, width: number = 30): string {
  928. const filled = Math.round((percent / 100) * width);
  929. const empty = width - filled;
  930. const bar = "█".repeat(filled) + "░".repeat(empty);
  931. return bar;
  932. }
  933. async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = false): Promise<void> {
  934. const db = getDb();
  935. const now = new Date().toISOString();
  936. // If force, clear all vectors
  937. if (force) {
  938. console.log(`${c.yellow}Force re-indexing: clearing all vectors...${c.reset}`);
  939. db.exec(`DELETE FROM content_vectors`);
  940. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  941. }
  942. // Find unique hashes that need embedding (from active documents)
  943. // Use MIN(filepath) to get one representative filepath per hash
  944. const hashesToEmbed = db.prepare(`
  945. SELECT d.hash, d.body, MIN(d.filepath) as filepath, MIN(d.display_path) as display_path
  946. FROM documents d
  947. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  948. WHERE d.active = 1 AND v.hash IS NULL
  949. GROUP BY d.hash
  950. `).all() as { hash: string; body: string; filepath: string; display_path: string }[];
  951. if (hashesToEmbed.length === 0) {
  952. console.log(`${c.green}✓ All content hashes already have embeddings.${c.reset}`);
  953. db.close();
  954. return;
  955. }
  956. // Prepare documents with chunks
  957. type ChunkItem = { hash: string; title: string; text: string; seq: number; pos: number; bytes: number; displayName: string };
  958. const allChunks: ChunkItem[] = [];
  959. let multiChunkDocs = 0;
  960. for (const item of hashesToEmbed) {
  961. const encoder = new TextEncoder();
  962. const bodyBytes = encoder.encode(item.body).length;
  963. if (bodyBytes === 0) continue; // Skip empty
  964. const title = extractTitle(item.body, item.filepath);
  965. const displayName = item.display_path || item.filepath;
  966. const chunks = chunkDocument(item.body, CHUNK_BYTE_SIZE);
  967. if (chunks.length > 1) multiChunkDocs++;
  968. for (let seq = 0; seq < chunks.length; seq++) {
  969. allChunks.push({
  970. hash: item.hash,
  971. title,
  972. text: chunks[seq].text,
  973. seq,
  974. pos: chunks[seq].pos,
  975. bytes: encoder.encode(chunks[seq].text).length,
  976. displayName,
  977. });
  978. }
  979. }
  980. if (allChunks.length === 0) {
  981. console.log(`${c.green}✓ No non-empty documents to embed.${c.reset}`);
  982. db.close();
  983. return;
  984. }
  985. const totalBytes = allChunks.reduce((sum, c) => sum + c.bytes, 0);
  986. const totalChunks = allChunks.length;
  987. const totalDocs = hashesToEmbed.length;
  988. console.log(`${c.bold}Embedding ${totalDocs} documents${c.reset} ${c.dim}(${totalChunks} chunks, ${formatBytes(totalBytes)})${c.reset}`);
  989. if (multiChunkDocs > 0) {
  990. console.log(`${c.dim}${multiChunkDocs} documents split into multiple chunks${c.reset}`);
  991. }
  992. console.log(`${c.dim}Model: ${model}${c.reset}\n`);
  993. // Hide cursor during embedding
  994. cursor.hide();
  995. // Get embedding dimensions from first chunk
  996. progress.indeterminate();
  997. const firstEmbedding = await getEmbedding(allChunks[0].text, model, false, allChunks[0].title);
  998. ensureVecTable(db, firstEmbedding.length);
  999. const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
  1000. const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
  1001. let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
  1002. const startTime = Date.now();
  1003. // Insert first chunk
  1004. const firstHashSeq = `${allChunks[0].hash}_${allChunks[0].seq}`;
  1005. insertVecStmt.run(firstHashSeq, new Float32Array(firstEmbedding));
  1006. insertContentVectorStmt.run(allChunks[0].hash, allChunks[0].seq, allChunks[0].pos, model, now);
  1007. chunksEmbedded++;
  1008. bytesProcessed += allChunks[0].bytes;
  1009. for (let i = 1; i < allChunks.length; i++) {
  1010. const chunk = allChunks[i];
  1011. try {
  1012. const embedding = await getEmbedding(chunk.text, model, false, chunk.title);
  1013. const hashSeq = `${chunk.hash}_${chunk.seq}`;
  1014. insertVecStmt.run(hashSeq, new Float32Array(embedding));
  1015. insertContentVectorStmt.run(chunk.hash, chunk.seq, chunk.pos, model, now);
  1016. chunksEmbedded++;
  1017. bytesProcessed += chunk.bytes;
  1018. } catch (err) {
  1019. errors++;
  1020. bytesProcessed += chunk.bytes;
  1021. progress.error();
  1022. console.error(`\n${c.yellow}⚠ Error embedding "${chunk.displayName}" chunk ${chunk.seq}: ${err}${c.reset}`);
  1023. }
  1024. const percent = (bytesProcessed / totalBytes) * 100;
  1025. progress.set(percent);
  1026. const elapsed = (Date.now() - startTime) / 1000;
  1027. const bytesPerSec = bytesProcessed / elapsed;
  1028. const remainingBytes = totalBytes - bytesProcessed;
  1029. const etaSec = remainingBytes / bytesPerSec;
  1030. const bar = renderProgressBar(percent);
  1031. const percentStr = percent.toFixed(0).padStart(3);
  1032. const throughput = `${formatBytes(bytesPerSec)}/s`;
  1033. const eta = elapsed > 2 ? formatETA(etaSec) : "...";
  1034. const errStr = errors > 0 ? ` ${c.yellow}${errors} err${c.reset}` : "";
  1035. process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${chunksEmbedded}/${totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset} `);
  1036. }
  1037. progress.clear();
  1038. cursor.show();
  1039. const totalTimeSec = (Date.now() - startTime) / 1000;
  1040. const avgThroughput = formatBytes(totalBytes / totalTimeSec);
  1041. console.log(`\r${c.green}${renderProgressBar(100)}${c.reset} ${c.bold}100%${c.reset} `);
  1042. console.log(`\n${c.green}✓ Done!${c.reset} Embedded ${c.bold}${chunksEmbedded}${c.reset} chunks from ${c.bold}${totalDocs}${c.reset} documents in ${c.bold}${formatETA(totalTimeSec)}${c.reset} ${c.dim}(${avgThroughput}/s)${c.reset}`);
  1043. if (errors > 0) {
  1044. console.log(`${c.yellow}⚠ ${errors} chunks failed${c.reset}`);
  1045. }
  1046. db.close();
  1047. }
  1048. function escapeCSV(value: string): string {
  1049. if (value.includes('"') || value.includes(',') || value.includes('\n')) {
  1050. return `"${value.replace(/"/g, '""')}"`;
  1051. }
  1052. return value;
  1053. }
  1054. function extractSnippet(body: string, query: string, maxLen = 500, chunkPos?: number): { line: number; snippet: string } {
  1055. // If chunkPos provided, calculate line offset and focus search there
  1056. let lineOffset = 0;
  1057. let searchBody = body;
  1058. if (chunkPos && chunkPos > 0) {
  1059. // Count lines before chunkPos to get line offset
  1060. const beforeChunk = body.slice(0, chunkPos);
  1061. lineOffset = beforeChunk.split('\n').length - 1;
  1062. // Focus search on the chunk area (with some context before)
  1063. const contextStart = Math.max(0, chunkPos - 200);
  1064. searchBody = body.slice(contextStart);
  1065. if (contextStart > 0) {
  1066. lineOffset = body.slice(0, contextStart).split('\n').length - 1;
  1067. }
  1068. }
  1069. const lines = searchBody.split('\n');
  1070. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
  1071. let bestLine = 0, bestScore = -1;
  1072. for (let i = 0; i < lines.length; i++) {
  1073. const lineLower = lines[i].toLowerCase();
  1074. let score = 0;
  1075. for (const term of queryTerms) {
  1076. if (lineLower.includes(term)) score++;
  1077. }
  1078. if (score > bestScore) {
  1079. bestScore = score;
  1080. bestLine = i;
  1081. }
  1082. }
  1083. const startLine = Math.max(0, bestLine - 1);
  1084. const endLine = Math.min(lines.length, bestLine + 2);
  1085. let snippet = lines.slice(startLine, endLine).join('\n');
  1086. if (snippet.length > maxLen) snippet = snippet.substring(0, maxLen - 3) + "...";
  1087. return { line: lineOffset + bestLine + 1, snippet };
  1088. }
  1089. type SearchResult = { file: string; displayPath: string; title: string; body: string; score: number; source: "fts" | "vec"; chunkPos?: number };
  1090. // Sanitize a term for FTS5: remove punctuation except apostrophes
  1091. function sanitizeFTS5Term(term: string): string {
  1092. // Remove all non-alphanumeric except apostrophes (for contractions like "don't")
  1093. return term.replace(/[^\w']/g, '').trim();
  1094. }
  1095. // Build FTS5 query: phrase-aware with fallback to individual terms
  1096. function buildFTS5Query(query: string): string {
  1097. // Sanitize the full query for phrase matching
  1098. const sanitizedQuery = query.replace(/[^\w\s']/g, '').trim();
  1099. const terms = query
  1100. .split(/\s+/)
  1101. .map(sanitizeFTS5Term)
  1102. .filter(term => term.length >= 2); // Skip single chars and empty
  1103. if (terms.length === 0) return "";
  1104. if (terms.length === 1) return `"${terms[0].replace(/"/g, '""')}"`;
  1105. // Strategy: exact phrase OR proximity match OR individual terms
  1106. // Exact phrase matches rank highest, then close proximity, then any term
  1107. const phrase = `"${sanitizedQuery.replace(/"/g, '""')}"`;
  1108. const quotedTerms = terms.map(t => `"${t.replace(/"/g, '""')}"`);
  1109. // FTS5 NEAR syntax: NEAR(term1 term2, distance)
  1110. const nearPhrase = `NEAR(${quotedTerms.join(' ')}, 10)`;
  1111. const orTerms = quotedTerms.join(' OR ');
  1112. // Exact phrase > proximity > any term
  1113. return `(${phrase}) OR (${nearPhrase}) OR (${orTerms})`;
  1114. }
  1115. // Normalize BM25 score to 0-1 range using sigmoid
  1116. function normalizeBM25(score: number): number {
  1117. // BM25 scores are negative in SQLite (lower = better)
  1118. // Typical range: -15 (excellent) to -2 (weak match)
  1119. // Map to 0-1 where higher is better
  1120. const absScore = Math.abs(score);
  1121. // Sigmoid-ish normalization: maps ~2-15 range to ~0.1-0.95
  1122. return 1 / (1 + Math.exp(-(absScore - 5) / 3));
  1123. }
  1124. function searchFTS(db: Database, query: string, limit: number = 20): SearchResult[] {
  1125. const ftsQuery = buildFTS5Query(query);
  1126. if (!ftsQuery) return [];
  1127. // BM25 weights: name=10, body=1 (title matches ranked higher)
  1128. const stmt = db.prepare(`
  1129. SELECT d.filepath, d.display_path, d.title, d.body, bm25(documents_fts, 10.0, 1.0) as score
  1130. FROM documents_fts f
  1131. JOIN documents d ON d.id = f.rowid
  1132. WHERE documents_fts MATCH ? AND d.active = 1
  1133. ORDER BY score
  1134. LIMIT ?
  1135. `);
  1136. const results = stmt.all(ftsQuery, limit) as { filepath: string; display_path: string; title: string; body: string; score: number }[];
  1137. return results.map(r => ({
  1138. file: r.filepath,
  1139. displayPath: r.display_path,
  1140. title: r.title,
  1141. body: r.body,
  1142. score: normalizeBM25(r.score),
  1143. source: "fts" as const,
  1144. }));
  1145. }
  1146. async function searchVec(db: Database, query: string, model: string, limit: number = 20): Promise<SearchResult[]> {
  1147. const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1148. if (!tableExists) return [];
  1149. const queryEmbedding = await getEmbedding(query, model, true);
  1150. const queryVec = new Float32Array(queryEmbedding);
  1151. // Join: vectors_vec -> content_vectors -> documents
  1152. // Over-retrieve to handle multiple chunks per document, then dedupe
  1153. const stmt = db.prepare(`
  1154. SELECT d.filepath, d.display_path, d.title, d.body, vec.distance, cv.pos
  1155. FROM vectors_vec vec
  1156. JOIN content_vectors cv ON vec.hash_seq = cv.hash || '_' || cv.seq
  1157. JOIN documents d ON d.hash = cv.hash AND d.active = 1
  1158. WHERE vec.embedding MATCH ? AND k = ?
  1159. ORDER BY vec.distance
  1160. `);
  1161. const rawResults = stmt.all(queryVec, limit * 3) as { filepath: string; display_path: string; title: string; body: string; distance: number; pos: number }[];
  1162. // Aggregate chunks per document: max score + small bonus for additional matches
  1163. const byFile = new Map<string, { filepath: string; displayPath: string; title: string; body: string; chunkCount: number; bestPos: number; bestDist: number }>();
  1164. for (const r of rawResults) {
  1165. const existing = byFile.get(r.filepath);
  1166. if (!existing) {
  1167. byFile.set(r.filepath, { filepath: r.filepath, displayPath: r.display_path, title: r.title, body: r.body, chunkCount: 1, bestPos: r.pos, bestDist: r.distance });
  1168. } else {
  1169. existing.chunkCount++;
  1170. if (r.distance < existing.bestDist) {
  1171. existing.bestDist = r.distance;
  1172. existing.bestPos = r.pos;
  1173. }
  1174. }
  1175. }
  1176. // Score = max chunk score + 0.02 bonus per additional chunk (capped at +0.1)
  1177. return Array.from(byFile.values())
  1178. .map(r => {
  1179. const maxScore = 1 / (1 + r.bestDist);
  1180. const bonusChunks = Math.min(r.chunkCount - 1, 5);
  1181. const bonus = bonusChunks * 0.02;
  1182. return {
  1183. file: r.filepath,
  1184. displayPath: r.displayPath,
  1185. title: r.title,
  1186. body: r.body,
  1187. score: maxScore + bonus,
  1188. source: "vec" as const,
  1189. chunkPos: r.bestPos,
  1190. };
  1191. })
  1192. .sort((a, b) => b.score - a.score)
  1193. .slice(0, limit);
  1194. }
  1195. function normalizeScores(results: SearchResult[]): SearchResult[] {
  1196. if (results.length === 0) return results;
  1197. const maxScore = Math.max(...results.map(r => r.score));
  1198. const minScore = Math.min(...results.map(r => r.score));
  1199. const range = maxScore - minScore || 1;
  1200. return results.map(r => ({ ...r, score: (r.score - minScore) / range }));
  1201. }
  1202. // Reciprocal Rank Fusion: combines multiple ranked lists
  1203. // RRF score = sum(1 / (k + rank)) across all lists where doc appears
  1204. // k=60 is standard, provides good balance between top and lower ranks
  1205. type RankedResult = { file: string; displayPath: string; title: string; body: string; score: number };
  1206. function reciprocalRankFusion(
  1207. resultLists: RankedResult[][],
  1208. weights: number[] = [], // Weight per result list (default 1.0)
  1209. k: number = 60
  1210. ): RankedResult[] {
  1211. const scores = new Map<string, { score: number; displayPath: string; title: string; body: string; bestRank: number }>();
  1212. for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
  1213. const results = resultLists[listIdx];
  1214. const weight = weights[listIdx] ?? 1.0;
  1215. for (let rank = 0; rank < results.length; rank++) {
  1216. const doc = results[rank];
  1217. const rrfScore = weight / (k + rank + 1);
  1218. const existing = scores.get(doc.file);
  1219. if (existing) {
  1220. existing.score += rrfScore;
  1221. existing.bestRank = Math.min(existing.bestRank, rank);
  1222. } else {
  1223. scores.set(doc.file, { score: rrfScore, displayPath: doc.displayPath, title: doc.title, body: doc.body, bestRank: rank });
  1224. }
  1225. }
  1226. }
  1227. // Add bonus for best rank: documents that ranked #1-3 in any list get a boost
  1228. // This prevents dilution of exact matches by expansion queries
  1229. return Array.from(scores.entries())
  1230. .map(([file, { score, displayPath, title, body, bestRank }]) => {
  1231. let bonus = 0;
  1232. if (bestRank === 0) bonus = 0.05; // Ranked #1 somewhere
  1233. else if (bestRank <= 2) bonus = 0.02; // Ranked top-3 somewhere
  1234. return { file, displayPath, title, body, score: score + bonus };
  1235. })
  1236. .sort((a, b) => b.score - a.score);
  1237. }
  1238. type OutputFormat = "cli" | "csv" | "md" | "xml" | "files" | "json";
  1239. type OutputOptions = {
  1240. format: OutputFormat;
  1241. full: boolean;
  1242. limit: number;
  1243. minScore: number;
  1244. all?: boolean;
  1245. };
  1246. // Extract snippet with more context lines for CLI display
  1247. function extractSnippetWithContext(body: string, query: string, contextLines = 3, chunkPos?: number): { line: number; snippet: string; hasMatch: boolean } {
  1248. // If chunkPos provided, focus search on that area
  1249. let lineOffset = 0;
  1250. let searchBody = body;
  1251. if (chunkPos && chunkPos > 0) {
  1252. const contextStart = Math.max(0, chunkPos - 200);
  1253. searchBody = body.slice(contextStart);
  1254. if (contextStart > 0) {
  1255. lineOffset = body.slice(0, contextStart).split('\n').length - 1;
  1256. }
  1257. }
  1258. const lines = searchBody.split('\n');
  1259. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
  1260. let bestLine = 0, bestScore = -1;
  1261. for (let i = 0; i < lines.length; i++) {
  1262. const lineLower = lines[i].toLowerCase();
  1263. let score = 0;
  1264. for (const term of queryTerms) {
  1265. if (lineLower.includes(term)) score++;
  1266. }
  1267. if (score > bestScore) {
  1268. bestScore = score;
  1269. bestLine = i;
  1270. }
  1271. }
  1272. // No query match found - return beginning of chunk area or file
  1273. if (bestScore <= 0) {
  1274. const preview = lines.slice(0, contextLines * 2).join('\n').trim();
  1275. return { line: lineOffset + 1, snippet: preview, hasMatch: false };
  1276. }
  1277. const startLine = Math.max(0, bestLine - contextLines);
  1278. const endLine = Math.min(lines.length, bestLine + contextLines + 1);
  1279. const snippet = lines.slice(startLine, endLine).join('\n').trim();
  1280. return { line: lineOffset + bestLine + 1, snippet, hasMatch: true };
  1281. }
  1282. // Highlight query terms in text (skip short words < 3 chars)
  1283. function highlightTerms(text: string, query: string): string {
  1284. if (!useColor) return text;
  1285. const terms = query.toLowerCase().split(/\s+/).filter(t => t.length >= 3);
  1286. let result = text;
  1287. for (const term of terms) {
  1288. const regex = new RegExp(`(${term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')})`, 'gi');
  1289. result = result.replace(regex, `${c.yellow}${c.bold}$1${c.reset}`);
  1290. }
  1291. return result;
  1292. }
  1293. // Format score with color based on value
  1294. function formatScore(score: number): string {
  1295. const pct = (score * 100).toFixed(0).padStart(3);
  1296. if (!useColor) return `${pct}%`;
  1297. if (score >= 0.7) return `${c.green}${pct}%${c.reset}`;
  1298. if (score >= 0.4) return `${c.yellow}${pct}%${c.reset}`;
  1299. return `${c.dim}${pct}%${c.reset}`;
  1300. }
  1301. // Shorten directory path for display - relative to $HOME (used for context paths, not documents)
  1302. function shortPath(dirpath: string): string {
  1303. const home = homedir();
  1304. if (dirpath.startsWith(home)) {
  1305. return '~' + dirpath.slice(home.length);
  1306. }
  1307. return dirpath;
  1308. }
  1309. function outputResults(results: { file: string; displayPath: string; title: string; body: string; score: number; context?: string | null; chunkPos?: number }[], query: string, opts: OutputOptions): void {
  1310. const filtered = results.filter(r => r.score >= opts.minScore).slice(0, opts.limit);
  1311. if (filtered.length === 0) {
  1312. console.log("No results found above minimum score threshold.");
  1313. return;
  1314. }
  1315. if (opts.format === "json") {
  1316. // JSON output for LLM consumption
  1317. const output = filtered.map(row => ({
  1318. score: Math.round(row.score * 100) / 100,
  1319. file: row.displayPath,
  1320. title: row.title,
  1321. ...(row.context && { context: row.context }),
  1322. ...(opts.full && { body: row.body }),
  1323. ...(!opts.full && { snippet: extractSnippet(row.body, query, 300, row.chunkPos).snippet }),
  1324. }));
  1325. console.log(JSON.stringify(output, null, 2));
  1326. } else if (opts.format === "files") {
  1327. // Simple score,filepath,context output
  1328. for (const row of filtered) {
  1329. const ctx = row.context ? `,"${row.context.replace(/"/g, '""')}"` : "";
  1330. console.log(`${row.score.toFixed(2)},${row.displayPath}${ctx}`);
  1331. }
  1332. } else if (opts.format === "cli") {
  1333. for (let i = 0; i < filtered.length; i++) {
  1334. const row = filtered[i];
  1335. const { line, snippet, hasMatch } = extractSnippetWithContext(row.body, query, 2, row.chunkPos);
  1336. // Line 1: filepath
  1337. const path = row.displayPath;
  1338. const lineInfo = hasMatch ? `:${line}` : "";
  1339. console.log(`${c.cyan}${path}${c.dim}${lineInfo}${c.reset}`);
  1340. // Line 2: Title (if available)
  1341. if (row.title) {
  1342. console.log(`${c.bold}Title: ${row.title}${c.reset}`);
  1343. }
  1344. // Line 3: Context (if available)
  1345. if (row.context) {
  1346. console.log(`${c.dim}Context: ${row.context}${c.reset}`);
  1347. }
  1348. // Line 4: Score
  1349. const score = formatScore(row.score);
  1350. console.log(`Score: ${c.bold}${score}${c.reset}`);
  1351. console.log();
  1352. // Snippet with highlighting (no leading | chars for better word wrap)
  1353. const highlighted = highlightTerms(snippet, query);
  1354. console.log(highlighted);
  1355. // Double empty line between results
  1356. if (i < filtered.length - 1) console.log('\n');
  1357. }
  1358. } else if (opts.format === "md") {
  1359. for (const row of filtered) {
  1360. const heading = row.title || row.displayPath;
  1361. if (opts.full) {
  1362. console.log(`---\n# ${heading}\n\n${row.body}\n`);
  1363. } else {
  1364. const { snippet } = extractSnippet(row.body, query, 500, row.chunkPos);
  1365. console.log(`---\n# ${heading}\n\n${snippet}\n`);
  1366. }
  1367. }
  1368. } else if (opts.format === "xml") {
  1369. for (const row of filtered) {
  1370. const titleAttr = row.title ? ` title="${row.title.replace(/"/g, '&quot;')}"` : "";
  1371. if (opts.full) {
  1372. console.log(`<file name="${row.displayPath}"${titleAttr}>\n${row.body}\n</file>\n`);
  1373. } else {
  1374. const { snippet } = extractSnippet(row.body, query, 500, row.chunkPos);
  1375. console.log(`<file name="${row.displayPath}"${titleAttr}>\n${snippet}\n</file>\n`);
  1376. }
  1377. }
  1378. } else {
  1379. // CSV format
  1380. console.log("score,file,title,line,snippet");
  1381. for (const row of filtered) {
  1382. const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos);
  1383. const content = opts.full ? row.body : snippet;
  1384. console.log(`${row.score.toFixed(4)},${escapeCSV(row.displayPath)},${escapeCSV(row.title)},${line},${escapeCSV(content)}`);
  1385. }
  1386. }
  1387. }
  1388. function search(query: string, opts: OutputOptions): void {
  1389. const db = getDb();
  1390. // Use large limit for --all, otherwise fetch more than needed and let outputResults filter
  1391. const fetchLimit = opts.all ? 100000 : Math.max(50, opts.limit * 2);
  1392. const results = searchFTS(db, query, fetchLimit);
  1393. // Add context to results
  1394. const resultsWithContext = results.map(r => ({
  1395. ...r,
  1396. context: getContextForFile(db, r.file),
  1397. }));
  1398. db.close();
  1399. if (resultsWithContext.length === 0) {
  1400. console.log("No results found.");
  1401. return;
  1402. }
  1403. outputResults(resultsWithContext, query, opts);
  1404. }
  1405. async function vectorSearch(query: string, opts: OutputOptions, model: string = DEFAULT_EMBED_MODEL): Promise<void> {
  1406. const db = getDb();
  1407. const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1408. if (!tableExists) {
  1409. console.error("Vector index not found. Run 'qmd embed' first to create embeddings.");
  1410. db.close();
  1411. return;
  1412. }
  1413. // Expand query to multiple variations (with caching)
  1414. const queries = await expandQuery(query, DEFAULT_QUERY_MODEL, db);
  1415. process.stderr.write(`Searching with ${queries.length} query variations...\n`);
  1416. // Collect results from all query variations
  1417. // For --all, fetch more results per query
  1418. const perQueryLimit = opts.all ? 500 : 20;
  1419. const allResults = new Map<string, { file: string; displayPath: string; title: string; body: string; score: number }>();
  1420. for (const q of queries) {
  1421. const vecResults = await searchVec(db, q, model, perQueryLimit);
  1422. for (const r of vecResults) {
  1423. const existing = allResults.get(r.file);
  1424. if (!existing || r.score > existing.score) {
  1425. allResults.set(r.file, { file: r.file, displayPath: r.displayPath, title: r.title, body: r.body, score: r.score });
  1426. }
  1427. }
  1428. }
  1429. // Sort by max score and limit to requested count
  1430. const results = Array.from(allResults.values())
  1431. .sort((a, b) => b.score - a.score)
  1432. .slice(0, opts.limit)
  1433. .map(r => ({ ...r, context: getContextForFile(db, r.file) }));
  1434. db.close();
  1435. if (results.length === 0) {
  1436. console.log("No results found.");
  1437. return;
  1438. }
  1439. outputResults(results, query, { ...opts, limit: results.length }); // Already limited
  1440. }
  1441. async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db?: Database): Promise<string[]> {
  1442. process.stderr.write("Generating query variations...\n");
  1443. const prompt = `You are a search query expander. Given a search query, generate 2 alternative queries that would help find relevant documents.
  1444. Rules:
  1445. - Use synonyms and related terminology (e.g., "craft" → "craftsmanship", "quality", "excellence")
  1446. - Rephrase to capture different angles (e.g., "engineering culture" → "technical excellence", "developer practices")
  1447. - Keep proper nouns and named concepts exactly as written (e.g., "Build a Business", "Stripe", "Shopify")
  1448. - Each variation should be 3-8 words, natural search terms
  1449. - Do NOT just append words like "search" or "find" or "documents"
  1450. Query: "${query}"
  1451. Output exactly 2 variations, one per line, no numbering or bullets:`;
  1452. const requestBody = {
  1453. model,
  1454. prompt,
  1455. stream: false,
  1456. think: false,
  1457. options: { num_predict: 150 },
  1458. };
  1459. // Check cache
  1460. const cacheDb = db || getDb();
  1461. const cacheKey = getCacheKey(`${OLLAMA_URL}/api/generate`, requestBody);
  1462. const cached = getCachedResult(cacheDb, cacheKey);
  1463. let responseText: string;
  1464. if (cached) {
  1465. responseText = cached;
  1466. } else {
  1467. const response = await fetch(`${OLLAMA_URL}/api/generate`, {
  1468. method: "POST",
  1469. headers: { "Content-Type": "application/json" },
  1470. body: JSON.stringify(requestBody),
  1471. });
  1472. if (!response.ok) {
  1473. const errorText = await response.text();
  1474. if (errorText.includes("not found") || errorText.includes("does not exist")) {
  1475. await ensureModelAvailable(model);
  1476. if (!db) cacheDb.close();
  1477. return expandQuery(query, model, db);
  1478. }
  1479. if (!db) cacheDb.close();
  1480. return [query];
  1481. }
  1482. const data = await response.json() as { response: string };
  1483. responseText = data.response;
  1484. setCachedResult(cacheDb, cacheKey, responseText);
  1485. }
  1486. if (!db) cacheDb.close();
  1487. const lines = responseText.trim().split('\n')
  1488. .map(l => l.replace(/^[\d\.\-\*\"\s]+/, '').replace(/["\s]+$/, '').trim())
  1489. .filter(l => l.length > 2 && l.length < 100 && !l.startsWith('<') && !l.toLowerCase().includes('variation'))
  1490. .slice(0, 2);
  1491. const allQueries = [query, ...lines];
  1492. process.stderr.write(`${c.dim}Queries: ${allQueries.join(' | ')}${c.reset}\n`);
  1493. return allQueries;
  1494. }
  1495. async function querySearch(query: string, opts: OutputOptions, embedModel: string = DEFAULT_EMBED_MODEL, rerankModel: string = DEFAULT_RERANK_MODEL): Promise<void> {
  1496. const db = getDb();
  1497. // Expand query to multiple variations (with caching)
  1498. const queries = await expandQuery(query, DEFAULT_QUERY_MODEL, db);
  1499. process.stderr.write(`Searching with ${queries.length} query variations...\n`);
  1500. // Collect ranked result lists for RRF fusion
  1501. const rankedLists: RankedResult[][] = [];
  1502. const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  1503. for (const q of queries) {
  1504. // FTS search - get ranked results
  1505. const ftsResults = searchFTS(db, q, 20);
  1506. if (ftsResults.length > 0) {
  1507. rankedLists.push(ftsResults.map(r => ({ file: r.file, displayPath: r.displayPath, title: r.title, body: r.body, score: r.score })));
  1508. }
  1509. // Vector search - get ranked results
  1510. if (hasVectors) {
  1511. const vecResults = await searchVec(db, q, embedModel, 20);
  1512. if (vecResults.length > 0) {
  1513. rankedLists.push(vecResults.map(r => ({ file: r.file, displayPath: r.displayPath, title: r.title, body: r.body, score: r.score })));
  1514. }
  1515. }
  1516. }
  1517. // Apply Reciprocal Rank Fusion to combine all ranked lists
  1518. // Give 2x weight to original query results (first 2 lists: FTS + vector)
  1519. const weights = rankedLists.map((_, i) => i < 2 ? 2.0 : 1.0);
  1520. const fused = reciprocalRankFusion(rankedLists, weights);
  1521. const candidates = fused.slice(0, 30); // Over-retrieve for reranking
  1522. if (candidates.length === 0) {
  1523. console.log("No results found.");
  1524. db.close();
  1525. return;
  1526. }
  1527. // Rerank with the original query (with caching)
  1528. const reranked = await rerank(
  1529. query,
  1530. candidates.map(c => ({ file: c.file, text: c.body })),
  1531. rerankModel,
  1532. db
  1533. );
  1534. // Blend RRF position score with reranker score using position-aware weights
  1535. // Top retrieval results get more protection from reranker disagreement
  1536. const candidateMap = new Map(candidates.map(c => [c.file, { displayPath: c.displayPath, title: c.title, body: c.body }]));
  1537. const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1])); // 1-indexed rank
  1538. const finalResults = reranked.map(r => {
  1539. const rrfRank = rrfRankMap.get(r.file) || 30;
  1540. // Position-aware blending: top retrieval results preserved more
  1541. // Rank 1-3: 75% RRF, 25% reranker (trust retrieval for exact matches)
  1542. // Rank 4-10: 60% RRF, 40% reranker
  1543. // Rank 11+: 40% RRF, 60% reranker (trust reranker for lower-ranked)
  1544. let rrfWeight: number;
  1545. if (rrfRank <= 3) {
  1546. rrfWeight = 0.75;
  1547. } else if (rrfRank <= 10) {
  1548. rrfWeight = 0.60;
  1549. } else {
  1550. rrfWeight = 0.40;
  1551. }
  1552. const rrfScore = 1 / rrfRank; // Position-based: 1, 0.5, 0.33...
  1553. const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * r.score;
  1554. const candidate = candidateMap.get(r.file);
  1555. return {
  1556. file: r.file,
  1557. displayPath: candidate?.displayPath || "",
  1558. title: candidate?.title || "",
  1559. body: candidate?.body || "",
  1560. score: blendedScore,
  1561. context: getContextForFile(db, r.file),
  1562. };
  1563. }).sort((a, b) => b.score - a.score);
  1564. db.close();
  1565. outputResults(finalResults, query, opts);
  1566. }
  1567. // Parse CLI arguments using util.parseArgs
  1568. function parseCLI() {
  1569. const { values, positionals } = parseArgs({
  1570. args: Bun.argv.slice(2), // Skip bun and script path
  1571. options: {
  1572. // Global options
  1573. index: { type: "string" },
  1574. help: { type: "boolean", short: "h" },
  1575. // Search options
  1576. n: { type: "string" },
  1577. "min-score": { type: "string" },
  1578. all: { type: "boolean" },
  1579. full: { type: "boolean" },
  1580. csv: { type: "boolean" },
  1581. md: { type: "boolean" },
  1582. xml: { type: "boolean" },
  1583. files: { type: "boolean" },
  1584. json: { type: "boolean" },
  1585. // Add options
  1586. drop: { type: "boolean" },
  1587. // Embed options
  1588. force: { type: "boolean", short: "f" },
  1589. // Get options
  1590. l: { type: "string" }, // max lines
  1591. from: { type: "string" }, // start line
  1592. },
  1593. allowPositionals: true,
  1594. strict: false, // Allow unknown options to pass through
  1595. });
  1596. // Set global index name
  1597. if (values.index) {
  1598. customIndexName = values.index;
  1599. }
  1600. // Determine output format
  1601. let format: OutputFormat = "cli";
  1602. if (values.csv) format = "csv";
  1603. else if (values.md) format = "md";
  1604. else if (values.xml) format = "xml";
  1605. else if (values.files) format = "files";
  1606. else if (values.json) format = "json";
  1607. // Default limit: 20 for --files/--json, 5 otherwise
  1608. // --all means return all results (use very large limit)
  1609. const defaultLimit = (format === "files" || format === "json") ? 20 : 5;
  1610. const isAll = values.all || false;
  1611. const opts: OutputOptions = {
  1612. format,
  1613. full: values.full || false,
  1614. limit: isAll ? 100000 : (values.n ? parseInt(values.n, 10) || defaultLimit : defaultLimit),
  1615. minScore: values["min-score"] ? parseFloat(values["min-score"]) || 0 : 0,
  1616. all: isAll,
  1617. };
  1618. return {
  1619. command: positionals[0] || "",
  1620. args: positionals.slice(1),
  1621. query: positionals.slice(1).join(" "),
  1622. opts,
  1623. values,
  1624. };
  1625. }
  1626. function showHelp(): void {
  1627. console.log("Usage:");
  1628. console.log(" qmd add [--drop] [glob] - Add/update collection from $PWD (default: **/*.md)");
  1629. console.log(" qmd add-context <path> <text> - Add context description for files under path");
  1630. console.log(" qmd get <file>[:line] [-l N] [--from N] - Get document (optionally from line, max N lines)");
  1631. console.log(" qmd status - Show index status and collections");
  1632. console.log(" qmd update-all - Re-index all collections");
  1633. console.log(" qmd embed [-f] - Create vector embeddings (chunks ~6KB each)");
  1634. console.log(" qmd cleanup - Remove cache and orphaned data, vacuum DB");
  1635. console.log(" qmd search <query> - Full-text search (BM25)");
  1636. console.log(" qmd vsearch <query> - Vector similarity search");
  1637. console.log(" qmd query <query> - Combined search with query expansion + reranking");
  1638. console.log("");
  1639. console.log("Global options:");
  1640. console.log(" --index <name> - Use custom index name (default: index)");
  1641. console.log("");
  1642. console.log("Search options:");
  1643. console.log(" -n <num> - Number of results (default: 5, or 20 for --files)");
  1644. console.log(" --all - Return all matches (use with --min-score to filter)");
  1645. console.log(" --min-score <num> - Minimum similarity score");
  1646. console.log(" --full - Output full document instead of snippet");
  1647. console.log(" --files - Output score,filepath,context (default: 20 results)");
  1648. console.log(" --json - JSON output with snippets (default: 20 results)");
  1649. console.log(" --csv - CSV output with snippets");
  1650. console.log(" --md - Markdown output");
  1651. console.log(" --xml - XML output");
  1652. console.log("");
  1653. console.log("Environment:");
  1654. console.log(" OLLAMA_URL - Ollama server URL (default: http://localhost:11434)");
  1655. console.log("");
  1656. console.log("Models:");
  1657. console.log(` Embedding: ${DEFAULT_EMBED_MODEL}`);
  1658. console.log(` Reranking: ${DEFAULT_RERANK_MODEL}`);
  1659. console.log("");
  1660. console.log(`Index: ${getDbPath()}`);
  1661. }
  1662. // Main CLI
  1663. const cli = parseCLI();
  1664. if (!cli.command || cli.values.help) {
  1665. showHelp();
  1666. process.exit(cli.values.help ? 0 : 1);
  1667. }
  1668. switch (cli.command) {
  1669. case "add": {
  1670. const globArg = cli.args[0];
  1671. // Treat "." as "use default glob in current directory"
  1672. const globPattern = (!globArg || globArg === ".") ? DEFAULT_GLOB : globArg;
  1673. if (cli.values.drop) {
  1674. await dropCollection(globPattern);
  1675. } else {
  1676. await indexFiles(globPattern);
  1677. }
  1678. break;
  1679. }
  1680. case "add-context": {
  1681. // qmd add-context <path> <context> OR qmd add-context <context> (uses .)
  1682. if (cli.args.length === 0) {
  1683. console.error("Usage: qmd add-context <path> <context>");
  1684. console.error(" qmd add-context . \"Description of files in current directory\"");
  1685. process.exit(1);
  1686. }
  1687. let pathArg: string;
  1688. let contextText: string;
  1689. if (cli.args.length === 1) {
  1690. // Single arg = context for current directory
  1691. pathArg = ".";
  1692. contextText = cli.args[0];
  1693. } else {
  1694. pathArg = cli.args[0];
  1695. contextText = cli.args.slice(1).join(" ");
  1696. }
  1697. await addContext(pathArg, contextText);
  1698. break;
  1699. }
  1700. case "get": {
  1701. if (!cli.args[0]) {
  1702. console.error("Usage: qmd get <filepath>[:line] [--from <line>] [-l <lines>]");
  1703. process.exit(1);
  1704. }
  1705. const fromLine = cli.values.from ? parseInt(cli.values.from as string, 10) : undefined;
  1706. const maxLines = cli.values.l ? parseInt(cli.values.l as string, 10) : undefined;
  1707. getDocument(cli.args[0], fromLine, maxLines);
  1708. break;
  1709. }
  1710. case "status":
  1711. showStatus();
  1712. break;
  1713. case "update-all":
  1714. await updateAllCollections();
  1715. break;
  1716. case "embed":
  1717. await vectorIndex(DEFAULT_EMBED_MODEL, cli.values.force || false);
  1718. break;
  1719. case "search":
  1720. if (!cli.query) {
  1721. console.error("Usage: qmd search [options] <query>");
  1722. process.exit(1);
  1723. }
  1724. search(cli.query, cli.opts);
  1725. break;
  1726. case "vsearch":
  1727. if (!cli.query) {
  1728. console.error("Usage: qmd vsearch [options] <query>");
  1729. process.exit(1);
  1730. }
  1731. // Default min-score for vector search is 0.3
  1732. if (!cli.values["min-score"]) {
  1733. cli.opts.minScore = 0.3;
  1734. }
  1735. await vectorSearch(cli.query, cli.opts);
  1736. break;
  1737. case "query":
  1738. if (!cli.query) {
  1739. console.error("Usage: qmd query [options] <query>");
  1740. process.exit(1);
  1741. }
  1742. await querySearch(cli.query, cli.opts);
  1743. break;
  1744. case "cleanup": {
  1745. const db = getDb();
  1746. // 1. Clear ollama_cache
  1747. const cacheCount = db.prepare(`SELECT COUNT(*) as c FROM ollama_cache`).get() as { c: number };
  1748. db.exec(`DELETE FROM ollama_cache`);
  1749. console.log(`${c.green}✓${c.reset} Cleared ${cacheCount.c} cached API responses`);
  1750. // 2. Remove orphaned vectors (no active document with that hash)
  1751. const orphanedVecs = db.prepare(`
  1752. SELECT COUNT(*) as c FROM content_vectors cv
  1753. WHERE NOT EXISTS (
  1754. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  1755. )
  1756. `).get() as { c: number };
  1757. if (orphanedVecs.c > 0) {
  1758. db.exec(`
  1759. DELETE FROM vectors_vec WHERE hash_seq IN (
  1760. SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
  1761. WHERE NOT EXISTS (
  1762. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  1763. )
  1764. )
  1765. `);
  1766. db.exec(`
  1767. DELETE FROM content_vectors WHERE hash NOT IN (
  1768. SELECT hash FROM documents WHERE active = 1
  1769. )
  1770. `);
  1771. console.log(`${c.green}✓${c.reset} Removed ${orphanedVecs.c} orphaned embedding chunks`);
  1772. } else {
  1773. console.log(`${c.dim}No orphaned embeddings to remove${c.reset}`);
  1774. }
  1775. // 3. Count inactive documents
  1776. const inactiveDocs = db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 0`).get() as { c: number };
  1777. if (inactiveDocs.c > 0) {
  1778. db.exec(`DELETE FROM documents WHERE active = 0`);
  1779. console.log(`${c.green}✓${c.reset} Removed ${inactiveDocs.c} inactive document records`);
  1780. }
  1781. // 4. Vacuum to reclaim space
  1782. db.exec(`VACUUM`);
  1783. console.log(`${c.green}✓${c.reset} Database vacuumed`);
  1784. db.close();
  1785. break;
  1786. }
  1787. default:
  1788. console.error(`Unknown command: ${cli.command}`);
  1789. console.error("Run 'qmd --help' for usage.");
  1790. process.exit(1);
  1791. }