/** * QMD Store - Core data access and retrieval functions * * This module provides all database operations, search functions, and document * retrieval for QMD. It returns raw data structures that can be formatted by * CLI or MCP consumers. * * Usage: * const store = createStore("/path/to/db.sqlite"); * // or use default path: * const store = createStore(); */ import { openDatabase, loadSqliteVec } from "./db.js"; import picomatch from "picomatch"; import { createHash } from "crypto"; import { readFileSync, realpathSync, statSync, mkdirSync } from "node:fs"; // Note: node:path resolve is not imported — we export our own cross-platform resolve() import fastGlob from "fast-glob"; import { LlamaCpp, getDefaultLlamaCpp, formatQueryForEmbedding, formatDocForEmbedding, withLLMSessionForLlm, } from "./llm.js"; import { assertModelCompatible, } from "./embedding/provider.js"; // ============================================================================= // Configuration // ============================================================================= const HOME = process.env.HOME || "/tmp"; export const DEFAULT_EMBED_MODEL = "embeddinggemma"; export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0"; export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B"; export const DEFAULT_GLOB = "**/*.md"; export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB export const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64; export const DEFAULT_EMBED_MAX_BATCH_BYTES = 64 * 1024 * 1024; // 64MB // Chunking: 900 tokens per chunk with 15% overlap // Increased from 800 to accommodate smart chunking finding natural break points export const CHUNK_SIZE_TOKENS = 900; export const CHUNK_OVERLAP_TOKENS = Math.floor(CHUNK_SIZE_TOKENS * 0.15); // 135 tokens (15% overlap) // Fallback char-based approximation for sync chunking (~4 chars per token) export const CHUNK_SIZE_CHARS = CHUNK_SIZE_TOKENS * 4; // 3600 chars export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 540 chars // Search window for finding optimal break points (in tokens, ~200 tokens) export const CHUNK_WINDOW_TOKENS = 200; export const CHUNK_WINDOW_CHARS = CHUNK_WINDOW_TOKENS * 4; // 800 chars /** * Get the LlamaCpp instance for a store — prefers the store's own instance, * falls back to the global singleton. */ function getLlm(store) { return store.llm ?? getDefaultLlamaCpp(); } /** * Patterns for detecting break points in markdown documents. * Higher scores indicate better places to split. * Scores are spread wide so headings decisively beat lower-quality breaks. * Order matters for scoring - more specific patterns first. */ export const BREAK_PATTERNS = [ [/\n#{1}(?!#)/g, 100, 'h1'], // # but not ## [/\n#{2}(?!#)/g, 90, 'h2'], // ## but not ### [/\n#{3}(?!#)/g, 80, 'h3'], // ### but not #### [/\n#{4}(?!#)/g, 70, 'h4'], // #### but not ##### [/\n#{5}(?!#)/g, 60, 'h5'], // ##### but not ###### [/\n#{6}(?!#)/g, 50, 'h6'], // ###### [/\n```/g, 80, 'codeblock'], // code block boundary (same as h3) [/\n(?:---|\*\*\*|___)\s*\n/g, 60, 'hr'], // horizontal rule [/\n\n+/g, 20, 'blank'], // paragraph boundary [/\n[-*]\s/g, 5, 'list'], // unordered list item [/\n\d+\.\s/g, 5, 'numlist'], // ordered list item [/\n/g, 1, 'newline'], // minimal break ]; /** * Scan text for all potential break points. * Returns sorted array of break points with higher-scoring patterns taking precedence * when multiple patterns match the same position. */ export function scanBreakPoints(text) { const points = []; const seen = new Map(); // pos -> best break point at that pos for (const [pattern, score, type] of BREAK_PATTERNS) { for (const match of text.matchAll(pattern)) { const pos = match.index; const existing = seen.get(pos); // Keep higher score if position already seen if (!existing || score > existing.score) { const bp = { pos, score, type }; seen.set(pos, bp); } } } // Convert to array and sort by position for (const bp of seen.values()) { points.push(bp); } return points.sort((a, b) => a.pos - b.pos); } /** * Find all code fence regions in the text. * Code fences are delimited by ``` and we should never split inside them. */ export function findCodeFences(text) { const regions = []; const fencePattern = /\n```/g; let inFence = false; let fenceStart = 0; for (const match of text.matchAll(fencePattern)) { if (!inFence) { fenceStart = match.index; inFence = true; } else { regions.push({ start: fenceStart, end: match.index + match[0].length }); inFence = false; } } // Handle unclosed fence - extends to end of document if (inFence) { regions.push({ start: fenceStart, end: text.length }); } return regions; } /** * Check if a position is inside a code fence region. */ export function isInsideCodeFence(pos, fences) { return fences.some(f => pos > f.start && pos < f.end); } /** * Find the best cut position using scored break points with distance decay. * * Uses squared distance for gentler early decay - headings far back still win * over low-quality breaks near the target. * * @param breakPoints - Pre-scanned break points from scanBreakPoints() * @param targetCharPos - The ideal cut position (e.g., maxChars boundary) * @param windowChars - How far back to search for break points (default ~200 tokens) * @param decayFactor - How much to penalize distance (0.7 = 30% score at window edge) * @param codeFences - Code fence regions to avoid splitting inside * @returns The best position to cut at */ export function findBestCutoff(breakPoints, targetCharPos, windowChars = CHUNK_WINDOW_CHARS, decayFactor = 0.7, codeFences = []) { const windowStart = targetCharPos - windowChars; let bestScore = -1; let bestPos = targetCharPos; for (const bp of breakPoints) { if (bp.pos < windowStart) continue; if (bp.pos > targetCharPos) break; // sorted, so we can stop // Skip break points inside code fences if (isInsideCodeFence(bp.pos, codeFences)) continue; const distance = targetCharPos - bp.pos; // Squared distance decay: gentle early, steep late // At target: multiplier = 1.0 // At 25% back: multiplier = 0.956 // At 50% back: multiplier = 0.825 // At 75% back: multiplier = 0.606 // At window edge: multiplier = 0.3 const normalizedDist = distance / windowChars; const multiplier = 1.0 - (normalizedDist * normalizedDist) * decayFactor; const finalScore = bp.score * multiplier; if (finalScore > bestScore) { bestScore = finalScore; bestPos = bp.pos; } } return bestPos; } /** * Merge two sets of break points (e.g. regex + AST), keeping the highest * score at each position. Result is sorted by position. */ export function mergeBreakPoints(a, b) { const seen = new Map(); for (const bp of a) { const existing = seen.get(bp.pos); if (!existing || bp.score > existing.score) { seen.set(bp.pos, bp); } } for (const bp of b) { const existing = seen.get(bp.pos); if (!existing || bp.score > existing.score) { seen.set(bp.pos, bp); } } return Array.from(seen.values()).sort((a, b) => a.pos - b.pos); } /** * Core chunk algorithm that operates on precomputed break points and code fences. * This is the shared implementation used by both regex-only and AST-aware chunking. */ export function chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS) { if (content.length <= maxChars) { return [{ text: content, pos: 0 }]; } const chunks = []; let charPos = 0; while (charPos < content.length) { const targetEndPos = Math.min(charPos + maxChars, content.length); let endPos = targetEndPos; if (endPos < content.length) { const bestCutoff = findBestCutoff(breakPoints, targetEndPos, windowChars, 0.7, codeFences); if (bestCutoff > charPos && bestCutoff <= targetEndPos) { endPos = bestCutoff; } } if (endPos <= charPos) { endPos = Math.min(charPos + maxChars, content.length); } chunks.push({ text: content.slice(charPos, endPos), pos: charPos }); if (endPos >= content.length) { break; } charPos = endPos - overlapChars; const lastChunkPos = chunks.at(-1).pos; if (charPos <= lastChunkPos) { charPos = endPos; } } return chunks; } // Hybrid query: strong BM25 signal detection thresholds // Skip expensive LLM expansion when top result is strong AND clearly separated from runner-up export const STRONG_SIGNAL_MIN_SCORE = 0.85; export const STRONG_SIGNAL_MIN_GAP = 0.15; // Max candidates to pass to reranker — balances quality vs latency. // 40 keeps rank 31-40 visible to the reranker (matters for recall on broad queries). export const RERANK_CANDIDATE_LIMIT = 40; // ============================================================================= // Path utilities // ============================================================================= export function homedir() { return HOME; } /** * Check if a path is absolute. * Supports: * - Unix paths: /path/to/file * - Windows native: C:\path or C:/path * - Git Bash: /c/path or /C/path (C-Z drives, excluding A/B floppy drives) * * Note: /c without trailing slash is treated as Unix path (directory named "c"), * while /c/ or /c/path are treated as Git Bash paths (C: drive). */ export function isAbsolutePath(path) { if (!path) return false; // Unix absolute path if (path.startsWith('/')) { // Check if it's a Git Bash style path like /c/ or /c/Users (C-Z only, not A or B) // Requires path[2] === '/' to distinguish from Unix paths like /c or /cache // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter if (!isWSL() && path.length >= 3 && path[2] === '/') { const driveLetter = path[1]; if (driveLetter && /[c-zC-Z]/.test(driveLetter)) { return true; } } // Any other path starting with / is Unix absolute return true; } // Windows native path: C:\ or C:/ (any letter A-Z) if (path.length >= 2 && /[a-zA-Z]/.test(path[0]) && path[1] === ':') { return true; } return false; } /** * Normalize path separators to forward slashes. * Converts Windows backslashes to forward slashes. */ export function normalizePathSeparators(path) { return path.replace(/\\/g, '/'); } /** * Detect if running inside WSL (Windows Subsystem for Linux). * On WSL, paths like /c/work/... are valid drvfs mount points, not Git Bash paths. */ function isWSL() { return !!(process.env.WSL_DISTRO_NAME || process.env.WSL_INTEROP); } /** * Get the relative path from a prefix. * Returns null if path is not under prefix. * Returns empty string if path equals prefix. */ export function getRelativePathFromPrefix(path, prefix) { // Empty prefix is invalid if (!prefix) { return null; } const normalizedPath = normalizePathSeparators(path); const normalizedPrefix = normalizePathSeparators(prefix); // Ensure prefix ends with / for proper matching const prefixWithSlash = !normalizedPrefix.endsWith('/') ? normalizedPrefix + '/' : normalizedPrefix; // Exact match if (normalizedPath === normalizedPrefix) { return ''; } // Check if path starts with prefix if (normalizedPath.startsWith(prefixWithSlash)) { return normalizedPath.slice(prefixWithSlash.length); } return null; } export function resolve(...paths) { if (paths.length === 0) { throw new Error("resolve: at least one path segment is required"); } // Normalize all paths to use forward slashes const normalizedPaths = paths.map(normalizePathSeparators); let result = ''; let windowsDrive = ''; // Check if first path is absolute const firstPath = normalizedPaths[0]; if (isAbsolutePath(firstPath)) { result = firstPath; // Extract Windows drive letter if present if (firstPath.length >= 2 && /[a-zA-Z]/.test(firstPath[0]) && firstPath[1] === ':') { windowsDrive = firstPath.slice(0, 2); result = firstPath.slice(2); } else if (!isWSL() && firstPath.startsWith('/') && firstPath.length >= 3 && firstPath[2] === '/') { // Git Bash style: /c/ -> C: (C-Z drives only, not A or B) // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter const driveLetter = firstPath[1]; if (driveLetter && /[c-zC-Z]/.test(driveLetter)) { windowsDrive = driveLetter.toUpperCase() + ':'; result = firstPath.slice(2); } } } else { // Start with PWD or cwd, then append the first relative path const pwd = normalizePathSeparators(process.env.PWD || process.cwd()); // Extract Windows drive from PWD if present if (pwd.length >= 2 && /[a-zA-Z]/.test(pwd[0]) && pwd[1] === ':') { windowsDrive = pwd.slice(0, 2); result = pwd.slice(2) + '/' + firstPath; } else { result = pwd + '/' + firstPath; } } // Process remaining paths for (let i = 1; i < normalizedPaths.length; i++) { const p = normalizedPaths[i]; if (isAbsolutePath(p)) { // Absolute path replaces everything result = p; // Update Windows drive if present if (p.length >= 2 && /[a-zA-Z]/.test(p[0]) && p[1] === ':') { windowsDrive = p.slice(0, 2); result = p.slice(2); } else if (!isWSL() && p.startsWith('/') && p.length >= 3 && p[2] === '/') { // Git Bash style (C-Z drives only, not A or B) // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter const driveLetter = p[1]; if (driveLetter && /[c-zC-Z]/.test(driveLetter)) { windowsDrive = driveLetter.toUpperCase() + ':'; result = p.slice(2); } else { windowsDrive = ''; } } else { windowsDrive = ''; } } else { // Relative path - append result = result + '/' + p; } } // Normalize . and .. components const parts = result.split('/').filter(Boolean); const normalized = []; for (const part of parts) { if (part === '..') { normalized.pop(); } else if (part !== '.') { normalized.push(part); } } // Build final path const finalPath = '/' + normalized.join('/'); // Prepend Windows drive if present if (windowsDrive) { return windowsDrive + finalPath; } return finalPath; } // Flag to indicate production mode (set by qmd.ts at startup) let _productionMode = false; export function enableProductionMode() { _productionMode = true; } /** Reset production mode flag — only for testing. */ export function _resetProductionModeForTesting() { _productionMode = false; } export function getDefaultDbPath(indexName = "index") { // Always allow override via INDEX_PATH (for testing) if (process.env.INDEX_PATH) { return process.env.INDEX_PATH; } // In non-production mode (tests), require explicit path if (!_productionMode) { throw new Error("Database path not set. Tests must set INDEX_PATH env var or use createStore() with explicit path. " + "This prevents tests from accidentally writing to the global index."); } const cacheDir = process.env.XDG_CACHE_HOME || resolve(homedir(), ".cache"); const qmdCacheDir = resolve(cacheDir, "qmd"); try { mkdirSync(qmdCacheDir, { recursive: true }); } catch { } return resolve(qmdCacheDir, `${indexName}.sqlite`); } export function getPwd() { return process.env.PWD || process.cwd(); } export function getRealPath(path) { try { return realpathSync(path); } catch { return resolve(path); } } /** * Normalize explicit virtual path formats to standard qmd:// format. * Only handles paths that are already explicitly virtual: * - qmd://collection/path.md (already normalized) * - qmd:////collection/path.md (extra slashes - normalize) * - //collection/path.md (missing qmd: prefix - add it) * * Does NOT handle: * - collection/path.md (bare paths - could be filesystem relative) * - :linenum suffix (should be parsed separately before calling this) */ export function normalizeVirtualPath(input) { let path = input.trim(); // Handle qmd:// with extra slashes: qmd:////collection/path -> qmd://collection/path if (path.startsWith('qmd:')) { // Remove qmd: prefix and normalize slashes path = path.slice(4); // Remove leading slashes and re-add exactly two path = path.replace(/^\/+/, ''); return `qmd://${path}`; } // Handle //collection/path (missing qmd: prefix) if (path.startsWith('//')) { path = path.replace(/^\/+/, ''); return `qmd://${path}`; } // Return as-is for other cases (filesystem paths, docids, bare collection/path, etc.) return path; } /** * Parse a virtual path like "qmd://collection-name/path/to/file.md" * into its components. * Also supports collection root: "qmd://collection-name/" or "qmd://collection-name" */ export function parseVirtualPath(virtualPath) { // Normalize the path first const normalized = normalizeVirtualPath(virtualPath); // Match: qmd://collection-name[/optional-path] // Allows: qmd://name, qmd://name/, qmd://name/path const match = normalized.match(/^qmd:\/\/([^\/]+)\/?(.*)$/); if (!match?.[1]) return null; return { collectionName: match[1], path: match[2] ?? '', // Empty string for collection root }; } /** * Build a virtual path from collection name and relative path. */ export function buildVirtualPath(collectionName, path) { return `qmd://${collectionName}/${path}`; } /** * Check if a path is explicitly a virtual path. * Only recognizes explicit virtual path formats: * - qmd://collection/path.md * - //collection/path.md * * Does NOT consider bare collection/path.md as virtual - that should be * handled separately by checking if the first component is a collection name. */ export function isVirtualPath(path) { const trimmed = path.trim(); // Explicit qmd:// prefix (with any number of slashes) if (trimmed.startsWith('qmd:')) return true; // //collection/path format (missing qmd: prefix) if (trimmed.startsWith('//')) return true; return false; } /** * Resolve a virtual path to absolute filesystem path. */ export function resolveVirtualPath(db, virtualPath) { const parsed = parseVirtualPath(virtualPath); if (!parsed) return null; const coll = getCollectionByName(db, parsed.collectionName); if (!coll) return null; return resolve(coll.pwd, parsed.path); } /** * Convert an absolute filesystem path to a virtual path. * Returns null if the file is not in any indexed collection. */ export function toVirtualPath(db, absolutePath) { // Get all collections from DB const collections = getStoreCollections(db); // Find which collection this absolute path belongs to for (const coll of collections) { if (absolutePath.startsWith(coll.path + '/') || absolutePath === coll.path) { // Extract relative path const relativePath = absolutePath.startsWith(coll.path + '/') ? absolutePath.slice(coll.path.length + 1) : ''; // Verify this document exists in the database const doc = db.prepare(` SELECT d.path FROM documents d WHERE d.collection = ? AND d.path = ? AND d.active = 1 LIMIT 1 `).get(coll.name, relativePath); if (doc) { return buildVirtualPath(coll.name, relativePath); } } } return null; } // ============================================================================= // Database initialization // ============================================================================= function createSqliteVecUnavailableError(reason) { return new Error("sqlite-vec extension is unavailable. " + `${reason}. ` + "Install Homebrew SQLite so the sqlite-vec extension can be loaded, " + "and set BREW_PREFIX if Homebrew is installed in a non-standard location."); } function getErrorMessage(err) { return err instanceof Error ? err.message : String(err); } export function verifySqliteVecLoaded(db) { try { const row = db.prepare(`SELECT vec_version() AS version`).get(); if (!row?.version || typeof row.version !== "string") { throw new Error("vec_version() returned no version"); } } catch (err) { const message = getErrorMessage(err); throw createSqliteVecUnavailableError(`sqlite-vec probe failed (${message})`); } } let _sqliteVecAvailable = null; function initializeDatabase(db) { try { loadSqliteVec(db); verifySqliteVecLoaded(db); _sqliteVecAvailable = true; } catch (err) { // sqlite-vec is optional — vector search won't work but FTS is fine _sqliteVecAvailable = false; console.warn(getErrorMessage(err)); } db.exec("PRAGMA journal_mode = WAL"); db.exec("PRAGMA foreign_keys = ON"); // Drop legacy tables that are now managed in YAML db.exec(`DROP TABLE IF EXISTS path_contexts`); db.exec(`DROP TABLE IF EXISTS collections`); // Content-addressable storage - the source of truth for document content db.exec(` CREATE TABLE IF NOT EXISTS content ( hash TEXT PRIMARY KEY, doc TEXT NOT NULL, created_at TEXT NOT NULL ) `); // Documents table - file system layer mapping virtual paths to content hashes // Collections are now managed in ~/.config/qmd/index.yml db.exec(` CREATE TABLE IF NOT EXISTS documents ( id INTEGER PRIMARY KEY AUTOINCREMENT, collection TEXT NOT NULL, path TEXT NOT NULL, title TEXT NOT NULL, hash TEXT NOT NULL, created_at TEXT NOT NULL, modified_at TEXT NOT NULL, active INTEGER NOT NULL DEFAULT 1, FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE, UNIQUE(collection, path) ) `); db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection, active)`); db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`); db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path, active)`); // Cache table for LLM API calls db.exec(` CREATE TABLE IF NOT EXISTS llm_cache ( hash TEXT PRIMARY KEY, result TEXT NOT NULL, created_at TEXT NOT NULL ) `); // Content vectors const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all(); const hasSeqColumn = cvInfo.some(col => col.name === 'seq'); if (cvInfo.length > 0 && !hasSeqColumn) { db.exec(`DROP TABLE IF EXISTS content_vectors`); db.exec(`DROP TABLE IF EXISTS vectors_vec`); } db.exec(` CREATE TABLE IF NOT EXISTS content_vectors ( hash TEXT NOT NULL, seq INTEGER NOT NULL DEFAULT 0, pos INTEGER NOT NULL DEFAULT 0, model TEXT NOT NULL, embedded_at TEXT NOT NULL, PRIMARY KEY (hash, seq) ) `); // Store collections — makes the DB self-contained (no external config needed) db.exec(` CREATE TABLE IF NOT EXISTS store_collections ( name TEXT PRIMARY KEY, path TEXT NOT NULL, pattern TEXT NOT NULL DEFAULT '**/*.md', ignore_patterns TEXT, include_by_default INTEGER DEFAULT 1, update_command TEXT, context TEXT ) `); // Store config — key-value metadata (e.g. config_hash for sync optimization) db.exec(` CREATE TABLE IF NOT EXISTS store_config ( key TEXT PRIMARY KEY, value TEXT ) `); // FTS - index filepath (collection/path), title, and content db.exec(` CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5( filepath, title, body, tokenize='porter unicode61' ) `); // Triggers to keep FTS in sync db.exec(` CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents WHEN new.active = 1 BEGIN INSERT INTO documents_fts(rowid, filepath, title, body) SELECT new.id, new.collection || '/' || new.path, new.title, (SELECT doc FROM content WHERE hash = new.hash) WHERE new.active = 1; END `); db.exec(` CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN DELETE FROM documents_fts WHERE rowid = old.id; END `); db.exec(` CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents BEGIN -- Delete from FTS if no longer active DELETE FROM documents_fts WHERE rowid = old.id AND new.active = 0; -- Update FTS if still/newly active INSERT OR REPLACE INTO documents_fts(rowid, filepath, title, body) SELECT new.id, new.collection || '/' || new.path, new.title, (SELECT doc FROM content WHERE hash = new.hash) WHERE new.active = 1; END `); } function rowToNamedCollection(row) { return { name: row.name, path: row.path, pattern: row.pattern, ...(row.ignore_patterns ? { ignore: JSON.parse(row.ignore_patterns) } : {}), ...(row.include_by_default === 0 ? { includeByDefault: false } : {}), ...(row.update_command ? { update: row.update_command } : {}), ...(row.context ? { context: JSON.parse(row.context) } : {}), }; } export function getStoreCollections(db) { const rows = db.prepare(`SELECT * FROM store_collections`).all(); return rows.map(rowToNamedCollection); } export function getStoreCollection(db, name) { const row = db.prepare(`SELECT * FROM store_collections WHERE name = ?`).get(name); if (row == null) return null; return rowToNamedCollection(row); } export function getStoreGlobalContext(db) { const row = db.prepare(`SELECT value FROM store_config WHERE key = 'global_context'`).get(); if (row == null) return undefined; return row.value || undefined; } export function getStoreContexts(db) { const results = []; // Global context const globalCtx = getStoreGlobalContext(db); if (globalCtx) { results.push({ collection: "*", path: "/", context: globalCtx }); } // Collection contexts const rows = db.prepare(`SELECT name, context FROM store_collections WHERE context IS NOT NULL`).all(); for (const row of rows) { const ctxMap = JSON.parse(row.context); for (const [path, context] of Object.entries(ctxMap)) { results.push({ collection: row.name, path, context }); } } return results; } export function upsertStoreCollection(db, name, collection) { db.prepare(` INSERT INTO store_collections (name, path, pattern, ignore_patterns, include_by_default, update_command, context) VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(name) DO UPDATE SET path = excluded.path, pattern = excluded.pattern, ignore_patterns = excluded.ignore_patterns, include_by_default = excluded.include_by_default, update_command = excluded.update_command, context = excluded.context `).run(name, collection.path, collection.pattern || '**/*.md', collection.ignore ? JSON.stringify(collection.ignore) : null, collection.includeByDefault === false ? 0 : 1, collection.update || null, collection.context ? JSON.stringify(collection.context) : null); } export function deleteStoreCollection(db, name) { const result = db.prepare(`DELETE FROM store_collections WHERE name = ?`).run(name); return result.changes > 0; } export function renameStoreCollection(db, oldName, newName) { // Check target doesn't exist const existing = db.prepare(`SELECT name FROM store_collections WHERE name = ?`).get(newName); if (existing != null) { throw new Error(`Collection '${newName}' already exists`); } const result = db.prepare(`UPDATE store_collections SET name = ? WHERE name = ?`).run(newName, oldName); return result.changes > 0; } export function updateStoreContext(db, collectionName, path, text) { const row = db.prepare(`SELECT context FROM store_collections WHERE name = ?`).get(collectionName); if (row == null) return false; const ctxMap = row.context ? JSON.parse(row.context) : {}; ctxMap[path] = text; db.prepare(`UPDATE store_collections SET context = ? WHERE name = ?`).run(JSON.stringify(ctxMap), collectionName); return true; } export function removeStoreContext(db, collectionName, path) { const row = db.prepare(`SELECT context FROM store_collections WHERE name = ?`).get(collectionName); if (row == null) return false; if (!row.context) return false; const ctxMap = JSON.parse(row.context); if (!(path in ctxMap)) return false; delete ctxMap[path]; const newCtx = Object.keys(ctxMap).length > 0 ? JSON.stringify(ctxMap) : null; db.prepare(`UPDATE store_collections SET context = ? WHERE name = ?`).run(newCtx, collectionName); return true; } export function setStoreGlobalContext(db, value) { if (value === undefined) { db.prepare(`DELETE FROM store_config WHERE key = 'global_context'`).run(); } else { db.prepare(`INSERT INTO store_config (key, value) VALUES ('global_context', ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value`).run(value); } } /** * Sync external config (YAML/inline) into SQLite store_collections. * External config always wins. Skips sync if config hash hasn't changed. */ export function syncConfigToDb(db, config) { // Check config hash — skip sync if unchanged const configJson = JSON.stringify(config); const hash = createHash('sha256').update(configJson).digest('hex'); const existingHash = db.prepare(`SELECT value FROM store_config WHERE key = 'config_hash'`).get(); if (existingHash != null && existingHash.value === hash) { return; // Config unchanged, skip sync } // Sync collections const configNames = new Set(Object.keys(config.collections)); for (const [name, coll] of Object.entries(config.collections)) { upsertStoreCollection(db, name, coll); } // Delete collections not in config const dbCollections = db.prepare(`SELECT name FROM store_collections`).all(); for (const row of dbCollections) { if (!configNames.has(row.name)) { db.prepare(`DELETE FROM store_collections WHERE name = ?`).run(row.name); } } // Sync global context if (config.global_context !== undefined) { setStoreGlobalContext(db, config.global_context); } else { setStoreGlobalContext(db, undefined); } // Save config hash db.prepare(`INSERT INTO store_config (key, value) VALUES ('config_hash', ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value`).run(hash); } export function isSqliteVecAvailable() { return _sqliteVecAvailable === true; } function ensureVecTableInternal(db, dimensions) { if (!_sqliteVecAvailable) { throw new Error("sqlite-vec is not available. Vector operations require a SQLite build with extension loading support."); } const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get(); if (tableInfo) { const match = tableInfo.sql.match(/float\[(\d+)\]/); const hasHashSeq = tableInfo.sql.includes('hash_seq'); const hasCosine = tableInfo.sql.includes('distance_metric=cosine'); const existingDims = match?.[1] ? parseInt(match[1], 10) : null; if (existingDims === dimensions && hasHashSeq && hasCosine) return; if (existingDims !== null && existingDims !== dimensions) { throw new Error(`Embedding dimension mismatch: existing vectors are ${existingDims}d but the current model produces ${dimensions}d. ` + `Run 'qmd embed -f' to re-embed with the new model.`); } db.exec("DROP TABLE IF EXISTS vectors_vec"); } db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}] distance_metric=cosine)`); } /** * Re-index a single collection by scanning the filesystem and updating the database. * Pure function — no console output, no db lifecycle management. */ export async function reindexCollection(store, collectionPath, globPattern, collectionName, options) { const db = store.db; const now = new Date().toISOString(); const excludeDirs = ["node_modules", ".git", ".cache", "vendor", "dist", "build"]; const allIgnore = [ ...excludeDirs.map(d => `**/${d}/**`), ...(options?.ignorePatterns || []), ]; const allFiles = await fastGlob(globPattern, { cwd: collectionPath, onlyFiles: true, followSymbolicLinks: false, dot: false, ignore: allIgnore, }); // Filter hidden files/folders const files = allFiles.filter(file => { const parts = file.split("/"); return !parts.some(part => part.startsWith(".")); }); const total = files.length; let indexed = 0, updated = 0, unchanged = 0, processed = 0; const seenPaths = new Set(); for (const relativeFile of files) { const filepath = getRealPath(resolve(collectionPath, relativeFile)); const path = handelize(relativeFile); seenPaths.add(path); let content; try { content = readFileSync(filepath, "utf-8"); } catch { processed++; options?.onProgress?.({ file: relativeFile, current: processed, total }); continue; } if (!content.trim()) { processed++; continue; } const hash = await hashContent(content); const title = extractTitle(content, relativeFile); const existing = findActiveDocument(db, collectionName, path); if (existing) { if (existing.hash === hash) { if (existing.title !== title) { updateDocumentTitle(db, existing.id, title, now); updated++; } else { unchanged++; } } else { insertContent(db, hash, content, now); const stat = statSync(filepath); updateDocument(db, existing.id, title, hash, stat ? new Date(stat.mtime).toISOString() : now); updated++; } } else { indexed++; insertContent(db, hash, content, now); const stat = statSync(filepath); insertDocument(db, collectionName, path, title, hash, stat ? new Date(stat.birthtime).toISOString() : now, stat ? new Date(stat.mtime).toISOString() : now); } processed++; options?.onProgress?.({ file: relativeFile, current: processed, total }); } // Deactivate documents that no longer exist const allActive = getActiveDocumentPaths(db, collectionName); let removed = 0; for (const path of allActive) { if (!seenPaths.has(path)) { deactivateDocument(db, collectionName, path); removed++; } } const orphanedCleaned = cleanupOrphanedContent(db); return { indexed, updated, unchanged, removed, orphanedCleaned }; } function validatePositiveIntegerOption(name, value, fallback) { if (value === undefined) return fallback; if (!Number.isInteger(value) || value < 1) { throw new Error(`${name} must be a positive integer`); } return value; } function resolveEmbedOptions(options) { return { maxDocsPerBatch: validatePositiveIntegerOption("maxDocsPerBatch", options?.maxDocsPerBatch, DEFAULT_EMBED_MAX_DOCS_PER_BATCH), maxBatchBytes: validatePositiveIntegerOption("maxBatchBytes", options?.maxBatchBytes, DEFAULT_EMBED_MAX_BATCH_BYTES), }; } function getPendingEmbeddingDocs(db) { // `MIN(d.collection)` deterministically picks one collection per hash when // the same content is indexed in multiple collections (SQLite tie-breaks // alphabetically). The identical bytes produce identical chunks regardless // of which collection wins; the chunkStrategy lookup still resolves via // that collection's YAML config. See Phase 2 design notes (i-bud0h8vu). return db.prepare(` SELECT d.hash, MIN(d.path) as path, MIN(d.collection) as collection, length(CAST(c.doc AS BLOB)) as bytes FROM documents d JOIN content c ON d.hash = c.hash LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0 WHERE d.active = 1 AND v.hash IS NULL GROUP BY d.hash ORDER BY MIN(d.path) `).all(); } function buildEmbeddingBatches(docs, maxDocsPerBatch, maxBatchBytes) { const batches = []; let currentBatch = []; let currentBytes = 0; for (const doc of docs) { const docBytes = Math.max(0, doc.bytes); const wouldExceedDocs = currentBatch.length >= maxDocsPerBatch; const wouldExceedBytes = currentBatch.length > 0 && (currentBytes + docBytes) > maxBatchBytes; if (wouldExceedDocs || wouldExceedBytes) { batches.push(currentBatch); currentBatch = []; currentBytes = 0; } currentBatch.push(doc); currentBytes += docBytes; } if (currentBatch.length > 0) { batches.push(currentBatch); } return batches; } function getEmbeddingDocsForBatch(db, batch) { if (batch.length === 0) return []; const placeholders = batch.map(() => "?").join(","); const rows = db.prepare(` SELECT hash, doc as body FROM content WHERE hash IN (${placeholders}) `).all(...batch.map(doc => doc.hash)); const bodyByHash = new Map(rows.map(row => [row.hash, row.body])); return batch.map((doc) => ({ ...doc, body: bodyByHash.get(doc.hash) ?? "", })); } /** * Run `body` with a session-shaped argument that supplies an AbortSignal + * isValid flag. When `provider` is supplied, the session is a lightweight * AbortController-backed stub — `getLlm(store)` is never called and * `withLLMSessionForLlm` is bypassed entirely, so node-llama-cpp is not * warmed up on remote-only deployments (i-08ovbvtb, follow-up to i-qkarfffa). * * When `provider` is undefined, behavior is unchanged: a real `LLMSession` * is created via `withLLMSessionForLlm(getLlm(store), ...)` so that the * body can use `session.embed`/`session.embedBatch` for the local path. * * The fake session's LLM-only methods (embed/embedBatch/expandQuery/rerank) * throw if called — they MUST NOT be reached when `provider` is set, since * the embed path is supposed to route through the provider instead. */ async function withEmbedSession(store, provider, body, options) { if (provider) { const ac = new AbortController(); const fakeSession = { get signal() { return ac.signal; }, get isValid() { return !ac.signal.aborted; }, embed: async () => { throw new Error("withEmbedSession: provider supplied — session.embed must not be called"); }, embedBatch: async () => { throw new Error("withEmbedSession: provider supplied — session.embedBatch must not be called"); }, expandQuery: async () => { throw new Error("withEmbedSession: provider supplied — session.expandQuery must not be called"); }, rerank: async () => { throw new Error("withEmbedSession: provider supplied — session.rerank must not be called"); }, }; try { return await body(fakeSession); } finally { ac.abort(); } } return withLLMSessionForLlm(getLlm(store), body, options); } /** * Generate vector embeddings for documents that need them. * Pure function — no console output, no db lifecycle management. * Uses the store's LlamaCpp instance if set, otherwise the global singleton. */ export async function generateEmbeddings(store, options) { const db = store.db; const model = options?.model ?? DEFAULT_EMBED_MODEL; const now = new Date().toISOString(); const { maxDocsPerBatch, maxBatchBytes } = resolveEmbedOptions(options); const encoder = new TextEncoder(); // Migration safety: if an embedProvider is supplied, verify its model id // matches the existing content_vectors rows (unless we're about to clear // them via `force`). This must happen BEFORE we clear vectors so users // who pass `--force` aren't blocked. if (options?.embedProvider && !options.force) { const existing = getDistinctEmbeddingModels(db); assertModelCompatible(options.embedProvider.getModelId(), existing); } if (options?.force) { clearAllEmbeddings(db); } const docsToEmbed = getPendingEmbeddingDocs(db); if (docsToEmbed.length === 0) { return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 }; } const totalBytes = docsToEmbed.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0); const totalDocs = docsToEmbed.length; const startTime = Date.now(); // Per-collection chunkStrategy lookup (Phase 2 — i-bud0h8vu). YAML // `chunkStrategy` on a collection wins over `options.chunkStrategy` // (global CLI flag); falls back to the global option, then to // chunkDocumentByTokens' own "regex" default when neither is set. // Opt-in per collection — collections without the field are untouched. const collectionStrategies = new Map(); try { const { listCollections: listYamlCollections } = await import("./collections.js"); for (const c of listYamlCollections()) { if (c.chunkStrategy) collectionStrategies.set(c.name, c.chunkStrategy); } } catch { // If YAML config is missing/unreadable, fall back silently to the // global strategy — no collection overrides. Keeps SDK/inline // callers that never touch ~/.config/qmd working. } // Provider routing — when an EmbeddingProvider is supplied, embed calls go // through it (HTTP, GPU worker, etc.). Otherwise, use the LLM session path. // The outer session is still created for its abort signal (chunking uses // `session.signal` for cooperative cancellation). const provider = options?.embedProvider; const providerModel = provider?.getModelId() ?? model; // Resolve `embedModelUri` (used for formatting prefixes etc.) lazily — // when `provider` is set, take it from the provider; otherwise fall back // to the local LlamaCpp's embed model name. Accessing `getLlm(store)` is // deferred to the non-provider branch so remote-only deployments do not // construct a `LlamaCpp` instance just to read its embedModelName. const embedModelUri = provider ? provider.getModelId() : getLlm(store).embedModelName; // Run the embedding loop inside a session-scoped wrapper. When `provider` // is set, this short-circuits the local LLM warm-up entirely (i-08ovbvtb). const result = await withEmbedSession(store, provider, async (session) => { let chunksEmbedded = 0; let errors = 0; let bytesProcessed = 0; let totalChunks = 0; let vectorTableInitialized = false; const BATCH_SIZE = 32; const batches = buildEmbeddingBatches(docsToEmbed, maxDocsPerBatch, maxBatchBytes); // Embedding helpers — single point of provider/session selection. // Both return the same shape as ILLMSession.embed/embedBatch so the // rest of the loop is unchanged. const embedOne = async (text, modelArg) => { if (provider) { const sig = provider.kind === 'local' ? session.signal : undefined; const r = await provider.embed(text, { model: modelArg, signal: sig }); return r ? { embedding: r.embedding, model: r.model } : null; } return session.embed(text, { model: modelArg }); }; const embedMany = async (texts, modelArg) => { if (provider) { const sig = provider.kind === 'local' ? session.signal : undefined; const r = await provider.embedBatch(texts, { model: modelArg, signal: sig }); return r.map((x) => (x ? { embedding: x.embedding, model: x.model } : null)); } return session.embedBatch(texts, { model: modelArg }); }; // JS-only token estimator for the provider path. Char-based with // avgCharsPerToken=3 — matches the heuristic the chunker already // uses for its initial char-space pass, so the safety re-split is a // near no-op while populating the `tokens` field with a stable // estimate. CRITICAL: avoids loading node-llama-cpp on remote-only // deployments (`QMD_EMBED_ENDPOINT=...`). i-1rqixh6m DoD #1. const chunkTokenizer = provider ? (text) => Math.ceil(text.length / 3) : undefined; for (const batchMeta of batches) { // Abort early if session has been invalidated if (!session.isValid) { console.warn(`⚠ Session expired — skipping remaining document batches`); break; } const batchDocs = getEmbeddingDocsForBatch(db, batchMeta); const batchChunks = []; const batchBytes = batchMeta.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0); for (const doc of batchDocs) { if (!doc.body.trim()) continue; const title = extractTitle(doc.body, doc.path); const perCollectionStrategy = collectionStrategies.get(doc.collection); const chunkStrategy = perCollectionStrategy ?? options?.chunkStrategy; const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, chunkStrategy, session.signal, chunkTokenizer); for (let seq = 0; seq < chunks.length; seq++) { batchChunks.push({ hash: doc.hash, title, text: chunks[seq].text, seq, pos: chunks[seq].pos, tokens: chunks[seq].tokens, bytes: encoder.encode(chunks[seq].text).length, }); } } totalChunks += batchChunks.length; if (batchChunks.length === 0) { bytesProcessed += batchBytes; options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors }); continue; } if (!vectorTableInitialized) { const firstChunk = batchChunks[0]; const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title, embedModelUri); // Single retry on transient failure (issue i-vm1lxwry). The provider // swallows per-chunk errors per its contract — `getLastError?.()` // surfaces the actual cause (HTTP status / abort / parse error) so we // can include it in the thrown message instead of the cryptic // "Failed to get embedding dimensions from first chunk". let firstResult = await embedOne(firstText, providerModel); if (!firstResult && session.isValid) { const firstErr = provider?.getLastError?.(); // Brief backoff before retry — embedding worker may be re-warming // a model or the GPU host may be transiently busy. 250ms is short // enough to be invisible on the happy path and long enough to // clear most "thundering-herd" race conditions. await new Promise((resolve) => setTimeout(resolve, 250)); if (process.env.QMD_EMBED_DEBUG) { process.stderr.write(`qmd embed: first-chunk dimension probe failed, retrying once${firstErr ? ` (last error: ${firstErr})` : ""}\n`); } firstResult = await embedOne(firstText, providerModel); } if (!firstResult) { const lastErr = provider?.getLastError?.(); const providerHint = provider ? `provider=${provider.kind}` : "provider=session"; const errSuffix = lastErr ? ` — underlying: ${lastErr}` : ""; const debugHint = process.env.QMD_EMBED_DEBUG ? "" : " (set QMD_EMBED_DEBUG=1 for per-chunk traces)"; throw new Error(`Failed to get embedding dimensions from first chunk after retry [${providerHint}]${errSuffix}${debugHint}`); } store.ensureVecTable(firstResult.embedding.length); vectorTableInitialized = true; } const totalBatchChunkBytes = batchChunks.reduce((sum, chunk) => sum + chunk.bytes, 0); let batchChunkBytesProcessed = 0; for (let batchStart = 0; batchStart < batchChunks.length; batchStart += BATCH_SIZE) { // Abort early if session has been invalidated (e.g. max duration exceeded) if (!session.isValid) { const remaining = batchChunks.length - batchStart; errors += remaining; console.warn(`⚠ Session expired — skipping ${remaining} remaining chunks`); break; } // Abort early if error rate is too high (>80% of processed chunks failed) const processed = chunksEmbedded + errors; if (processed >= BATCH_SIZE && errors > processed * 0.8) { const remaining = batchChunks.length - batchStart; errors += remaining; console.warn(`⚠ Error rate too high (${errors}/${processed}) — aborting embedding`); break; } const batchEnd = Math.min(batchStart + BATCH_SIZE, batchChunks.length); const chunkBatch = batchChunks.slice(batchStart, batchEnd); const texts = chunkBatch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title, embedModelUri)); try { const embeddings = await embedMany(texts, providerModel); for (let i = 0; i < chunkBatch.length; i++) { const chunk = chunkBatch[i]; const embedding = embeddings[i]; if (embedding) { insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), providerModel, now); chunksEmbedded++; } else { errors++; } batchChunkBytesProcessed += chunk.bytes; } } catch { // Batch failed — try individual embeddings as fallback // But skip if session is already invalid (avoids N doomed retries) if (!session.isValid) { errors += chunkBatch.length; batchChunkBytesProcessed += chunkBatch.reduce((sum, c) => sum + c.bytes, 0); } else { for (const chunk of chunkBatch) { try { const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri); const result = await embedOne(text, providerModel); if (result) { insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), providerModel, now); chunksEmbedded++; } else { errors++; } } catch { errors++; } batchChunkBytesProcessed += chunk.bytes; } } } const proportionalBytes = totalBatchChunkBytes === 0 ? batchBytes : Math.min(batchBytes, Math.round((batchChunkBytesProcessed / totalBatchChunkBytes) * batchBytes)); options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed: bytesProcessed + proportionalBytes, totalBytes, errors, }); } bytesProcessed += batchBytes; options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors }); } return { chunksEmbedded, errors }; }, { maxDuration: 30 * 60 * 1000, name: 'generateEmbeddings' }); return { docsProcessed: totalDocs, chunksEmbedded: result.chunksEmbedded, errors: result.errors, durationMs: Date.now() - startTime, }; } /** * Create a new store instance with the given database path. * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite). * * @param dbPath - Path to the SQLite database file * @returns Store instance with all methods bound to the database */ export function createStore(dbPath) { const resolvedPath = dbPath || getDefaultDbPath(); const db = openDatabase(resolvedPath); initializeDatabase(db); const store = { db, dbPath: resolvedPath, close: () => db.close(), ensureVecTable: (dimensions) => ensureVecTableInternal(db, dimensions), // Index health getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db), getIndexHealth: () => getIndexHealth(db), getStatus: () => getStatus(db), // Caching getCacheKey, getCachedResult: (cacheKey) => getCachedResult(db, cacheKey), setCachedResult: (cacheKey, result) => setCachedResult(db, cacheKey, result), clearCache: () => clearCache(db), // Cleanup and maintenance deleteLLMCache: () => deleteLLMCache(db), deleteInactiveDocuments: () => deleteInactiveDocuments(db), cleanupOrphanedContent: () => cleanupOrphanedContent(db), cleanupOrphanedVectors: () => cleanupOrphanedVectors(db), vacuumDatabase: () => vacuumDatabase(db), // Context getContextForFile: (filepath) => getContextForFile(db, filepath), getContextForPath: (collectionName, path) => getContextForPath(db, collectionName, path), getCollectionByName: (name) => getCollectionByName(db, name), getCollectionsWithoutContext: () => getCollectionsWithoutContext(db), getTopLevelPathsWithoutContext: (collectionName) => getTopLevelPathsWithoutContext(db, collectionName), // Virtual paths parseVirtualPath, buildVirtualPath, isVirtualPath, resolveVirtualPath: (virtualPath) => resolveVirtualPath(db, virtualPath), toVirtualPath: (absolutePath) => toVirtualPath(db, absolutePath), // Search searchFTS: (query, limit, collectionName) => searchFTS(db, query, limit, collectionName), searchVec: (query, model, limit, collectionName, session, precomputedEmbedding, embedProvider) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding, embedProvider), // Query expansion & reranking expandQuery: (query, model, intent) => expandQuery(query, model, db, intent, store.llm), rerank: (query, documents, model, intent) => rerank(query, documents, model, db, intent, store.llm), // Document retrieval findDocument: (filename, options) => findDocument(db, filename, options), getDocumentBody: (doc, fromLine, maxLines) => getDocumentBody(db, doc, fromLine, maxLines), findDocuments: (pattern, options) => findDocuments(db, pattern, options), // Fuzzy matching and docid lookup findSimilarFiles: (query, maxDistance, limit) => findSimilarFiles(db, query, maxDistance, limit), matchFilesByGlob: (pattern) => matchFilesByGlob(db, pattern), findDocumentByDocid: (docid) => findDocumentByDocid(db, docid), // Document indexing operations insertContent: (hash, content, createdAt) => insertContent(db, hash, content, createdAt), insertDocument: (collectionName, path, title, hash, createdAt, modifiedAt) => insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt), findActiveDocument: (collectionName, path) => findActiveDocument(db, collectionName, path), updateDocumentTitle: (documentId, title, modifiedAt) => updateDocumentTitle(db, documentId, title, modifiedAt), updateDocument: (documentId, title, hash, modifiedAt) => updateDocument(db, documentId, title, hash, modifiedAt), deactivateDocument: (collectionName, path) => deactivateDocument(db, collectionName, path), getActiveDocumentPaths: (collectionName) => getActiveDocumentPaths(db, collectionName), // Vector/embedding operations getHashesForEmbedding: () => getHashesForEmbedding(db), clearAllEmbeddings: () => clearAllEmbeddings(db), insertEmbedding: (hash, seq, pos, embedding, model, embeddedAt) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt), }; return store; } /** * Extract short docid from a full hash (first 6 characters). */ export function getDocid(hash) { return hash.slice(0, 6); } /** * Handelize a filename to be more token-friendly. * - Convert triple underscore `___` to `/` (folder separator) * - Convert to lowercase * - Replace sequences of non-word chars (except /) with single dash * - Remove leading/trailing dashes from path segments * - Preserve folder structure (a/b/c/d.md stays structured) * - Preserve file extension */ /** Replace emoji/symbol codepoints with their hex representation (e.g. 🐘 → 1f418) */ function emojiToHex(str) { return str.replace(/(?:\p{So}\p{Mn}?|\p{Sk})+/gu, (run) => { // Split the run into individual emoji and convert each to hex, dash-separated return [...run].filter(c => /\p{So}|\p{Sk}/u.test(c)) .map(c => c.codePointAt(0).toString(16)).join('-'); }); } export function handelize(path) { if (!path || path.trim() === '') { throw new Error('handelize: path cannot be empty'); } // Allow route-style "$" filenames while still rejecting paths with no usable content. // Emoji (\p{So}) counts as valid content — they get converted to hex codepoints below. const segments = path.split('/').filter(Boolean); const lastSegment = segments[segments.length - 1] || ''; const filenameWithoutExt = lastSegment.replace(/\.[^.]+$/, ''); const hasValidContent = /[\p{L}\p{N}\p{So}\p{Sk}$]/u.test(filenameWithoutExt); if (!hasValidContent) { throw new Error(`handelize: path "${path}" has no valid filename content`); } const result = path .replace(/___/g, '/') // Triple underscore becomes folder separator .toLowerCase() .split('/') .map((segment, idx, arr) => { const isLastSegment = idx === arr.length - 1; // Convert emoji to hex codepoints before cleaning segment = emojiToHex(segment); if (isLastSegment) { // For the filename (last segment), preserve the extension const extMatch = segment.match(/(\.[a-z0-9]+)$/i); const ext = extMatch ? extMatch[1] : ''; const nameWithoutExt = ext ? segment.slice(0, -ext.length) : segment; const cleanedName = nameWithoutExt .replace(/[^\p{L}\p{N}$]+/gu, '-') // Keep letters, numbers, "$"; dash-separate rest (including dots) .replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes return cleanedName + ext; } else { // For directories, just clean normally return segment .replace(/[^\p{L}\p{N}$]+/gu, '-') .replace(/^-+|-+$/g, ''); } }) .filter(Boolean) .join('/'); if (!result) { throw new Error(`handelize: path "${path}" resulted in empty string after processing`); } return result; } // ============================================================================= // Index health // ============================================================================= export function getHashesNeedingEmbedding(db) { const result = db.prepare(` SELECT COUNT(DISTINCT d.hash) as count FROM documents d LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0 WHERE d.active = 1 AND v.hash IS NULL `).get(); return result.count; } export function getIndexHealth(db) { const needsEmbedding = getHashesNeedingEmbedding(db); const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get().count; const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get(); let daysStale = null; if (mostRecent?.latest) { const lastUpdate = new Date(mostRecent.latest); daysStale = Math.floor((Date.now() - lastUpdate.getTime()) / (24 * 60 * 60 * 1000)); } return { needsEmbedding, totalDocs, daysStale }; } // ============================================================================= // Caching // ============================================================================= export function getCacheKey(url, body) { const hash = createHash("sha256"); hash.update(url); hash.update(JSON.stringify(body)); return hash.digest("hex"); } export function getCachedResult(db, cacheKey) { const row = db.prepare(`SELECT result FROM llm_cache WHERE hash = ?`).get(cacheKey); return row?.result || null; } export function setCachedResult(db, cacheKey, result) { const now = new Date().toISOString(); db.prepare(`INSERT OR REPLACE INTO llm_cache (hash, result, created_at) VALUES (?, ?, ?)`).run(cacheKey, result, now); if (Math.random() < 0.01) { db.exec(`DELETE FROM llm_cache WHERE hash NOT IN (SELECT hash FROM llm_cache ORDER BY created_at DESC LIMIT 1000)`); } } export function clearCache(db) { db.exec(`DELETE FROM llm_cache`); } // ============================================================================= // Cleanup and maintenance operations // ============================================================================= /** * Delete cached LLM API responses. * Returns the number of cached responses deleted. */ export function deleteLLMCache(db) { const result = db.prepare(`DELETE FROM llm_cache`).run(); return result.changes; } /** * Remove inactive document records (active = 0). * Returns the number of inactive documents deleted. */ export function deleteInactiveDocuments(db) { const result = db.prepare(`DELETE FROM documents WHERE active = 0`).run(); return result.changes; } /** * Remove orphaned content hashes that are not referenced by any active document. * Returns the number of orphaned content hashes deleted. */ export function cleanupOrphanedContent(db) { const result = db.prepare(` DELETE FROM content WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1) `).run(); return result.changes; } /** * Remove orphaned vector embeddings that are not referenced by any active document. * Returns the number of orphaned embedding chunks deleted. */ export function cleanupOrphanedVectors(db) { // sqlite-vec may not be loaded (e.g. Bun's bun:sqlite lacks loadExtension). // The vectors_vec virtual table can appear in sqlite_master from a prior // session, but querying it without the vec0 module loaded will crash (#380). if (!isSqliteVecAvailable()) { return 0; } // The schema entry can exist even when sqlite-vec itself is unavailable // (for example when reopening a DB without vec0 loaded). In that case, // touching the virtual table throws "no such module: vec0" and cleanup // should degrade gracefully like the rest of the vector features. try { db.prepare(`SELECT 1 FROM vectors_vec LIMIT 0`).get(); } catch { return 0; } // Count orphaned vectors first const countResult = db.prepare(` SELECT COUNT(*) as c FROM content_vectors cv WHERE NOT EXISTS ( SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1 ) `).get(); if (countResult.c === 0) { return 0; } // Delete from vectors_vec first db.exec(` DELETE FROM vectors_vec WHERE hash_seq IN ( SELECT cv.hash || '_' || cv.seq FROM content_vectors cv WHERE NOT EXISTS ( SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1 ) ) `); // Delete from content_vectors db.exec(` DELETE FROM content_vectors WHERE hash NOT IN ( SELECT hash FROM documents WHERE active = 1 ) `); return countResult.c; } /** * Run VACUUM to reclaim unused space in the database. * This operation rebuilds the database file to eliminate fragmentation. */ export function vacuumDatabase(db) { db.exec(`VACUUM`); } // ============================================================================= // Document helpers // ============================================================================= export async function hashContent(content) { const hash = createHash("sha256"); hash.update(content); return hash.digest("hex"); } const titleExtractors = { '.md': (content) => { const match = content.match(/^##?\s+(.+)$/m); if (match) { const title = (match[1] ?? "").trim(); if (title === "📝 Notes" || title === "Notes") { const nextMatch = content.match(/^##\s+(.+)$/m); if (nextMatch?.[1]) return nextMatch[1].trim(); } return title; } return null; }, '.org': (content) => { const titleProp = content.match(/^#\+TITLE:\s*(.+)$/im); if (titleProp?.[1]) return titleProp[1].trim(); const heading = content.match(/^\*+\s+(.+)$/m); if (heading?.[1]) return heading[1].trim(); return null; }, }; export function extractTitle(content, filename) { const ext = filename.slice(filename.lastIndexOf('.')).toLowerCase(); const extractor = titleExtractors[ext]; if (extractor) { const title = extractor(content); if (title) return title; } return filename.replace(/\.[^.]+$/, "").split("/").pop() || filename; } // ============================================================================= // Document indexing operations // ============================================================================= /** * Insert content into the content table (content-addressable storage). * Uses INSERT OR IGNORE so duplicate hashes are skipped. */ export function insertContent(db, hash, content, createdAt) { db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`) .run(hash, content, createdAt); } /** * Insert a new document into the documents table. */ export function insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt) { db.prepare(` INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, 1) ON CONFLICT(collection, path) DO UPDATE SET title = excluded.title, hash = excluded.hash, modified_at = excluded.modified_at, active = 1 `).run(collectionName, path, title, hash, createdAt, modifiedAt); } /** * Find an active document by collection name and path. */ export function findActiveDocument(db, collectionName, path) { const row = db.prepare(` SELECT id, hash, title FROM documents WHERE collection = ? AND path = ? AND active = 1 `).get(collectionName, path); return row ?? null; } /** * Update the title and modified_at timestamp for a document. */ export function updateDocumentTitle(db, documentId, title, modifiedAt) { db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`) .run(title, modifiedAt, documentId); } /** * Update an existing document's hash, title, and modified_at timestamp. * Used when content changes but the file path stays the same. */ export function updateDocument(db, documentId, title, hash, modifiedAt) { db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`) .run(title, hash, modifiedAt, documentId); } /** * Deactivate a document (mark as inactive but don't delete). */ export function deactivateDocument(db, collectionName, path) { db.prepare(`UPDATE documents SET active = 0 WHERE collection = ? AND path = ? AND active = 1`) .run(collectionName, path); } /** * Get all active document paths for a collection. */ export function getActiveDocumentPaths(db, collectionName) { const rows = db.prepare(` SELECT path FROM documents WHERE collection = ? AND active = 1 `).all(collectionName); return rows.map(r => r.path); } export { formatQueryForEmbedding, formatDocForEmbedding }; /** * Chunk a document using regex-only break point detection. * This is the sync, backward-compatible API used by tests and legacy callers. */ export function chunkDocument(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS) { const breakPoints = scanBreakPoints(content); const codeFences = findCodeFences(content); return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars); } /** * Async AST-aware chunking. Detects language from filepath, computes AST * break points for supported code files, merges with regex break points, * and delegates to the shared chunk algorithm. * * Strategies: * - "regex" (default) — char-based chunking with regex break points only. * - "auto" — regex break points merged with AST break points (soft hints). * - "function" — one chunk per AST function range (Phase 2); inter-range * gaps (imports, top-level code) are char-chunked with AST * hints. Falls back to "auto" when zero ranges are detected. */ export async function chunkDocumentAsync(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS, filepath, chunkStrategy = "regex") { const regexPoints = scanBreakPoints(content); const codeFences = findCodeFences(content); // "function" strategy: delegate to the function-level chunker. If no // ranges are detected (markdown, unsupported lang, parse failure), fall // back to "auto" behavior (AST-break-point-assisted char chunking). if (chunkStrategy === "function" && filepath) { const { getASTFunctionRanges, getASTBreakPoints } = await import("./ast.js"); const ranges = await getASTFunctionRanges(content, filepath); if (ranges.length > 0) { return chunkByFunctionRanges(content, ranges, regexPoints, codeFences, maxChars, overlapChars, windowChars); } // Zero ranges — fall through to auto behavior so break points still help. const astPoints = await getASTBreakPoints(content, filepath); const merged = astPoints.length > 0 ? mergeBreakPoints(regexPoints, astPoints) : regexPoints; return chunkDocumentWithBreakPoints(content, merged, codeFences, maxChars, overlapChars, windowChars); } let breakPoints = regexPoints; if (chunkStrategy === "auto" && filepath) { const { getASTBreakPoints } = await import("./ast.js"); const astPoints = await getASTBreakPoints(content, filepath); if (astPoints.length > 0) { breakPoints = mergeBreakPoints(regexPoints, astPoints); } } return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars); } /** * Produce one chunk per AST function range, plus char-chunks for the gaps * between ranges (imports, top-level code). Ranges that exceed `maxChars` * are further split using the existing char-based algorithm so we never * emit a single oversized chunk. * * Preconditions: `ranges` is non-empty, sorted by `startIndex`, and the * ranges are non-overlapping (as produced by `getASTFunctionRanges`). */ function chunkByFunctionRanges(content, ranges, regexPoints, codeFences, maxChars, overlapChars, windowChars) { const out = []; let cursor = 0; const emitGap = (start, end) => { if (start >= end) return; const gap = content.slice(start, end); // Whitespace-only gaps are dropped — they carry no embeddable signal. if (!gap.trim()) return; if (gap.length <= maxChars) { out.push({ text: gap, pos: start }); return; } // Reuse char-based algorithm for oversized gaps. Restrict break // points and code fences to the gap window and rebase positions so // chunkDocumentWithBreakPoints operates on a standalone slice. const subPoints = regexPoints .filter(p => p.pos >= start && p.pos < end) .map(p => ({ ...p, pos: p.pos - start })); const subFences = codeFences .filter(f => f.end > start && f.start < end) .map(f => ({ start: Math.max(0, f.start - start), end: Math.max(0, Math.min(end, f.end) - start), })); const sub = chunkDocumentWithBreakPoints(gap, subPoints, subFences, maxChars, overlapChars, windowChars); for (const c of sub) out.push({ text: c.text, pos: start + c.pos }); }; for (const range of ranges) { // Emit any leading / inter-range gap (imports, top-level code). emitGap(cursor, range.startIndex); const body = content.slice(range.startIndex, range.endIndex); if (body.length === 0) { cursor = range.endIndex; continue; } if (body.length <= maxChars) { out.push({ text: body, pos: range.startIndex }); } else { // Oversized function/class — split with char algorithm so we stay // under the embed token budget. Break points inside the range are // reused to keep splits at syntactically-sensible positions. const subPoints = regexPoints .filter(p => p.pos >= range.startIndex && p.pos < range.endIndex) .map(p => ({ ...p, pos: p.pos - range.startIndex })); const subFences = codeFences .filter(f => f.end > range.startIndex && f.start < range.endIndex) .map(f => ({ start: Math.max(0, f.start - range.startIndex), end: Math.max(0, Math.min(range.endIndex, f.end) - range.startIndex), })); const sub = chunkDocumentWithBreakPoints(body, subPoints, subFences, maxChars, overlapChars, windowChars); for (const c of sub) out.push({ text: c.text, pos: range.startIndex + c.pos }); } cursor = range.endIndex; } // Trailing gap after the last range. emitGap(cursor, content.length); // Edge case: content consisted entirely of whitespace-only gaps (zero // emitted chunks). Preserve the invariant that non-empty content yields // at least one chunk. if (out.length === 0 && content.length > 0) { return [{ text: content, pos: 0 }]; } return out; } /** * Chunk a document by actual token count using the LLM tokenizer. * More accurate than character-based chunking but requires async. * * When `tokenizer` is supplied, it is used in place of the local * `llm.tokenize(...)` call — neither `getDefaultLlamaCpp()` nor * `llm.tokenize(...)` is invoked. This lets remote-only deployments * (`QMD_EMBED_ENDPOINT=...`) chunk documents without warming up * node-llama-cpp (DoD #1 of i-1rqixh6m / i-qkarfffa). * * When `filepath` and `chunkStrategy` are provided, uses AST-aware break * points for supported code files. */ export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS, filepath, chunkStrategy = "regex", signal, tokenizer) { // Resolve token counter lazily so callers that supply `tokenizer` never // touch the local LlamaCpp instance — `getDefaultLlamaCpp()` is only // invoked from inside the default closure when it is actually called // (i.e. when no tokenizer is supplied). let llm; const countTokens = tokenizer ?? (async (text) => { if (!llm) llm = getDefaultLlamaCpp(); return (await llm.tokenize(text)).length; }); // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3) // If chunks exceed limit, they'll be re-split with actual ratio const avgCharsPerToken = 3; const maxChars = maxTokens * avgCharsPerToken; const overlapChars = overlapTokens * avgCharsPerToken; const windowChars = windowTokens * avgCharsPerToken; // Chunk in character space with conservative estimate // Use AST-aware chunking for the first pass when filepath/strategy provided let charChunks = await chunkDocumentAsync(content, maxChars, overlapChars, windowChars, filepath, chunkStrategy); // Tokenize and split any chunks that still exceed limit const results = []; for (const chunk of charChunks) { // Respect abort signal to avoid runaway tokenization if (signal?.aborted) break; const tokenCount = await countTokens(chunk.text); if (tokenCount <= maxTokens) { results.push({ text: chunk.text, pos: chunk.pos, tokens: tokenCount }); } else { // Chunk is still too large - split it further // Use actual token count to estimate better char limit const actualCharsPerToken = chunk.text.length / tokenCount; const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2)); for (const subChunk of subChunks) { if (signal?.aborted) break; const subCount = await countTokens(subChunk.text); results.push({ text: subChunk.text, pos: chunk.pos + subChunk.pos, tokens: subCount, }); } } } return results; } // ============================================================================= // Fuzzy matching // ============================================================================= function levenshtein(a, b) { const m = a.length, n = b.length; if (m === 0) return n; if (n === 0) return m; const dp = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0)); for (let i = 0; i <= m; i++) dp[i][0] = i; for (let j = 0; j <= n; j++) dp[0][j] = j; for (let i = 1; i <= m; i++) { for (let j = 1; j <= n; j++) { const cost = a[i - 1] === b[j - 1] ? 0 : 1; dp[i][j] = Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost); } } return dp[m][n]; } /** * Normalize a docid input by stripping surrounding quotes and leading #. * Handles: "#abc123", 'abc123', "abc123", #abc123, abc123 * Returns the bare hex string. */ export function normalizeDocid(docid) { let normalized = docid.trim(); // Strip surrounding quotes (single or double) if ((normalized.startsWith('"') && normalized.endsWith('"')) || (normalized.startsWith("'") && normalized.endsWith("'"))) { normalized = normalized.slice(1, -1); } // Strip leading # if present if (normalized.startsWith('#')) { normalized = normalized.slice(1); } return normalized; } /** * Check if a string looks like a docid reference. * Accepts: #abc123, abc123, "#abc123", "abc123", '#abc123', 'abc123' * Returns true if the normalized form is a valid hex string of 6+ chars. */ export function isDocid(input) { const normalized = normalizeDocid(input); // Must be at least 6 hex characters return normalized.length >= 6 && /^[a-f0-9]+$/i.test(normalized); } /** * Find a document by its short docid (first 6 characters of hash). * Returns the document's virtual path if found, null otherwise. * If multiple documents match the same short hash (collision), returns the first one. * * Accepts lenient input: #abc123, abc123, "#abc123", "abc123" */ export function findDocumentByDocid(db, docid) { const shortHash = normalizeDocid(docid); if (shortHash.length < 1) return null; // Look up documents where hash starts with the short hash const doc = db.prepare(` SELECT 'qmd://' || d.collection || '/' || d.path as filepath, d.hash FROM documents d WHERE d.hash LIKE ? AND d.active = 1 LIMIT 1 `).get(`${shortHash}%`); return doc; } export function findSimilarFiles(db, query, maxDistance = 3, limit = 5) { const allFiles = db.prepare(` SELECT d.path FROM documents d WHERE d.active = 1 `).all(); const queryLower = query.toLowerCase(); const scored = allFiles .map(f => ({ path: f.path, dist: levenshtein(f.path.toLowerCase(), queryLower) })) .filter(f => f.dist <= maxDistance) .sort((a, b) => a.dist - b.dist) .slice(0, limit); return scored.map(f => f.path); } export function matchFilesByGlob(db, pattern) { const allFiles = db.prepare(` SELECT 'qmd://' || d.collection || '/' || d.path as virtual_path, LENGTH(content.doc) as body_length, d.path, d.collection FROM documents d JOIN content ON content.hash = d.hash WHERE d.active = 1 `).all(); const isMatch = picomatch(pattern); return allFiles .filter(f => isMatch(f.virtual_path) || isMatch(f.path) || isMatch(f.collection + '/' + f.path)) .map(f => ({ filepath: f.virtual_path, // Virtual path for precise lookup displayPath: f.path, // Relative path for display bodyLength: f.body_length })); } // ============================================================================= // Context // ============================================================================= /** * Get context for a file path using hierarchical inheritance. * Contexts are collection-scoped and inherit from parent directories. * For example, context at "/talks" applies to "/talks/2024/keynote.md". * * @param db Database instance (unused - kept for compatibility) * @param collectionName Collection name * @param path Relative path within the collection * @returns Context string or null if no context is defined */ export function getContextForPath(db, collectionName, path) { const coll = getStoreCollection(db, collectionName); if (!coll) return null; // Collect ALL matching contexts (global + all path prefixes) const contexts = []; // Add global context if present const globalCtx = getStoreGlobalContext(db); if (globalCtx) { contexts.push(globalCtx); } // Add all matching path contexts (from most general to most specific) if (coll.context) { const normalizedPath = path.startsWith("/") ? path : `/${path}`; // Collect all matching prefixes const matchingContexts = []; for (const [prefix, context] of Object.entries(coll.context)) { const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`; if (normalizedPath.startsWith(normalizedPrefix)) { matchingContexts.push({ prefix: normalizedPrefix, context }); } } // Sort by prefix length (shortest/most general first) matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length); // Add all matching contexts for (const match of matchingContexts) { contexts.push(match.context); } } // Join all contexts with double newline return contexts.length > 0 ? contexts.join('\n\n') : null; } /** * Get context for a file path (virtual or filesystem). * Resolves the collection and relative path from the DB store_collections table. */ export function getContextForFile(db, filepath) { // Handle undefined or null filepath if (!filepath) return null; // Get all collections from DB const collections = getStoreCollections(db); // Parse virtual path format: qmd://collection/path let collectionName = null; let relativePath = null; const parsedVirtual = filepath.startsWith('qmd://') ? parseVirtualPath(filepath) : null; if (parsedVirtual) { collectionName = parsedVirtual.collectionName; relativePath = parsedVirtual.path; } else { // Filesystem path: find which collection this absolute path belongs to for (const coll of collections) { // Skip collections with missing paths if (!coll || !coll.path) continue; if (filepath.startsWith(coll.path + '/') || filepath === coll.path) { collectionName = coll.name; // Extract relative path relativePath = filepath.startsWith(coll.path + '/') ? filepath.slice(coll.path.length + 1) : ''; break; } } if (!collectionName || relativePath === null) return null; } // Get the collection from DB const coll = getStoreCollection(db, collectionName); if (!coll) return null; // Verify this document exists in the database const doc = db.prepare(` SELECT d.path FROM documents d WHERE d.collection = ? AND d.path = ? AND d.active = 1 LIMIT 1 `).get(collectionName, relativePath); if (!doc) return null; // Collect ALL matching contexts (global + all path prefixes) const contexts = []; // Add global context if present const globalCtx = getStoreGlobalContext(db); if (globalCtx) { contexts.push(globalCtx); } // Add all matching path contexts (from most general to most specific) if (coll.context) { const normalizedPath = relativePath.startsWith("/") ? relativePath : `/${relativePath}`; // Collect all matching prefixes const matchingContexts = []; for (const [prefix, context] of Object.entries(coll.context)) { const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`; if (normalizedPath.startsWith(normalizedPrefix)) { matchingContexts.push({ prefix: normalizedPrefix, context }); } } // Sort by prefix length (shortest/most general first) matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length); // Add all matching contexts for (const match of matchingContexts) { contexts.push(match.context); } } // Join all contexts with double newline return contexts.length > 0 ? contexts.join('\n\n') : null; } /** * Get collection by name from DB store_collections table. */ export function getCollectionByName(db, name) { const collection = getStoreCollection(db, name); if (!collection) return null; return { name: collection.name, pwd: collection.path, glob_pattern: collection.pattern, }; } /** * List all collections with document counts from database. * Merges store_collections config with database statistics. */ export function listCollections(db) { const collections = getStoreCollections(db); // Get document counts from database for each collection const result = collections.map(coll => { const stats = db.prepare(` SELECT COUNT(d.id) as doc_count, SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count, MAX(d.modified_at) as last_modified FROM documents d WHERE d.collection = ? `).get(coll.name); return { name: coll.name, pwd: coll.path, glob_pattern: coll.pattern, doc_count: stats?.doc_count || 0, active_count: stats?.active_count || 0, last_modified: stats?.last_modified || null, includeByDefault: coll.includeByDefault !== false, }; }); return result; } /** * Remove a collection and clean up its documents. * Uses collections.ts to remove from YAML config and cleans up database. */ export function removeCollection(db, collectionName) { // Delete documents from database const docResult = db.prepare(`DELETE FROM documents WHERE collection = ?`).run(collectionName); // Clean up orphaned content hashes const cleanupResult = db.prepare(` DELETE FROM content WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1) `).run(); // Remove from store_collections deleteStoreCollection(db, collectionName); return { deletedDocs: docResult.changes, cleanedHashes: cleanupResult.changes }; } /** * Rename a collection. * Updates both YAML config and database documents table. */ export function renameCollection(db, oldName, newName) { // Update all documents with the new collection name in database db.prepare(`UPDATE documents SET collection = ? WHERE collection = ?`) .run(newName, oldName); // Rename in store_collections renameStoreCollection(db, oldName, newName); } // ============================================================================= // Context Management Operations // ============================================================================= /** * Insert or update a context for a specific collection and path prefix. */ export function insertContext(db, collectionId, pathPrefix, context) { // Get collection name from ID const coll = db.prepare(`SELECT name FROM collections WHERE id = ?`).get(collectionId); if (!coll) { throw new Error(`Collection with id ${collectionId} not found`); } // Add context to store_collections updateStoreContext(db, coll.name, pathPrefix, context); } /** * Delete a context for a specific collection and path prefix. * Returns the number of contexts deleted. */ export function deleteContext(db, collectionName, pathPrefix) { // Remove context from store_collections const success = removeStoreContext(db, collectionName, pathPrefix); return success ? 1 : 0; } /** * Delete all global contexts (contexts with empty path_prefix). * Returns the number of contexts deleted. */ export function deleteGlobalContexts(db) { let deletedCount = 0; // Remove global context setStoreGlobalContext(db, undefined); deletedCount++; // Remove root context (empty string) from all collections const collections = getStoreCollections(db); for (const coll of collections) { const success = removeStoreContext(db, coll.name, ''); if (success) { deletedCount++; } } return deletedCount; } /** * List all contexts, grouped by collection. * Returns contexts ordered by collection name, then by path prefix length (longest first). */ export function listPathContexts(db) { const allContexts = getStoreContexts(db); // Convert to expected format and sort return allContexts.map(ctx => ({ collection_name: ctx.collection, path_prefix: ctx.path, context: ctx.context, })).sort((a, b) => { // Sort by collection name first if (a.collection_name !== b.collection_name) { return a.collection_name.localeCompare(b.collection_name); } // Then by path prefix length (longest first) if (a.path_prefix.length !== b.path_prefix.length) { return b.path_prefix.length - a.path_prefix.length; } // Then alphabetically return a.path_prefix.localeCompare(b.path_prefix); }); } /** * Get all collections (name only - from YAML config). */ export function getAllCollections(db) { const collections = getStoreCollections(db); return collections.map(c => ({ name: c.name })); } /** * Check which collections don't have any context defined. * Returns collections that have no context entries at all (not even root context). */ export function getCollectionsWithoutContext(db) { // Get all collections from DB const allCollections = getStoreCollections(db); // Filter to those without context const collectionsWithoutContext = []; for (const coll of allCollections) { // Check if collection has any context if (!coll.context || Object.keys(coll.context).length === 0) { // Get doc count from database const stats = db.prepare(` SELECT COUNT(d.id) as doc_count FROM documents d WHERE d.collection = ? AND d.active = 1 `).get(coll.name); collectionsWithoutContext.push({ name: coll.name, pwd: coll.path, doc_count: stats?.doc_count || 0, }); } } return collectionsWithoutContext.sort((a, b) => a.name.localeCompare(b.name)); } /** * Get top-level directories in a collection that don't have context. * Useful for suggesting where context might be needed. */ export function getTopLevelPathsWithoutContext(db, collectionName) { // Get all paths in the collection from database const paths = db.prepare(` SELECT DISTINCT path FROM documents WHERE collection = ? AND active = 1 `).all(collectionName); // Get existing contexts for this collection from DB const dbColl = getStoreCollection(db, collectionName); if (!dbColl) return []; const contextPrefixes = new Set(); if (dbColl.context) { for (const prefix of Object.keys(dbColl.context)) { contextPrefixes.add(prefix); } } // Extract top-level directories (first path component) const topLevelDirs = new Set(); for (const { path } of paths) { const parts = path.split('/').filter(Boolean); if (parts.length > 1) { const dir = parts[0]; if (dir) topLevelDirs.add(dir); } } // Filter out directories that already have context (exact or parent) const missing = []; for (const dir of topLevelDirs) { let hasContext = false; // Check if this dir or any parent has context for (const prefix of contextPrefixes) { if (prefix === '' || prefix === dir || dir.startsWith(prefix + '/')) { hasContext = true; break; } } if (!hasContext) { missing.push(dir); } } return missing.sort(); } // ============================================================================= // FTS Search // ============================================================================= export function sanitizeFTS5Term(term) { return term.replace(/[^\p{L}\p{N}'_]/gu, '').toLowerCase(); } /** * Check if a token is a hyphenated compound word (e.g., multi-agent, DEC-0054, gpt-4). * Returns true if the token contains internal hyphens between word/digit characters. */ function isHyphenatedToken(token) { return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token); } /** * Sanitize a hyphenated term into an FTS5 phrase by splitting on hyphens * and sanitizing each part. Returns the parts joined by spaces for use * inside FTS5 quotes: "multi agent" matches "multi-agent" in porter tokenizer. */ function sanitizeHyphenatedTerm(term) { return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' '); } /** * Parse lex query syntax into FTS5 query. * * Supports: * - Quoted phrases: "exact phrase" → "exact phrase" (exact match) * - Negation: -term or -"phrase" → uses FTS5 NOT operator * - Hyphenated tokens: multi-agent, DEC-0054, gpt-4 → treated as phrases * - Plain terms: term → "term"* (prefix match) * * FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2". * So `-term` only works when there are also positive terms. * * Hyphen disambiguation: `-sports` at a word boundary is negation, but `multi-agent` * (where `-` is between word characters) is treated as a hyphenated phrase. * When a leading `-` is followed by what looks like a hyphenated compound word * (e.g., `-multi-agent`), the entire token is treated as a negated phrase. * * Examples: * performance -sports → "performance"* NOT "sports"* * "machine learning" → "machine learning" * multi-agent memory → "multi agent" AND "memory"* * DEC-0054 → "dec 0054" * -multi-agent → NOT "multi agent" */ function buildFTS5Query(query) { const positive = []; const negative = []; let i = 0; const s = query.trim(); while (i < s.length) { // Skip whitespace while (i < s.length && /\s/.test(s[i])) i++; if (i >= s.length) break; // Check for negation prefix const negated = s[i] === '-'; if (negated) i++; // Check for quoted phrase if (s[i] === '"') { const start = i + 1; i++; while (i < s.length && s[i] !== '"') i++; const phrase = s.slice(start, i).trim(); i++; // skip closing quote if (phrase.length > 0) { const sanitized = phrase.split(/\s+/).map(t => sanitizeFTS5Term(t)).filter(t => t).join(' '); if (sanitized) { const ftsPhrase = `"${sanitized}"`; // Exact phrase, no prefix match if (negated) { negative.push(ftsPhrase); } else { positive.push(ftsPhrase); } } } } else { // Plain term (until whitespace or quote) const start = i; while (i < s.length && !/[\s"]/.test(s[i])) i++; const term = s.slice(start, i); // Handle hyphenated tokens: multi-agent, DEC-0054, gpt-4 // These get split into phrase queries so FTS5 porter tokenizer matches them. if (isHyphenatedToken(term)) { const sanitized = sanitizeHyphenatedTerm(term); if (sanitized) { const ftsPhrase = `"${sanitized}"`; // Phrase match (no prefix) if (negated) { negative.push(ftsPhrase); } else { positive.push(ftsPhrase); } } } else { const sanitized = sanitizeFTS5Term(term); if (sanitized) { const ftsTerm = `"${sanitized}"*`; // Prefix match if (negated) { negative.push(ftsTerm); } else { positive.push(ftsTerm); } } } } } if (positive.length === 0 && negative.length === 0) return null; // If only negative terms, we can't search (FTS5 NOT is binary) if (positive.length === 0) return null; // Join positive terms with AND let result = positive.join(' AND '); // Add NOT clause for negative terms for (const neg of negative) { result = `${result} NOT ${neg}`; } return result; } /** * Validate that a vec/hyde query doesn't use lex-only syntax. * Returns error message if invalid, null if valid. * * Negation is detected ONLY when `-` is preceded by whitespace or sits at * the start of the query. Hyphens inside words (e.g. `auto-archived`, * `pre-commit`, `multi-session`, `state-of-the-art`) carry no negation * semantics in natural English and must pass through unchanged. */ export function validateSemanticQuery(query) { // `-term` or `-"phrase"` only counts as negation at SOS or after whitespace. if (/(?:^|\s)-\w/.test(query) || /(?:^|\s)-"/.test(query)) { return 'Negation (-term) is not supported in vec/hyde queries. Use lex for exclusions.'; } return null; } export function validateLexQuery(query) { if (/[\r\n]/.test(query)) { return 'Lex queries must be a single line. Remove newline characters or split into separate lex: lines.'; } const quoteCount = (query.match(/"/g) ?? []).length; if (quoteCount % 2 === 1) { return 'Lex query has an unmatched double quote ("). Add the closing quote or remove it.'; } return null; } export function searchFTS(db, query, limit = 20, collectionName) { const ftsQuery = buildFTS5Query(query); if (!ftsQuery) return []; // Use a CTE to force FTS5 to run first, then filter by collection. // Without the CTE, SQLite's query planner combines FTS5 MATCH with the // collection filter in a single WHERE clause, which can cause it to // abandon the FTS5 index and fall back to a full scan — turning an 8ms // query into a 17-second query on large collections. const params = [ftsQuery]; // When filtering by collection, fetch extra candidates from the FTS index // since some will be filtered out. Without a collection filter we can // fetch exactly the requested limit. const ftsLimit = collectionName ? limit * 10 : limit; let sql = ` WITH fts_matches AS ( SELECT rowid, bm25(documents_fts, 1.5, 4.0, 1.0) as bm25_score FROM documents_fts WHERE documents_fts MATCH ? ORDER BY bm25_score ASC LIMIT ${ftsLimit} ) SELECT 'qmd://' || d.collection || '/' || d.path as filepath, d.collection || '/' || d.path as display_path, d.title, content.doc as body, d.hash, fm.bm25_score FROM fts_matches fm JOIN documents d ON d.id = fm.rowid JOIN content ON content.hash = d.hash WHERE d.active = 1 `; if (collectionName) { sql += ` AND d.collection = ?`; params.push(String(collectionName)); } // bm25 lower is better; sort ascending. sql += ` ORDER BY fm.bm25_score ASC LIMIT ?`; params.push(limit); const rows = db.prepare(sql).all(...params); return rows.map(row => { const collectionName = row.filepath.split('//')[1]?.split('/')[0] || ""; // Convert bm25 (negative, lower is better) into a stable [0..1) score where higher is better. // FTS5 BM25 scores are negative (e.g., -10 is strong, -2 is weak). // |x| / (1 + |x|) maps: strong(-10)→0.91, medium(-2)→0.67, weak(-0.5)→0.33, none(0)→0. // Monotonic and query-independent — no per-query normalization needed. const score = Math.abs(row.bm25_score) / (1 + Math.abs(row.bm25_score)); return { filepath: row.filepath, displayPath: row.display_path, title: row.title, hash: row.hash, docid: getDocid(row.hash), collectionName, modifiedAt: "", // Not available in FTS query bodyLength: row.body.length, body: row.body, context: getContextForFile(db, row.filepath), score, source: "fts", }; }); } // ============================================================================= // Vector Search // ============================================================================= export async function searchVec(db, query, model, limit = 20, collectionName, session, precomputedEmbedding, embedProvider) { const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get(); if (!tableExists) return []; const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session, undefined, embedProvider); if (!embedding) return []; // IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables // hang indefinitely when combined with JOINs in the same query. Do NOT try to // "optimize" this by combining into a single query with JOINs - it will break. // See: https://github.com/tobi/qmd/pull/23 // Step 1: Get vector matches from sqlite-vec (no JOINs allowed) const vecResults = db.prepare(` SELECT hash_seq, distance FROM vectors_vec WHERE embedding MATCH ? AND k = ? `).all(new Float32Array(embedding), limit * 3); if (vecResults.length === 0) return []; // Step 2: Get chunk info and document data const hashSeqs = vecResults.map(r => r.hash_seq); const distanceMap = new Map(vecResults.map(r => [r.hash_seq, r.distance])); // Build query for document lookup const placeholders = hashSeqs.map(() => '?').join(','); let docSql = ` SELECT cv.hash || '_' || cv.seq as hash_seq, cv.hash, cv.pos, 'qmd://' || d.collection || '/' || d.path as filepath, d.collection || '/' || d.path as display_path, d.title, content.doc as body FROM content_vectors cv JOIN documents d ON d.hash = cv.hash AND d.active = 1 JOIN content ON content.hash = d.hash WHERE cv.hash || '_' || cv.seq IN (${placeholders}) `; const params = [...hashSeqs]; if (collectionName) { docSql += ` AND d.collection = ?`; params.push(collectionName); } const docRows = db.prepare(docSql).all(...params); // Combine with distances and dedupe by filepath const seen = new Map(); for (const row of docRows) { const distance = distanceMap.get(row.hash_seq) ?? 1; const existing = seen.get(row.filepath); if (!existing || distance < existing.bestDist) { seen.set(row.filepath, { row, bestDist: distance }); } } return Array.from(seen.values()) .sort((a, b) => a.bestDist - b.bestDist) .slice(0, limit) .map(({ row, bestDist }) => { const collectionName = row.filepath.split('//')[1]?.split('/')[0] || ""; return { filepath: row.filepath, displayPath: row.display_path, title: row.title, hash: row.hash, docid: getDocid(row.hash), collectionName, modifiedAt: "", // Not available in vec query bodyLength: row.body.length, body: row.body, context: getContextForFile(db, row.filepath), score: 1 - bestDist, // Cosine similarity = 1 - cosine distance source: "vec", chunkPos: row.pos, }; }); } // ============================================================================= // Embeddings // ============================================================================= async function getEmbedding(text, model, isQuery, session, llmOverride, embedProvider) { // When an EmbeddingProvider is supplied, route the encoding through it // (HTTP / GPU worker / fallback chain) instead of touching local // node-llama-cpp at all. The provider sees the raw text + the desired // model id; query-formatting prefixes are still applied via // formatQueryForEmbedding so embedding parity with the index is preserved. if (embedProvider) { const providerModel = embedProvider.getModelId(); const formattedText = isQuery ? formatQueryForEmbedding(text, providerModel) : formatDocForEmbedding(text, undefined, providerModel); // Only forward an AbortSignal when the provider is local-backed; // remote providers manage their own timeouts and an LLM-session signal // would abort their HTTP request prematurely (i-08ovbvtb). const sig = embedProvider.kind === "local" ? session?.signal : undefined; const result = await embedProvider.embed(formattedText, sig ? { model: providerModel, signal: sig } : { model: providerModel }); return result?.embedding ?? null; } // Format text using the appropriate prompt template const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model); const result = session ? await session.embed(formattedText, { model, isQuery }) : await (llmOverride ?? getDefaultLlamaCpp()).embed(formattedText, { model, isQuery }); return result?.embedding || null; } /** * Get all unique content hashes that need embeddings (from active documents). * Returns hash, document body, and a sample path for display purposes. */ export function getHashesForEmbedding(db) { return db.prepare(` SELECT d.hash, c.doc as body, MIN(d.path) as path FROM documents d JOIN content c ON d.hash = c.hash LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0 WHERE d.active = 1 AND v.hash IS NULL GROUP BY d.hash `).all(); } /** * Clear all embeddings from the database (force re-index). * Deletes all rows from content_vectors and drops the vectors_vec table. */ export function clearAllEmbeddings(db) { db.exec(`DELETE FROM content_vectors`); db.exec(`DROP TABLE IF EXISTS vectors_vec`); } /** * Get the distinct set of model identifiers present in `content_vectors`. * * Used by the embedding migration-safety guard: if a configured provider's * `getModelId()` does not appear in this list (and the table is non-empty), * we refuse to embed and ask the user to run `qmd embed -f` to rebuild. * * Returns `[]` when the table is empty (fresh DB) — in which case any * provider is allowed. */ export function getDistinctEmbeddingModels(db) { const rows = db.prepare(`SELECT DISTINCT model FROM content_vectors WHERE model IS NOT NULL`).all(); return rows.map((r) => r.model).filter((m) => typeof m === "string" && m.length > 0); } /** * Insert a single embedding into both content_vectors and vectors_vec tables. * The hash_seq key is formatted as "hash_seq" for the vectors_vec table. * * content_vectors is inserted first so that getHashesForEmbedding (which checks * only content_vectors) won't re-select the hash on a crash between the two inserts. * * vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's * vec0 virtual tables silently ignore the OR REPLACE conflict clause. */ export function insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt) { const hashSeq = `${hash}_${seq}`; // Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding) const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`); insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt); // vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`); const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`); deleteVecStmt.run(hashSeq); insertVecStmt.run(hashSeq, embedding); } // ============================================================================= // Query expansion // ============================================================================= export async function expandQuery(query, model = DEFAULT_QUERY_MODEL, db, intent, llmOverride) { // Check cache first — stored as JSON preserving types const cacheKey = getCacheKey("expandQuery", { query, model, ...(intent && { intent }) }); const cached = getCachedResult(db, cacheKey); if (cached) { try { const parsed = JSON.parse(cached); // Migrate old cache format: { type, text } → { type, query } if (parsed.length > 0 && parsed[0].query) { return parsed; } else if (parsed.length > 0 && parsed[0].text) { return parsed.map((r) => ({ type: r.type, query: r.text })); } } catch { // Old cache format (pre-typed, newline-separated text) — re-expand } } const llm = llmOverride ?? getDefaultLlamaCpp(); // Note: LlamaCpp uses hardcoded model, model parameter is ignored const results = await llm.expandQuery(query, { intent }); // Map Queryable[] → ExpandedQuery[] (same shape, decoupled from llm.ts internals). // Filter out entries that duplicate the original query text. const expanded = results .filter(r => r.text !== query) .map(r => ({ type: r.type, query: r.text })); if (expanded.length > 0) { setCachedResult(db, cacheKey, JSON.stringify(expanded)); } return expanded; } // ============================================================================= // Reranking // ============================================================================= export async function rerank(query, documents, model = DEFAULT_RERANK_MODEL, db, intent, llmOverride) { // Prepend intent to rerank query so the reranker scores with domain context const rerankQuery = intent ? `${intent}\n\n${query}` : query; const cachedResults = new Map(); const uncachedDocsByChunk = new Map(); // Check cache for each document // Cache key includes chunk text — different queries can select different chunks // from the same file, and the reranker score depends on which chunk was sent. // File path is excluded from the new cache key because the reranker score // depends on the chunk content, not where it came from. for (const doc of documents) { const cacheKey = getCacheKey("rerank", { query: rerankQuery, model, chunk: doc.text }); const legacyCacheKey = getCacheKey("rerank", { query, file: doc.file, model, chunk: doc.text }); const cached = getCachedResult(db, cacheKey) ?? getCachedResult(db, legacyCacheKey); if (cached !== null) { cachedResults.set(doc.text, parseFloat(cached)); } else { uncachedDocsByChunk.set(doc.text, { file: doc.file, text: doc.text }); } } // Rerank uncached documents using LlamaCpp if (uncachedDocsByChunk.size > 0) { const llm = llmOverride ?? getDefaultLlamaCpp(); const uncachedDocs = [...uncachedDocsByChunk.values()]; const rerankResult = await llm.rerank(rerankQuery, uncachedDocs, { model }); // Cache results by chunk text so identical chunks across files are scored once. const textByFile = new Map(uncachedDocs.map(d => [d.file, d.text])); for (const result of rerankResult.results) { const chunk = textByFile.get(result.file) || ""; const cacheKey = getCacheKey("rerank", { query: rerankQuery, model, chunk }); setCachedResult(db, cacheKey, result.score.toString()); cachedResults.set(chunk, result.score); } } // Return all results sorted by score return documents .map(doc => ({ file: doc.file, score: cachedResults.get(doc.text) || 0 })) .sort((a, b) => b.score - a.score); } // ============================================================================= // Reciprocal Rank Fusion // ============================================================================= export function reciprocalRankFusion(resultLists, weights = [], k = 60) { const scores = new Map(); for (let listIdx = 0; listIdx < resultLists.length; listIdx++) { const list = resultLists[listIdx]; if (!list) continue; const weight = weights[listIdx] ?? 1.0; for (let rank = 0; rank < list.length; rank++) { const result = list[rank]; if (!result) continue; const rrfContribution = weight / (k + rank + 1); const existing = scores.get(result.file); if (existing) { existing.rrfScore += rrfContribution; existing.topRank = Math.min(existing.topRank, rank); } else { scores.set(result.file, { result, rrfScore: rrfContribution, topRank: rank, }); } } } // Top-rank bonus for (const entry of scores.values()) { if (entry.topRank === 0) { entry.rrfScore += 0.05; } else if (entry.topRank <= 2) { entry.rrfScore += 0.02; } } return Array.from(scores.values()) .sort((a, b) => b.rrfScore - a.rrfScore) .map(e => ({ ...e.result, score: e.rrfScore })); } /** * Build per-document RRF contribution traces for explain/debug output. */ export function buildRrfTrace(resultLists, weights = [], listMeta = [], k = 60) { const traces = new Map(); for (let listIdx = 0; listIdx < resultLists.length; listIdx++) { const list = resultLists[listIdx]; if (!list) continue; const weight = weights[listIdx] ?? 1.0; const meta = listMeta[listIdx] ?? { source: "fts", queryType: "original", query: "", }; for (let rank0 = 0; rank0 < list.length; rank0++) { const result = list[rank0]; if (!result) continue; const rank = rank0 + 1; // 1-indexed rank for explain output const contribution = weight / (k + rank); const existing = traces.get(result.file); const detail = { listIndex: listIdx, source: meta.source, queryType: meta.queryType, query: meta.query, rank, weight, backendScore: result.score, rrfContribution: contribution, }; if (existing) { existing.baseScore += contribution; existing.topRank = Math.min(existing.topRank, rank); existing.contributions.push(detail); } else { traces.set(result.file, { contributions: [detail], baseScore: contribution, topRank: rank, topRankBonus: 0, totalScore: 0, }); } } } for (const trace of traces.values()) { let bonus = 0; if (trace.topRank === 1) bonus = 0.05; else if (trace.topRank <= 3) bonus = 0.02; trace.topRankBonus = bonus; trace.totalScore = trace.baseScore + bonus; } return traces; } /** * Find a document by filename/path, docid (#hash), or with fuzzy matching. * Returns document metadata without body by default. * * Supports: * - Virtual paths: qmd://collection/path/to/file.md * - Absolute paths: /path/to/file.md * - Relative paths: path/to/file.md * - Short docid: #abc123 (first 6 chars of hash) */ export function findDocument(db, filename, options = {}) { let filepath = filename; const colonMatch = filepath.match(/:(\d+)$/); if (colonMatch) { filepath = filepath.slice(0, -colonMatch[0].length); } // Check if this is a docid lookup (#abc123, abc123, "#abc123", "abc123", etc.) if (isDocid(filepath)) { const docidMatch = findDocumentByDocid(db, filepath); if (docidMatch) { filepath = docidMatch.filepath; } else { return { error: "not_found", query: filename, similarFiles: [] }; } } if (filepath.startsWith('~/')) { filepath = homedir() + filepath.slice(1); } const bodyCol = options.includeBody ? `, content.doc as body` : ``; // Build computed columns // Note: absoluteFilepath is computed from YAML collections after query const selectCols = ` 'qmd://' || d.collection || '/' || d.path as virtual_path, d.collection || '/' || d.path as display_path, d.title, d.hash, d.collection, d.modified_at, LENGTH(content.doc) as body_length ${bodyCol} `; // Try to match by virtual path first let doc = db.prepare(` SELECT ${selectCols} FROM documents d JOIN content ON content.hash = d.hash WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1 `).get(filepath); // Try fuzzy match by virtual path if (!doc) { doc = db.prepare(` SELECT ${selectCols} FROM documents d JOIN content ON content.hash = d.hash WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1 LIMIT 1 `).get(`%${filepath}`); } // Try to match by absolute path (requires looking up collection paths from DB) if (!doc && !filepath.startsWith('qmd://')) { const collections = getStoreCollections(db); for (const coll of collections) { let relativePath = null; // If filepath is absolute and starts with collection path, extract relative part if (filepath.startsWith(coll.path + '/')) { relativePath = filepath.slice(coll.path.length + 1); } // Otherwise treat filepath as relative to collection else if (!filepath.startsWith('/')) { relativePath = filepath; } if (relativePath) { doc = db.prepare(` SELECT ${selectCols} FROM documents d JOIN content ON content.hash = d.hash WHERE d.collection = ? AND d.path = ? AND d.active = 1 `).get(coll.name, relativePath); if (doc) break; } } } if (!doc) { const similar = findSimilarFiles(db, filepath, 5, 5); return { error: "not_found", query: filename, similarFiles: similar }; } // Get context using virtual path const virtualPath = doc.virtual_path || `qmd://${doc.collection}/${doc.display_path}`; const context = getContextForFile(db, virtualPath); return { filepath: virtualPath, displayPath: doc.display_path, title: doc.title, context, hash: doc.hash, docid: getDocid(doc.hash), collectionName: doc.collection, modifiedAt: doc.modified_at, bodyLength: doc.body_length, ...(options.includeBody && doc.body !== undefined && { body: doc.body }), }; } /** * Get the body content for a document * Optionally slice by line range */ export function getDocumentBody(db, doc, fromLine, maxLines) { const filepath = doc.filepath; // Try to resolve document by filepath (absolute or virtual) let row = null; // Try virtual path first if (filepath.startsWith('qmd://')) { row = db.prepare(` SELECT content.doc as body FROM documents d JOIN content ON content.hash = d.hash WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1 `).get(filepath); } // Try absolute path by looking up in DB store_collections if (!row) { const collections = getStoreCollections(db); for (const coll of collections) { if (filepath.startsWith(coll.path + '/')) { const relativePath = filepath.slice(coll.path.length + 1); row = db.prepare(` SELECT content.doc as body FROM documents d JOIN content ON content.hash = d.hash WHERE d.collection = ? AND d.path = ? AND d.active = 1 `).get(coll.name, relativePath); if (row) break; } } } if (!row) return null; let body = row.body; if (fromLine !== undefined || maxLines !== undefined) { const lines = body.split('\n'); const start = (fromLine || 1) - 1; const end = maxLines !== undefined ? start + maxLines : lines.length; body = lines.slice(start, end).join('\n'); } return body; } /** * Find multiple documents by glob pattern or comma-separated list * Returns documents without body by default (use getDocumentBody to load) */ export function findDocuments(db, pattern, options = {}) { const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?') && !pattern.includes('{'); const errors = []; const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES; const bodyCol = options.includeBody ? `, content.doc as body` : ``; const selectCols = ` 'qmd://' || d.collection || '/' || d.path as virtual_path, d.collection || '/' || d.path as display_path, d.title, d.hash, d.collection, d.modified_at, LENGTH(content.doc) as body_length ${bodyCol} `; let fileRows; if (isCommaSeparated) { const names = pattern.split(',').map(s => s.trim()).filter(Boolean); fileRows = []; for (const name of names) { let doc = db.prepare(` SELECT ${selectCols} FROM documents d JOIN content ON content.hash = d.hash WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1 `).get(name); if (!doc) { doc = db.prepare(` SELECT ${selectCols} FROM documents d JOIN content ON content.hash = d.hash WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1 LIMIT 1 `).get(`%${name}`); } if (doc) { fileRows.push(doc); } else { const similar = findSimilarFiles(db, name, 5, 3); let msg = `File not found: ${name}`; if (similar.length > 0) { msg += ` (did you mean: ${similar.join(', ')}?)`; } errors.push(msg); } } } else { // Glob pattern match const matched = matchFilesByGlob(db, pattern); if (matched.length === 0) { errors.push(`No files matched pattern: ${pattern}`); return { docs: [], errors }; } const virtualPaths = matched.map(m => m.filepath); const placeholders = virtualPaths.map(() => '?').join(','); fileRows = db.prepare(` SELECT ${selectCols} FROM documents d JOIN content ON content.hash = d.hash WHERE 'qmd://' || d.collection || '/' || d.path IN (${placeholders}) AND d.active = 1 `).all(...virtualPaths); } const results = []; for (const row of fileRows) { // Get context using virtual path const virtualPath = row.virtual_path || `qmd://${row.collection}/${row.display_path}`; const context = getContextForFile(db, virtualPath); if (row.body_length > maxBytes) { results.push({ doc: { filepath: virtualPath, displayPath: row.display_path }, skipped: true, skipReason: `File too large (${Math.round(row.body_length / 1024)}KB > ${Math.round(maxBytes / 1024)}KB)`, }); continue; } results.push({ doc: { filepath: virtualPath, displayPath: row.display_path, title: row.title || row.display_path.split('/').pop() || row.display_path, context, hash: row.hash, docid: getDocid(row.hash), collectionName: row.collection, modifiedAt: row.modified_at, bodyLength: row.body_length, ...(options.includeBody && row.body !== undefined && { body: row.body }), }, skipped: false, }); } return { docs: results, errors }; } // ============================================================================= // Status // ============================================================================= export function getStatus(db) { // DB is source of truth for collections — config provides supplementary metadata const dbCollections = db.prepare(` SELECT collection as name, COUNT(*) as active_count, MAX(modified_at) as last_doc_update FROM documents WHERE active = 1 GROUP BY collection `).all(); // Build a lookup from store_collections for path/pattern metadata const storeCollections = getStoreCollections(db); const configLookup = new Map(storeCollections.map(c => [c.name, { path: c.path, pattern: c.pattern }])); const collections = dbCollections.map(row => { const config = configLookup.get(row.name); return { name: row.name, path: config?.path ?? null, pattern: config?.pattern ?? null, documents: row.active_count, lastUpdated: row.last_doc_update || new Date().toISOString(), }; }); // Sort by last update time (most recent first) collections.sort((a, b) => { if (!a.lastUpdated) return 1; if (!b.lastUpdated) return -1; return new Date(b.lastUpdated).getTime() - new Date(a.lastUpdated).getTime(); }); const totalDocs = db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get().c; const needsEmbedding = getHashesNeedingEmbedding(db); const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get(); return { totalDocuments: totalDocs, needsEmbedding, hasVectorIndex: hasVectors, collections, }; } /** Weight for intent terms relative to query terms (1.0) in snippet scoring */ export const INTENT_WEIGHT_SNIPPET = 0.3; /** Weight for intent terms relative to query terms (1.0) in chunk selection */ export const INTENT_WEIGHT_CHUNK = 0.5; // Common stop words filtered from intent strings before tokenization. // Seeded from finetune/reward.py KEY_TERM_STOPWORDS, extended with common // 2-3 char function words so the length threshold can drop to >1 and let // short domain terms (API, SQL, LLM, CPU, CDN, …) survive. const INTENT_STOP_WORDS = new Set([ // 2-char function words "am", "an", "as", "at", "be", "by", "do", "he", "if", "in", "is", "it", "me", "my", "no", "of", "on", "or", "so", "to", "up", "us", "we", // 3-char function words "all", "and", "any", "are", "but", "can", "did", "for", "get", "has", "her", "him", "his", "how", "its", "let", "may", "not", "our", "out", "the", "too", "was", "who", "why", "you", // 4+ char common words "also", "does", "find", "from", "have", "into", "more", "need", "show", "some", "tell", "that", "them", "this", "want", "what", "when", "will", "with", "your", // Search-context noise "about", "looking", "notes", "search", "where", "which", ]); /** * Extract meaningful terms from an intent string, filtering stop words and punctuation. * Uses Unicode-aware punctuation stripping so domain terms like "API" survive. * Returns lowercase terms suitable for text matching. */ export function extractIntentTerms(intent) { return intent.toLowerCase().split(/\s+/) .map(t => t.replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu, "")) .filter(t => t.length > 1 && !INTENT_STOP_WORDS.has(t)); } export function extractSnippet(body, query, maxLen = 500, chunkPos, chunkLen, intent) { const totalLines = body.split('\n').length; let searchBody = body; let lineOffset = 0; if (chunkPos && chunkPos > 0) { // Search within the chunk region, with some padding for context // Use provided chunkLen or fall back to max chunk size (covers variable-length chunks) const searchLen = chunkLen || CHUNK_SIZE_CHARS; const contextStart = Math.max(0, chunkPos - 100); const contextEnd = Math.min(body.length, chunkPos + searchLen + 100); searchBody = body.slice(contextStart, contextEnd); if (contextStart > 0) { lineOffset = body.slice(0, contextStart).split('\n').length - 1; } } const lines = searchBody.split('\n'); const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0); const intentTerms = intent ? extractIntentTerms(intent) : []; let bestLine = 0, bestScore = -1; for (let i = 0; i < lines.length; i++) { const lineLower = (lines[i] ?? "").toLowerCase(); let score = 0; for (const term of queryTerms) { if (lineLower.includes(term)) score += 1.0; } for (const term of intentTerms) { if (lineLower.includes(term)) score += INTENT_WEIGHT_SNIPPET; } if (score > bestScore) { bestScore = score; bestLine = i; } } const start = Math.max(0, bestLine - 1); const end = Math.min(lines.length, bestLine + 3); const snippetLines = lines.slice(start, end); let snippetText = snippetLines.join('\n'); // If we focused on a chunk window and it produced an empty/whitespace-only snippet, // fall back to a full-document snippet so we always show something useful. if (chunkPos && chunkPos > 0 && snippetText.trim().length === 0) { return extractSnippet(body, query, maxLen, undefined, undefined, intent); } if (snippetText.length > maxLen) snippetText = snippetText.substring(0, maxLen - 3) + "..."; const absoluteStart = lineOffset + start + 1; // 1-indexed const snippetLineCount = snippetLines.length; const linesBefore = absoluteStart - 1; const linesAfter = totalLines - (absoluteStart + snippetLineCount - 1); // Format with diff-style header: @@ -start,count @@ (linesBefore before, linesAfter after) const header = `@@ -${absoluteStart},${snippetLineCount} @@ (${linesBefore} before, ${linesAfter} after)`; const snippet = `${header}\n${snippetText}`; return { line: lineOffset + bestLine + 1, snippet, linesBefore, linesAfter, snippetLines: snippetLineCount, }; } // ============================================================================= // Shared helpers (used by both CLI and MCP) // ============================================================================= /** * Add line numbers to text content. * Each line becomes: "{lineNum}: {content}" */ export function addLineNumbers(text, startLine = 1) { const lines = text.split('\n'); return lines.map((line, i) => `${startLine + i}: ${line}`).join('\n'); } /** * Hybrid search: BM25 + vector + query expansion + RRF + chunked reranking. * * Pipeline: * 1. BM25 probe → skip expansion if strong signal * 2. expandQuery() → typed query variants (lex/vec/hyde) * 3. Type-routed search: original→vector, lex→FTS, vec/hyde→vector * 4. RRF fusion → slice to candidateLimit * 5. chunkDocument() + keyword-best-chunk selection * 6. rerank on chunks (NOT full bodies — O(tokens) trap) * 7. Position-aware score blending (RRF rank × reranker score) * 8. Dedup by file, filter by minScore, slice to limit */ export async function hybridQuery(store, query, options) { const limit = options?.limit ?? 10; const minScore = options?.minScore ?? 0; const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT; const collection = options?.collection; const explain = options?.explain ?? false; const intent = options?.intent; const skipRerank = options?.skipRerank ?? false; const hooks = options?.hooks; const embedProvider = options?.embedProvider; const rankedLists = []; const rankedListMeta = []; const docidMap = new Map(); // filepath -> docid const hasVectors = !!store.db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get(); // Step 1: BM25 probe — strong signal skips expensive LLM expansion // When intent is provided, disable strong-signal bypass — the obvious BM25 // match may not be what the caller wants (e.g. "performance" with intent // "web page load times" should NOT shortcut to a sports-performance doc). // Pass collection directly into FTS query (filter at SQL level, not post-hoc) const initialFts = store.searchFTS(query, 20, collection); const topScore = initialFts[0]?.score ?? 0; const secondScore = initialFts[1]?.score ?? 0; const hasStrongSignal = !intent && initialFts.length > 0 && topScore >= STRONG_SIGNAL_MIN_SCORE && (topScore - secondScore) >= STRONG_SIGNAL_MIN_GAP; if (hasStrongSignal) hooks?.onStrongSignal?.(topScore); // Step 2: Expand query (or skip if strong signal) hooks?.onExpandStart?.(); const expandStart = Date.now(); const expanded = hasStrongSignal ? [] : await store.expandQuery(query, undefined, intent); hooks?.onExpand?.(query, expanded, Date.now() - expandStart); // Seed with initial FTS results (avoid re-running original query FTS) if (initialFts.length > 0) { for (const r of initialFts) docidMap.set(r.filepath, r.docid); rankedLists.push(initialFts.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score, }))); rankedListMeta.push({ source: "fts", queryType: "original", query }); } // Step 3: Route searches by query type // // Strategy: run all FTS queries immediately (they're sync/instant), then // batch-embed all vector queries in one embedBatch() call, then run // sqlite-vec lookups with pre-computed embeddings. // 3a: Run FTS for all lex expansions right away (no LLM needed) for (const q of expanded) { if (q.type === 'lex') { const ftsResults = store.searchFTS(q.query, 20, collection); if (ftsResults.length > 0) { for (const r of ftsResults) docidMap.set(r.filepath, r.docid); rankedLists.push(ftsResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score, }))); rankedListMeta.push({ source: "fts", queryType: "lex", query: q.query }); } } } // 3b: Collect all texts that need vector search (original query + vec/hyde expansions) if (hasVectors) { const vecQueries = [ { text: query, queryType: "original" }, ]; for (const q of expanded) { if (q.type === 'vec' || q.type === 'hyde') { vecQueries.push({ text: q.query, queryType: q.type }); } } // Batch embed all vector queries in a single call. // When `embedProvider` is supplied (i-loazq6ze), route the encode through // it (HTTP / GPU worker / AutoFallback chain) instead of warming the // local llama-cpp model — this is the whole point of the GPU worker. const embedModelName = embedProvider ? embedProvider.getModelId() : getLlm(store).embedModelName; const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, embedModelName)); hooks?.onEmbedStart?.(textsToEmbed.length); const embedStart = Date.now(); const embeddings = embedProvider ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName }) : await getLlm(store).embedBatch(textsToEmbed); hooks?.onEmbedDone?.(Date.now() - embedStart); // Run sqlite-vec lookups with pre-computed embeddings for (let i = 0; i < vecQueries.length; i++) { const embedding = embeddings[i]?.embedding; if (!embedding) continue; const vecResults = await store.searchVec(vecQueries[i].text, DEFAULT_EMBED_MODEL, 20, collection, undefined, embedding); if (vecResults.length > 0) { for (const r of vecResults) docidMap.set(r.filepath, r.docid); rankedLists.push(vecResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score, }))); rankedListMeta.push({ source: "vec", queryType: vecQueries[i].queryType, query: vecQueries[i].text, }); } } } // Step 4: RRF fusion — first 2 lists (original FTS + first vec) get 2x weight const weights = rankedLists.map((_, i) => i < 2 ? 2.0 : 1.0); const fused = reciprocalRankFusion(rankedLists, weights); const rrfTraceByFile = explain ? buildRrfTrace(rankedLists, weights, rankedListMeta) : null; const candidates = fused.slice(0, candidateLimit); if (candidates.length === 0) return []; // Step 5: Chunk documents, pick best chunk per doc for reranking. // Reranking full bodies is O(tokens) — the critical perf lesson that motivated this refactor. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2); const intentTerms = intent ? extractIntentTerms(intent) : []; const docChunkMap = new Map(); const chunkStrategy = options?.chunkStrategy; for (const cand of candidates) { const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, chunkStrategy); if (chunks.length === 0) continue; // Pick chunk with most keyword overlap (fallback: first chunk) // Intent terms contribute at INTENT_WEIGHT_CHUNK (0.5) relative to query terms (1.0) let bestIdx = 0; let bestScore = -1; for (let i = 0; i < chunks.length; i++) { const chunkLower = chunks[i].text.toLowerCase(); let score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0); for (const term of intentTerms) { if (chunkLower.includes(term)) score += INTENT_WEIGHT_CHUNK; } if (score > bestScore) { bestScore = score; bestIdx = i; } } docChunkMap.set(cand.file, { chunks, bestIdx }); } if (skipRerank) { // Skip LLM reranking — return candidates scored by RRF only const seenFiles = new Set(); return candidates .map((cand, i) => { const chunkInfo = docChunkMap.get(cand.file); const bestIdx = chunkInfo?.bestIdx ?? 0; const bestChunk = chunkInfo?.chunks[bestIdx]?.text || cand.body || ""; const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0; const rrfRank = i + 1; const rrfScore = 1 / rrfRank; const trace = rrfTraceByFile?.get(cand.file); const explainData = explain ? { ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [], vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [], rrf: { rank: rrfRank, positionScore: rrfScore, weight: 1.0, baseScore: trace?.baseScore ?? 0, topRankBonus: trace?.topRankBonus ?? 0, totalScore: trace?.totalScore ?? 0, contributions: trace?.contributions ?? [], }, rerankScore: 0, blendedScore: rrfScore, } : undefined; return { file: cand.file, displayPath: cand.displayPath, title: cand.title, body: cand.body, bestChunk, bestChunkPos, score: rrfScore, context: store.getContextForFile(cand.file), docid: docidMap.get(cand.file) || "", ...(explainData ? { explain: explainData } : {}), }; }) .filter(r => { if (seenFiles.has(r.file)) return false; seenFiles.add(r.file); return true; }) .filter(r => r.score >= minScore) .slice(0, limit); } // Step 6: Rerank chunks (NOT full bodies) const chunksToRerank = []; for (const cand of candidates) { const chunkInfo = docChunkMap.get(cand.file); if (chunkInfo) { chunksToRerank.push({ file: cand.file, text: chunkInfo.chunks[chunkInfo.bestIdx].text }); } } hooks?.onRerankStart?.(chunksToRerank.length); const rerankStart = Date.now(); const reranked = await store.rerank(query, chunksToRerank, undefined, intent); hooks?.onRerankDone?.(Date.now() - rerankStart); // Step 7: Blend RRF position score with reranker score // Position-aware weights: top retrieval results get more protection from reranker disagreement const candidateMap = new Map(candidates.map(c => [c.file, { displayPath: c.displayPath, title: c.title, body: c.body, }])); const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1])); const blended = reranked.map(r => { const rrfRank = rrfRankMap.get(r.file) || candidateLimit; let rrfWeight; if (rrfRank <= 3) rrfWeight = 0.75; else if (rrfRank <= 10) rrfWeight = 0.60; else rrfWeight = 0.40; const rrfScore = 1 / rrfRank; const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * r.score; const candidate = candidateMap.get(r.file); const chunkInfo = docChunkMap.get(r.file); const bestIdx = chunkInfo?.bestIdx ?? 0; const bestChunk = chunkInfo?.chunks[bestIdx]?.text || candidate?.body || ""; const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0; const trace = rrfTraceByFile?.get(r.file); const explainData = explain ? { ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [], vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [], rrf: { rank: rrfRank, positionScore: rrfScore, weight: rrfWeight, baseScore: trace?.baseScore ?? 0, topRankBonus: trace?.topRankBonus ?? 0, totalScore: trace?.totalScore ?? 0, contributions: trace?.contributions ?? [], }, rerankScore: r.score, blendedScore, } : undefined; return { file: r.file, displayPath: candidate?.displayPath || "", title: candidate?.title || "", body: candidate?.body || "", bestChunk, bestChunkPos, score: blendedScore, context: store.getContextForFile(r.file), docid: docidMap.get(r.file) || "", ...(explainData ? { explain: explainData } : {}), }; }).sort((a, b) => b.score - a.score); // Step 8: Dedup by file (safety net — prevents duplicate output) const seenFiles = new Set(); return blended .filter(r => { if (seenFiles.has(r.file)) return false; seenFiles.add(r.file); return true; }) .filter(r => r.score >= minScore) .slice(0, limit); } /** * Vector-only semantic search with query expansion. * * Pipeline: * 1. expandQuery() → typed variants, filter to vec/hyde only (lex irrelevant here) * 2. searchVec() for original + vec/hyde variants (sequential — node-llama-cpp embed limitation) * 3. Dedup by filepath (keep max score) * 4. Sort by score descending, filter by minScore, slice to limit */ export async function vectorSearchQuery(store, query, options) { const limit = options?.limit ?? 10; const minScore = options?.minScore ?? 0.3; const collection = options?.collection; const intent = options?.intent; const embedProvider = options?.embedProvider; const hasVectors = !!store.db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get(); if (!hasVectors) return []; // Expand query — filter to vec/hyde only (lex queries target FTS, not vector) const expandStart = Date.now(); const allExpanded = await store.expandQuery(query, undefined, intent); const vecExpanded = allExpanded.filter(q => q.type !== 'lex'); options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart); // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs. // When `embedProvider` is supplied (i-loazq6ze), query encoding is routed // through it; the per-call signature `searchVec(...)` accepts the provider // as the trailing argument so existing tests / callers stay untouched. const queryTexts = [query, ...vecExpanded.map(q => q.query)]; const allResults = new Map(); for (const q of queryTexts) { const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection, undefined, undefined, embedProvider); for (const r of vecResults) { const existing = allResults.get(r.filepath); if (!existing || r.score > existing.score) { allResults.set(r.filepath, { file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score, context: store.getContextForFile(r.filepath), docid: r.docid, }); } } } return Array.from(allResults.values()) .sort((a, b) => b.score - a.score) .filter(r => r.score >= minScore) .slice(0, limit); } /** * Structured search: execute pre-expanded queries without LLM query expansion. * * Designed for LLM callers (MCP/HTTP) that generate their own query expansions. * Skips the internal expandQuery() step — goes directly to: * * Pipeline: * 1. Route searches: lex→FTS, vec/hyde→vector (batch embed) * 2. RRF fusion across all result lists * 3. Chunk documents + keyword-best-chunk selection * 4. Rerank on chunks * 5. Position-aware score blending * 6. Dedup, filter, slice * * This is the recommended endpoint for capable LLMs — they can generate * better query variations than our small local model, especially for * domain-specific or nuanced queries. */ export async function structuredSearch(store, searches, options) { const limit = options?.limit ?? 10; const minScore = options?.minScore ?? 0; const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT; const explain = options?.explain ?? false; const intent = options?.intent; const skipRerank = options?.skipRerank ?? false; const hooks = options?.hooks; const embedProvider = options?.embedProvider; const collections = options?.collections; if (searches.length === 0) return []; // Validate queries before executing for (const search of searches) { const location = search.line ? `Line ${search.line}` : 'Structured search'; if (/[\r\n]/.test(search.query)) { throw new Error(`${location} (${search.type}): queries must be single-line. Remove newline characters.`); } if (search.type === 'lex') { const error = validateLexQuery(search.query); if (error) { throw new Error(`${location} (lex): ${error}`); } } else if (search.type === 'vec' || search.type === 'hyde') { const error = validateSemanticQuery(search.query); if (error) { throw new Error(`${location} (${search.type}): ${error}`); } } } const rankedLists = []; const rankedListMeta = []; const docidMap = new Map(); // filepath -> docid const hasVectors = !!store.db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get(); // Helper to run search across collections (or all if undefined) const collectionList = collections ?? [undefined]; // undefined = all collections // Step 1: Run FTS for all lex searches (sync, instant) for (const search of searches) { if (search.type === 'lex') { for (const coll of collectionList) { const ftsResults = store.searchFTS(search.query, 20, coll); if (ftsResults.length > 0) { for (const r of ftsResults) docidMap.set(r.filepath, r.docid); rankedLists.push(ftsResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score, }))); rankedListMeta.push({ source: "fts", queryType: "lex", query: search.query, }); } } } } // Step 2: Batch embed and run vector searches for vec/hyde if (hasVectors) { const vecSearches = searches.filter((s) => s.type === 'vec' || s.type === 'hyde'); if (vecSearches.length > 0) { // Route batch encoding through the supplied EmbeddingProvider when // present (i-loazq6ze). Otherwise fall back to the local llama-cpp // singleton — preserves pre-patch behavior for callers that don't // configure a provider. const embedModelName = embedProvider ? embedProvider.getModelId() : getLlm(store).embedModelName; const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, embedModelName)); hooks?.onEmbedStart?.(textsToEmbed.length); const embedStart = Date.now(); const embeddings = embedProvider ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName }) : await getLlm(store).embedBatch(textsToEmbed); hooks?.onEmbedDone?.(Date.now() - embedStart); for (let i = 0; i < vecSearches.length; i++) { const embedding = embeddings[i]?.embedding; if (!embedding) continue; for (const coll of collectionList) { const vecResults = await store.searchVec(vecSearches[i].query, DEFAULT_EMBED_MODEL, 20, coll, undefined, embedding); if (vecResults.length > 0) { for (const r of vecResults) docidMap.set(r.filepath, r.docid); rankedLists.push(vecResults.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score, }))); rankedListMeta.push({ source: "vec", queryType: vecSearches[i].type, query: vecSearches[i].query, }); } } } } } if (rankedLists.length === 0) return []; // Step 3: RRF fusion — first list gets 2x weight (assume caller ordered by importance) const weights = rankedLists.map((_, i) => i === 0 ? 2.0 : 1.0); const fused = reciprocalRankFusion(rankedLists, weights); const rrfTraceByFile = explain ? buildRrfTrace(rankedLists, weights, rankedListMeta) : null; const candidates = fused.slice(0, candidateLimit); if (candidates.length === 0) return []; hooks?.onExpand?.("", [], 0); // Signal no expansion (pre-expanded) // Step 4: Chunk documents, pick best chunk per doc for reranking // Use first lex query as the "query" for keyword matching, or first vec if no lex const primaryQuery = searches.find(s => s.type === 'lex')?.query || searches.find(s => s.type === 'vec')?.query || searches[0]?.query || ""; const queryTerms = primaryQuery.toLowerCase().split(/\s+/).filter(t => t.length > 2); const intentTerms = intent ? extractIntentTerms(intent) : []; const docChunkMap = new Map(); const ssChunkStrategy = options?.chunkStrategy; for (const cand of candidates) { const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, ssChunkStrategy); if (chunks.length === 0) continue; // Pick chunk with most keyword overlap // Intent terms contribute at INTENT_WEIGHT_CHUNK (0.5) relative to query terms (1.0) let bestIdx = 0; let bestScore = -1; for (let i = 0; i < chunks.length; i++) { const chunkLower = chunks[i].text.toLowerCase(); let score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0); for (const term of intentTerms) { if (chunkLower.includes(term)) score += INTENT_WEIGHT_CHUNK; } if (score > bestScore) { bestScore = score; bestIdx = i; } } docChunkMap.set(cand.file, { chunks, bestIdx }); } if (skipRerank) { // Skip LLM reranking — return candidates scored by RRF only const seenFiles = new Set(); return candidates .map((cand, i) => { const chunkInfo = docChunkMap.get(cand.file); const bestIdx = chunkInfo?.bestIdx ?? 0; const bestChunk = chunkInfo?.chunks[bestIdx]?.text || cand.body || ""; const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0; const rrfRank = i + 1; const rrfScore = 1 / rrfRank; const trace = rrfTraceByFile?.get(cand.file); const explainData = explain ? { ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [], vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [], rrf: { rank: rrfRank, positionScore: rrfScore, weight: 1.0, baseScore: trace?.baseScore ?? 0, topRankBonus: trace?.topRankBonus ?? 0, totalScore: trace?.totalScore ?? 0, contributions: trace?.contributions ?? [], }, rerankScore: 0, blendedScore: rrfScore, } : undefined; return { file: cand.file, displayPath: cand.displayPath, title: cand.title, body: cand.body, bestChunk, bestChunkPos, score: rrfScore, context: store.getContextForFile(cand.file), docid: docidMap.get(cand.file) || "", ...(explainData ? { explain: explainData } : {}), }; }) .filter(r => { if (seenFiles.has(r.file)) return false; seenFiles.add(r.file); return true; }) .filter(r => r.score >= minScore) .slice(0, limit); } // Step 5: Rerank chunks const chunksToRerank = []; for (const cand of candidates) { const chunkInfo = docChunkMap.get(cand.file); if (chunkInfo) { chunksToRerank.push({ file: cand.file, text: chunkInfo.chunks[chunkInfo.bestIdx].text }); } } hooks?.onRerankStart?.(chunksToRerank.length); const rerankStart2 = Date.now(); const reranked = await store.rerank(primaryQuery, chunksToRerank, undefined, intent); hooks?.onRerankDone?.(Date.now() - rerankStart2); // Step 6: Blend RRF position score with reranker score const candidateMap = new Map(candidates.map(c => [c.file, { displayPath: c.displayPath, title: c.title, body: c.body, }])); const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1])); const blended = reranked.map(r => { const rrfRank = rrfRankMap.get(r.file) || candidateLimit; let rrfWeight; if (rrfRank <= 3) rrfWeight = 0.75; else if (rrfRank <= 10) rrfWeight = 0.60; else rrfWeight = 0.40; const rrfScore = 1 / rrfRank; const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * r.score; const candidate = candidateMap.get(r.file); const chunkInfo = docChunkMap.get(r.file); const bestIdx = chunkInfo?.bestIdx ?? 0; const bestChunk = chunkInfo?.chunks[bestIdx]?.text || candidate?.body || ""; const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0; const trace = rrfTraceByFile?.get(r.file); const explainData = explain ? { ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [], vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [], rrf: { rank: rrfRank, positionScore: rrfScore, weight: rrfWeight, baseScore: trace?.baseScore ?? 0, topRankBonus: trace?.topRankBonus ?? 0, totalScore: trace?.totalScore ?? 0, contributions: trace?.contributions ?? [], }, rerankScore: r.score, blendedScore, } : undefined; return { file: r.file, displayPath: candidate?.displayPath || "", title: candidate?.title || "", body: candidate?.body || "", bestChunk, bestChunkPos, score: blendedScore, context: store.getContextForFile(r.file), docid: docidMap.get(r.file) || "", ...(explainData ? { explain: explainData } : {}), }; }).sort((a, b) => b.score - a.score); // Step 7: Dedup by file const seenFiles = new Set(); return blended .filter(r => { if (seenFiles.has(r.file)) return false; seenFiles.add(r.file); return true; }) .filter(r => r.score >= minScore) .slice(0, limit); }