| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221 |
- /**
- * QMD Store - Core data access and retrieval functions
- *
- * This module provides all database operations, search functions, and document
- * retrieval for QMD. It returns raw data structures that can be formatted by
- * CLI or MCP consumers.
- *
- * Usage:
- * const store = createStore("/path/to/db.sqlite");
- * // or use default path:
- * const store = createStore();
- */
- import { Database } from "bun:sqlite";
- import { Glob } from "bun";
- import * as sqliteVec from "sqlite-vec";
- import {
- Ollama,
- getDefaultOllama,
- formatQueryForEmbedding,
- formatDocForEmbedding,
- type RerankDocument,
- } from "./llm";
- // =============================================================================
- // Configuration
- // =============================================================================
- const HOME = Bun.env.HOME || "/tmp";
- export const DEFAULT_EMBED_MODEL = "embeddinggemma";
- export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
- export const DEFAULT_QUERY_MODEL = "qwen3:0.6b";
- export const DEFAULT_GLOB = "**/*.md";
- export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
- // Re-export OLLAMA_URL for backwards compatibility
- export const OLLAMA_URL = getDefaultOllama().getBaseUrl();
- // Chunking: ~2000 tokens per chunk, ~3 bytes/token = 6KB
- const CHUNK_BYTE_SIZE = 6 * 1024;
- // =============================================================================
- // Path utilities
- // =============================================================================
- export function homedir(): string {
- return HOME;
- }
- export function resolve(...paths: string[]): string {
- let result = paths[0].startsWith('/') ? '' : Bun.env.PWD || process.cwd();
- for (const p of paths) {
- if (p.startsWith('/')) {
- result = p;
- } else {
- result = result + '/' + p;
- }
- }
- const parts = result.split('/').filter(Boolean);
- const normalized: string[] = [];
- for (const part of parts) {
- if (part === '..') normalized.pop();
- else if (part !== '.') normalized.push(part);
- }
- return '/' + normalized.join('/');
- }
- export function getDefaultDbPath(indexName: string = "index"): string {
- const cacheDir = Bun.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
- const qmdCacheDir = resolve(cacheDir, "qmd");
- try { Bun.spawnSync(["mkdir", "-p", qmdCacheDir]); } catch {}
- return resolve(qmdCacheDir, `${indexName}.sqlite`);
- }
- export function getPwd(): string {
- return process.env.PWD || process.cwd();
- }
- export function getRealPath(path: string): string {
- try {
- const result = Bun.spawnSync(["realpath", path]);
- if (result.success) {
- return result.stdout.toString().trim();
- }
- } catch {}
- return resolve(path);
- }
- // =============================================================================
- // Database initialization
- // =============================================================================
- // On macOS, use Homebrew's SQLite which supports extensions
- if (process.platform === "darwin") {
- const homebrewSqlitePath = "/opt/homebrew/opt/sqlite/lib/libsqlite3.dylib";
- try {
- if (Bun.file(homebrewSqlitePath).size > 0) {
- Database.setCustomSQLite(homebrewSqlitePath);
- }
- } catch {}
- }
- function initializeDatabase(db: Database): void {
- sqliteVec.load(db);
- db.exec("PRAGMA journal_mode = WAL");
- // Collections table
- db.exec(`
- CREATE TABLE IF NOT EXISTS collections (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- pwd TEXT NOT NULL,
- glob_pattern TEXT NOT NULL,
- created_at TEXT NOT NULL,
- context TEXT,
- UNIQUE(pwd, glob_pattern)
- )
- `);
- // Path-based context
- db.exec(`
- CREATE TABLE IF NOT EXISTS path_contexts (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- path_prefix TEXT NOT NULL UNIQUE,
- context TEXT NOT NULL,
- created_at TEXT NOT NULL
- )
- `);
- db.exec(`CREATE INDEX IF NOT EXISTS idx_path_contexts_prefix ON path_contexts(path_prefix)`);
- // Cache table for Ollama API calls
- db.exec(`
- CREATE TABLE IF NOT EXISTS ollama_cache (
- hash TEXT PRIMARY KEY,
- result TEXT NOT NULL,
- created_at TEXT NOT NULL
- )
- `);
- // Documents table
- db.exec(`
- CREATE TABLE IF NOT EXISTS documents (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- collection_id INTEGER NOT NULL,
- name TEXT NOT NULL,
- title TEXT NOT NULL,
- hash TEXT NOT NULL,
- filepath TEXT NOT NULL,
- display_path TEXT NOT NULL DEFAULT '',
- body TEXT NOT NULL,
- created_at TEXT NOT NULL,
- modified_at TEXT NOT NULL,
- active INTEGER NOT NULL DEFAULT 1,
- FOREIGN KEY (collection_id) REFERENCES collections(id)
- )
- `);
- // Migration: add display_path column if missing
- const docInfo = db.prepare(`PRAGMA table_info(documents)`).all() as { name: string }[];
- const hasDisplayPath = docInfo.some(col => col.name === 'display_path');
- if (!hasDisplayPath) {
- db.exec(`ALTER TABLE documents ADD COLUMN display_path TEXT NOT NULL DEFAULT ''`);
- }
- db.exec(`CREATE UNIQUE INDEX IF NOT EXISTS idx_documents_display_path ON documents(display_path) WHERE display_path != '' AND active = 1`);
- // Content vectors
- const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
- const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
- if (cvInfo.length > 0 && !hasSeqColumn) {
- db.exec(`DROP TABLE IF EXISTS content_vectors`);
- db.exec(`DROP TABLE IF EXISTS vectors_vec`);
- }
- db.exec(`
- CREATE TABLE IF NOT EXISTS content_vectors (
- hash TEXT NOT NULL,
- seq INTEGER NOT NULL DEFAULT 0,
- pos INTEGER NOT NULL DEFAULT 0,
- model TEXT NOT NULL,
- embedded_at TEXT NOT NULL,
- PRIMARY KEY (hash, seq)
- )
- `);
- // FTS
- db.exec(`
- CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
- name, body,
- content='documents',
- content_rowid='id',
- tokenize='porter unicode61'
- )
- `);
- db.exec(`
- CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents BEGIN
- INSERT INTO documents_fts(rowid, name, body) VALUES (new.id, new.name, new.body);
- END
- `);
- db.exec(`
- CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
- INSERT INTO documents_fts(documents_fts, rowid, name, body) VALUES('delete', old.id, old.name, old.body);
- END
- `);
- db.exec(`
- CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents BEGIN
- INSERT INTO documents_fts(documents_fts, rowid, name, body) VALUES('delete', old.id, old.name, old.body);
- INSERT INTO documents_fts(rowid, name, body) VALUES (new.id, new.name, new.body);
- END
- `);
- db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection_id, active)`);
- db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`);
- db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_filepath ON documents(filepath, active)`);
- db.exec(`CREATE UNIQUE INDEX IF NOT EXISTS idx_documents_filepath_active ON documents(filepath) WHERE active = 1`);
- }
- function ensureVecTableInternal(db: Database, dimensions: number): void {
- const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get() as { sql: string } | null;
- if (tableInfo) {
- const match = tableInfo.sql.match(/float\[(\d+)\]/);
- const hasHashSeq = tableInfo.sql.includes('hash_seq');
- if (match && parseInt(match[1]) === dimensions && hasHashSeq) return;
- db.exec("DROP TABLE IF EXISTS vectors_vec");
- }
- db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}])`);
- }
- // =============================================================================
- // Store Factory
- // =============================================================================
- export type Store = {
- db: Database;
- dbPath: string;
- close: () => void;
- ensureVecTable: (dimensions: number) => void;
- // Index health
- getHashesNeedingEmbedding: () => number;
- getIndexHealth: () => IndexHealthInfo;
- getStatus: () => IndexStatus;
- // Caching
- getCacheKey: typeof getCacheKey;
- getCachedResult: (cacheKey: string) => string | null;
- setCachedResult: (cacheKey: string, result: string) => void;
- clearCache: () => void;
- // Context
- getContextForFile: (filepath: string) => string | null;
- getCollectionIdByName: (name: string) => number | null;
- // Search
- searchFTS: (query: string, limit?: number, collectionId?: number) => SearchResult[];
- searchVec: (query: string, model: string, limit?: number, collectionId?: number) => Promise<SearchResult[]>;
- // Query expansion & reranking
- expandQuery: (query: string, model?: string) => Promise<string[]>;
- rerank: (query: string, documents: { file: string; text: string }[], model?: string) => Promise<{ file: string; score: number }[]>;
- // Document retrieval
- findDocument: (filename: string, options?: { includeBody?: boolean }) => DocumentResult | DocumentNotFound;
- getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => string | null;
- findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => { docs: MultiGetResult[]; errors: string[] };
- // Legacy compatibility
- getDocument: (filename: string, fromLine?: number, maxLines?: number) => (DocumentResult & { body: string }) | DocumentNotFound;
- getMultipleDocuments: (pattern: string, maxLines?: number, maxBytes?: number) => { files: MultiGetFile[]; errors: string[] };
- // Fuzzy matching
- findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => string[];
- matchFilesByGlob: (pattern: string) => { filepath: string; displayPath: string; bodyLength: number }[];
- };
- /**
- * Create a new store instance with the given database path.
- * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
- *
- * @param dbPath - Path to the SQLite database file
- * @returns Store instance with all methods bound to the database
- */
- export function createStore(dbPath?: string): Store {
- const resolvedPath = dbPath || getDefaultDbPath();
- const db = new Database(resolvedPath);
- initializeDatabase(db);
- return {
- db,
- dbPath: resolvedPath,
- close: () => db.close(),
- ensureVecTable: (dimensions: number) => ensureVecTableInternal(db, dimensions),
- // Index health
- getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
- getIndexHealth: () => getIndexHealth(db),
- getStatus: () => getStatus(db),
- // Caching
- getCacheKey,
- getCachedResult: (cacheKey: string) => getCachedResult(db, cacheKey),
- setCachedResult: (cacheKey: string, result: string) => setCachedResult(db, cacheKey, result),
- clearCache: () => clearCache(db),
- // Context
- getContextForFile: (filepath: string) => getContextForFile(db, filepath),
- getCollectionIdByName: (name: string) => getCollectionIdByName(db, name),
- // Search
- searchFTS: (query: string, limit?: number, collectionId?: number) => searchFTS(db, query, limit, collectionId),
- searchVec: (query: string, model: string, limit?: number, collectionId?: number) => searchVec(db, query, model, limit, collectionId),
- // Query expansion & reranking
- expandQuery: (query: string, model?: string) => expandQuery(query, model, db),
- rerank: (query: string, documents: { file: string; text: string }[], model?: string) => rerank(query, documents, model, db),
- // Document retrieval
- findDocument: (filename: string, options?: { includeBody?: boolean }) => findDocument(db, filename, options),
- getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => getDocumentBody(db, doc, fromLine, maxLines),
- findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => findDocuments(db, pattern, options),
- // Legacy compatibility
- getDocument: (filename: string, fromLine?: number, maxLines?: number) => getDocument(db, filename, fromLine, maxLines),
- getMultipleDocuments: (pattern: string, maxLines?: number, maxBytes?: number) => getMultipleDocuments(db, pattern, maxLines, maxBytes),
- // Fuzzy matching
- findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => findSimilarFiles(db, query, maxDistance, limit),
- matchFilesByGlob: (pattern: string) => matchFilesByGlob(db, pattern),
- };
- }
- // =============================================================================
- // Legacy compatibility - will be removed
- // =============================================================================
- let _legacyDb: Database | null = null;
- let _legacyDbPath: string | null = null;
- /** @deprecated Use createStore() instead */
- export function setCustomIndexName(name: string | null): void {
- _legacyDbPath = name ? getDefaultDbPath(name) : null;
- _legacyDb = null; // Reset so next getDb() creates new connection
- }
- /** @deprecated Use createStore() instead */
- export function getDbPath(): string {
- return _legacyDbPath || getDefaultDbPath();
- }
- /** @deprecated Use createStore() instead */
- export function getDb(): Database {
- if (!_legacyDb) {
- _legacyDb = new Database(getDbPath());
- initializeDatabase(_legacyDb);
- }
- return _legacyDb;
- }
- /** @deprecated Use store.ensureVecTable() instead */
- export function ensureVecTable(db: Database, dimensions: number): void {
- ensureVecTableInternal(db, dimensions);
- }
- // =============================================================================
- // Core Document Type
- // =============================================================================
- /**
- * Unified document result type with all metadata.
- * Body is optional - use getDocumentBody() to load it separately if needed.
- */
- export type DocumentResult = {
- filepath: string; // Full filesystem path
- displayPath: string; // Short display path (e.g., "docs/readme.md")
- title: string; // Document title (from first heading or filename)
- context: string | null; // Folder context description if configured
- hash: string; // Content hash for caching/change detection
- collectionId: number; // Parent collection ID
- modifiedAt: string; // Last modification timestamp
- bodyLength: number; // Body length in bytes (useful before loading)
- body?: string; // Document body (optional, load with getDocumentBody)
- };
- /**
- * Search result extends DocumentResult with score and source info
- */
- export type SearchResult = DocumentResult & {
- score: number; // Relevance score (0-1)
- source: "fts" | "vec"; // Search source (full-text or vector)
- chunkPos?: number; // Character position of matching chunk (for vector search)
- };
- /**
- * Ranked result for RRF fusion (simplified, used internally)
- */
- export type RankedResult = {
- file: string;
- displayPath: string;
- title: string;
- body: string;
- score: number;
- };
- /**
- * Error result when document is not found
- */
- export type DocumentNotFound = {
- error: "not_found";
- query: string;
- similarFiles: string[];
- };
- /**
- * Result from multi-get operations
- */
- export type MultiGetResult = {
- doc: DocumentResult;
- skipped: false;
- } | {
- doc: Pick<DocumentResult, "filepath" | "displayPath">;
- skipped: true;
- skipReason: string;
- };
- export type CollectionInfo = {
- id: number;
- path: string;
- pattern: string;
- documents: number;
- lastUpdated: string;
- };
- export type IndexStatus = {
- totalDocuments: number;
- needsEmbedding: number;
- hasVectorIndex: boolean;
- collections: CollectionInfo[];
- };
- // =============================================================================
- // Index health
- // =============================================================================
- export function getHashesNeedingEmbedding(db: Database): number {
- const result = db.prepare(`
- SELECT COUNT(DISTINCT d.hash) as count
- FROM documents d
- LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
- WHERE d.active = 1 AND v.hash IS NULL
- `).get() as { count: number };
- return result.count;
- }
- export type IndexHealthInfo = {
- needsEmbedding: number;
- totalDocs: number;
- daysStale: number | null;
- };
- export function getIndexHealth(db: Database): IndexHealthInfo {
- const needsEmbedding = getHashesNeedingEmbedding(db);
- const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count;
- const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
- let daysStale: number | null = null;
- if (mostRecent?.latest) {
- const lastUpdate = new Date(mostRecent.latest);
- daysStale = Math.floor((Date.now() - lastUpdate.getTime()) / (24 * 60 * 60 * 1000));
- }
- return { needsEmbedding, totalDocs, daysStale };
- }
- // =============================================================================
- // Caching
- // =============================================================================
- export function getCacheKey(url: string, body: object): string {
- const hash = new Bun.CryptoHasher("sha256");
- hash.update(url);
- hash.update(JSON.stringify(body));
- return hash.digest("hex");
- }
- export function getCachedResult(db: Database, cacheKey: string): string | null {
- const row = db.prepare(`SELECT result FROM ollama_cache WHERE hash = ?`).get(cacheKey) as { result: string } | null;
- return row?.result || null;
- }
- export function setCachedResult(db: Database, cacheKey: string, result: string): void {
- const now = new Date().toISOString();
- db.prepare(`INSERT OR REPLACE INTO ollama_cache (hash, result, created_at) VALUES (?, ?, ?)`).run(cacheKey, result, now);
- if (Math.random() < 0.01) {
- db.exec(`DELETE FROM ollama_cache WHERE hash NOT IN (SELECT hash FROM ollama_cache ORDER BY created_at DESC LIMIT 1000)`);
- }
- }
- export function clearCache(db: Database): void {
- db.exec(`DELETE FROM ollama_cache`);
- }
- // =============================================================================
- // Document helpers
- // =============================================================================
- export async function hashContent(content: string): Promise<string> {
- const hash = new Bun.CryptoHasher("sha256");
- hash.update(content);
- return hash.digest("hex");
- }
- export function extractTitle(content: string, filename: string): string {
- const match = content.match(/^##?\s+(.+)$/m);
- if (match) {
- const title = match[1].trim();
- if (title === "📝 Notes" || title === "Notes") {
- const nextMatch = content.match(/^##\s+(.+)$/m);
- if (nextMatch) return nextMatch[1].trim();
- }
- return title;
- }
- return filename.replace(/\.md$/, "").split("/").pop() || filename;
- }
- // Re-export from llm.ts for backwards compatibility
- export { formatQueryForEmbedding, formatDocForEmbedding };
- export function chunkDocument(content: string, maxBytes: number = CHUNK_BYTE_SIZE): { text: string; pos: number }[] {
- const encoder = new TextEncoder();
- const totalBytes = encoder.encode(content).length;
- if (totalBytes <= maxBytes) {
- return [{ text: content, pos: 0 }];
- }
- const chunks: { text: string; pos: number }[] = [];
- let charPos = 0;
- while (charPos < content.length) {
- let endPos = charPos;
- let byteCount = 0;
- while (endPos < content.length && byteCount < maxBytes) {
- const charBytes = encoder.encode(content[endPos]).length;
- if (byteCount + charBytes > maxBytes) break;
- byteCount += charBytes;
- endPos++;
- }
- if (endPos < content.length && endPos > charPos) {
- const slice = content.slice(charPos, endPos);
- const paragraphBreak = slice.lastIndexOf('\n\n');
- const sentenceEnd = Math.max(
- slice.lastIndexOf('. '),
- slice.lastIndexOf('.\n'),
- slice.lastIndexOf('? '),
- slice.lastIndexOf('?\n'),
- slice.lastIndexOf('! '),
- slice.lastIndexOf('!\n')
- );
- const lineBreak = slice.lastIndexOf('\n');
- const spaceBreak = slice.lastIndexOf(' ');
- let breakPoint = -1;
- if (paragraphBreak > slice.length * 0.5) {
- breakPoint = paragraphBreak + 2;
- } else if (sentenceEnd > slice.length * 0.5) {
- breakPoint = sentenceEnd + 2;
- } else if (lineBreak > slice.length * 0.3) {
- breakPoint = lineBreak + 1;
- } else if (spaceBreak > slice.length * 0.3) {
- breakPoint = spaceBreak + 1;
- }
- if (breakPoint > 0) {
- endPos = charPos + breakPoint;
- }
- }
- if (endPos <= charPos) {
- endPos = charPos + 1;
- }
- chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
- charPos = endPos;
- }
- return chunks;
- }
- // =============================================================================
- // Fuzzy matching
- // =============================================================================
- function levenshtein(a: string, b: string): number {
- const m = a.length, n = b.length;
- if (m === 0) return n;
- if (n === 0) return m;
- const dp: number[][] = Array.from({ length: m + 1 }, (_, i) => [i]);
- for (let j = 1; j <= n; j++) dp[0][j] = j;
- for (let i = 1; i <= m; i++) {
- for (let j = 1; j <= n; j++) {
- const cost = a[i - 1] === b[j - 1] ? 0 : 1;
- dp[i][j] = Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost);
- }
- }
- return dp[m][n];
- }
- export function findSimilarFiles(db: Database, query: string, maxDistance: number = 3, limit: number = 5): string[] {
- const allFiles = db.prepare(`SELECT display_path FROM documents WHERE active = 1`).all() as { display_path: string }[];
- const queryLower = query.toLowerCase();
- const scored = allFiles
- .map(f => ({ path: f.display_path, dist: levenshtein(f.display_path.toLowerCase(), queryLower) }))
- .filter(f => f.dist <= maxDistance)
- .sort((a, b) => a.dist - b.dist)
- .slice(0, limit);
- return scored.map(f => f.path);
- }
- export function matchFilesByGlob(db: Database, pattern: string): { filepath: string; displayPath: string; bodyLength: number }[] {
- const allFiles = db.prepare(`SELECT filepath, display_path, LENGTH(body) as body_length FROM documents WHERE active = 1`).all() as { filepath: string; display_path: string; body_length: number }[];
- const glob = new Glob(pattern);
- return allFiles
- .filter(f => glob.match(f.display_path))
- .map(f => ({ filepath: f.filepath, displayPath: f.display_path, bodyLength: f.body_length }));
- }
- // =============================================================================
- // Context
- // =============================================================================
- export function getContextForFile(db: Database, filepath: string): string | null {
- const result = db.prepare(`
- SELECT context FROM path_contexts
- WHERE ? LIKE path_prefix || '%'
- ORDER BY LENGTH(path_prefix) DESC
- LIMIT 1
- `).get(filepath) as { context: string } | null;
- return result?.context || null;
- }
- export function getCollectionIdByName(db: Database, name: string): number | null {
- const result = db.prepare(`SELECT id FROM collections WHERE pwd LIKE ? ORDER BY LENGTH(pwd) DESC LIMIT 1`).get(`%${name}`) as { id: number } | null;
- return result?.id || null;
- }
- // =============================================================================
- // FTS Search
- // =============================================================================
- function sanitizeFTS5Term(term: string): string {
- return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
- }
- function buildFTS5Query(query: string): string | null {
- const terms = query.split(/\s+/)
- .map(t => sanitizeFTS5Term(t))
- .filter(t => t.length > 0);
- if (terms.length === 0) return null;
- if (terms.length === 1) return `"${terms[0]}"*`;
- return terms.map(t => `"${t}"*`).join(' AND ');
- }
- export function searchFTS(db: Database, query: string, limit: number = 20, collectionId?: number): SearchResult[] {
- const ftsQuery = buildFTS5Query(query);
- if (!ftsQuery) return [];
- let sql = `
- SELECT d.filepath, d.display_path, d.title, d.body, bm25(documents_fts, 10.0, 1.0) as score
- FROM documents_fts f
- JOIN documents d ON d.id = f.rowid
- WHERE documents_fts MATCH ? AND d.active = 1
- `;
- const params: (string | number)[] = [ftsQuery];
- if (collectionId !== undefined) {
- sql += ` AND d.collection_id = ?`;
- params.push(collectionId);
- }
- sql += ` ORDER BY score LIMIT ?`;
- params.push(limit);
- const rows = db.prepare(sql).all(...params) as { filepath: string; display_path: string; title: string; body: string; score: number }[];
- const maxScore = rows.length > 0 ? Math.max(...rows.map(r => Math.abs(r.score))) : 1;
- return rows.map(row => ({
- file: row.filepath,
- displayPath: row.display_path,
- title: row.title,
- body: row.body,
- score: Math.abs(row.score) / maxScore,
- source: "fts" as const,
- }));
- }
- // =============================================================================
- // Vector Search
- // =============================================================================
- export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionId?: number): Promise<SearchResult[]> {
- const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
- if (!tableExists) return [];
- const embedding = await getEmbedding(query, model, true);
- if (!embedding) return [];
- // sqlite-vec requires "k = ?" for KNN queries
- let sql = `
- SELECT v.hash_seq, v.distance, d.filepath, d.display_path, d.title, d.body, cv.pos
- FROM vectors_vec v
- JOIN content_vectors cv ON cv.hash || '_' || cv.seq = v.hash_seq
- JOIN documents d ON d.hash = cv.hash AND d.active = 1
- WHERE v.embedding MATCH ? AND k = ?
- `;
- if (collectionId !== undefined) {
- sql += ` AND d.collection_id = ${collectionId}`;
- }
- sql += ` ORDER BY v.distance`;
- const rows = db.prepare(sql).all(new Float32Array(embedding), limit * 3) as { hash_seq: string; distance: number; filepath: string; display_path: string; title: string; body: string; pos: number }[];
- const seen = new Map<string, { row: typeof rows[0]; bestDist: number }>();
- for (const row of rows) {
- const existing = seen.get(row.filepath);
- if (!existing || row.distance < existing.bestDist) {
- seen.set(row.filepath, { row, bestDist: row.distance });
- }
- }
- return Array.from(seen.values())
- .sort((a, b) => a.bestDist - b.bestDist)
- .slice(0, limit)
- .map(({ row }) => ({
- file: row.filepath,
- displayPath: row.display_path,
- title: row.title,
- body: row.body,
- score: 1 / (1 + row.distance),
- source: "vec" as const,
- chunkPos: row.pos,
- }));
- }
- // =============================================================================
- // Embeddings
- // =============================================================================
- async function getEmbedding(text: string, model: string, isQuery: boolean): Promise<number[] | null> {
- const ollama = getDefaultOllama();
- const result = await ollama.embed(text, { model, isQuery });
- return result?.embedding || null;
- }
- // =============================================================================
- // Query expansion
- // =============================================================================
- export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database): Promise<string[]> {
- // Check cache first
- const cacheKey = getCacheKey("expandQuery", { query, model });
- const cached = getCachedResult(db, cacheKey);
- if (cached) {
- const lines = cached.split('\n').map(l => l.trim()).filter(l => l.length > 0);
- return [query, ...lines.slice(0, 2)];
- }
- const ollama = getDefaultOllama();
- const results = await ollama.expandQuery(query, model, 2);
- // Cache the expanded queries (excluding original)
- if (results.length > 1) {
- setCachedResult(db, cacheKey, results.slice(1).join('\n'));
- }
- return results;
- }
- // =============================================================================
- // Reranking
- // =============================================================================
- export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database): Promise<{ file: string; score: number }[]> {
- const cachedResults: Map<string, number> = new Map();
- const uncachedDocs: RerankDocument[] = [];
- // Check cache for each document
- for (const doc of documents) {
- const cacheKey = getCacheKey("rerank", { query, file: doc.file, model });
- const cached = getCachedResult(db, cacheKey);
- if (cached !== null) {
- cachedResults.set(doc.file, parseFloat(cached));
- } else {
- uncachedDocs.push({ file: doc.file, text: doc.text });
- }
- }
- // Rerank uncached documents using Ollama
- if (uncachedDocs.length > 0) {
- const ollama = getDefaultOllama();
- const rerankResult = await ollama.rerank(query, uncachedDocs, { model });
- // Cache results
- for (const result of rerankResult.results) {
- const cacheKey = getCacheKey("rerank", { query, file: result.file, model });
- setCachedResult(db, cacheKey, result.score.toString());
- cachedResults.set(result.file, result.score);
- }
- }
- // Return all results sorted by score
- return documents
- .map(doc => ({ file: doc.file, score: cachedResults.get(doc.file) || 0 }))
- .sort((a, b) => b.score - a.score);
- }
- // =============================================================================
- // Reciprocal Rank Fusion
- // =============================================================================
- export function reciprocalRankFusion(
- resultLists: RankedResult[][],
- weights: number[] = [],
- k: number = 60
- ): RankedResult[] {
- const scores = new Map<string, { result: RankedResult; rrfScore: number; topRank: number }>();
- for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
- const list = resultLists[listIdx];
- const weight = weights[listIdx] ?? 1.0;
- for (let rank = 0; rank < list.length; rank++) {
- const result = list[rank];
- const rrfContribution = weight / (k + rank + 1);
- const existing = scores.get(result.file);
- if (existing) {
- existing.rrfScore += rrfContribution;
- existing.topRank = Math.min(existing.topRank, rank);
- } else {
- scores.set(result.file, {
- result,
- rrfScore: rrfContribution,
- topRank: rank,
- });
- }
- }
- }
- // Top-rank bonus
- for (const entry of scores.values()) {
- if (entry.topRank === 0) {
- entry.rrfScore += 0.05;
- } else if (entry.topRank <= 2) {
- entry.rrfScore += 0.02;
- }
- }
- return Array.from(scores.values())
- .sort((a, b) => b.rrfScore - a.rrfScore)
- .map(e => ({ ...e.result, score: e.rrfScore }));
- }
- // =============================================================================
- // Document retrieval
- // =============================================================================
- type DbDocRow = {
- filepath: string;
- display_path: string;
- title: string;
- hash: string;
- collection_id: number;
- modified_at: string;
- body_length: number;
- body?: string;
- };
- /**
- * Find a document by filename/path (with fuzzy matching)
- * Returns document metadata without body by default
- */
- export function findDocument(db: Database, filename: string, options: { includeBody?: boolean } = {}): DocumentResult | DocumentNotFound {
- let filepath = filename;
- const colonMatch = filepath.match(/:(\d+)$/);
- if (colonMatch) {
- filepath = filepath.slice(0, -colonMatch[0].length);
- }
- if (filepath.startsWith('~/')) {
- filepath = homedir() + filepath.slice(1);
- }
- const selectCols = options.includeBody
- ? `filepath, display_path, title, hash, collection_id, modified_at, LENGTH(body) as body_length, body`
- : `filepath, display_path, title, hash, collection_id, modified_at, LENGTH(body) as body_length`;
- // Try various match strategies
- let doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE filepath = ? AND active = 1`).get(filepath) as DbDocRow | null;
- if (!doc) {
- doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE display_path = ? AND active = 1`).get(filepath) as DbDocRow | null;
- }
- if (!doc) {
- doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE filepath LIKE ? AND active = 1 LIMIT 1`).get(`%${filepath}`) as DbDocRow | null;
- }
- if (!doc) {
- doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE display_path LIKE ? AND active = 1 LIMIT 1`).get(`%${filepath}`) as DbDocRow | null;
- }
- if (!doc) {
- const similar = findSimilarFiles(db, filepath, 5, 5);
- return { error: "not_found", query: filename, similarFiles: similar };
- }
- const context = getContextForFile(db, doc.filepath);
- return {
- filepath: doc.filepath,
- displayPath: doc.display_path,
- title: doc.title,
- context,
- hash: doc.hash,
- collectionId: doc.collection_id,
- modifiedAt: doc.modified_at,
- bodyLength: doc.body_length,
- ...(options.includeBody && doc.body !== undefined && { body: doc.body }),
- };
- }
- /**
- * Get the body content for a document
- * Optionally slice by line range
- */
- export function getDocumentBody(db: Database, doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number): string | null {
- const filepath = 'filepath' in doc ? doc.filepath : doc.filepath;
- const row = db.prepare(`SELECT body FROM documents WHERE filepath = ? AND active = 1`).get(filepath) as { body: string } | null;
- if (!row) return null;
- let body = row.body;
- if (fromLine !== undefined || maxLines !== undefined) {
- const lines = body.split('\n');
- const start = (fromLine || 1) - 1;
- const end = maxLines !== undefined ? start + maxLines : lines.length;
- body = lines.slice(start, end).join('\n');
- }
- return body;
- }
- /**
- * Legacy function for backwards compatibility
- * Combines findDocument + getDocumentBody with line slicing
- */
- export function getDocument(db: Database, filename: string, fromLine?: number, maxLines?: number): (DocumentResult & { body: string }) | DocumentNotFound {
- // Parse :line suffix
- let parsedFromLine = fromLine;
- let filepath = filename;
- const colonMatch = filepath.match(/:(\d+)$/);
- if (colonMatch && !parsedFromLine) {
- parsedFromLine = parseInt(colonMatch[1], 10);
- filepath = filepath.slice(0, -colonMatch[0].length);
- }
- const result = findDocument(db, filepath, { includeBody: true });
- if ("error" in result) return result;
- let body = result.body || "";
- if (parsedFromLine !== undefined || maxLines !== undefined) {
- const lines = body.split('\n');
- const start = (parsedFromLine || 1) - 1;
- const end = maxLines !== undefined ? start + maxLines : lines.length;
- body = lines.slice(start, end).join('\n');
- }
- return { ...result, body };
- }
- /**
- * Find multiple documents by glob pattern or comma-separated list
- * Returns documents without body by default (use getDocumentBody to load)
- */
- export function findDocuments(
- db: Database,
- pattern: string,
- options: { includeBody?: boolean; maxBytes?: number } = {}
- ): { docs: MultiGetResult[]; errors: string[] } {
- const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
- const errors: string[] = [];
- const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
- const selectCols = options.includeBody
- ? `filepath, display_path, title, hash, collection_id, modified_at, LENGTH(body) as body_length, body`
- : `filepath, display_path, title, hash, collection_id, modified_at, LENGTH(body) as body_length`;
- let fileRows: DbDocRow[];
- if (isCommaSeparated) {
- const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
- fileRows = [];
- for (const name of names) {
- let doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE display_path = ? AND active = 1`).get(name) as DbDocRow | null;
- if (!doc) {
- doc = db.prepare(`SELECT ${selectCols} FROM documents WHERE display_path LIKE ? AND active = 1 LIMIT 1`).get(`%${name}`) as DbDocRow | null;
- }
- if (doc) {
- fileRows.push(doc);
- } else {
- const similar = findSimilarFiles(db, name, 5, 3);
- let msg = `File not found: ${name}`;
- if (similar.length > 0) {
- msg += ` (did you mean: ${similar.join(', ')}?)`;
- }
- errors.push(msg);
- }
- }
- } else {
- // Glob pattern match
- const matched = matchFilesByGlob(db, pattern);
- if (matched.length === 0) {
- errors.push(`No files matched pattern: ${pattern}`);
- return { docs: [], errors };
- }
- const filepaths = matched.map(m => m.filepath);
- const placeholders = filepaths.map(() => '?').join(',');
- fileRows = db.prepare(`SELECT ${selectCols} FROM documents WHERE filepath IN (${placeholders}) AND active = 1`).all(...filepaths) as DbDocRow[];
- }
- const results: MultiGetResult[] = [];
- for (const row of fileRows) {
- const context = getContextForFile(db, row.filepath);
- if (row.body_length > maxBytes) {
- results.push({
- doc: { filepath: row.filepath, displayPath: row.display_path },
- skipped: true,
- skipReason: `File too large (${Math.round(row.body_length / 1024)}KB > ${Math.round(maxBytes / 1024)}KB)`,
- });
- continue;
- }
- results.push({
- doc: {
- filepath: row.filepath,
- displayPath: row.display_path,
- title: row.title || row.display_path.split('/').pop() || row.display_path,
- context,
- hash: row.hash,
- collectionId: row.collection_id,
- modifiedAt: row.modified_at,
- bodyLength: row.body_length,
- ...(options.includeBody && row.body !== undefined && { body: row.body }),
- },
- skipped: false,
- });
- }
- return { docs: results, errors };
- }
- /**
- * Legacy function for backwards compatibility
- */
- export function getMultipleDocuments(db: Database, pattern: string, maxLines?: number, maxBytes: number = DEFAULT_MULTI_GET_MAX_BYTES): { files: MultiGetFile[]; errors: string[] } {
- const { docs, errors } = findDocuments(db, pattern, { includeBody: true, maxBytes });
- const files: MultiGetFile[] = docs.map(result => {
- if (result.skipped) {
- return {
- filepath: result.doc.filepath,
- displayPath: result.doc.displayPath,
- title: "",
- body: "",
- context: null,
- skipped: true as const,
- skipReason: result.skipReason,
- };
- }
- let body = result.doc.body || "";
- if (maxLines !== undefined) {
- const lines = body.split('\n');
- body = lines.slice(0, maxLines).join('\n');
- if (lines.length > maxLines) {
- body += `\n\n[... truncated ${lines.length - maxLines} more lines]`;
- }
- }
- return {
- filepath: result.doc.filepath,
- displayPath: result.doc.displayPath,
- title: result.doc.title,
- body,
- context: result.doc.context,
- skipped: false as const,
- };
- });
- return { files, errors };
- }
- // Keep the old MultiGetFile type for backwards compatibility
- export type MultiGetFile = {
- filepath: string;
- displayPath: string;
- title: string;
- body: string;
- context: string | null;
- skipped: false;
- } | {
- filepath: string;
- displayPath: string;
- title: string;
- body: string;
- context: string | null;
- skipped: true;
- skipReason: string;
- };
- // =============================================================================
- // Status
- // =============================================================================
- export function getStatus(db: Database): IndexStatus {
- const collections = db.prepare(`
- SELECT c.id, c.pwd, c.glob_pattern, c.created_at,
- COUNT(d.id) as active_count,
- MAX(d.modified_at) as last_doc_update
- FROM collections c
- LEFT JOIN documents d ON d.collection_id = c.id AND d.active = 1
- GROUP BY c.id
- ORDER BY last_doc_update DESC
- `).all() as { id: number; pwd: string; glob_pattern: string; created_at: string; active_count: number; last_doc_update: string | null }[];
- const totalDocs = (db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get() as { c: number }).c;
- const needsEmbedding = getHashesNeedingEmbedding(db);
- const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
- return {
- totalDocuments: totalDocs,
- needsEmbedding,
- hasVectorIndex: hasVectors,
- collections: collections.map(col => ({
- id: col.id,
- path: col.pwd,
- pattern: col.glob_pattern,
- documents: col.active_count,
- lastUpdated: col.last_doc_update || col.created_at,
- })),
- };
- }
- // =============================================================================
- // Snippet extraction
- // =============================================================================
- export type SnippetResult = {
- line: number; // 1-indexed line number of best match
- snippet: string; // The snippet text with diff-style header
- linesBefore: number; // Lines in document before snippet
- linesAfter: number; // Lines in document after snippet
- snippetLines: number; // Number of lines in snippet
- };
- export function extractSnippet(body: string, query: string, maxLen = 500, chunkPos?: number): SnippetResult {
- const totalLines = body.split('\n').length;
- let searchBody = body;
- let lineOffset = 0;
- if (chunkPos && chunkPos > 0) {
- const contextStart = Math.max(0, chunkPos - 100);
- const contextEnd = Math.min(body.length, chunkPos + maxLen + 100);
- searchBody = body.slice(contextStart, contextEnd);
- if (contextStart > 0) {
- lineOffset = body.slice(0, contextStart).split('\n').length - 1;
- }
- }
- const lines = searchBody.split('\n');
- const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
- let bestLine = 0, bestScore = -1;
- for (let i = 0; i < lines.length; i++) {
- const lineLower = lines[i].toLowerCase();
- let score = 0;
- for (const term of queryTerms) {
- if (lineLower.includes(term)) score++;
- }
- if (score > bestScore) {
- bestScore = score;
- bestLine = i;
- }
- }
- const start = Math.max(0, bestLine - 1);
- const end = Math.min(lines.length, bestLine + 3);
- const snippetLines = lines.slice(start, end);
- let snippetText = snippetLines.join('\n');
- if (snippetText.length > maxLen) snippetText = snippetText.substring(0, maxLen - 3) + "...";
- const absoluteStart = lineOffset + start + 1; // 1-indexed
- const snippetLineCount = snippetLines.length;
- const linesBefore = absoluteStart - 1;
- const linesAfter = totalLines - (absoluteStart + snippetLineCount - 1);
- // Format with diff-style header: @@ -start,count @@ (linesBefore before, linesAfter after)
- const header = `@@ -${absoluteStart},${snippetLineCount} @@ (${linesBefore} before, ${linesAfter} after)`;
- const snippet = `${header}\n${snippetText}`;
- return {
- line: lineOffset + bestLine + 1,
- snippet,
- linesBefore,
- linesAfter,
- snippetLines: snippetLineCount,
- };
- }
|