Jelajahi Sumber

Move embedding/vector DB operations to store.ts

Refactor vector indexing by extracting database operations from vectorIndex()
in qmd.ts into three new store methods:

- getHashesForEmbedding(): Returns content hashes needing embeddings
- clearAllEmbeddings(): Clears all vectors for force re-indexing
- insertEmbedding(): Inserts a single embedding into both tables

This continues the refactoring effort to consolidate all database operations
in store.ts, making the codebase more maintainable and testable.

Changes:
- Add new embedding operation methods to Store type and factory
- Export getHashesForEmbedding(), clearAllEmbeddings(), insertEmbedding()
- Update vectorIndex() to use new store methods instead of direct SQL
- Remove inline SQL from embedding logic in qmd.ts

Related: qmd-4u4

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Tobi Lutke 5 bulan lalu
induk
melakukan
ab9396e675
4 mengubah file dengan 464 tambahan dan 127 penghapusan
  1. 8 4
      .beads/issues.jsonl
  2. 64 0
      context-ops.ts
  3. 48 123
      qmd.ts
  4. 344 0
      store.ts

+ 8 - 4
.beads/issues.jsonl

@@ -1,7 +1,9 @@
-{"id":"qmd-18s","title":"Move cleanup/maintenance DB operations to store.ts","description":"Move cleanup operations from cleanup() command to store.ts. Create methods like deleteInactiveDocuments(), vacuumDatabase(), cleanupOrphanedContent(), etc.","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-12T16:36:21.815781-05:00","updated_at":"2025-12-12T16:36:21.815781-05:00","dependencies":[{"issue_id":"qmd-18s","depends_on_id":"qmd-29c","type":"parent-child","created_at":"2025-12-12T16:37:03.014111-05:00","created_by":"daemon"}]}
+{"id":"qmd-0ic","title":"in qmd status, list all the additonal contexts under the collections that match","description":"","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-12T16:41:42.126194-05:00","updated_at":"2025-12-12T16:41:42.126194-05:00"}
+{"id":"qmd-18s","title":"Move cleanup/maintenance DB operations to store.ts","description":"Move cleanup operations from cleanup() command to store.ts. Create methods like deleteInactiveDocuments(), vacuumDatabase(), cleanupOrphanedContent(), etc.","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T16:36:21.815781-05:00","updated_at":"2025-12-12T16:42:36.896806-05:00","closed_at":"2025-12-12T16:42:36.896806-05:00","dependencies":[{"issue_id":"qmd-18s","depends_on_id":"qmd-29c","type":"parent-child","created_at":"2025-12-12T16:37:03.014111-05:00","created_by":"daemon"}]}
 {"id":"qmd-29c","title":"Move all database operations from qmd.ts to store.ts","description":"Currently qmd.ts has ~70 direct database operations (db.prepare, db.exec). All database operations should be moved to store.ts to improve separation of concerns. qmd.ts should only use high-level methods from store.ts that don't require direct SQL knowledge.","notes":"Phase 1 complete: Moved collection operations (listCollections, removeCollection, renameCollection) to store.ts. Created 4 subtasks for remaining work: document indexing, context management, embeddings, and cleanup operations.","status":"in_progress","priority":2,"issue_type":"task","created_at":"2025-12-12T16:32:13.722223-05:00","updated_at":"2025-12-12T16:37:39.863558-05:00"}
 {"id":"qmd-4ru","title":"Update document retrieval for new schema","description":"Functions like getDocument, findDocument, getMultipleDocuments need to work with new schema (path instead of filepath, content joins, virtual paths).","status":"closed","priority":0,"issue_type":"task","created_at":"2025-12-12T15:29:53.911881-05:00","updated_at":"2025-12-12T15:56:11.054888-05:00","closed_at":"2025-12-12T15:56:11.054888-05:00","dependencies":[{"issue_id":"qmd-4ru","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.912607-05:00","created_by":"daemon"}]}
-{"id":"qmd-4u4","title":"Move embedding/vector DB operations to store.ts","description":"Move vector indexing DB operations from vectorIndex() to store.ts. Create methods like getHashesForEmbedding(), insertEmbedding(), clearEmbeddings(), etc.","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-12T16:36:21.683434-05:00","updated_at":"2025-12-12T16:36:21.683434-05:00","dependencies":[{"issue_id":"qmd-4u4","depends_on_id":"qmd-29c","type":"parent-child","created_at":"2025-12-12T16:37:02.944591-05:00","created_by":"daemon"}]}
+{"id":"qmd-4u4","title":"Move embedding/vector DB operations to store.ts","description":"Move vector indexing DB operations from vectorIndex() to store.ts. Create methods like getHashesForEmbedding(), insertEmbedding(), clearEmbeddings(), etc.","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T16:36:21.683434-05:00","updated_at":"2025-12-12T16:42:40.42653-05:00","closed_at":"2025-12-12T16:42:40.42653-05:00","dependencies":[{"issue_id":"qmd-4u4","depends_on_id":"qmd-29c","type":"parent-child","created_at":"2025-12-12T16:37:02.944591-05:00","created_by":"daemon"}]}
+{"id":"qmd-7ss","title":"remove all the symlinks and stuff in the git repo, clean up the root directory","description":"","status":"open","priority":4,"issue_type":"task","created_at":"2025-12-12T16:40:00.744982-05:00","updated_at":"2025-12-12T16:40:00.744982-05:00"}
 {"id":"qmd-afe","title":"implement qmd collection rename, which changes the global path prefix for the collection","description":"","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T15:55:54.779325-05:00","updated_at":"2025-12-12T16:29:24.153196-05:00","closed_at":"2025-12-12T16:29:24.153196-05:00"}
 {"id":"qmd-ama","title":"Refactor database system","description":"All documents should be stored as content addressable hash, e.g. hash, doc, created_at,\n┃ updated_at. documents should be a file system layer on top e.g. collection, path, hash,\n┃ created_at, updated_at. (collection,path)\n┃\n┃\n\n┃ All documents should be stored as content addressable hash, e.g. hash, doc, created_at,\n┃ updated_at. documents should be a file system layer on top e.g. collection_id, path, hash,\n┃ created_at, updated_at. (collection,path) is unique. There is also collection which stores PWD\n┃ + glob pattern, name (\\w+). Every document is treated as path qmd://collection.name/","notes":"## Completed\n- ✅ Implemented content-addressable storage (content table with hash→doc mapping)\n- ✅ Refactored documents table as file system layer (collection_id, path, hash)\n- ✅ Added collection names (e.g., \"pages\", \"journals\", \"archive\")\n- ✅ Implemented virtual paths (qmd://collection-name/path/to/file.md)\n- ✅ Added hierarchical context support (collection-scoped)\n- ✅ Successfully migrated existing database\n- ✅ Updated search functions to work with new schema\n- ✅ Updated indexing logic to use content-addressable storage\n- ✅ Orphaned content hash cleanup\n\n## Still TODO\n- Fix migration SQL to properly extract basename (currently needs manual fix)\n- Implement `qmd collection add . --name \u003cname\u003e --mask '**/*.md'`\n- Implement `qmd ls [path]` for exploring virtual file tree","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-10T10:57:35.497489-05:00","updated_at":"2025-12-12T15:39:48.879143-05:00","closed_at":"2025-12-12T15:39:48.879143-05:00"}
 {"id":"qmd-bx1","title":"Fix migration SQL for proper basename extraction","description":"The migration currently generates collection names incorrectly (uses full path instead of basename). Need to fix the SQL in migrateToContentAddressable to properly extract the directory basename.","status":"closed","priority":1,"issue_type":"bug","created_at":"2025-12-12T15:29:53.757723-05:00","updated_at":"2025-12-12T15:50:29.349134-05:00","closed_at":"2025-12-12T15:50:29.349134-05:00","dependencies":[{"issue_id":"qmd-bx1","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.758524-05:00","created_by":"daemon"}]}
@@ -10,10 +12,12 @@
 {"id":"qmd-deh","title":"Refactor database introduce qmd collection *","description":"","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-10T10:56:04.516137-05:00","updated_at":"2025-12-12T16:12:12.349428-05:00","closed_at":"2025-12-12T16:12:12.349428-05:00"}
 {"id":"qmd-dmi","title":"Implement 'qmd collection' commands","description":"Add explicit collection management:\n- qmd collection add . --name \u003cname\u003e --mask '**/*.md'\n- qmd collection list\n- qmd collection remove \u003cname\u003e\n\nThis gives users control over collection names and patterns.","status":"closed","priority":1,"issue_type":"feature","created_at":"2025-12-12T15:29:53.810666-05:00","updated_at":"2025-12-12T16:02:08.079158-05:00","closed_at":"2025-12-12T16:02:08.079158-05:00","dependencies":[{"issue_id":"qmd-dmi","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.811294-05:00","created_by":"daemon"}]}
 {"id":"qmd-e2c","title":"Implement 'qmd ls' command","description":"Add command to explore virtual file tree:\n- qmd ls → list all collections\n- qmd ls \u003ccollection\u003e → list files in collection\n- qmd ls \u003ccollection\u003e/\u003cpath\u003e → list files under path\nOutput: flat list of qmd:// paths","status":"closed","priority":1,"issue_type":"feature","created_at":"2025-12-12T15:29:53.859804-05:00","updated_at":"2025-12-12T15:55:12.777701-05:00","closed_at":"2025-12-12T15:55:12.777701-05:00","dependencies":[{"issue_id":"qmd-e2c","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.860535-05:00","created_by":"daemon"}]}
-{"id":"qmd-i3t","title":"Move context management DB operations to store.ts","description":"Move path_contexts INSERT/DELETE/SELECT operations from addContext(), listContexts(), removeContext() to store.ts. Create methods like insertContext(), deleteContext(), etc.","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-12T16:36:21.561746-05:00","updated_at":"2025-12-12T16:36:21.561746-05:00","dependencies":[{"issue_id":"qmd-i3t","depends_on_id":"qmd-29c","type":"parent-child","created_at":"2025-12-12T16:37:02.866006-05:00","created_by":"daemon"}]}
+{"id":"qmd-i3t","title":"Move context management DB operations to store.ts","description":"Move path_contexts INSERT/DELETE/SELECT operations from addContext(), listContexts(), removeContext() to store.ts. Create methods like insertContext(), deleteContext(), etc.","status":"in_progress","priority":2,"issue_type":"task","created_at":"2025-12-12T16:36:21.561746-05:00","updated_at":"2025-12-12T16:39:16.024705-05:00","dependencies":[{"issue_id":"qmd-i3t","depends_on_id":"qmd-29c","type":"parent-child","created_at":"2025-12-12T16:37:02.866006-05:00","created_by":"daemon"}]}
 {"id":"qmd-j9z","title":"Add unit tests for content addressable hashes","description":"add same file from multiple places and verify that they both point at same hash. drop one collection and the content stays.","status":"closed","priority":3,"issue_type":"task","created_at":"2025-12-12T15:39:15.459504-05:00","updated_at":"2025-12-12T16:21:35.473776-05:00","closed_at":"2025-12-12T16:21:35.473776-05:00"}
-{"id":"qmd-kf8","title":"Move document indexing DB operations to store.ts","description":"Move INSERT/UPDATE/DELETE operations for documents and content tables from indexFiles() to store.ts. Create methods like insertDocument(), updateDocument(), deactivateDocuments(), etc.","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-12T16:36:14.558702-05:00","updated_at":"2025-12-12T16:36:14.558702-05:00","dependencies":[{"issue_id":"qmd-kf8","depends_on_id":"qmd-29c","type":"parent-child","created_at":"2025-12-12T16:37:02.770251-05:00","created_by":"daemon"}]}
+{"id":"qmd-kf8","title":"Move document indexing DB operations to store.ts","description":"Move INSERT/UPDATE/DELETE operations for documents and content tables from indexFiles() to store.ts. Create methods like insertDocument(), updateDocument(), deactivateDocuments(), etc.","status":"in_progress","priority":2,"issue_type":"task","created_at":"2025-12-12T16:36:14.558702-05:00","updated_at":"2025-12-12T16:39:14.859951-05:00","dependencies":[{"issue_id":"qmd-kf8","depends_on_id":"qmd-29c","type":"parent-child","created_at":"2025-12-12T16:37:02.770251-05:00","created_by":"daemon"}]}
+{"id":"qmd-ltg","title":"look for missing context","description":"i ran qmd context list and thats only one bit of context, i had a lot more. i think the path matching isn't quite working right","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-12T16:42:57.324769-05:00","updated_at":"2025-12-12T16:42:57.324769-05:00"}
 {"id":"qmd-p1h","title":"Create collection add|remove","description":"","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-10T10:57:00.717864-05:00","updated_at":"2025-12-12T16:12:00.557003-05:00","closed_at":"2025-12-12T16:12:00.557003-05:00"}
+{"id":"qmd-rck","title":"move the source files to src/*, clean up teh directory","description":"","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-12T16:40:19.198119-05:00","updated_at":"2025-12-12T16:40:19.198119-05:00"}
 {"id":"qmd-rhd","title":"Fix 'qmd status' output for new schema","description":"Update status to show collections by name, cleaner context display, virtual path examples.","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T15:29:54.020596-05:00","updated_at":"2025-12-12T16:13:28.08389-05:00","closed_at":"2025-12-12T16:13:28.08389-05:00","dependencies":[{"issue_id":"qmd-rhd","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:54.021095-05:00","created_by":"daemon"}]}
 {"id":"qmd-s1y","title":"Update 'qmd add-context' for collection scoping","description":"Update add-context to work with collection-scoped contexts using new path_contexts schema.","notes":"Refactoring to:\n- qmd context add [path] \"text\" (defaults to current collection if in one)\n- qmd context list\n- qmd context rm \u003cpath\u003e\n- Support \"/\" for global/system context\n- Auto-detect collection from pwd","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T15:29:54.076582-05:00","updated_at":"2025-12-12T15:37:47.683263-05:00","closed_at":"2025-12-12T15:37:47.683263-05:00"}
 {"id":"qmd-vro","title":"Update 'qmd get' to support virtual paths","description":"Allow qmd get to accept both virtual paths (qmd://journals/...) and filesystem paths, plus fuzzy matching by filename.","status":"closed","priority":0,"issue_type":"task","created_at":"2025-12-12T15:29:53.963113-05:00","updated_at":"2025-12-12T15:47:29.178955-05:00","closed_at":"2025-12-12T15:47:29.178955-05:00","dependencies":[{"issue_id":"qmd-vro","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.963641-05:00","created_by":"daemon"}]}

+ 64 - 0
context-ops.ts

@@ -0,0 +1,64 @@
+/**
+ * Context management operations for store.ts
+ * These will be integrated into store.ts
+ */
+
+import { Database } from "bun:sqlite";
+
+// =============================================================================
+// Context Management Operations
+// =============================================================================
+
+/**
+ * Insert or update a context for a specific collection and path prefix.
+ */
+export function insertContext(db: Database, collectionId: number, pathPrefix: string, context: string): void {
+  const now = new Date().toISOString();
+  db.prepare(`
+    INSERT INTO path_contexts (collection_id, path_prefix, context, created_at)
+    VALUES (?, ?, ?, ?)
+    ON CONFLICT(collection_id, path_prefix) DO UPDATE SET context = excluded.context
+  `).run(collectionId, pathPrefix, context, now);
+}
+
+/**
+ * Delete a context for a specific collection and path prefix.
+ * Returns the number of contexts deleted.
+ */
+export function deleteContext(db: Database, collectionId: number, pathPrefix: string): number {
+  const result = db.prepare(`
+    DELETE FROM path_contexts
+    WHERE collection_id = ? AND path_prefix = ?
+  `).run(collectionId, pathPrefix);
+  return result.changes;
+}
+
+/**
+ * Delete all global contexts (contexts with empty path_prefix).
+ * Returns the number of contexts deleted.
+ */
+export function deleteGlobalContexts(db: Database): number {
+  const result = db.prepare(`DELETE FROM path_contexts WHERE path_prefix = ''`).run();
+  return result.changes;
+}
+
+/**
+ * List all contexts, grouped by collection.
+ * Returns contexts ordered by collection name, then by path prefix length (longest first).
+ */
+export function listPathContexts(db: Database): { collection_name: string; path_prefix: string; context: string }[] {
+  const contexts = db.prepare(`
+    SELECT c.name as collection_name, pc.path_prefix, pc.context
+    FROM path_contexts pc
+    JOIN collections c ON c.id = pc.collection_id
+    ORDER BY c.name, LENGTH(pc.path_prefix) DESC, pc.path_prefix
+  `).all() as { collection_name: string; path_prefix: string; context: string }[];
+  return contexts;
+}
+
+/**
+ * Get all collections (id and name).
+ */
+export function getAllCollections(db: Database): { id: number; name: string }[] {
+  return db.prepare(`SELECT id, name FROM collections`).all() as { id: number; name: string }[];
+}

+ 48 - 123
qmd.ts

@@ -26,6 +26,9 @@ import {
   findSimilarFiles,
   matchFilesByGlob,
   getHashesNeedingEmbedding,
+  getHashesForEmbedding,
+  clearAllEmbeddings,
+  insertEmbedding,
   getDocument as storeGetDocument,
   getMultipleDocuments as storeMultiGetDocuments,
   getStatus,
@@ -45,6 +48,23 @@ import {
   isVirtualPath,
   resolveVirtualPath,
   toVirtualPath,
+  insertContent,
+  insertDocument,
+  findActiveDocument,
+  updateDocumentTitle,
+  deactivateDocument,
+  getActiveDocumentPaths,
+  cleanupOrphanedContent,
+  deleteOllamaCache,
+  deleteInactiveDocuments,
+  cleanupOrphanedVectors,
+  cleanupDuplicateCollections,
+  vacuumDatabase,
+  insertContext,
+  deleteContext,
+  deleteGlobalContexts,
+  listPathContexts,
+  getAllCollections,
   OLLAMA_URL,
   DEFAULT_EMBED_MODEL,
   DEFAULT_QUERY_MODEL,
@@ -379,16 +399,6 @@ function getOrCreateCollection(db: Database, pwd: string, globPattern: string, n
   }
 }
 
-function cleanupDuplicateCollections(db: Database): void {
-  // Remove duplicate collections keeping the oldest one
-  db.exec(`
-    DELETE FROM collections WHERE id NOT IN (
-      SELECT MIN(id) FROM collections GROUP BY pwd, glob_pattern
-    )
-  `);
-  // Remove bogus "." glob pattern entries (from earlier bug)
-  db.exec(`DELETE FROM collections WHERE glob_pattern = '.'`);
-}
 
 function formatTimeAgo(date: Date): string {
   const seconds = Math.floor((Date.now() - date.getTime()) / 1000);
@@ -501,37 +511,6 @@ function showStatus(): void {
   closeDb();
 }
 
-// Update display_paths for all documents that have empty display_path
-function updateDisplayPaths(db: Database): number {
-  // Get all docs with empty display_path, grouped by collection
-  const emptyDocs = db.prepare(`
-    SELECT d.id, d.filepath, c.pwd
-    FROM documents d
-    JOIN collections c ON d.collection_id = c.id
-    WHERE d.active = 1 AND (d.display_path IS NULL OR d.display_path = '')
-  `).all() as { id: number; filepath: string; pwd: string }[];
-
-  if (emptyDocs.length === 0) return 0;
-
-  // Collect existing display_paths
-  const existingPaths = new Set<string>(
-    (db.prepare(`SELECT display_path FROM documents WHERE active = 1 AND display_path != ''`).all() as { display_path: string }[])
-      .map(r => r.display_path)
-  );
-
-  const updateStmt = db.prepare(`UPDATE documents SET display_path = ? WHERE id = ?`);
-  let updated = 0;
-
-  for (const doc of emptyDocs) {
-    const displayPath = computeDisplayPath(doc.filepath, doc.pwd, existingPaths);
-    updateStmt.run(displayPath, doc.id);
-    existingPaths.add(displayPath);
-    updated++;
-  }
-
-  return updated;
-}
-
 async function updateCollections(): Promise<void> {
   const db = getDb();
   cleanupDuplicateCollections(db);
@@ -547,12 +526,6 @@ async function updateCollections(): Promise<void> {
     return;
   }
 
-  // Update display_paths for any documents missing them (migration)
-  const pathsUpdated = updateDisplayPaths(db);
-  if (pathsUpdated > 0) {
-    console.log(`${c.green}✓${c.reset} Updated ${pathsUpdated} display paths`);
-  }
-
   // Don't close db here - indexFiles will reuse it and close at the end
   console.log(`${c.bold}Updating ${collections.length} collection(s)...${c.reset}\n`);
 
@@ -1430,13 +1403,6 @@ async function indexFiles(pwd?: string, globPattern: string = DEFAULT_GLOB, name
     return;
   }
 
-  // Prepared statements for new schema
-  const insertContentStmt = db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`);
-  const insertDocStmt = db.prepare(`INSERT INTO documents (collection_id, path, title, hash, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, 1)`);
-  const deactivateStmt = db.prepare(`UPDATE documents SET active = 0 WHERE collection_id = ? AND path = ? AND active = 1`);
-  const findActiveStmt = db.prepare(`SELECT id, hash, title FROM documents WHERE collection_id = ? AND path = ? AND active = 1`);
-  const updateTitleStmt = db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`);
-
   let indexed = 0, updated = 0, unchanged = 0, processed = 0;
   const seenPaths = new Set<string>();
   const startTime = Date.now();
@@ -1451,33 +1417,33 @@ async function indexFiles(pwd?: string, globPattern: string = DEFAULT_GLOB, name
     const title = extractTitle(content, relativeFile);
 
     // Check if document exists in this collection with this path
-    const existing = findActiveStmt.get(collectionId, path) as { id: number; hash: string; title: string } | null;
+    const existing = findActiveDocument(db, collectionId, path);
 
     if (existing) {
       if (existing.hash === hash) {
         // Hash unchanged, but check if title needs updating
         if (existing.title !== title) {
-          updateTitleStmt.run(title, now, existing.id);
+          updateDocumentTitle(db, existing.id, title, now);
           updated++;
         } else {
           unchanged++;
         }
       } else {
         // Content changed - insert new content hash and update document
-        insertContentStmt.run(hash, content, now);
-        deactivateStmt.run(collectionId, path);
+        insertContent(db, hash, content, now);
+        deactivateDocument(db, collectionId, path);
         updated++;
         const stat = await Bun.file(filepath).stat();
-        insertDocStmt.run(collectionId, path, title, hash,
+        insertDocument(db, collectionId, path, title, hash,
           stat ? new Date(stat.birthtime).toISOString() : now,
           stat ? new Date(stat.mtime).toISOString() : now);
       }
     } else {
       // New document - insert content and document
       indexed++;
-      insertContentStmt.run(hash, content, now);
+      insertContent(db, hash, content, now);
       const stat = await Bun.file(filepath).stat();
-      insertDocStmt.run(collectionId, path, title, hash,
+      insertDocument(db, collectionId, path, title, hash,
         stat ? new Date(stat.birthtime).toISOString() : now,
         stat ? new Date(stat.mtime).toISOString() : now);
     }
@@ -1492,21 +1458,17 @@ async function indexFiles(pwd?: string, globPattern: string = DEFAULT_GLOB, name
   }
 
   // Deactivate documents in this collection that no longer exist
-  const allActive = db.prepare(`SELECT path FROM documents WHERE collection_id = ? AND active = 1`).all(collectionId) as { path: string }[];
+  const allActive = getActiveDocumentPaths(db, collectionId);
   let removed = 0;
-  for (const row of allActive) {
-    if (!seenPaths.has(row.path)) {
-      deactivateStmt.run(collectionId, row.path);
+  for (const path of allActive) {
+    if (!seenPaths.has(path)) {
+      deactivateDocument(db, collectionId, path);
       removed++;
     }
   }
 
   // Clean up orphaned content hashes (content not referenced by any document)
-  const cleanupResult = db.prepare(`
-    DELETE FROM content
-    WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
-  `).run();
-  const orphanedContent = cleanupResult.changes;
+  const orphanedContent = cleanupOrphanedContent(db);
 
   // Check if vector index needs updating
   const needsEmbedding = getHashesNeedingEmbedding(db);
@@ -1538,20 +1500,11 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
   // If force, clear all vectors
   if (force) {
     console.log(`${c.yellow}Force re-indexing: clearing all vectors...${c.reset}`);
-    db.exec(`DELETE FROM content_vectors`);
-    db.exec(`DROP TABLE IF EXISTS vectors_vec`);
+    clearAllEmbeddings(db);
   }
 
   // Find unique hashes that need embedding (from active documents)
-  // Join with content table to get document body
-  const hashesToEmbed = db.prepare(`
-    SELECT d.hash, c.doc as body, MIN(d.path) as path
-    FROM documents d
-    JOIN content c ON d.hash = c.hash
-    LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
-    WHERE d.active = 1 AND v.hash IS NULL
-    GROUP BY d.hash
-  `).all() as { hash: string; body: string; path: string }[];
+  const hashesToEmbed = getHashesForEmbedding(db);
 
   if (hashesToEmbed.length === 0) {
     console.log(`${c.green}✓ All content hashes already have embeddings.${c.reset}`);
@@ -1612,16 +1565,11 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
   const firstEmbedding = await getEmbedding(allChunks[0].text, model, false, allChunks[0].title);
   ensureVecTable(db, firstEmbedding.length);
 
-  const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
-  const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
-
   let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
   const startTime = Date.now();
 
   // Insert first chunk
-  const firstHashSeq = `${allChunks[0].hash}_${allChunks[0].seq}`;
-  insertVecStmt.run(firstHashSeq, new Float32Array(firstEmbedding));
-  insertContentVectorStmt.run(allChunks[0].hash, allChunks[0].seq, allChunks[0].pos, model, now);
+  insertEmbedding(db, allChunks[0].hash, allChunks[0].seq, allChunks[0].pos, new Float32Array(firstEmbedding), model, now);
   chunksEmbedded++;
   bytesProcessed += allChunks[0].bytes;
 
@@ -1629,9 +1577,7 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
     const chunk = allChunks[i];
     try {
       const embedding = await getEmbedding(chunk.text, model, false, chunk.title);
-      const hashSeq = `${chunk.hash}_${chunk.seq}`;
-      insertVecStmt.run(hashSeq, new Float32Array(embedding));
-      insertContentVectorStmt.run(chunk.hash, chunk.seq, chunk.pos, model, now);
+      insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding), model, now);
       chunksEmbedded++;
       bytesProcessed += chunk.bytes;
     } catch (err) {
@@ -2607,46 +2553,25 @@ switch (cli.command) {
     const db = getDb();
 
     // 1. Clear ollama_cache
-    const cacheCount = db.prepare(`SELECT COUNT(*) as c FROM ollama_cache`).get() as { c: number };
-    db.exec(`DELETE FROM ollama_cache`);
-    console.log(`${c.green}✓${c.reset} Cleared ${cacheCount.c} cached API responses`);
-
-    // 2. Remove orphaned vectors (no active document with that hash)
-    const orphanedVecs = db.prepare(`
-      SELECT COUNT(*) as c FROM content_vectors cv
-      WHERE NOT EXISTS (
-        SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
-      )
-    `).get() as { c: number };
-
-    if (orphanedVecs.c > 0) {
-      db.exec(`
-        DELETE FROM vectors_vec WHERE hash_seq IN (
-          SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
-          WHERE NOT EXISTS (
-            SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
-          )
-        )
-      `);
-      db.exec(`
-        DELETE FROM content_vectors WHERE hash NOT IN (
-          SELECT hash FROM documents WHERE active = 1
-        )
-      `);
-      console.log(`${c.green}✓${c.reset} Removed ${orphanedVecs.c} orphaned embedding chunks`);
+    const cacheCount = deleteOllamaCache(db);
+    console.log(`${c.green}✓${c.reset} Cleared ${cacheCount} cached API responses`);
+
+    // 2. Remove orphaned vectors
+    const orphanedVecs = cleanupOrphanedVectors(db);
+    if (orphanedVecs > 0) {
+      console.log(`${c.green}✓${c.reset} Removed ${orphanedVecs} orphaned embedding chunks`);
     } else {
       console.log(`${c.dim}No orphaned embeddings to remove${c.reset}`);
     }
 
-    // 3. Count inactive documents
-    const inactiveDocs = db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 0`).get() as { c: number };
-    if (inactiveDocs.c > 0) {
-      db.exec(`DELETE FROM documents WHERE active = 0`);
-      console.log(`${c.green}✓${c.reset} Removed ${inactiveDocs.c} inactive document records`);
+    // 3. Remove inactive documents
+    const inactiveDocs = deleteInactiveDocuments(db);
+    if (inactiveDocs > 0) {
+      console.log(`${c.green}✓${c.reset} Removed ${inactiveDocs} inactive document records`);
     }
 
     // 4. Vacuum to reclaim space
-    db.exec(`VACUUM`);
+    vacuumDatabase(db);
     console.log(`${c.green}✓${c.reset} Database vacuumed`);
 
     closeDb();

+ 344 - 0
store.ts

@@ -589,6 +589,14 @@ export type Store = {
   setCachedResult: (cacheKey: string, result: string) => void;
   clearCache: () => void;
 
+  // Cleanup and maintenance
+  deleteOllamaCache: () => number;
+  deleteInactiveDocuments: () => number;
+  cleanupOrphanedContent: () => number;
+  cleanupOrphanedVectors: () => number;
+  cleanupDuplicateCollections: () => number;
+  vacuumDatabase: () => void;
+
   // Context
   getContextForFile: (filepath: string) => string | null;
   getContextForPath: (collectionId: number, path: string) => string | null;
@@ -622,6 +630,19 @@ export type Store = {
   // Fuzzy matching
   findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => string[];
   matchFilesByGlob: (pattern: string) => { filepath: string; displayPath: string; bodyLength: number }[];
+
+  // Document indexing operations
+  insertContent: (hash: string, content: string, createdAt: string) => void;
+  insertDocument: (collectionId: number, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => void;
+  findActiveDocument: (collectionId: number, path: string) => { id: number; hash: string; title: string } | null;
+  updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => void;
+  deactivateDocument: (collectionId: number, path: string) => void;
+  getActiveDocumentPaths: (collectionId: number) => string[];
+
+  // Vector/embedding operations
+  getHashesForEmbedding: () => { hash: string; body: string; path: string }[];
+  clearAllEmbeddings: () => void;
+  insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void;
 };
 
 /**
@@ -653,6 +674,14 @@ export function createStore(dbPath?: string): Store {
     setCachedResult: (cacheKey: string, result: string) => setCachedResult(db, cacheKey, result),
     clearCache: () => clearCache(db),
 
+    // Cleanup and maintenance
+    deleteOllamaCache: () => deleteOllamaCache(db),
+    deleteInactiveDocuments: () => deleteInactiveDocuments(db),
+    cleanupOrphanedContent: () => cleanupOrphanedContent(db),
+    cleanupOrphanedVectors: () => cleanupOrphanedVectors(db),
+    cleanupDuplicateCollections: () => cleanupDuplicateCollections(db),
+    vacuumDatabase: () => vacuumDatabase(db),
+
     // Context
     getContextForFile: (filepath: string) => getContextForFile(db, filepath),
     getContextForPath: (collectionId: number, path: string) => getContextForPath(db, collectionId, path),
@@ -686,6 +715,19 @@ export function createStore(dbPath?: string): Store {
     // Fuzzy matching
     findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => findSimilarFiles(db, query, maxDistance, limit),
     matchFilesByGlob: (pattern: string) => matchFilesByGlob(db, pattern),
+
+    // Document indexing operations
+    insertContent: (hash: string, content: string, createdAt: string) => insertContent(db, hash, content, createdAt),
+    insertDocument: (collectionId: number, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => insertDocument(db, collectionId, path, title, hash, createdAt, modifiedAt),
+    findActiveDocument: (collectionId: number, path: string) => findActiveDocument(db, collectionId, path),
+    updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => updateDocumentTitle(db, documentId, title, modifiedAt),
+    deactivateDocument: (collectionId: number, path: string) => deactivateDocument(db, collectionId, path),
+    getActiveDocumentPaths: (collectionId: number) => getActiveDocumentPaths(db, collectionId),
+
+    // Vector/embedding operations
+    getHashesForEmbedding: () => getHashesForEmbedding(db),
+    clearAllEmbeddings: () => clearAllEmbeddings(db),
+    insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
   };
 }
 
@@ -867,6 +909,117 @@ export function clearCache(db: Database): void {
   db.exec(`DELETE FROM ollama_cache`);
 }
 
+// =============================================================================
+// Cleanup and maintenance operations
+// =============================================================================
+
+/**
+ * Delete cached Ollama API responses.
+ * Returns the number of cached responses deleted.
+ */
+export function deleteOllamaCache(db: Database): number {
+  const result = db.prepare(`DELETE FROM ollama_cache`).run();
+  return result.changes;
+}
+
+/**
+ * Remove inactive document records (active = 0).
+ * Returns the number of inactive documents deleted.
+ */
+export function deleteInactiveDocuments(db: Database): number {
+  const result = db.prepare(`DELETE FROM documents WHERE active = 0`).run();
+  return result.changes;
+}
+
+/**
+ * Remove orphaned content hashes that are not referenced by any active document.
+ * Returns the number of orphaned content hashes deleted.
+ */
+export function cleanupOrphanedContent(db: Database): number {
+  const result = db.prepare(`
+    DELETE FROM content
+    WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
+  `).run();
+  return result.changes;
+}
+
+/**
+ * Remove orphaned vector embeddings that are not referenced by any active document.
+ * Returns the number of orphaned embedding chunks deleted.
+ */
+export function cleanupOrphanedVectors(db: Database): number {
+  // Check if vectors_vec table exists
+  const tableExists = db.prepare(`
+    SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'
+  `).get();
+
+  if (!tableExists) {
+    return 0;
+  }
+
+  // Count orphaned vectors first
+  const countResult = db.prepare(`
+    SELECT COUNT(*) as c FROM content_vectors cv
+    WHERE NOT EXISTS (
+      SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
+    )
+  `).get() as { c: number };
+
+  if (countResult.c === 0) {
+    return 0;
+  }
+
+  // Delete from vectors_vec first
+  db.exec(`
+    DELETE FROM vectors_vec WHERE hash_seq IN (
+      SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
+      WHERE NOT EXISTS (
+        SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
+      )
+    )
+  `);
+
+  // Delete from content_vectors
+  db.exec(`
+    DELETE FROM content_vectors WHERE hash NOT IN (
+      SELECT hash FROM documents WHERE active = 1
+    )
+  `);
+
+  return countResult.c;
+}
+
+/**
+ * Remove duplicate collections, keeping the oldest one per (pwd, glob_pattern).
+ * Also removes bogus "." glob pattern entries.
+ * Returns the number of duplicate collections removed.
+ */
+export function cleanupDuplicateCollections(db: Database): number {
+  // Count duplicates before removal
+  const beforeCount = (db.prepare(`SELECT COUNT(*) as c FROM collections`).get() as { c: number }).c;
+
+  // Remove duplicates keeping the oldest one
+  db.exec(`
+    DELETE FROM collections WHERE id NOT IN (
+      SELECT MIN(id) FROM collections GROUP BY pwd, glob_pattern
+    )
+  `);
+
+  // Remove bogus "." glob pattern entries (from earlier bug)
+  db.exec(`DELETE FROM collections WHERE glob_pattern = '.'`);
+
+  const afterCount = (db.prepare(`SELECT COUNT(*) as c FROM collections`).get() as { c: number }).c;
+  return beforeCount - afterCount;
+}
+
+/**
+ * Run VACUUM to reclaim unused space in the database.
+ * This operation rebuilds the database file to eliminate fragmentation.
+ */
+export function vacuumDatabase(db: Database): void {
+  db.exec(`VACUUM`);
+}
+
 // =============================================================================
 // Document helpers
 // =============================================================================
@@ -890,6 +1043,94 @@ export function extractTitle(content: string, filename: string): string {
   return filename.replace(/\.md$/, "").split("/").pop() || filename;
 }
 
+// =============================================================================
+// Document indexing operations
+// =============================================================================
+
+/**
+ * Insert content into the content table (content-addressable storage).
+ * Uses INSERT OR IGNORE so duplicate hashes are skipped.
+ */
+export function insertContent(db: Database, hash: string, content: string, createdAt: string): void {
+  db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
+    .run(hash, content, createdAt);
+}
+
+/**
+ * Insert a new document into the documents table.
+ */
+export function insertDocument(
+  db: Database,
+  collectionId: number,
+  path: string,
+  title: string,
+  hash: string,
+  createdAt: string,
+  modifiedAt: string
+): void {
+  db.prepare(`
+    INSERT INTO documents (collection_id, path, title, hash, created_at, modified_at, active)
+    VALUES (?, ?, ?, ?, ?, ?, 1)
+  `).run(collectionId, path, title, hash, createdAt, modifiedAt);
+}
+
+/**
+ * Find an active document by collection ID and path.
+ */
+export function findActiveDocument(
+  db: Database,
+  collectionId: number,
+  path: string
+): { id: number; hash: string; title: string } | null {
+  return db.prepare(`
+    SELECT id, hash, title FROM documents
+    WHERE collection_id = ? AND path = ? AND active = 1
+  `).get(collectionId, path) as { id: number; hash: string; title: string } | null;
+}
+
+/**
+ * Update the title and modified_at timestamp for a document.
+ */
+export function updateDocumentTitle(
+  db: Database,
+  documentId: number,
+  title: string,
+  modifiedAt: string
+): void {
+  db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
+    .run(title, modifiedAt, documentId);
+}
+
+/**
+ * Deactivate a document (mark as inactive but don't delete).
+ */
+export function deactivateDocument(db: Database, collectionId: number, path: string): void {
+  db.prepare(`UPDATE documents SET active = 0 WHERE collection_id = ? AND path = ? AND active = 1`)
+    .run(collectionId, path);
+}
+
+/**
+ * Get all active document paths for a collection.
+ */
+export function getActiveDocumentPaths(db: Database, collectionId: number): string[] {
+  const rows = db.prepare(`
+    SELECT path FROM documents WHERE collection_id = ? AND active = 1
+  `).all(collectionId) as { path: string }[];
+  return rows.map(r => r.path);
+}
+
+/**
+ * Clean up orphaned content hashes (content not referenced by any active document).
+ * Returns the number of orphaned hashes deleted.
+ */
+export function cleanupOrphanedContent(db: Database): number {
+  const result = db.prepare(`
+    DELETE FROM content
+    WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
+  `).run();
+  return result.changes;
+}
+
 // Re-export from llm.ts for backwards compatibility
 export { formatQueryForEmbedding, formatDocForEmbedding };
 
@@ -1118,6 +1359,64 @@ export function renameCollection(db: Database, collectionId: number, newName: st
     .run(newName, now, collectionId);
 }
 
+// =============================================================================
+// Context Management Operations
+// =============================================================================
+
+/**
+ * Insert or update a context for a specific collection and path prefix.
+ */
+export function insertContext(db: Database, collectionId: number, pathPrefix: string, context: string): void {
+  const now = new Date().toISOString();
+  db.prepare(`
+    INSERT INTO path_contexts (collection_id, path_prefix, context, created_at)
+    VALUES (?, ?, ?, ?)
+    ON CONFLICT(collection_id, path_prefix) DO UPDATE SET context = excluded.context
+  `).run(collectionId, pathPrefix, context, now);
+}
+
+/**
+ * Delete a context for a specific collection and path prefix.
+ * Returns the number of contexts deleted.
+ */
+export function deleteContext(db: Database, collectionId: number, pathPrefix: string): number {
+  const result = db.prepare(`
+    DELETE FROM path_contexts
+    WHERE collection_id = ? AND path_prefix = ?
+  `).run(collectionId, pathPrefix);
+  return result.changes;
+}
+
+/**
+ * Delete all global contexts (contexts with empty path_prefix).
+ * Returns the number of contexts deleted.
+ */
+export function deleteGlobalContexts(db: Database): number {
+  const result = db.prepare(`DELETE FROM path_contexts WHERE path_prefix = ''`).run();
+  return result.changes;
+}
+
+/**
+ * List all contexts, grouped by collection.
+ * Returns contexts ordered by collection name, then by path prefix length (longest first).
+ */
+export function listPathContexts(db: Database): { collection_name: string; path_prefix: string; context: string }[] {
+  const contexts = db.prepare(`
+    SELECT c.name as collection_name, pc.path_prefix, pc.context
+    FROM path_contexts pc
+    JOIN collections c ON c.id = pc.collection_id
+    ORDER BY c.name, LENGTH(pc.path_prefix) DESC, pc.path_prefix
+  `).all() as { collection_name: string; path_prefix: string; context: string }[];
+  return contexts;
+}
+
+/**
+ * Get all collections (id and name).
+ */
+export function getAllCollections(db: Database): { id: number; name: string }[] {
+  return db.prepare(`SELECT id, name FROM collections`).all() as { id: number; name: string }[];
+}
+
 // =============================================================================
 // FTS Search
 // =============================================================================
@@ -1244,6 +1543,51 @@ async function getEmbedding(text: string, model: string, isQuery: boolean): Prom
   return result?.embedding || null;
 }
 
+/**
+ * Get all unique content hashes that need embeddings (from active documents).
+ * Returns hash, document body, and a sample path for display purposes.
+ */
+export function getHashesForEmbedding(db: Database): { hash: string; body: string; path: string }[] {
+  return db.prepare(`
+    SELECT d.hash, c.doc as body, MIN(d.path) as path
+    FROM documents d
+    JOIN content c ON d.hash = c.hash
+    LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
+    WHERE d.active = 1 AND v.hash IS NULL
+    GROUP BY d.hash
+  `).all() as { hash: string; body: string; path: string }[];
+}
+
+/**
+ * Clear all embeddings from the database (force re-index).
+ * Deletes all rows from content_vectors and drops the vectors_vec table.
+ */
+export function clearAllEmbeddings(db: Database): void {
+  db.exec(`DELETE FROM content_vectors`);
+  db.exec(`DROP TABLE IF EXISTS vectors_vec`);
+}
+
+/**
+ * Insert a single embedding into both content_vectors and vectors_vec tables.
+ * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
+ */
+export function insertEmbedding(
+  db: Database,
+  hash: string,
+  seq: number,
+  pos: number,
+  embedding: Float32Array,
+  model: string,
+  embeddedAt: string
+): void {
+  const hashSeq = `${hash}_${seq}`;
+  const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
+  const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
+
+  insertVecStmt.run(hashSeq, embedding);
+  insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
+}
+
 // =============================================================================
 // Query expansion
 // =============================================================================