Pārlūkot izejas kodu

Add content-addressable storage tests and fix test helpers

- Updated createTestCollection() to support new schema with name field
- Updated insertTestDocument() to match new schema (path, not filepath)
- Content is now stored in separate content table with hash deduplication
- Added comprehensive test suite for content-addressable storage:
  * Same content gets same hash from multiple collections
  * Removing one collection preserves shared content
  * Deduplication works across many collections
  * Different content gets different hashes
- All 4 content-addressable tests passing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Tobi Lutke 5 mēneši atpakaļ
vecāks
revīzija
d4f971c230
2 mainītis faili ar 213 papildinājumiem un 21 dzēšanām
  1. 2 1
      .beads/issues.jsonl
  2. 211 20
      store.test.ts

+ 2 - 1
.beads/issues.jsonl

@@ -3,10 +3,11 @@
 {"id":"qmd-ama","title":"Refactor database system","description":"All documents should be stored as content addressable hash, e.g. hash, doc, created_at,\n┃ updated_at. documents should be a file system layer on top e.g. collection, path, hash,\n┃ created_at, updated_at. (collection,path)\n┃\n┃\n\n┃ All documents should be stored as content addressable hash, e.g. hash, doc, created_at,\n┃ updated_at. documents should be a file system layer on top e.g. collection_id, path, hash,\n┃ created_at, updated_at. (collection,path) is unique. There is also collection which stores PWD\n┃ + glob pattern, name (\\w+). Every document is treated as path qmd://collection.name/","notes":"## Completed\n- ✅ Implemented content-addressable storage (content table with hash→doc mapping)\n- ✅ Refactored documents table as file system layer (collection_id, path, hash)\n- ✅ Added collection names (e.g., \"pages\", \"journals\", \"archive\")\n- ✅ Implemented virtual paths (qmd://collection-name/path/to/file.md)\n- ✅ Added hierarchical context support (collection-scoped)\n- ✅ Successfully migrated existing database\n- ✅ Updated search functions to work with new schema\n- ✅ Updated indexing logic to use content-addressable storage\n- ✅ Orphaned content hash cleanup\n\n## Still TODO\n- Fix migration SQL to properly extract basename (currently needs manual fix)\n- Implement `qmd collection add . --name \u003cname\u003e --mask '**/*.md'`\n- Implement `qmd ls [path]` for exploring virtual file tree","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-10T10:57:35.497489-05:00","updated_at":"2025-12-12T15:39:48.879143-05:00","closed_at":"2025-12-12T15:39:48.879143-05:00"}
 {"id":"qmd-bx1","title":"Fix migration SQL for proper basename extraction","description":"The migration currently generates collection names incorrectly (uses full path instead of basename). Need to fix the SQL in migrateToContentAddressable to properly extract the directory basename.","status":"closed","priority":1,"issue_type":"bug","created_at":"2025-12-12T15:29:53.757723-05:00","updated_at":"2025-12-12T15:50:29.349134-05:00","closed_at":"2025-12-12T15:50:29.349134-05:00","dependencies":[{"issue_id":"qmd-bx1","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.758524-05:00","created_by":"daemon"}]}
 {"id":"qmd-c0m","title":"Comprehensive CLI review and consistency pass","description":"Review entire CLI command structure:\n- Consistent naming (add vs create, remove vs delete)\n- Consistent flag usage (--name, --mask, etc)\n- Update help text for all commands\n- Ensure virtual paths work everywhere\n- Test all commands end-to-end","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-12T15:29:38.083564-05:00","updated_at":"2025-12-12T16:06:51.544695-05:00","closed_at":"2025-12-12T16:06:51.544695-05:00"}
+{"id":"qmd-clr","title":"fix embed","description":"","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-12T16:14:55.292114-05:00","updated_at":"2025-12-12T16:14:55.292114-05:00"}
 {"id":"qmd-deh","title":"Refactor database introduce qmd collection *","description":"","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-10T10:56:04.516137-05:00","updated_at":"2025-12-12T16:12:12.349428-05:00","closed_at":"2025-12-12T16:12:12.349428-05:00"}
 {"id":"qmd-dmi","title":"Implement 'qmd collection' commands","description":"Add explicit collection management:\n- qmd collection add . --name \u003cname\u003e --mask '**/*.md'\n- qmd collection list\n- qmd collection remove \u003cname\u003e\n\nThis gives users control over collection names and patterns.","status":"closed","priority":1,"issue_type":"feature","created_at":"2025-12-12T15:29:53.810666-05:00","updated_at":"2025-12-12T16:02:08.079158-05:00","closed_at":"2025-12-12T16:02:08.079158-05:00","dependencies":[{"issue_id":"qmd-dmi","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.811294-05:00","created_by":"daemon"}]}
 {"id":"qmd-e2c","title":"Implement 'qmd ls' command","description":"Add command to explore virtual file tree:\n- qmd ls → list all collections\n- qmd ls \u003ccollection\u003e → list files in collection\n- qmd ls \u003ccollection\u003e/\u003cpath\u003e → list files under path\nOutput: flat list of qmd:// paths","status":"closed","priority":1,"issue_type":"feature","created_at":"2025-12-12T15:29:53.859804-05:00","updated_at":"2025-12-12T15:55:12.777701-05:00","closed_at":"2025-12-12T15:55:12.777701-05:00","dependencies":[{"issue_id":"qmd-e2c","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.860535-05:00","created_by":"daemon"}]}
-{"id":"qmd-j9z","title":"Add unit tests for content addressable hashes","description":"add same file from multiple places and verify that they both point at same hash. drop one collection and the content stays.","status":"open","priority":3,"issue_type":"task","created_at":"2025-12-12T15:39:15.459504-05:00","updated_at":"2025-12-12T15:39:15.459504-05:00"}
+{"id":"qmd-j9z","title":"Add unit tests for content addressable hashes","description":"add same file from multiple places and verify that they both point at same hash. drop one collection and the content stays.","status":"closed","priority":3,"issue_type":"task","created_at":"2025-12-12T15:39:15.459504-05:00","updated_at":"2025-12-12T16:21:35.473776-05:00","closed_at":"2025-12-12T16:21:35.473776-05:00"}
 {"id":"qmd-p1h","title":"Create collection add|remove","description":"","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-10T10:57:00.717864-05:00","updated_at":"2025-12-12T16:12:00.557003-05:00","closed_at":"2025-12-12T16:12:00.557003-05:00"}
 {"id":"qmd-rhd","title":"Fix 'qmd status' output for new schema","description":"Update status to show collections by name, cleaner context display, virtual path examples.","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T15:29:54.020596-05:00","updated_at":"2025-12-12T16:13:28.08389-05:00","closed_at":"2025-12-12T16:13:28.08389-05:00","dependencies":[{"issue_id":"qmd-rhd","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:54.021095-05:00","created_by":"daemon"}]}
 {"id":"qmd-s1y","title":"Update 'qmd add-context' for collection scoping","description":"Update add-context to work with collection-scoped contexts using new path_contexts schema.","notes":"Refactoring to:\n- qmd context add [path] \"text\" (defaults to current collection if in one)\n- qmd context list\n- qmd context rm \u003cpath\u003e\n- Support \"/\" for global/system context\n- Auto-detect collection from pwd","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T15:29:54.076582-05:00","updated_at":"2025-12-12T15:37:47.683263-05:00","closed_at":"2025-12-12T15:37:47.683263-05:00"}

+ 211 - 20
store.test.ts

@@ -142,43 +142,57 @@ async function cleanupTestDb(store: Store): Promise<void> {
 }
 
 // Helper to insert a test document directly into the database
-function insertTestDocument(
+async function insertTestDocument(
   db: Database,
   collectionId: number,
   opts: {
     name?: string;
     title?: string;
     hash?: string;
-    filepath?: string;
     displayPath?: string;
     body?: string;
     active?: number;
   }
-): number {
+): Promise<number> {
   const now = new Date().toISOString();
   const name = opts.name || "test-doc";
   const title = opts.title || "Test Document";
-  const hash = opts.hash || `hash-${Date.now()}-${Math.random().toString(36).slice(2)}`;
-  const filepath = opts.filepath || `/test/path/${name}.md`;
-  const displayPath = opts.displayPath || `test/${name}.md`;
+  const path = opts.displayPath || `test/${name}.md`;
   const body = opts.body || "# Test Document\n\nThis is test content.";
   const active = opts.active ?? 1;
 
+  // Generate hash from body if not provided
+  const hash = opts.hash || await hashContent(body);
+
+  // Insert content (with OR IGNORE for deduplication)
+  db.prepare(`
+    INSERT OR IGNORE INTO content (hash, doc, created_at)
+    VALUES (?, ?, ?)
+  `).run(hash, body, now);
+
+  // Insert document
   const result = db.prepare(`
-    INSERT INTO documents (collection_id, name, title, hash, filepath, display_path, body, created_at, modified_at, active)
-    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
-  `).run(collectionId, name, title, hash, filepath, displayPath, body, now, now, active);
+    INSERT INTO documents (collection_id, path, title, hash, created_at, modified_at, active)
+    VALUES (?, ?, ?, ?, ?, ?, ?)
+  `).run(collectionId, path, title, hash, now, now, active);
 
   return Number(result.lastInsertRowid);
 }
 
 // Helper to create a test collection
-function createTestCollection(db: Database, pwd: string = "/test/collection", glob: string = "**/*.md"): number {
+function createTestCollection(
+  db: Database,
+  options: { pwd?: string; glob?: string; name?: string } = {}
+): number {
+  const pwd = options.pwd || "/test/collection";
+  const glob = options.glob || "**/*.md";
+  const name = options.name || pwd.split('/').filter(Boolean).pop() || 'test';
   const now = new Date().toISOString();
+
   const result = db.prepare(`
-    INSERT INTO collections (pwd, glob_pattern, created_at)
-    VALUES (?, ?, ?)
-  `).run(pwd, glob, now);
+    INSERT INTO collections (name, pwd, glob_pattern, created_at, updated_at)
+    VALUES (?, ?, ?, ?, ?)
+  `).run(name, pwd, glob, now, now);
   return Number(result.lastInsertRowid);
 }
 
@@ -522,7 +536,7 @@ describe("Path Context", () => {
 describe("Collections", () => {
   test("getCollectionIdByName finds collection by path suffix", async () => {
     const store = await createTestStore();
-    const collectionId = createTestCollection(store.db, "/home/user/projects/myapp", "**/*.md");
+    const collectionId = createTestCollection(store.db, { pwd: "/home/user/projects/myapp", glob: "**/*.md" });
 
     const found = store.getCollectionIdByName("myapp");
     expect(found).toBe(collectionId);
@@ -624,8 +638,8 @@ describe("FTS Search", () => {
 
   test("searchFTS filters by collectionId", async () => {
     const store = await createTestStore();
-    const collection1 = createTestCollection(store.db, "/path/one", "**/*.md");
-    const collection2 = createTestCollection(store.db, "/path/two", "**/*.md");
+    const collection1 = createTestCollection(store.db, { pwd: "/path/one", glob: "**/*.md" });
+    const collection2 = createTestCollection(store.db, { pwd: "/path/two", glob: "**/*.md" });
 
     insertTestDocument(store.db, collection1, {
       name: "doc1",
@@ -1272,7 +1286,7 @@ describe("Index Status", () => {
 
   test("getStatus reports collection info", async () => {
     const store = await createTestStore();
-    const collectionId = createTestCollection(store.db, "/test/path", "**/*.md");
+    const collectionId = createTestCollection(store.db, { pwd: "/test/path", glob: "**/*.md" });
     insertTestDocument(store.db, collectionId, { name: "doc1" });
 
     const status = store.getStatus();
@@ -1439,7 +1453,7 @@ describe("Vector Table", () => {
 describe("Integration", () => {
   test("full document lifecycle: create, search, retrieve", async () => {
     const store = await createTestStore();
-    const collectionId = createTestCollection(store.db, "/test/notes", "**/*.md");
+    const collectionId = createTestCollection(store.db, { pwd: "/test/notes", glob: "**/*.md" });
 
     // Add context
     addPathContext(store.db, "/test/notes", "Personal notes");
@@ -1491,8 +1505,8 @@ describe("Integration", () => {
     const store1 = await createTestStore();
     const store2 = await createTestStore();
 
-    const col1 = createTestCollection(store1.db, "/store1", "**/*.md");
-    const col2 = createTestCollection(store2.db, "/store2", "**/*.md");
+    const col1 = createTestCollection(store1.db, { pwd: "/store1", glob: "**/*.md" });
+    const col2 = createTestCollection(store2.db, { pwd: "/store2", glob: "**/*.md" });
 
     insertTestDocument(store1.db, col1, {
       name: "doc1",
@@ -1806,3 +1820,180 @@ describe("Edge Cases", () => {
     await cleanupTestDb(store);
   });
 });
+
+// =============================================================================
+// Content-Addressable Storage Tests
+// =============================================================================
+
+describe("Content-Addressable Storage", () => {
+  test("same content gets same hash from multiple collections", async () => {
+    const store = await createTestStore();
+
+    // Create two collections
+    const collection1 = createTestCollection(store.db, { pwd: "/path/collection1" });
+    const collection2 = createTestCollection(store.db, { pwd: "/path/collection2" });
+
+    // Add same content to both collections
+    const content = "# Same Content\n\nThis is the same content in two places.";
+    const hash1 = await hashContent(content);
+
+    const doc1 = await insertTestDocument(store.db, collection1, {
+      name: "doc1",
+      body: content,
+      displayPath: "doc1.md",
+    });
+
+    const doc2 = await insertTestDocument(store.db, collection2, {
+      name: "doc2",
+      body: content,
+      displayPath: "doc2.md",
+    });
+
+    // Both should have the same hash
+    const hash1Db = store.db.prepare(`SELECT hash FROM documents WHERE id = ?`).get(doc1) as { hash: string };
+    const hash2Db = store.db.prepare(`SELECT hash FROM documents WHERE id = ?`).get(doc2) as { hash: string };
+
+    expect(hash1Db.hash).toBe(hash2Db.hash);
+    expect(hash1Db.hash).toBe(hash1);
+
+    // There should only be one entry in the content table
+    const contentCount = store.db.prepare(`SELECT COUNT(*) as count FROM content WHERE hash = ?`).get(hash1) as { count: number };
+    expect(contentCount.count).toBe(1);
+
+    await cleanupTestDb(store);
+  });
+
+  test("removing one collection preserves content used by another", async () => {
+    const store = await createTestStore();
+
+    // Create two collections
+    const collection1 = createTestCollection(store.db, { pwd: "/path/collection1" });
+    const collection2 = createTestCollection(store.db, { pwd: "/path/collection2" });
+
+    // Add same content to both collections
+    const sharedContent = "# Shared Content\n\nThis is shared.";
+    const sharedHash = await hashContent(sharedContent);
+
+    await insertTestDocument(store.db, collection1, {
+      name: "shared1",
+      body: sharedContent,
+      displayPath: "shared1.md",
+    });
+
+    await insertTestDocument(store.db, collection2, {
+      name: "shared2",
+      body: sharedContent,
+      displayPath: "shared2.md",
+    });
+
+    // Add unique content to collection1
+    const uniqueContent = "# Unique Content\n\nThis is unique to collection1.";
+    const uniqueHash = await hashContent(uniqueContent);
+
+    await insertTestDocument(store.db, collection1, {
+      name: "unique",
+      body: uniqueContent,
+      displayPath: "unique.md",
+    });
+
+    // Verify both hashes exist in content table
+    const sharedExists1 = store.db.prepare(`SELECT hash FROM content WHERE hash = ?`).get(sharedHash);
+    const uniqueExists1 = store.db.prepare(`SELECT hash FROM content WHERE hash = ?`).get(uniqueHash);
+    expect(sharedExists1).toBeTruthy();
+    expect(uniqueExists1).toBeTruthy();
+
+    // Remove collection1 (this should NOT remove shared content)
+    store.db.prepare(`DELETE FROM documents WHERE collection_id = ?`).run(collection1);
+    store.db.prepare(`DELETE FROM collections WHERE id = ?`).run(collection1);
+
+    // Clean up orphaned content (mimics what the CLI does)
+    store.db.prepare(`
+      DELETE FROM content
+      WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
+    `).run();
+
+    // Shared content should still exist (used by collection2)
+    const sharedExists2 = store.db.prepare(`SELECT hash FROM content WHERE hash = ?`).get(sharedHash);
+    expect(sharedExists2).toBeTruthy();
+
+    // Unique content should be removed (only used by collection1)
+    const uniqueExists2 = store.db.prepare(`SELECT hash FROM content WHERE hash = ?`).get(uniqueHash);
+    expect(uniqueExists2).toBeFalsy();
+
+    await cleanupTestDb(store);
+  });
+
+  test("deduplicates content across many collections", async () => {
+    const store = await createTestStore();
+
+    const sharedContent = "# Common Header\n\nThis appears everywhere.";
+    const sharedHash = await hashContent(sharedContent);
+
+    // Create 5 collections with the same content
+    const collectionIds = [];
+    for (let i = 0; i < 5; i++) {
+      const collId = createTestCollection(store.db, { pwd: `/path/collection${i}` });
+      collectionIds.push(collId);
+
+      await insertTestDocument(store.db, collId, {
+        name: `doc${i}`,
+        body: sharedContent,
+        displayPath: `doc${i}.md`,
+      });
+    }
+
+    // Should have 5 documents
+    const docCount = store.db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number };
+    expect(docCount.count).toBe(5);
+
+    // But only 1 content entry
+    const contentCount = store.db.prepare(`SELECT COUNT(*) as count FROM content WHERE hash = ?`).get(sharedHash) as { count: number };
+    expect(contentCount.count).toBe(1);
+
+    // All documents should point to the same hash
+    const hashes = store.db.prepare(`SELECT DISTINCT hash FROM documents WHERE active = 1`).all() as { hash: string }[];
+    expect(hashes).toHaveLength(1);
+    expect(hashes[0].hash).toBe(sharedHash);
+
+    await cleanupTestDb(store);
+  });
+
+  test("different content gets different hashes", async () => {
+    const store = await createTestStore();
+    const collectionId = createTestCollection(store.db);
+
+    const content1 = "# Content One";
+    const content2 = "# Content Two";
+    const hash1 = await hashContent(content1);
+    const hash2 = await hashContent(content2);
+
+    // Hashes should be different
+    expect(hash1).not.toBe(hash2);
+
+    const doc1 = await insertTestDocument(store.db, collectionId, {
+      name: "doc1",
+      body: content1,
+      displayPath: "doc1.md",
+    });
+
+    const doc2 = await insertTestDocument(store.db, collectionId, {
+      name: "doc2",
+      body: content2,
+      displayPath: "doc2.md",
+    });
+
+    // Both hashes should exist in content table
+    const hash1Db = store.db.prepare(`SELECT hash FROM documents WHERE id = ?`).get(doc1) as { hash: string };
+    const hash2Db = store.db.prepare(`SELECT hash FROM documents WHERE id = ?`).get(doc2) as { hash: string };
+
+    expect(hash1Db.hash).toBe(hash1);
+    expect(hash2Db.hash).toBe(hash2);
+    expect(hash1Db.hash).not.toBe(hash2Db.hash);
+
+    // Should have 2 entries in content table
+    const contentCount = store.db.prepare(`SELECT COUNT(*) as count FROM content`).get() as { count: number };
+    expect(contentCount.count).toBe(2);
+
+    await cleanupTestDb(store);
+  });
+});