Sfoglia il codice sorgente

Add status command, fix collections, improve CLI output

- Rename 'collections' to 'status' with richer output:
  - Index size
  - Documents count and vector embedding status
  - Time since last update
  - Per-collection stats

- Fix `qmd add .` to use default glob pattern
- Fix duplicate collections with cleanup and INSERT OR IGNORE
- Improve update-all with colored progress output
- Fix 'qmd vector' → 'qmd embed' in help messages
- Implement weighted RRF (2x weight for original query)
- Simplify CLAUDE.md for project-specific instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Tobi Lutke 5 mesi fa
parent
commit
e963555ff8
2 ha cambiato i file con 116 aggiunte e 134 eliminazioni
  1. 20 102
      CLAUDE.md
  2. 96 32
      qmd.ts

+ 20 - 102
CLAUDE.md

@@ -1,111 +1,29 @@
----
-description: Use Bun instead of Node.js, npm, pnpm, or vite.
-globs: "*.ts, *.tsx, *.html, *.css, *.js, *.jsx, package.json"
-alwaysApply: false
----
+# QMD - Quick Markdown Search
 
-Default to using Bun instead of Node.js.
+Use Bun instead of Node.js (`bun` not `node`, `bun install` not `npm install`).
 
-- Use `bun <file>` instead of `node <file>` or `ts-node <file>`
-- Use `bun test` instead of `jest` or `vitest`
-- Use `bun build <file.html|file.ts|file.css>` instead of `webpack` or `esbuild`
-- Use `bun install` instead of `npm install` or `yarn install` or `pnpm install`
-- Use `bun run <script>` instead of `npm run <script>` or `yarn run <script>` or `pnpm run <script>`
-- Bun automatically loads .env, so don't use dotenv.
+## Commands
 
-## APIs
-
-- `Bun.serve()` supports WebSockets, HTTPS, and routes. Don't use `express`.
-- `bun:sqlite` for SQLite. Don't use `better-sqlite3`.
-- `Bun.redis` for Redis. Don't use `ioredis`.
-- `Bun.sql` for Postgres. Don't use `pg` or `postgres.js`.
-- `WebSocket` is built-in. Don't use `ws`.
-- Prefer `Bun.file` over `node:fs`'s readFile/writeFile
-- Bun.$`ls` instead of execa.
-
-## Testing
-
-Use `bun test` to run tests.
-
-```ts#index.test.ts
-import { test, expect } from "bun:test";
-
-test("hello world", () => {
-  expect(1).toBe(1);
-});
-```
-
-## Frontend
-
-Use HTML imports with `Bun.serve()`. Don't use `vite`. HTML imports fully support React, CSS, Tailwind.
-
-Server:
-
-```ts#index.ts
-import index from "./index.html"
-
-Bun.serve({
-  routes: {
-    "/": index,
-    "/api/users/:id": {
-      GET: (req) => {
-        return new Response(JSON.stringify({ id: req.params.id }));
-      },
-    },
-  },
-  // optional websocket support
-  websocket: {
-    open: (ws) => {
-      ws.send("Hello, world!");
-    },
-    message: (ws, message) => {
-      ws.send(message);
-    },
-    close: (ws) => {
-      // handle close
-    }
-  },
-  development: {
-    hmr: true,
-    console: true,
-  }
-})
-```
-
-HTML files can import .tsx, .jsx or .js files directly and Bun's bundler will transpile & bundle automatically. `<link>` tags can point to stylesheets and Bun's CSS bundler will bundle.
-
-```html#index.html
-<html>
-  <body>
-    <h1>Hello, world!</h1>
-    <script type="module" src="./frontend.tsx"></script>
-  </body>
-</html>
-```
-
-With the following `frontend.tsx`:
-
-```tsx#frontend.tsx
-import React from "react";
-
-// import .css files directly and it works
-import './index.css';
-
-import { createRoot } from "react-dom/client";
-
-const root = createRoot(document.body);
-
-export default function Frontend() {
-  return <h1>Hello, world!</h1>;
-}
-
-root.render(<Frontend />);
+```sh
+qmd add .              # Index markdown files in current directory
+qmd status             # Show index status and collections
+qmd update-all         # Re-index all collections
+qmd embed              # Generate vector embeddings (requires Ollama)
+qmd search <query>     # BM25 full-text search
+qmd vsearch <query>    # Vector similarity search
+qmd query <query>      # Hybrid search with reranking (best quality)
 ```
 
-Then, run index.ts
+## Development
 
 ```sh
-bun --hot ./index.ts
+bun qmd.ts <command>   # Run from source
+bun link               # Install globally as 'qmd'
 ```
 
-For more information, read the Bun API docs in `node_modules/bun-types/docs/**.md`.
+## Architecture
+
+- SQLite FTS5 for full-text search (BM25)
+- sqlite-vec for vector similarity search
+- Ollama for embeddings (embeddinggemma) and reranking (qwen3-reranker)
+- Reciprocal Rank Fusion (RRF) for combining results

+ 96 - 32
qmd.ts

@@ -392,69 +392,133 @@ async function rerank(query: string, documents: { file: string; text: string }[]
 
 function getOrCreateCollection(db: Database, pwd: string, globPattern: string): number {
   const now = new Date().toISOString();
-  const existing = db.prepare(`SELECT id FROM collections WHERE pwd = ? AND glob_pattern = ?`).get(pwd, globPattern) as { id: number } | null;
-  if (existing) return existing.id;
 
-  db.prepare(`INSERT INTO collections (pwd, glob_pattern, created_at) VALUES (?, ?, ?)`).run(pwd, globPattern, now);
-  return (db.prepare(`SELECT last_insert_rowid() as id`).get() as { id: number }).id;
+  // Use INSERT OR IGNORE to handle race conditions, then SELECT
+  db.prepare(`INSERT OR IGNORE INTO collections (pwd, glob_pattern, created_at) VALUES (?, ?, ?)`).run(pwd, globPattern, now);
+  const existing = db.prepare(`SELECT id FROM collections WHERE pwd = ? AND glob_pattern = ?`).get(pwd, globPattern) as { id: number };
+  return existing.id;
 }
 
-function listCollections(): void {
+function cleanupDuplicateCollections(db: Database): void {
+  // Remove duplicate collections keeping the oldest one
+  db.exec(`
+    DELETE FROM collections WHERE id NOT IN (
+      SELECT MIN(id) FROM collections GROUP BY pwd, glob_pattern
+    )
+  `);
+  // Remove bogus "." glob pattern entries (from earlier bug)
+  db.exec(`DELETE FROM collections WHERE glob_pattern = '.'`);
+}
+
+function formatTimeAgo(date: Date): string {
+  const seconds = Math.floor((Date.now() - date.getTime()) / 1000);
+  if (seconds < 60) return `${seconds}s ago`;
+  const minutes = Math.floor(seconds / 60);
+  if (minutes < 60) return `${minutes}m ago`;
+  const hours = Math.floor(minutes / 60);
+  if (hours < 24) return `${hours}h ago`;
+  const days = Math.floor(hours / 24);
+  return `${days}d ago`;
+}
+
+function formatBytes(bytes: number): string {
+  if (bytes < 1024) return `${bytes} B`;
+  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
+  if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
+  return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
+}
+
+function showStatus(): void {
+  const dbPath = getDbPath();
   const db = getDb();
+
+  // Cleanup any duplicate collections
+  cleanupDuplicateCollections(db);
+
+  // Index size
+  let indexSize = 0;
+  try {
+    const stat = Bun.file(dbPath).size;
+    indexSize = stat;
+  } catch {}
+
+  // Collections info
   const collections = db.prepare(`
     SELECT c.id, c.pwd, c.glob_pattern, c.created_at,
            COUNT(d.id) as doc_count,
-           SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count
+           SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count,
+           MAX(d.modified_at) as last_modified
     FROM collections c
     LEFT JOIN documents d ON d.collection_id = c.id
     GROUP BY c.id
     ORDER BY c.created_at DESC
-  `).all() as { id: number; pwd: string; glob_pattern: string; created_at: string; doc_count: number; active_count: number }[];
+  `).all() as { id: number; pwd: string; glob_pattern: string; created_at: string; doc_count: number; active_count: number; last_modified: string | null }[];
 
-  if (collections.length === 0) {
-    console.log("No collections found.");
-    db.close();
-    return;
-  }
+  // Overall stats
+  const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number };
+  const vectorCount = db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get() as { count: number };
+  const needsEmbedding = getHashesNeedingEmbedding(db);
+
+  // Most recent update across all collections
+  const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
 
-  console.log("Collections:\n");
-  for (const c of collections) {
-    console.log(`  ${c.pwd}`);
-    console.log(`    Pattern: ${c.glob_pattern}`);
-    console.log(`    Documents: ${c.active_count} active (${c.doc_count} total)`);
-    console.log(`    Created: ${c.created_at}\n`);
+  console.log(`${c.bold}QMD Status${c.reset}\n`);
+  console.log(`Index: ${dbPath}`);
+  console.log(`Size:  ${formatBytes(indexSize)}\n`);
+
+  console.log(`${c.bold}Documents${c.reset}`);
+  console.log(`  Total:    ${totalDocs.count} files indexed`);
+  console.log(`  Vectors:  ${vectorCount.count} embedded`);
+  if (needsEmbedding > 0) {
+    console.log(`  ${c.yellow}Pending:  ${needsEmbedding} need embedding${c.reset} (run 'qmd embed')`);
+  }
+  if (mostRecent.latest) {
+    const lastUpdate = new Date(mostRecent.latest);
+    console.log(`  Updated:  ${formatTimeAgo(lastUpdate)}`);
   }
 
-  const hashCount = db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get() as { count: number };
-  console.log(`Vectors: ${hashCount.count} unique content hashes embedded`);
+  if (collections.length > 0) {
+    console.log(`\n${c.bold}Collections${c.reset}`);
+    for (const col of collections) {
+      const lastMod = col.last_modified ? formatTimeAgo(new Date(col.last_modified)) : "never";
+      console.log(`  ${c.cyan}${col.pwd}${c.reset}`);
+      console.log(`    ${col.glob_pattern} → ${col.active_count} docs (updated ${lastMod})`);
+    }
+  } else {
+    console.log(`\n${c.dim}No collections. Run 'qmd add .' to index markdown files.${c.reset}`);
+  }
 
   db.close();
 }
 
 async function updateAllCollections(): Promise<void> {
   const db = getDb();
+  cleanupDuplicateCollections(db);
   const collections = db.prepare(`SELECT id, pwd, glob_pattern FROM collections`).all() as { id: number; pwd: string; glob_pattern: string }[];
 
   if (collections.length === 0) {
-    console.log("No collections found.");
+    console.log(`${c.dim}No collections found. Run 'qmd add .' to index markdown files.${c.reset}`);
     db.close();
     return;
   }
 
   db.close();
 
-  console.log(`Updating ${collections.length} collection(s)...\n`);
+  console.log(`${c.bold}Updating ${collections.length} collection(s)...${c.reset}\n`);
 
-  for (const c of collections) {
-    console.log(`\n--- ${c.pwd} (${c.glob_pattern}) ---`);
+  for (let i = 0; i < collections.length; i++) {
+    const col = collections[i];
+    console.log(`${c.cyan}[${i + 1}/${collections.length}]${c.reset} ${c.bold}${col.pwd}${c.reset}`);
+    console.log(`${c.dim}    Pattern: ${col.glob_pattern}${c.reset}`);
     // Temporarily set PWD for indexing
     const originalPwd = process.env.PWD;
-    process.env.PWD = c.pwd;
-    await indexFiles(c.glob_pattern);
+    process.env.PWD = col.pwd;
+    await indexFiles(col.glob_pattern);
     process.env.PWD = originalPwd;
+    console.log("");
   }
 
-  console.log("\nAll collections updated.");
+  console.log(`${c.green}✓ All collections updated.${c.reset}`);
 }
 
 async function dropCollection(globPattern: string): Promise<void> {
@@ -575,7 +639,7 @@ async function indexFiles(globPattern: string = DEFAULT_GLOB): Promise<void> {
   console.log(`\nIndexed: ${indexed} new, ${updated} updated, ${unchanged} unchanged, ${removed} removed`);
 
   if (needsEmbedding > 0) {
-    console.log(`\nRun 'qmd vector' to update embeddings (${needsEmbedding} unique hashes need vectors)`);
+    console.log(`\nRun 'qmd embed' to update embeddings (${needsEmbedding} unique hashes need vectors)`);
   }
 
   db.close();
@@ -958,7 +1022,7 @@ async function vectorSearch(query: string, opts: OutputOptions, model: string =
 
   const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
   if (!tableExists) {
-    console.error("Vector index not found. Run 'qmd vector' first to create embeddings.");
+    console.error("Vector index not found. Run 'qmd embed' first to create embeddings.");
     db.close();
     return;
   }
@@ -1171,7 +1235,7 @@ const args = parseGlobalOptions(rawArgs);
 if (args.length === 0) {
   console.log("Usage:");
   console.log("  qmd add [--drop] [glob]    - Add/update collection from $PWD (default: **/*.md)");
-  console.log("  qmd collections            - List all collections");
+  console.log("  qmd status                 - Show index status and collections");
   console.log("  qmd update-all             - Re-index all collections");
   console.log("  qmd embed [-f]             - Create vector embeddings for all content");
   console.log("  qmd search <query>         - Full-text search (BM25)");
@@ -1214,8 +1278,8 @@ if (cmd === "add") {
   } else {
     await indexFiles(globPattern);
   }
-} else if (cmd === "collections") {
-  listCollections();
+} else if (cmd === "status") {
+  showStatus();
 } else if (cmd === "update-all") {
   await updateAllCollections();
 } else if (cmd === "embed") {