index.ts 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541
  1. /**
  2. * QMD SDK - Library mode for programmatic access to QMD search and indexing.
  3. *
  4. * Usage:
  5. * import { createStore } from '@tobilu/qmd'
  6. *
  7. * const store = await createStore({
  8. * dbPath: './my-index.sqlite',
  9. * config: {
  10. * collections: {
  11. * docs: { path: '/path/to/docs', pattern: '**\/*.md' }
  12. * }
  13. * }
  14. * })
  15. *
  16. * const results = await store.search({ query: "how does auth work?" })
  17. * await store.close()
  18. */
  19. import {
  20. createStore as createStoreInternal,
  21. hybridQuery,
  22. structuredSearch,
  23. extractSnippet,
  24. addLineNumbers,
  25. DEFAULT_EMBED_MODEL,
  26. DEFAULT_MULTI_GET_MAX_BYTES,
  27. reindexCollection,
  28. generateEmbeddings,
  29. listCollections as storeListCollections,
  30. syncConfigToDb,
  31. getStoreCollections,
  32. getStoreCollection,
  33. getStoreGlobalContext,
  34. getStoreContexts,
  35. upsertStoreCollection,
  36. deleteStoreCollection,
  37. renameStoreCollection,
  38. updateStoreContext,
  39. removeStoreContext,
  40. setStoreGlobalContext,
  41. vacuumDatabase,
  42. cleanupOrphanedContent,
  43. cleanupOrphanedVectors,
  44. deleteLLMCache,
  45. deleteInactiveDocuments,
  46. clearAllEmbeddings,
  47. type Store as InternalStore,
  48. type DocumentResult,
  49. type DocumentNotFound,
  50. type SearchResult,
  51. type HybridQueryResult,
  52. type HybridQueryOptions,
  53. type HybridQueryExplain,
  54. type ExpandedQuery,
  55. type StructuredSearchOptions,
  56. type MultiGetResult,
  57. type IndexStatus,
  58. type IndexHealthInfo,
  59. type SearchHooks,
  60. type ReindexProgress,
  61. type ReindexResult,
  62. type EmbedProgress,
  63. type EmbedResult,
  64. type ChunkStrategy,
  65. } from "./store.js";
  66. import {
  67. LlamaCpp,
  68. } from "./llm.js";
  69. import {
  70. setConfigSource,
  71. loadConfig,
  72. addCollection as collectionsAddCollection,
  73. removeCollection as collectionsRemoveCollection,
  74. renameCollection as collectionsRenameCollection,
  75. addContext as collectionsAddContext,
  76. removeContext as collectionsRemoveContext,
  77. setGlobalContext as collectionsSetGlobalContext,
  78. type Collection,
  79. type CollectionConfig,
  80. type NamedCollection,
  81. type ContextMap,
  82. } from "./collections.js";
  83. // Re-export types for SDK consumers
  84. export type {
  85. DocumentResult,
  86. DocumentNotFound,
  87. SearchResult,
  88. HybridQueryResult,
  89. HybridQueryOptions,
  90. HybridQueryExplain,
  91. ExpandedQuery,
  92. StructuredSearchOptions,
  93. MultiGetResult,
  94. IndexStatus,
  95. IndexHealthInfo,
  96. SearchHooks,
  97. ReindexProgress,
  98. ReindexResult,
  99. EmbedProgress,
  100. EmbedResult,
  101. Collection,
  102. CollectionConfig,
  103. NamedCollection,
  104. ContextMap,
  105. };
  106. // Re-export the internal Store type for advanced consumers
  107. export type { InternalStore };
  108. // Re-export utility functions and types used by frontends
  109. export { extractSnippet, addLineNumbers, DEFAULT_MULTI_GET_MAX_BYTES };
  110. export type { ChunkStrategy } from "./store.js";
  111. // Re-export getDefaultDbPath for CLI/MCP that need the default database location
  112. export { getDefaultDbPath } from "./store.js";
  113. // Re-export Maintenance class for CLI housekeeping operations
  114. export { Maintenance } from "./maintenance.js";
  115. /**
  116. * Progress info emitted during update() for each file processed.
  117. */
  118. export type UpdateProgress = {
  119. collection: string;
  120. file: string;
  121. current: number;
  122. total: number;
  123. };
  124. /**
  125. * Aggregated result from update() across all collections.
  126. */
  127. export type UpdateResult = {
  128. collections: number;
  129. indexed: number;
  130. updated: number;
  131. unchanged: number;
  132. removed: number;
  133. needsEmbedding: number;
  134. };
  135. /**
  136. * Options for the unified search() method.
  137. */
  138. export interface SearchOptions {
  139. /** Simple query string — will be auto-expanded via LLM */
  140. query?: string;
  141. /** Pre-expanded queries (from expandQuery) — skips auto-expansion */
  142. queries?: ExpandedQuery[];
  143. /** Domain intent hint — steers expansion and reranking */
  144. intent?: string;
  145. /** Rerank results using LLM (default: true) */
  146. rerank?: boolean;
  147. /** Filter to a specific collection */
  148. collection?: string;
  149. /** Filter to specific collections */
  150. collections?: string[];
  151. /** Max results (default: 10) */
  152. limit?: number;
  153. /** Minimum score threshold */
  154. minScore?: number;
  155. /** Include explain traces */
  156. explain?: boolean;
  157. /** Chunk strategy: "auto" (default, uses AST for code files) or "regex" (legacy) */
  158. chunkStrategy?: ChunkStrategy;
  159. }
  160. /**
  161. * Options for searchLex() — BM25 keyword search.
  162. */
  163. export interface LexSearchOptions {
  164. limit?: number;
  165. collection?: string;
  166. }
  167. /**
  168. * Options for searchVector() — vector similarity search.
  169. */
  170. export interface VectorSearchOptions {
  171. limit?: number;
  172. collection?: string;
  173. }
  174. /**
  175. * Options for expandQuery() — manual query expansion.
  176. */
  177. export interface ExpandQueryOptions {
  178. intent?: string;
  179. }
  180. /**
  181. * Options for creating a QMD store.
  182. *
  183. * Provide `dbPath` and optionally `configPath` (YAML file) or `config` (inline).
  184. * If neither configPath nor config is provided, the store reads from existing
  185. * DB state (useful for reopening a previously-configured store).
  186. */
  187. export interface StoreOptions {
  188. /** Path to the SQLite database file */
  189. dbPath: string;
  190. /** Path to a YAML config file (mutually exclusive with `config`) */
  191. configPath?: string;
  192. /** Inline collection config (mutually exclusive with `configPath`) */
  193. config?: CollectionConfig;
  194. }
  195. /**
  196. * The QMD SDK store — provides search, retrieval, collection management,
  197. * context management, and indexing operations.
  198. *
  199. * All methods are async. The store manages its own LlamaCpp instance
  200. * (lazy-loaded, auto-unloaded after inactivity) — no global singletons.
  201. */
  202. export interface QMDStore {
  203. /** The underlying internal store (for advanced use) */
  204. readonly internal: InternalStore;
  205. /** Path to the SQLite database */
  206. readonly dbPath: string;
  207. // ── Search ──────────────────────────────────────────────────────────
  208. /** Full search: query expansion + multi-signal retrieval + LLM reranking */
  209. search(options: SearchOptions): Promise<HybridQueryResult[]>;
  210. /** BM25 keyword search (fast, no LLM) */
  211. searchLex(query: string, options?: LexSearchOptions): Promise<SearchResult[]>;
  212. /** Vector similarity search (embedding model, no reranking) */
  213. searchVector(query: string, options?: VectorSearchOptions): Promise<SearchResult[]>;
  214. /** Expand a query into typed sub-searches (lex/vec/hyde) for manual control */
  215. expandQuery(query: string, options?: ExpandQueryOptions): Promise<ExpandedQuery[]>;
  216. // ── Document Retrieval ──────────────────────────────────────────────
  217. /** Get a single document by path or docid */
  218. get(pathOrDocid: string, options?: { includeBody?: boolean }): Promise<DocumentResult | DocumentNotFound>;
  219. /** Get the body content of a document, optionally sliced by line range */
  220. getDocumentBody(pathOrDocid: string, opts?: { fromLine?: number; maxLines?: number }): Promise<string | null>;
  221. /** Get multiple documents by glob pattern or comma-separated list */
  222. multiGet(pattern: string, options?: { includeBody?: boolean; maxBytes?: number }): Promise<{ docs: MultiGetResult[]; errors: string[] }>;
  223. // ── Collection Management ───────────────────────────────────────────
  224. /** Add or update a collection */
  225. addCollection(name: string, opts: { path: string; pattern?: string; ignore?: string[] }): Promise<void>;
  226. /** Remove a collection */
  227. removeCollection(name: string): Promise<boolean>;
  228. /** Rename a collection */
  229. renameCollection(oldName: string, newName: string): Promise<boolean>;
  230. /** List all collections with document stats */
  231. listCollections(): Promise<{ name: string; pwd: string; glob_pattern: string; doc_count: number; active_count: number; last_modified: string | null; includeByDefault: boolean }[]>;
  232. /** Get names of collections included by default in queries */
  233. getDefaultCollectionNames(): Promise<string[]>;
  234. // ── Context Management ──────────────────────────────────────────────
  235. /** Add context for a path within a collection */
  236. addContext(collectionName: string, pathPrefix: string, contextText: string): Promise<boolean>;
  237. /** Remove context from a collection path */
  238. removeContext(collectionName: string, pathPrefix: string): Promise<boolean>;
  239. /** Set global context (applies to all collections) */
  240. setGlobalContext(context: string | undefined): Promise<void>;
  241. /** Get global context */
  242. getGlobalContext(): Promise<string | undefined>;
  243. /** List all contexts across all collections */
  244. listContexts(): Promise<Array<{ collection: string; path: string; context: string }>>;
  245. // ── Indexing ────────────────────────────────────────────────────────
  246. /** Re-index collections by scanning the filesystem */
  247. update(options?: {
  248. collections?: string[];
  249. onProgress?: (info: UpdateProgress) => void;
  250. }): Promise<UpdateResult>;
  251. /** Generate vector embeddings for documents that need them */
  252. embed(options?: {
  253. force?: boolean;
  254. model?: string;
  255. maxDocsPerBatch?: number;
  256. maxBatchBytes?: number;
  257. chunkStrategy?: ChunkStrategy;
  258. onProgress?: (info: EmbedProgress) => void;
  259. }): Promise<EmbedResult>;
  260. // ── Index Health ────────────────────────────────────────────────────
  261. /** Get index status (document counts, collections, embedding state) */
  262. getStatus(): Promise<IndexStatus>;
  263. /** Get index health info (stale embeddings, etc.) */
  264. getIndexHealth(): Promise<IndexHealthInfo>;
  265. // ── Lifecycle ───────────────────────────────────────────────────────
  266. /** Close the store and release all resources (LLM models, DB connection) */
  267. close(): Promise<void>;
  268. }
  269. /**
  270. * Create a QMD store for programmatic access to search and indexing.
  271. *
  272. * @example
  273. * ```typescript
  274. * // With a YAML config file
  275. * const store = await createStore({
  276. * dbPath: './index.sqlite',
  277. * configPath: './qmd.yml',
  278. * })
  279. *
  280. * // With inline config (no files needed besides the DB)
  281. * const store = await createStore({
  282. * dbPath: './index.sqlite',
  283. * config: {
  284. * collections: {
  285. * docs: { path: '/path/to/docs', pattern: '**\/*.md' }
  286. * }
  287. * }
  288. * })
  289. *
  290. * const results = await store.search({ query: "authentication flow" })
  291. * await store.close()
  292. * ```
  293. */
  294. export async function createStore(options: StoreOptions): Promise<QMDStore> {
  295. if (!options.dbPath) {
  296. throw new Error("dbPath is required");
  297. }
  298. if (options.configPath && options.config) {
  299. throw new Error("Provide either configPath or config, not both");
  300. }
  301. // Create the internal store (opens DB, creates tables)
  302. const internal = createStoreInternal(options.dbPath);
  303. const db = internal.db;
  304. // Track whether we have a YAML config path for write-through
  305. const hasYamlConfig = !!options.configPath;
  306. // Sync config into SQLite store_collections
  307. let config: CollectionConfig | undefined;
  308. if (options.configPath) {
  309. // YAML mode: inject config source for write-through, sync to DB
  310. setConfigSource({ configPath: options.configPath });
  311. config = loadConfig();
  312. syncConfigToDb(db, config);
  313. } else if (options.config) {
  314. // Inline config mode: inject config source for mutations, sync to DB
  315. setConfigSource({ config: options.config });
  316. config = options.config;
  317. syncConfigToDb(db, config);
  318. }
  319. // else: DB-only mode — no external config, use existing store_collections
  320. // Create a per-store LlamaCpp instance — lazy-loads models on first use,
  321. // auto-unloads after 5 min inactivity to free VRAM.
  322. const llm = new LlamaCpp({
  323. embedModel: config?.models?.embed,
  324. generateModel: config?.models?.generate,
  325. rerankModel: config?.models?.rerank,
  326. inactivityTimeoutMs: 5 * 60 * 1000,
  327. disposeModelsOnInactivity: true,
  328. });
  329. internal.llm = llm;
  330. const store: QMDStore = {
  331. internal,
  332. dbPath: internal.dbPath,
  333. // Search
  334. search: async (opts) => {
  335. if (!opts.query && !opts.queries) {
  336. throw new Error("search() requires either 'query' or 'queries'");
  337. }
  338. // Normalize collection/collections
  339. const collections = [
  340. ...(opts.collection ? [opts.collection] : []),
  341. ...(opts.collections ?? []),
  342. ];
  343. const skipRerank = opts.rerank === false;
  344. if (opts.queries) {
  345. // Pre-expanded queries — use structuredSearch
  346. return structuredSearch(internal, opts.queries, {
  347. collections: collections.length > 0 ? collections : undefined,
  348. limit: opts.limit,
  349. minScore: opts.minScore,
  350. explain: opts.explain,
  351. intent: opts.intent,
  352. skipRerank,
  353. chunkStrategy: opts.chunkStrategy,
  354. });
  355. }
  356. // Simple query string — use hybridQuery (expand + search + rerank)
  357. return hybridQuery(internal, opts.query!, {
  358. collection: collections[0],
  359. limit: opts.limit,
  360. minScore: opts.minScore,
  361. explain: opts.explain,
  362. intent: opts.intent,
  363. skipRerank,
  364. chunkStrategy: opts.chunkStrategy,
  365. });
  366. },
  367. searchLex: async (q, opts) => internal.searchFTS(q, opts?.limit, opts?.collection),
  368. searchVector: async (q, opts) => internal.searchVec(q, DEFAULT_EMBED_MODEL, opts?.limit, opts?.collection),
  369. expandQuery: async (q, opts) => internal.expandQuery(q, undefined, opts?.intent),
  370. get: async (pathOrDocid, opts) => internal.findDocument(pathOrDocid, opts),
  371. getDocumentBody: async (pathOrDocid, opts) => {
  372. const result = internal.findDocument(pathOrDocid, { includeBody: false });
  373. if ("error" in result) return null;
  374. return internal.getDocumentBody(result, opts?.fromLine, opts?.maxLines);
  375. },
  376. multiGet: async (pattern, opts) => internal.findDocuments(pattern, opts),
  377. // Collection Management — write to SQLite + write-through to YAML/inline if configured
  378. addCollection: async (name, opts) => {
  379. upsertStoreCollection(db, name, { path: opts.path, pattern: opts.pattern, ignore: opts.ignore });
  380. if (hasYamlConfig || options.config) {
  381. collectionsAddCollection(name, opts.path, opts.pattern);
  382. }
  383. },
  384. removeCollection: async (name) => {
  385. const result = deleteStoreCollection(db, name);
  386. if (hasYamlConfig || options.config) {
  387. collectionsRemoveCollection(name);
  388. }
  389. return result;
  390. },
  391. renameCollection: async (oldName, newName) => {
  392. const result = renameStoreCollection(db, oldName, newName);
  393. if (hasYamlConfig || options.config) {
  394. collectionsRenameCollection(oldName, newName);
  395. }
  396. return result;
  397. },
  398. listCollections: async () => storeListCollections(db),
  399. getDefaultCollectionNames: async () => {
  400. const collections = storeListCollections(db);
  401. return collections.filter(c => c.includeByDefault).map(c => c.name);
  402. },
  403. // Context Management — write to SQLite + write-through to YAML/inline if configured
  404. addContext: async (collectionName, pathPrefix, contextText) => {
  405. const result = updateStoreContext(db, collectionName, pathPrefix, contextText);
  406. if (hasYamlConfig || options.config) {
  407. collectionsAddContext(collectionName, pathPrefix, contextText);
  408. }
  409. return result;
  410. },
  411. removeContext: async (collectionName, pathPrefix) => {
  412. const result = removeStoreContext(db, collectionName, pathPrefix);
  413. if (hasYamlConfig || options.config) {
  414. collectionsRemoveContext(collectionName, pathPrefix);
  415. }
  416. return result;
  417. },
  418. setGlobalContext: async (context) => {
  419. setStoreGlobalContext(db, context);
  420. if (hasYamlConfig || options.config) {
  421. collectionsSetGlobalContext(context);
  422. }
  423. },
  424. getGlobalContext: async () => getStoreGlobalContext(db),
  425. listContexts: async () => getStoreContexts(db),
  426. // Indexing — reads collections from SQLite
  427. update: async (updateOpts) => {
  428. const collections = getStoreCollections(db);
  429. const filtered = updateOpts?.collections
  430. ? collections.filter(c => updateOpts.collections!.includes(c.name))
  431. : collections;
  432. internal.clearCache();
  433. let totalIndexed = 0, totalUpdated = 0, totalUnchanged = 0, totalRemoved = 0;
  434. for (const col of filtered) {
  435. const result = await reindexCollection(internal, col.path, col.pattern || "**/*.md", col.name, {
  436. ignorePatterns: col.ignore,
  437. onProgress: updateOpts?.onProgress
  438. ? (info) => updateOpts.onProgress!({ collection: col.name, ...info })
  439. : undefined,
  440. });
  441. totalIndexed += result.indexed;
  442. totalUpdated += result.updated;
  443. totalUnchanged += result.unchanged;
  444. totalRemoved += result.removed;
  445. }
  446. return {
  447. collections: filtered.length,
  448. indexed: totalIndexed,
  449. updated: totalUpdated,
  450. unchanged: totalUnchanged,
  451. removed: totalRemoved,
  452. needsEmbedding: internal.getHashesNeedingEmbedding(),
  453. };
  454. },
  455. embed: async (embedOpts) => {
  456. return generateEmbeddings(internal, {
  457. force: embedOpts?.force,
  458. model: embedOpts?.model,
  459. maxDocsPerBatch: embedOpts?.maxDocsPerBatch,
  460. maxBatchBytes: embedOpts?.maxBatchBytes,
  461. chunkStrategy: embedOpts?.chunkStrategy,
  462. onProgress: embedOpts?.onProgress,
  463. });
  464. },
  465. // Index Health
  466. getStatus: async () => internal.getStatus(),
  467. getIndexHealth: async () => internal.getIndexHealth(),
  468. // Lifecycle
  469. close: async () => {
  470. await llm.dispose();
  471. internal.close();
  472. if (hasYamlConfig || options.config) {
  473. setConfigSource(undefined); // Reset config source
  474. }
  475. },
  476. };
  477. return store;
  478. }