store.d.ts 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989
  1. /**
  2. * QMD Store - Core data access and retrieval functions
  3. *
  4. * This module provides all database operations, search functions, and document
  5. * retrieval for QMD. It returns raw data structures that can be formatted by
  6. * CLI or MCP consumers.
  7. *
  8. * Usage:
  9. * const store = createStore("/path/to/db.sqlite");
  10. * // or use default path:
  11. * const store = createStore();
  12. */
  13. import type { Database } from "./db.js";
  14. import { LlamaCpp, formatQueryForEmbedding, formatDocForEmbedding, type ILLMSession } from "./llm.js";
  15. import type { NamedCollection, Collection, CollectionConfig } from "./collections.js";
  16. import { type EmbeddingProvider } from "./embedding/provider.js";
  17. export declare const DEFAULT_EMBED_MODEL = "embeddinggemma";
  18. export declare const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
  19. export declare const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
  20. export declare const DEFAULT_GLOB = "**/*.md";
  21. export declare const DEFAULT_MULTI_GET_MAX_BYTES: number;
  22. export declare const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
  23. export declare const DEFAULT_EMBED_MAX_BATCH_BYTES: number;
  24. export declare const CHUNK_SIZE_TOKENS = 900;
  25. export declare const CHUNK_OVERLAP_TOKENS: number;
  26. export declare const CHUNK_SIZE_CHARS: number;
  27. export declare const CHUNK_OVERLAP_CHARS: number;
  28. export declare const CHUNK_WINDOW_TOKENS = 200;
  29. export declare const CHUNK_WINDOW_CHARS: number;
  30. /**
  31. * A potential break point in the document with a base score indicating quality.
  32. */
  33. export interface BreakPoint {
  34. pos: number;
  35. score: number;
  36. type: string;
  37. }
  38. /**
  39. * A region where a code fence exists (between ``` markers).
  40. * We should never split inside a code fence.
  41. */
  42. export interface CodeFenceRegion {
  43. start: number;
  44. end: number;
  45. }
  46. /**
  47. * Patterns for detecting break points in markdown documents.
  48. * Higher scores indicate better places to split.
  49. * Scores are spread wide so headings decisively beat lower-quality breaks.
  50. * Order matters for scoring - more specific patterns first.
  51. */
  52. export declare const BREAK_PATTERNS: [RegExp, number, string][];
  53. /**
  54. * Scan text for all potential break points.
  55. * Returns sorted array of break points with higher-scoring patterns taking precedence
  56. * when multiple patterns match the same position.
  57. */
  58. export declare function scanBreakPoints(text: string): BreakPoint[];
  59. /**
  60. * Find all code fence regions in the text.
  61. * Code fences are delimited by ``` and we should never split inside them.
  62. */
  63. export declare function findCodeFences(text: string): CodeFenceRegion[];
  64. /**
  65. * Check if a position is inside a code fence region.
  66. */
  67. export declare function isInsideCodeFence(pos: number, fences: CodeFenceRegion[]): boolean;
  68. /**
  69. * Find the best cut position using scored break points with distance decay.
  70. *
  71. * Uses squared distance for gentler early decay - headings far back still win
  72. * over low-quality breaks near the target.
  73. *
  74. * @param breakPoints - Pre-scanned break points from scanBreakPoints()
  75. * @param targetCharPos - The ideal cut position (e.g., maxChars boundary)
  76. * @param windowChars - How far back to search for break points (default ~200 tokens)
  77. * @param decayFactor - How much to penalize distance (0.7 = 30% score at window edge)
  78. * @param codeFences - Code fence regions to avoid splitting inside
  79. * @returns The best position to cut at
  80. */
  81. export declare function findBestCutoff(breakPoints: BreakPoint[], targetCharPos: number, windowChars?: number, decayFactor?: number, codeFences?: CodeFenceRegion[]): number;
  82. export type ChunkStrategy = "auto" | "regex" | "function";
  83. /**
  84. * Merge two sets of break points (e.g. regex + AST), keeping the highest
  85. * score at each position. Result is sorted by position.
  86. */
  87. export declare function mergeBreakPoints(a: BreakPoint[], b: BreakPoint[]): BreakPoint[];
  88. /**
  89. * Core chunk algorithm that operates on precomputed break points and code fences.
  90. * This is the shared implementation used by both regex-only and AST-aware chunking.
  91. */
  92. export declare function chunkDocumentWithBreakPoints(content: string, breakPoints: BreakPoint[], codeFences: CodeFenceRegion[], maxChars?: number, overlapChars?: number, windowChars?: number): {
  93. text: string;
  94. pos: number;
  95. }[];
  96. export declare const STRONG_SIGNAL_MIN_SCORE = 0.85;
  97. export declare const STRONG_SIGNAL_MIN_GAP = 0.15;
  98. export declare const RERANK_CANDIDATE_LIMIT = 40;
  99. /**
  100. * A typed query expansion result. Decoupled from llm.ts internal Queryable —
  101. * same shape, but store.ts owns its own public API type.
  102. *
  103. * - lex: keyword variant → routes to FTS only
  104. * - vec: semantic variant → routes to vector only
  105. * - hyde: hypothetical document → routes to vector only
  106. */
  107. export type ExpandedQuery = {
  108. type: 'lex' | 'vec' | 'hyde';
  109. query: string;
  110. /** Optional line number for error reporting (CLI parser) */
  111. line?: number;
  112. };
  113. export declare function homedir(): string;
  114. /**
  115. * Check if a path is absolute.
  116. * Supports:
  117. * - Unix paths: /path/to/file
  118. * - Windows native: C:\path or C:/path
  119. * - Git Bash: /c/path or /C/path (C-Z drives, excluding A/B floppy drives)
  120. *
  121. * Note: /c without trailing slash is treated as Unix path (directory named "c"),
  122. * while /c/ or /c/path are treated as Git Bash paths (C: drive).
  123. */
  124. export declare function isAbsolutePath(path: string): boolean;
  125. /**
  126. * Normalize path separators to forward slashes.
  127. * Converts Windows backslashes to forward slashes.
  128. */
  129. export declare function normalizePathSeparators(path: string): string;
  130. /**
  131. * Get the relative path from a prefix.
  132. * Returns null if path is not under prefix.
  133. * Returns empty string if path equals prefix.
  134. */
  135. export declare function getRelativePathFromPrefix(path: string, prefix: string): string | null;
  136. export declare function resolve(...paths: string[]): string;
  137. export declare function enableProductionMode(): void;
  138. /** Reset production mode flag — only for testing. */
  139. export declare function _resetProductionModeForTesting(): void;
  140. export declare function getDefaultDbPath(indexName?: string): string;
  141. export declare function getPwd(): string;
  142. export declare function getRealPath(path: string): string;
  143. export type VirtualPath = {
  144. collectionName: string;
  145. path: string;
  146. };
  147. /**
  148. * Normalize explicit virtual path formats to standard qmd:// format.
  149. * Only handles paths that are already explicitly virtual:
  150. * - qmd://collection/path.md (already normalized)
  151. * - qmd:////collection/path.md (extra slashes - normalize)
  152. * - //collection/path.md (missing qmd: prefix - add it)
  153. *
  154. * Does NOT handle:
  155. * - collection/path.md (bare paths - could be filesystem relative)
  156. * - :linenum suffix (should be parsed separately before calling this)
  157. */
  158. export declare function normalizeVirtualPath(input: string): string;
  159. /**
  160. * Parse a virtual path like "qmd://collection-name/path/to/file.md"
  161. * into its components.
  162. * Also supports collection root: "qmd://collection-name/" or "qmd://collection-name"
  163. */
  164. export declare function parseVirtualPath(virtualPath: string): VirtualPath | null;
  165. /**
  166. * Build a virtual path from collection name and relative path.
  167. */
  168. export declare function buildVirtualPath(collectionName: string, path: string): string;
  169. /**
  170. * Check if a path is explicitly a virtual path.
  171. * Only recognizes explicit virtual path formats:
  172. * - qmd://collection/path.md
  173. * - //collection/path.md
  174. *
  175. * Does NOT consider bare collection/path.md as virtual - that should be
  176. * handled separately by checking if the first component is a collection name.
  177. */
  178. export declare function isVirtualPath(path: string): boolean;
  179. /**
  180. * Resolve a virtual path to absolute filesystem path.
  181. */
  182. export declare function resolveVirtualPath(db: Database, virtualPath: string): string | null;
  183. /**
  184. * Convert an absolute filesystem path to a virtual path.
  185. * Returns null if the file is not in any indexed collection.
  186. */
  187. export declare function toVirtualPath(db: Database, absolutePath: string): string | null;
  188. export declare function verifySqliteVecLoaded(db: Database): void;
  189. /**
  190. * Apply concurrency pragmas with env-var override support. Exported for
  191. * unit tests; consumers should rely on `initializeDatabase` instead.
  192. */
  193. export declare function applyConcurrencyPragmas(db: Database): void;
  194. export declare function getStoreCollections(db: Database): NamedCollection[];
  195. export declare function getStoreCollection(db: Database, name: string): NamedCollection | null;
  196. export declare function getStoreGlobalContext(db: Database): string | undefined;
  197. export declare function getStoreContexts(db: Database): Array<{
  198. collection: string;
  199. path: string;
  200. context: string;
  201. }>;
  202. export declare function upsertStoreCollection(db: Database, name: string, collection: Omit<Collection, 'pattern'> & {
  203. pattern?: string;
  204. }): void;
  205. export declare function deleteStoreCollection(db: Database, name: string): boolean;
  206. export declare function renameStoreCollection(db: Database, oldName: string, newName: string): boolean;
  207. export declare function updateStoreContext(db: Database, collectionName: string, path: string, text: string): boolean;
  208. export declare function removeStoreContext(db: Database, collectionName: string, path: string): boolean;
  209. export declare function setStoreGlobalContext(db: Database, value: string | undefined): void;
  210. /**
  211. * Sync external config (YAML/inline) into SQLite store_collections.
  212. * External config always wins. Skips sync if config hash hasn't changed.
  213. */
  214. export declare function syncConfigToDb(db: Database, config: CollectionConfig): void;
  215. export declare function isSqliteVecAvailable(): boolean;
  216. export type Store = {
  217. db: Database;
  218. dbPath: string;
  219. /** Optional LlamaCpp instance for this store (overrides the global singleton) */
  220. llm?: LlamaCpp;
  221. close: () => void;
  222. ensureVecTable: (dimensions: number) => void;
  223. getHashesNeedingEmbedding: () => number;
  224. getIndexHealth: () => IndexHealthInfo;
  225. getStatus: () => IndexStatus;
  226. getCacheKey: typeof getCacheKey;
  227. getCachedResult: (cacheKey: string) => string | null;
  228. setCachedResult: (cacheKey: string, result: string) => void;
  229. clearCache: () => void;
  230. deleteLLMCache: () => number;
  231. deleteInactiveDocuments: () => number;
  232. cleanupOrphanedContent: () => number;
  233. cleanupOrphanedVectors: () => number;
  234. vacuumDatabase: () => void;
  235. getContextForFile: (filepath: string) => string | null;
  236. getContextForPath: (collectionName: string, path: string) => string | null;
  237. getCollectionByName: (name: string) => {
  238. name: string;
  239. pwd: string;
  240. glob_pattern: string;
  241. } | null;
  242. getCollectionsWithoutContext: () => {
  243. name: string;
  244. pwd: string;
  245. doc_count: number;
  246. }[];
  247. getTopLevelPathsWithoutContext: (collectionName: string) => string[];
  248. parseVirtualPath: typeof parseVirtualPath;
  249. buildVirtualPath: typeof buildVirtualPath;
  250. isVirtualPath: typeof isVirtualPath;
  251. resolveVirtualPath: (virtualPath: string) => string | null;
  252. toVirtualPath: (absolutePath: string) => string | null;
  253. searchFTS: (query: string, limit?: number, collectionName?: string) => SearchResult[];
  254. searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider) => Promise<SearchResult[]>;
  255. expandQuery: (query: string, model?: string, intent?: string) => Promise<ExpandedQuery[]>;
  256. rerank: (query: string, documents: {
  257. file: string;
  258. text: string;
  259. }[], model?: string, intent?: string) => Promise<{
  260. file: string;
  261. score: number;
  262. }[]>;
  263. findDocument: (filename: string, options?: {
  264. includeBody?: boolean;
  265. }) => DocumentResult | DocumentNotFound;
  266. getDocumentBody: (doc: DocumentResult | {
  267. filepath: string;
  268. }, fromLine?: number, maxLines?: number) => string | null;
  269. findDocuments: (pattern: string, options?: {
  270. includeBody?: boolean;
  271. maxBytes?: number;
  272. }) => {
  273. docs: MultiGetResult[];
  274. errors: string[];
  275. };
  276. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => string[];
  277. matchFilesByGlob: (pattern: string) => {
  278. filepath: string;
  279. displayPath: string;
  280. bodyLength: number;
  281. }[];
  282. findDocumentByDocid: (docid: string) => {
  283. filepath: string;
  284. hash: string;
  285. } | null;
  286. insertContent: (hash: string, content: string, createdAt: string) => void;
  287. insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => void;
  288. findActiveDocument: (collectionName: string, path: string) => {
  289. id: number;
  290. hash: string;
  291. title: string;
  292. } | null;
  293. updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => void;
  294. updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => void;
  295. deactivateDocument: (collectionName: string, path: string) => void;
  296. getActiveDocumentPaths: (collectionName: string) => string[];
  297. getHashesForEmbedding: () => {
  298. hash: string;
  299. body: string;
  300. path: string;
  301. }[];
  302. clearAllEmbeddings: () => void;
  303. insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void;
  304. };
  305. export type ReindexProgress = {
  306. file: string;
  307. current: number;
  308. total: number;
  309. };
  310. export type ReindexResult = {
  311. indexed: number;
  312. updated: number;
  313. unchanged: number;
  314. removed: number;
  315. orphanedCleaned: number;
  316. };
  317. /**
  318. * Re-index a single collection by scanning the filesystem and updating the database.
  319. * Pure function — no console output, no db lifecycle management.
  320. */
  321. export declare function reindexCollection(store: Store, collectionPath: string, globPattern: string, collectionName: string, options?: {
  322. ignorePatterns?: string[];
  323. onProgress?: (info: ReindexProgress) => void;
  324. }): Promise<ReindexResult>;
  325. export type EmbedProgress = {
  326. chunksEmbedded: number;
  327. totalChunks: number;
  328. bytesProcessed: number;
  329. totalBytes: number;
  330. errors: number;
  331. };
  332. export type EmbedResult = {
  333. docsProcessed: number;
  334. chunksEmbedded: number;
  335. errors: number;
  336. durationMs: number;
  337. };
  338. export type EmbedOptions = {
  339. force?: boolean;
  340. model?: string;
  341. maxDocsPerBatch?: number;
  342. maxBatchBytes?: number;
  343. chunkStrategy?: ChunkStrategy;
  344. onProgress?: (info: EmbedProgress) => void;
  345. /**
  346. * Optional embedding provider. When supplied, embeddings are routed through
  347. * this provider (HTTP, GPU worker, etc.) instead of the local llama.cpp
  348. * session path. The provider's `getModelId()` is verified against existing
  349. * `content_vectors.model` rows; mismatch throws unless `force` is set.
  350. *
  351. * When omitted, behavior is identical to pre-patch: embeddings come from
  352. * the store's `LlamaCpp` (or the global singleton).
  353. */
  354. embedProvider?: EmbeddingProvider;
  355. };
  356. /**
  357. * Generate vector embeddings for documents that need them.
  358. * Pure function — no console output, no db lifecycle management.
  359. * Uses the store's LlamaCpp instance if set, otherwise the global singleton.
  360. */
  361. export declare function generateEmbeddings(store: Store, options?: EmbedOptions): Promise<EmbedResult>;
  362. /**
  363. * Create a new store instance with the given database path.
  364. * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
  365. *
  366. * @param dbPath - Path to the SQLite database file
  367. * @returns Store instance with all methods bound to the database
  368. */
  369. export declare function createStore(dbPath?: string): Store;
  370. /**
  371. * Unified document result type with all metadata.
  372. * Body is optional - use getDocumentBody() to load it separately if needed.
  373. */
  374. export type DocumentResult = {
  375. filepath: string;
  376. displayPath: string;
  377. title: string;
  378. context: string | null;
  379. hash: string;
  380. docid: string;
  381. collectionName: string;
  382. modifiedAt: string;
  383. bodyLength: number;
  384. body?: string;
  385. };
  386. /**
  387. * Extract short docid from a full hash (first 6 characters).
  388. */
  389. export declare function getDocid(hash: string): string;
  390. export declare function handelize(path: string): string;
  391. /**
  392. * Search result extends DocumentResult with score and source info
  393. */
  394. export type SearchResult = DocumentResult & {
  395. score: number;
  396. source: "fts" | "vec";
  397. chunkPos?: number;
  398. };
  399. /**
  400. * Ranked result for RRF fusion (simplified, used internally)
  401. */
  402. export type RankedResult = {
  403. file: string;
  404. displayPath: string;
  405. title: string;
  406. body: string;
  407. score: number;
  408. };
  409. export type RRFContributionTrace = {
  410. listIndex: number;
  411. source: "fts" | "vec";
  412. queryType: "original" | "lex" | "vec" | "hyde";
  413. query: string;
  414. rank: number;
  415. weight: number;
  416. backendScore: number;
  417. rrfContribution: number;
  418. };
  419. export type RRFScoreTrace = {
  420. contributions: RRFContributionTrace[];
  421. baseScore: number;
  422. topRank: number;
  423. topRankBonus: number;
  424. totalScore: number;
  425. };
  426. export type HybridQueryExplain = {
  427. ftsScores: number[];
  428. vectorScores: number[];
  429. rrf: {
  430. rank: number;
  431. positionScore: number;
  432. weight: number;
  433. baseScore: number;
  434. topRankBonus: number;
  435. totalScore: number;
  436. contributions: RRFContributionTrace[];
  437. };
  438. rerankScore: number;
  439. blendedScore: number;
  440. };
  441. /**
  442. * Error result when document is not found
  443. */
  444. export type DocumentNotFound = {
  445. error: "not_found";
  446. query: string;
  447. similarFiles: string[];
  448. };
  449. /**
  450. * Result from multi-get operations
  451. */
  452. export type MultiGetResult = {
  453. doc: DocumentResult;
  454. skipped: false;
  455. } | {
  456. doc: Pick<DocumentResult, "filepath" | "displayPath">;
  457. skipped: true;
  458. skipReason: string;
  459. };
  460. export type CollectionInfo = {
  461. name: string;
  462. path: string | null;
  463. pattern: string | null;
  464. documents: number;
  465. lastUpdated: string;
  466. };
  467. export type IndexStatus = {
  468. totalDocuments: number;
  469. needsEmbedding: number;
  470. hasVectorIndex: boolean;
  471. collections: CollectionInfo[];
  472. };
  473. export declare function getHashesNeedingEmbedding(db: Database): number;
  474. export type IndexHealthInfo = {
  475. needsEmbedding: number;
  476. totalDocs: number;
  477. daysStale: number | null;
  478. };
  479. export declare function getIndexHealth(db: Database): IndexHealthInfo;
  480. export declare function getCacheKey(url: string, body: object): string;
  481. export declare function getCachedResult(db: Database, cacheKey: string): string | null;
  482. export declare function setCachedResult(db: Database, cacheKey: string, result: string): void;
  483. export declare function clearCache(db: Database): void;
  484. /**
  485. * Delete cached LLM API responses.
  486. * Returns the number of cached responses deleted.
  487. */
  488. export declare function deleteLLMCache(db: Database): number;
  489. /**
  490. * Remove inactive document records (active = 0).
  491. * Returns the number of inactive documents deleted.
  492. */
  493. export declare function deleteInactiveDocuments(db: Database): number;
  494. /**
  495. * Remove orphaned content hashes that are not referenced by any active document.
  496. * Returns the number of orphaned content hashes deleted.
  497. */
  498. export declare function cleanupOrphanedContent(db: Database): number;
  499. /**
  500. * Remove orphaned vector embeddings that are not referenced by any active document.
  501. * Returns the number of orphaned embedding chunks deleted.
  502. */
  503. export declare function cleanupOrphanedVectors(db: Database): number;
  504. /**
  505. * Run VACUUM to reclaim unused space in the database.
  506. * This operation rebuilds the database file to eliminate fragmentation.
  507. */
  508. export declare function vacuumDatabase(db: Database): void;
  509. export declare function hashContent(content: string): Promise<string>;
  510. export declare function extractTitle(content: string, filename: string): string;
  511. /**
  512. * Insert content into the content table (content-addressable storage).
  513. * Uses INSERT OR IGNORE so duplicate hashes are skipped.
  514. */
  515. export declare function insertContent(db: Database, hash: string, content: string, createdAt: string): void;
  516. /**
  517. * Insert a new document into the documents table.
  518. */
  519. export declare function insertDocument(db: Database, collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string): void;
  520. /**
  521. * Find an active document by collection name and path.
  522. */
  523. export declare function findActiveDocument(db: Database, collectionName: string, path: string): {
  524. id: number;
  525. hash: string;
  526. title: string;
  527. } | null;
  528. /**
  529. * Update the title and modified_at timestamp for a document.
  530. */
  531. export declare function updateDocumentTitle(db: Database, documentId: number, title: string, modifiedAt: string): void;
  532. /**
  533. * Update an existing document's hash, title, and modified_at timestamp.
  534. * Used when content changes but the file path stays the same.
  535. */
  536. export declare function updateDocument(db: Database, documentId: number, title: string, hash: string, modifiedAt: string): void;
  537. /**
  538. * Deactivate a document (mark as inactive but don't delete).
  539. */
  540. export declare function deactivateDocument(db: Database, collectionName: string, path: string): void;
  541. /**
  542. * Get all active document paths for a collection.
  543. */
  544. export declare function getActiveDocumentPaths(db: Database, collectionName: string): string[];
  545. export { formatQueryForEmbedding, formatDocForEmbedding };
  546. /**
  547. * Chunk a document using regex-only break point detection.
  548. * This is the sync, backward-compatible API used by tests and legacy callers.
  549. */
  550. export declare function chunkDocument(content: string, maxChars?: number, overlapChars?: number, windowChars?: number): {
  551. text: string;
  552. pos: number;
  553. }[];
  554. /**
  555. * Async AST-aware chunking. Detects language from filepath, computes AST
  556. * break points for supported code files, merges with regex break points,
  557. * and delegates to the shared chunk algorithm.
  558. *
  559. * Strategies:
  560. * - "regex" (default) — char-based chunking with regex break points only.
  561. * - "auto" — regex break points merged with AST break points (soft hints).
  562. * - "function" — one chunk per AST function range (Phase 2); inter-range
  563. * gaps (imports, top-level code) are char-chunked with AST
  564. * hints. Falls back to "auto" when zero ranges are detected.
  565. */
  566. export declare function chunkDocumentAsync(content: string, maxChars?: number, overlapChars?: number, windowChars?: number, filepath?: string, chunkStrategy?: ChunkStrategy): Promise<{
  567. text: string;
  568. pos: number;
  569. }[]>;
  570. /**
  571. * Counts the tokens in `text`. Used by `chunkDocumentByTokens` for the
  572. * safety re-split that splits chunks exceeding `maxTokens`.
  573. *
  574. * When `chunkDocumentByTokens` is called WITHOUT a tokenizer (default),
  575. * it lazily resolves `getDefaultLlamaCpp()` and uses `llm.tokenize` —
  576. * accurate but expensive (loads the local GGUF embed model + initialises
  577. * llama.cpp, ~22s on cold cache).
  578. *
  579. * Provider-mode callers (HTTP embed providers like the GPU worker on
  580. * `models` LXC) MUST pass a JS-only approximator to avoid loading the
  581. * local model entirely. A char-based estimate like
  582. * `Math.ceil(text.length / 3)` is a reasonable default — it matches the
  583. * `avgCharsPerToken=3` heuristic used for the initial char-space chunk
  584. * step, so the safety re-split stays a near no-op while populating the
  585. * `tokens` field with a stable estimate.
  586. */
  587. export type TokenCounter = (text: string) => number | Promise<number>;
  588. /**
  589. * Chunk a document by actual token count using the LLM tokenizer.
  590. * More accurate than character-based chunking but requires async.
  591. *
  592. * When `tokenizer` is supplied, it is used in place of the local
  593. * `llm.tokenize(...)` call — neither `getDefaultLlamaCpp()` nor
  594. * `llm.tokenize(...)` is invoked. This lets remote-only deployments
  595. * (`QMD_EMBED_ENDPOINT=...`) chunk documents without warming up
  596. * node-llama-cpp (DoD #1 of i-1rqixh6m / i-qkarfffa).
  597. *
  598. * When `filepath` and `chunkStrategy` are provided, uses AST-aware break
  599. * points for supported code files.
  600. */
  601. export declare function chunkDocumentByTokens(content: string, maxTokens?: number, overlapTokens?: number, windowTokens?: number, filepath?: string, chunkStrategy?: ChunkStrategy, signal?: AbortSignal, tokenizer?: TokenCounter): Promise<{
  602. text: string;
  603. pos: number;
  604. tokens: number;
  605. }[]>;
  606. /**
  607. * Normalize a docid input by stripping surrounding quotes and leading #.
  608. * Handles: "#abc123", 'abc123', "abc123", #abc123, abc123
  609. * Returns the bare hex string.
  610. */
  611. export declare function normalizeDocid(docid: string): string;
  612. /**
  613. * Check if a string looks like a docid reference.
  614. * Accepts: #abc123, abc123, "#abc123", "abc123", '#abc123', 'abc123'
  615. * Returns true if the normalized form is a valid hex string of 6+ chars.
  616. */
  617. export declare function isDocid(input: string): boolean;
  618. /**
  619. * Find a document by its short docid (first 6 characters of hash).
  620. * Returns the document's virtual path if found, null otherwise.
  621. * If multiple documents match the same short hash (collision), returns the first one.
  622. *
  623. * Accepts lenient input: #abc123, abc123, "#abc123", "abc123"
  624. */
  625. export declare function findDocumentByDocid(db: Database, docid: string): {
  626. filepath: string;
  627. hash: string;
  628. } | null;
  629. export declare function findSimilarFiles(db: Database, query: string, maxDistance?: number, limit?: number): string[];
  630. export declare function matchFilesByGlob(db: Database, pattern: string): {
  631. filepath: string;
  632. displayPath: string;
  633. bodyLength: number;
  634. }[];
  635. /**
  636. * Get context for a file path using hierarchical inheritance.
  637. * Contexts are collection-scoped and inherit from parent directories.
  638. * For example, context at "/talks" applies to "/talks/2024/keynote.md".
  639. *
  640. * @param db Database instance (unused - kept for compatibility)
  641. * @param collectionName Collection name
  642. * @param path Relative path within the collection
  643. * @returns Context string or null if no context is defined
  644. */
  645. export declare function getContextForPath(db: Database, collectionName: string, path: string): string | null;
  646. /**
  647. * Get context for a file path (virtual or filesystem).
  648. * Resolves the collection and relative path from the DB store_collections table.
  649. */
  650. export declare function getContextForFile(db: Database, filepath: string): string | null;
  651. /**
  652. * Get collection by name from DB store_collections table.
  653. */
  654. export declare function getCollectionByName(db: Database, name: string): {
  655. name: string;
  656. pwd: string;
  657. glob_pattern: string;
  658. } | null;
  659. /**
  660. * List all collections with document counts from database.
  661. * Merges store_collections config with database statistics.
  662. */
  663. export declare function listCollections(db: Database): {
  664. name: string;
  665. pwd: string;
  666. glob_pattern: string;
  667. doc_count: number;
  668. active_count: number;
  669. last_modified: string | null;
  670. includeByDefault: boolean;
  671. }[];
  672. /**
  673. * Remove a collection and clean up its documents.
  674. * Uses collections.ts to remove from YAML config and cleans up database.
  675. */
  676. export declare function removeCollection(db: Database, collectionName: string): {
  677. deletedDocs: number;
  678. cleanedHashes: number;
  679. };
  680. /**
  681. * Rename a collection.
  682. * Updates both YAML config and database documents table.
  683. */
  684. export declare function renameCollection(db: Database, oldName: string, newName: string): void;
  685. /**
  686. * Insert or update a context for a specific collection and path prefix.
  687. */
  688. export declare function insertContext(db: Database, collectionId: number, pathPrefix: string, context: string): void;
  689. /**
  690. * Delete a context for a specific collection and path prefix.
  691. * Returns the number of contexts deleted.
  692. */
  693. export declare function deleteContext(db: Database, collectionName: string, pathPrefix: string): number;
  694. /**
  695. * Delete all global contexts (contexts with empty path_prefix).
  696. * Returns the number of contexts deleted.
  697. */
  698. export declare function deleteGlobalContexts(db: Database): number;
  699. /**
  700. * List all contexts, grouped by collection.
  701. * Returns contexts ordered by collection name, then by path prefix length (longest first).
  702. */
  703. export declare function listPathContexts(db: Database): {
  704. collection_name: string;
  705. path_prefix: string;
  706. context: string;
  707. }[];
  708. /**
  709. * Get all collections (name only - from YAML config).
  710. */
  711. export declare function getAllCollections(db: Database): {
  712. name: string;
  713. }[];
  714. /**
  715. * Check which collections don't have any context defined.
  716. * Returns collections that have no context entries at all (not even root context).
  717. */
  718. export declare function getCollectionsWithoutContext(db: Database): {
  719. name: string;
  720. pwd: string;
  721. doc_count: number;
  722. }[];
  723. /**
  724. * Get top-level directories in a collection that don't have context.
  725. * Useful for suggesting where context might be needed.
  726. */
  727. export declare function getTopLevelPathsWithoutContext(db: Database, collectionName: string): string[];
  728. export declare function sanitizeFTS5Term(term: string): string;
  729. /**
  730. * Validate that a vec/hyde query doesn't use lex-only syntax.
  731. * Returns error message if invalid, null if valid.
  732. *
  733. * Negation is detected ONLY when `-` is preceded by whitespace or sits at
  734. * the start of the query. Hyphens inside words (e.g. `auto-archived`,
  735. * `pre-commit`, `multi-session`, `state-of-the-art`) carry no negation
  736. * semantics in natural English and must pass through unchanged.
  737. */
  738. export declare function validateSemanticQuery(query: string): string | null;
  739. export declare function validateLexQuery(query: string): string | null;
  740. export declare function searchFTS(db: Database, query: string, limit?: number, collectionName?: string): SearchResult[];
  741. export declare function searchVec(db: Database, query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider): Promise<SearchResult[]>;
  742. /**
  743. * Get all unique content hashes that need embeddings (from active documents).
  744. * Returns hash, document body, and a sample path for display purposes.
  745. */
  746. export declare function getHashesForEmbedding(db: Database): {
  747. hash: string;
  748. body: string;
  749. path: string;
  750. }[];
  751. /**
  752. * Clear all embeddings from the database (force re-index).
  753. * Deletes all rows from content_vectors and drops the vectors_vec table.
  754. */
  755. export declare function clearAllEmbeddings(db: Database): void;
  756. /**
  757. * Get the distinct set of model identifiers present in `content_vectors`.
  758. *
  759. * Used by the embedding migration-safety guard: if a configured provider's
  760. * `getModelId()` does not appear in this list (and the table is non-empty),
  761. * we refuse to embed and ask the user to run `qmd embed -f` to rebuild.
  762. *
  763. * Returns `[]` when the table is empty (fresh DB) — in which case any
  764. * provider is allowed.
  765. */
  766. export declare function getDistinctEmbeddingModels(db: Database): string[];
  767. /**
  768. * Insert a single embedding into both content_vectors and vectors_vec tables.
  769. * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
  770. *
  771. * content_vectors is inserted first so that getHashesForEmbedding (which checks
  772. * only content_vectors) won't re-select the hash on a crash between the two inserts.
  773. *
  774. * vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's
  775. * vec0 virtual tables silently ignore the OR REPLACE conflict clause.
  776. */
  777. export declare function insertEmbedding(db: Database, hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string): void;
  778. export declare function expandQuery(query: string, model: string | undefined, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<ExpandedQuery[]>;
  779. export declare function rerank(query: string, documents: {
  780. file: string;
  781. text: string;
  782. }[], model: string | undefined, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<{
  783. file: string;
  784. score: number;
  785. }[]>;
  786. export declare function reciprocalRankFusion(resultLists: RankedResult[][], weights?: number[], k?: number): RankedResult[];
  787. /**
  788. * Build per-document RRF contribution traces for explain/debug output.
  789. */
  790. export declare function buildRrfTrace(resultLists: RankedResult[][], weights?: number[], listMeta?: RankedListMeta[], k?: number): Map<string, RRFScoreTrace>;
  791. /**
  792. * Find a document by filename/path, docid (#hash), or with fuzzy matching.
  793. * Returns document metadata without body by default.
  794. *
  795. * Supports:
  796. * - Virtual paths: qmd://collection/path/to/file.md
  797. * - Absolute paths: /path/to/file.md
  798. * - Relative paths: path/to/file.md
  799. * - Short docid: #abc123 (first 6 chars of hash)
  800. */
  801. export declare function findDocument(db: Database, filename: string, options?: {
  802. includeBody?: boolean;
  803. }): DocumentResult | DocumentNotFound;
  804. /**
  805. * Get the body content for a document
  806. * Optionally slice by line range
  807. */
  808. export declare function getDocumentBody(db: Database, doc: DocumentResult | {
  809. filepath: string;
  810. }, fromLine?: number, maxLines?: number): string | null;
  811. /**
  812. * Find multiple documents by glob pattern or comma-separated list
  813. * Returns documents without body by default (use getDocumentBody to load)
  814. */
  815. export declare function findDocuments(db: Database, pattern: string, options?: {
  816. includeBody?: boolean;
  817. maxBytes?: number;
  818. }): {
  819. docs: MultiGetResult[];
  820. errors: string[];
  821. };
  822. export declare function getStatus(db: Database): IndexStatus;
  823. export type SnippetResult = {
  824. line: number;
  825. snippet: string;
  826. linesBefore: number;
  827. linesAfter: number;
  828. snippetLines: number;
  829. };
  830. /** Weight for intent terms relative to query terms (1.0) in snippet scoring */
  831. export declare const INTENT_WEIGHT_SNIPPET = 0.3;
  832. /** Weight for intent terms relative to query terms (1.0) in chunk selection */
  833. export declare const INTENT_WEIGHT_CHUNK = 0.5;
  834. /**
  835. * Extract meaningful terms from an intent string, filtering stop words and punctuation.
  836. * Uses Unicode-aware punctuation stripping so domain terms like "API" survive.
  837. * Returns lowercase terms suitable for text matching.
  838. */
  839. export declare function extractIntentTerms(intent: string): string[];
  840. export declare function extractSnippet(body: string, query: string, maxLen?: number, chunkPos?: number, chunkLen?: number, intent?: string): SnippetResult;
  841. /**
  842. * Add line numbers to text content.
  843. * Each line becomes: "{lineNum}: {content}"
  844. */
  845. export declare function addLineNumbers(text: string, startLine?: number): string;
  846. /**
  847. * Optional progress hooks for search orchestration.
  848. * CLI wires these to stderr for user feedback; MCP leaves them unset.
  849. */
  850. export interface SearchHooks {
  851. /** BM25 probe found strong signal — expansion will be skipped */
  852. onStrongSignal?: (topScore: number) => void;
  853. /** Query expansion starting */
  854. onExpandStart?: () => void;
  855. /** Query expansion complete. Empty array = strong signal skip. elapsedMs = time taken. */
  856. onExpand?: (original: string, expanded: ExpandedQuery[], elapsedMs: number) => void;
  857. /** Embedding starting (vec/hyde queries) */
  858. onEmbedStart?: (count: number) => void;
  859. /** Embedding complete */
  860. onEmbedDone?: (elapsedMs: number) => void;
  861. /** Reranking is about to start */
  862. onRerankStart?: (chunkCount: number) => void;
  863. /** Reranking finished */
  864. onRerankDone?: (elapsedMs: number) => void;
  865. }
  866. export interface HybridQueryOptions {
  867. collection?: string;
  868. limit?: number;
  869. minScore?: number;
  870. candidateLimit?: number;
  871. explain?: boolean;
  872. intent?: string;
  873. skipRerank?: boolean;
  874. chunkStrategy?: ChunkStrategy;
  875. hooks?: SearchHooks;
  876. /**
  877. * Optional embedding provider for query-side encoding (i-loazq6ze).
  878. * When supplied, the original-query vector AND any vec/hyde expansion
  879. * variants are encoded through this provider (HTTP, GPU worker,
  880. * AutoFallback chain) instead of `getLlm(store).embedBatch(...)`. Skip
  881. * to keep pre-patch behavior (uses local LlamaCpp).
  882. */
  883. embedProvider?: EmbeddingProvider;
  884. }
  885. export interface HybridQueryResult {
  886. file: string;
  887. displayPath: string;
  888. title: string;
  889. body: string;
  890. bestChunk: string;
  891. bestChunkPos: number;
  892. score: number;
  893. context: string | null;
  894. docid: string;
  895. explain?: HybridQueryExplain;
  896. }
  897. export type RankedListMeta = {
  898. source: "fts" | "vec";
  899. queryType: "original" | "lex" | "vec" | "hyde";
  900. query: string;
  901. };
  902. /**
  903. * Hybrid search: BM25 + vector + query expansion + RRF + chunked reranking.
  904. *
  905. * Pipeline:
  906. * 1. BM25 probe → skip expansion if strong signal
  907. * 2. expandQuery() → typed query variants (lex/vec/hyde)
  908. * 3. Type-routed search: original→vector, lex→FTS, vec/hyde→vector
  909. * 4. RRF fusion → slice to candidateLimit
  910. * 5. chunkDocument() + keyword-best-chunk selection
  911. * 6. rerank on chunks (NOT full bodies — O(tokens) trap)
  912. * 7. Position-aware score blending (RRF rank × reranker score)
  913. * 8. Dedup by file, filter by minScore, slice to limit
  914. */
  915. export declare function hybridQuery(store: Store, query: string, options?: HybridQueryOptions): Promise<HybridQueryResult[]>;
  916. export interface VectorSearchOptions {
  917. collection?: string;
  918. limit?: number;
  919. minScore?: number;
  920. intent?: string;
  921. hooks?: Pick<SearchHooks, 'onExpand'>;
  922. /**
  923. * Optional embedding provider for query-side encoding (i-loazq6ze).
  924. * When supplied, query vectors are encoded via the provider (HTTP /
  925. * GPU worker / fallback chain) instead of the local llama-cpp model.
  926. */
  927. embedProvider?: EmbeddingProvider;
  928. }
  929. export interface VectorSearchResult {
  930. file: string;
  931. displayPath: string;
  932. title: string;
  933. body: string;
  934. score: number;
  935. context: string | null;
  936. docid: string;
  937. }
  938. /**
  939. * Vector-only semantic search with query expansion.
  940. *
  941. * Pipeline:
  942. * 1. expandQuery() → typed variants, filter to vec/hyde only (lex irrelevant here)
  943. * 2. searchVec() for original + vec/hyde variants (sequential — node-llama-cpp embed limitation)
  944. * 3. Dedup by filepath (keep max score)
  945. * 4. Sort by score descending, filter by minScore, slice to limit
  946. */
  947. export declare function vectorSearchQuery(store: Store, query: string, options?: VectorSearchOptions): Promise<VectorSearchResult[]>;
  948. /**
  949. * A single sub-search in a structured search request.
  950. * Matches the format used in QMD training data.
  951. */
  952. export interface StructuredSearchOptions {
  953. collections?: string[];
  954. limit?: number;
  955. minScore?: number;
  956. candidateLimit?: number;
  957. explain?: boolean;
  958. /** Domain intent hint for disambiguation — steers reranking and chunk selection */
  959. intent?: string;
  960. /** Skip LLM reranking, use only RRF scores */
  961. skipRerank?: boolean;
  962. chunkStrategy?: ChunkStrategy;
  963. hooks?: SearchHooks;
  964. /**
  965. * Optional embedding provider for query-side encoding (i-loazq6ze).
  966. * When supplied, vec/hyde sub-queries are batch-encoded via the provider
  967. * (HTTP / GPU worker / fallback chain) instead of `getLlm(store).embedBatch`.
  968. */
  969. embedProvider?: EmbeddingProvider;
  970. }
  971. /**
  972. * Structured search: execute pre-expanded queries without LLM query expansion.
  973. *
  974. * Designed for LLM callers (MCP/HTTP) that generate their own query expansions.
  975. * Skips the internal expandQuery() step — goes directly to:
  976. *
  977. * Pipeline:
  978. * 1. Route searches: lex→FTS, vec/hyde→vector (batch embed)
  979. * 2. RRF fusion across all result lists
  980. * 3. Chunk documents + keyword-best-chunk selection
  981. * 4. Rerank on chunks
  982. * 5. Position-aware score blending
  983. * 6. Dedup, filter, slice
  984. *
  985. * This is the recommended endpoint for capable LLMs — they can generate
  986. * better query variations than our small local model, especially for
  987. * domain-specific or nuanced queries.
  988. */
  989. export declare function structuredSearch(store: Store, searches: ExpandedQuery[], options?: StructuredSearchOptions): Promise<HybridQueryResult[]>;