store.d.ts 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909
  1. /**
  2. * QMD Store - Core data access and retrieval functions
  3. *
  4. * This module provides all database operations, search functions, and document
  5. * retrieval for QMD. It returns raw data structures that can be formatted by
  6. * CLI or MCP consumers.
  7. *
  8. * Usage:
  9. * const store = createStore("/path/to/db.sqlite");
  10. * // or use default path:
  11. * const store = createStore();
  12. */
  13. import type { Database } from "./db.js";
  14. import { LlamaCpp, formatQueryForEmbedding, formatDocForEmbedding, type ILLMSession } from "./llm.js";
  15. import type { NamedCollection, Collection, CollectionConfig } from "./collections.js";
  16. export declare const DEFAULT_EMBED_MODEL = "embeddinggemma";
  17. export declare const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
  18. export declare const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
  19. export declare const DEFAULT_GLOB = "**/*.md";
  20. export declare const DEFAULT_MULTI_GET_MAX_BYTES: number;
  21. export declare const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
  22. export declare const DEFAULT_EMBED_MAX_BATCH_BYTES: number;
  23. export declare const CHUNK_SIZE_TOKENS = 900;
  24. export declare const CHUNK_OVERLAP_TOKENS: number;
  25. export declare const CHUNK_SIZE_CHARS: number;
  26. export declare const CHUNK_OVERLAP_CHARS: number;
  27. export declare const CHUNK_WINDOW_TOKENS = 200;
  28. export declare const CHUNK_WINDOW_CHARS: number;
  29. /**
  30. * A potential break point in the document with a base score indicating quality.
  31. */
  32. export interface BreakPoint {
  33. pos: number;
  34. score: number;
  35. type: string;
  36. }
  37. /**
  38. * A region where a code fence exists (between ``` markers).
  39. * We should never split inside a code fence.
  40. */
  41. export interface CodeFenceRegion {
  42. start: number;
  43. end: number;
  44. }
  45. /**
  46. * Patterns for detecting break points in markdown documents.
  47. * Higher scores indicate better places to split.
  48. * Scores are spread wide so headings decisively beat lower-quality breaks.
  49. * Order matters for scoring - more specific patterns first.
  50. */
  51. export declare const BREAK_PATTERNS: [RegExp, number, string][];
  52. /**
  53. * Scan text for all potential break points.
  54. * Returns sorted array of break points with higher-scoring patterns taking precedence
  55. * when multiple patterns match the same position.
  56. */
  57. export declare function scanBreakPoints(text: string): BreakPoint[];
  58. /**
  59. * Find all code fence regions in the text.
  60. * Code fences are delimited by ``` and we should never split inside them.
  61. */
  62. export declare function findCodeFences(text: string): CodeFenceRegion[];
  63. /**
  64. * Check if a position is inside a code fence region.
  65. */
  66. export declare function isInsideCodeFence(pos: number, fences: CodeFenceRegion[]): boolean;
  67. /**
  68. * Find the best cut position using scored break points with distance decay.
  69. *
  70. * Uses squared distance for gentler early decay - headings far back still win
  71. * over low-quality breaks near the target.
  72. *
  73. * @param breakPoints - Pre-scanned break points from scanBreakPoints()
  74. * @param targetCharPos - The ideal cut position (e.g., maxChars boundary)
  75. * @param windowChars - How far back to search for break points (default ~200 tokens)
  76. * @param decayFactor - How much to penalize distance (0.7 = 30% score at window edge)
  77. * @param codeFences - Code fence regions to avoid splitting inside
  78. * @returns The best position to cut at
  79. */
  80. export declare function findBestCutoff(breakPoints: BreakPoint[], targetCharPos: number, windowChars?: number, decayFactor?: number, codeFences?: CodeFenceRegion[]): number;
  81. export type ChunkStrategy = "auto" | "regex";
  82. /**
  83. * Merge two sets of break points (e.g. regex + AST), keeping the highest
  84. * score at each position. Result is sorted by position.
  85. */
  86. export declare function mergeBreakPoints(a: BreakPoint[], b: BreakPoint[]): BreakPoint[];
  87. /**
  88. * Core chunk algorithm that operates on precomputed break points and code fences.
  89. * This is the shared implementation used by both regex-only and AST-aware chunking.
  90. */
  91. export declare function chunkDocumentWithBreakPoints(content: string, breakPoints: BreakPoint[], codeFences: CodeFenceRegion[], maxChars?: number, overlapChars?: number, windowChars?: number): {
  92. text: string;
  93. pos: number;
  94. }[];
  95. export declare const STRONG_SIGNAL_MIN_SCORE = 0.85;
  96. export declare const STRONG_SIGNAL_MIN_GAP = 0.15;
  97. export declare const RERANK_CANDIDATE_LIMIT = 40;
  98. /**
  99. * A typed query expansion result. Decoupled from llm.ts internal Queryable —
  100. * same shape, but store.ts owns its own public API type.
  101. *
  102. * - lex: keyword variant → routes to FTS only
  103. * - vec: semantic variant → routes to vector only
  104. * - hyde: hypothetical document → routes to vector only
  105. */
  106. export type ExpandedQuery = {
  107. type: 'lex' | 'vec' | 'hyde';
  108. query: string;
  109. /** Optional line number for error reporting (CLI parser) */
  110. line?: number;
  111. };
  112. export declare function homedir(): string;
  113. /**
  114. * Check if a path is absolute.
  115. * Supports:
  116. * - Unix paths: /path/to/file
  117. * - Windows native: C:\path or C:/path
  118. * - Git Bash: /c/path or /C/path (C-Z drives, excluding A/B floppy drives)
  119. *
  120. * Note: /c without trailing slash is treated as Unix path (directory named "c"),
  121. * while /c/ or /c/path are treated as Git Bash paths (C: drive).
  122. */
  123. export declare function isAbsolutePath(path: string): boolean;
  124. /**
  125. * Normalize path separators to forward slashes.
  126. * Converts Windows backslashes to forward slashes.
  127. */
  128. export declare function normalizePathSeparators(path: string): string;
  129. /**
  130. * Get the relative path from a prefix.
  131. * Returns null if path is not under prefix.
  132. * Returns empty string if path equals prefix.
  133. */
  134. export declare function getRelativePathFromPrefix(path: string, prefix: string): string | null;
  135. export declare function resolve(...paths: string[]): string;
  136. export declare function enableProductionMode(): void;
  137. /** Reset production mode flag — only for testing. */
  138. export declare function _resetProductionModeForTesting(): void;
  139. export declare function getDefaultDbPath(indexName?: string): string;
  140. export declare function getPwd(): string;
  141. export declare function getRealPath(path: string): string;
  142. export type VirtualPath = {
  143. collectionName: string;
  144. path: string;
  145. };
  146. /**
  147. * Normalize explicit virtual path formats to standard qmd:// format.
  148. * Only handles paths that are already explicitly virtual:
  149. * - qmd://collection/path.md (already normalized)
  150. * - qmd:////collection/path.md (extra slashes - normalize)
  151. * - //collection/path.md (missing qmd: prefix - add it)
  152. *
  153. * Does NOT handle:
  154. * - collection/path.md (bare paths - could be filesystem relative)
  155. * - :linenum suffix (should be parsed separately before calling this)
  156. */
  157. export declare function normalizeVirtualPath(input: string): string;
  158. /**
  159. * Parse a virtual path like "qmd://collection-name/path/to/file.md"
  160. * into its components.
  161. * Also supports collection root: "qmd://collection-name/" or "qmd://collection-name"
  162. */
  163. export declare function parseVirtualPath(virtualPath: string): VirtualPath | null;
  164. /**
  165. * Build a virtual path from collection name and relative path.
  166. */
  167. export declare function buildVirtualPath(collectionName: string, path: string): string;
  168. /**
  169. * Check if a path is explicitly a virtual path.
  170. * Only recognizes explicit virtual path formats:
  171. * - qmd://collection/path.md
  172. * - //collection/path.md
  173. *
  174. * Does NOT consider bare collection/path.md as virtual - that should be
  175. * handled separately by checking if the first component is a collection name.
  176. */
  177. export declare function isVirtualPath(path: string): boolean;
  178. /**
  179. * Resolve a virtual path to absolute filesystem path.
  180. */
  181. export declare function resolveVirtualPath(db: Database, virtualPath: string): string | null;
  182. /**
  183. * Convert an absolute filesystem path to a virtual path.
  184. * Returns null if the file is not in any indexed collection.
  185. */
  186. export declare function toVirtualPath(db: Database, absolutePath: string): string | null;
  187. export declare function verifySqliteVecLoaded(db: Database): void;
  188. export declare function getStoreCollections(db: Database): NamedCollection[];
  189. export declare function getStoreCollection(db: Database, name: string): NamedCollection | null;
  190. export declare function getStoreGlobalContext(db: Database): string | undefined;
  191. export declare function getStoreContexts(db: Database): Array<{
  192. collection: string;
  193. path: string;
  194. context: string;
  195. }>;
  196. export declare function upsertStoreCollection(db: Database, name: string, collection: Omit<Collection, 'pattern'> & {
  197. pattern?: string;
  198. }): void;
  199. export declare function deleteStoreCollection(db: Database, name: string): boolean;
  200. export declare function renameStoreCollection(db: Database, oldName: string, newName: string): boolean;
  201. export declare function updateStoreContext(db: Database, collectionName: string, path: string, text: string): boolean;
  202. export declare function removeStoreContext(db: Database, collectionName: string, path: string): boolean;
  203. export declare function setStoreGlobalContext(db: Database, value: string | undefined): void;
  204. /**
  205. * Sync external config (YAML/inline) into SQLite store_collections.
  206. * External config always wins. Skips sync if config hash hasn't changed.
  207. */
  208. export declare function syncConfigToDb(db: Database, config: CollectionConfig): void;
  209. export declare function isSqliteVecAvailable(): boolean;
  210. export type Store = {
  211. db: Database;
  212. dbPath: string;
  213. /** Optional LlamaCpp instance for this store (overrides the global singleton) */
  214. llm?: LlamaCpp;
  215. close: () => void;
  216. ensureVecTable: (dimensions: number) => void;
  217. getHashesNeedingEmbedding: () => number;
  218. getIndexHealth: () => IndexHealthInfo;
  219. getStatus: () => IndexStatus;
  220. getCacheKey: typeof getCacheKey;
  221. getCachedResult: (cacheKey: string) => string | null;
  222. setCachedResult: (cacheKey: string, result: string) => void;
  223. clearCache: () => void;
  224. deleteLLMCache: () => number;
  225. deleteInactiveDocuments: () => number;
  226. cleanupOrphanedContent: () => number;
  227. cleanupOrphanedVectors: () => number;
  228. vacuumDatabase: () => void;
  229. getContextForFile: (filepath: string) => string | null;
  230. getContextForPath: (collectionName: string, path: string) => string | null;
  231. getCollectionByName: (name: string) => {
  232. name: string;
  233. pwd: string;
  234. glob_pattern: string;
  235. } | null;
  236. getCollectionsWithoutContext: () => {
  237. name: string;
  238. pwd: string;
  239. doc_count: number;
  240. }[];
  241. getTopLevelPathsWithoutContext: (collectionName: string) => string[];
  242. parseVirtualPath: typeof parseVirtualPath;
  243. buildVirtualPath: typeof buildVirtualPath;
  244. isVirtualPath: typeof isVirtualPath;
  245. resolveVirtualPath: (virtualPath: string) => string | null;
  246. toVirtualPath: (absolutePath: string) => string | null;
  247. searchFTS: (query: string, limit?: number, collectionName?: string) => SearchResult[];
  248. searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => Promise<SearchResult[]>;
  249. expandQuery: (query: string, model?: string, intent?: string) => Promise<ExpandedQuery[]>;
  250. rerank: (query: string, documents: {
  251. file: string;
  252. text: string;
  253. }[], model?: string, intent?: string) => Promise<{
  254. file: string;
  255. score: number;
  256. }[]>;
  257. findDocument: (filename: string, options?: {
  258. includeBody?: boolean;
  259. }) => DocumentResult | DocumentNotFound;
  260. getDocumentBody: (doc: DocumentResult | {
  261. filepath: string;
  262. }, fromLine?: number, maxLines?: number) => string | null;
  263. findDocuments: (pattern: string, options?: {
  264. includeBody?: boolean;
  265. maxBytes?: number;
  266. }) => {
  267. docs: MultiGetResult[];
  268. errors: string[];
  269. };
  270. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => string[];
  271. matchFilesByGlob: (pattern: string) => {
  272. filepath: string;
  273. displayPath: string;
  274. bodyLength: number;
  275. }[];
  276. findDocumentByDocid: (docid: string) => {
  277. filepath: string;
  278. hash: string;
  279. } | null;
  280. insertContent: (hash: string, content: string, createdAt: string) => void;
  281. insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => void;
  282. findActiveDocument: (collectionName: string, path: string) => {
  283. id: number;
  284. hash: string;
  285. title: string;
  286. } | null;
  287. updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => void;
  288. updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => void;
  289. deactivateDocument: (collectionName: string, path: string) => void;
  290. getActiveDocumentPaths: (collectionName: string) => string[];
  291. getHashesForEmbedding: () => {
  292. hash: string;
  293. body: string;
  294. path: string;
  295. }[];
  296. clearAllEmbeddings: () => void;
  297. insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void;
  298. };
  299. export type ReindexProgress = {
  300. file: string;
  301. current: number;
  302. total: number;
  303. };
  304. export type ReindexResult = {
  305. indexed: number;
  306. updated: number;
  307. unchanged: number;
  308. removed: number;
  309. orphanedCleaned: number;
  310. };
  311. /**
  312. * Re-index a single collection by scanning the filesystem and updating the database.
  313. * Pure function — no console output, no db lifecycle management.
  314. */
  315. export declare function reindexCollection(store: Store, collectionPath: string, globPattern: string, collectionName: string, options?: {
  316. ignorePatterns?: string[];
  317. onProgress?: (info: ReindexProgress) => void;
  318. }): Promise<ReindexResult>;
  319. export type EmbedProgress = {
  320. chunksEmbedded: number;
  321. totalChunks: number;
  322. bytesProcessed: number;
  323. totalBytes: number;
  324. errors: number;
  325. };
  326. export type EmbedResult = {
  327. docsProcessed: number;
  328. chunksEmbedded: number;
  329. errors: number;
  330. durationMs: number;
  331. };
  332. export type EmbedOptions = {
  333. force?: boolean;
  334. model?: string;
  335. maxDocsPerBatch?: number;
  336. maxBatchBytes?: number;
  337. chunkStrategy?: ChunkStrategy;
  338. onProgress?: (info: EmbedProgress) => void;
  339. };
  340. /**
  341. * Generate vector embeddings for documents that need them.
  342. * Pure function — no console output, no db lifecycle management.
  343. * Uses the store's LlamaCpp instance if set, otherwise the global singleton.
  344. */
  345. export declare function generateEmbeddings(store: Store, options?: EmbedOptions): Promise<EmbedResult>;
  346. /**
  347. * Create a new store instance with the given database path.
  348. * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
  349. *
  350. * @param dbPath - Path to the SQLite database file
  351. * @returns Store instance with all methods bound to the database
  352. */
  353. export declare function createStore(dbPath?: string): Store;
  354. /**
  355. * Unified document result type with all metadata.
  356. * Body is optional - use getDocumentBody() to load it separately if needed.
  357. */
  358. export type DocumentResult = {
  359. filepath: string;
  360. displayPath: string;
  361. title: string;
  362. context: string | null;
  363. hash: string;
  364. docid: string;
  365. collectionName: string;
  366. modifiedAt: string;
  367. bodyLength: number;
  368. body?: string;
  369. };
  370. /**
  371. * Extract short docid from a full hash (first 6 characters).
  372. */
  373. export declare function getDocid(hash: string): string;
  374. export declare function handelize(path: string): string;
  375. /**
  376. * Search result extends DocumentResult with score and source info
  377. */
  378. export type SearchResult = DocumentResult & {
  379. score: number;
  380. source: "fts" | "vec";
  381. chunkPos?: number;
  382. };
  383. /**
  384. * Ranked result for RRF fusion (simplified, used internally)
  385. */
  386. export type RankedResult = {
  387. file: string;
  388. displayPath: string;
  389. title: string;
  390. body: string;
  391. score: number;
  392. };
  393. export type RRFContributionTrace = {
  394. listIndex: number;
  395. source: "fts" | "vec";
  396. queryType: "original" | "lex" | "vec" | "hyde";
  397. query: string;
  398. rank: number;
  399. weight: number;
  400. backendScore: number;
  401. rrfContribution: number;
  402. };
  403. export type RRFScoreTrace = {
  404. contributions: RRFContributionTrace[];
  405. baseScore: number;
  406. topRank: number;
  407. topRankBonus: number;
  408. totalScore: number;
  409. };
  410. export type HybridQueryExplain = {
  411. ftsScores: number[];
  412. vectorScores: number[];
  413. rrf: {
  414. rank: number;
  415. positionScore: number;
  416. weight: number;
  417. baseScore: number;
  418. topRankBonus: number;
  419. totalScore: number;
  420. contributions: RRFContributionTrace[];
  421. };
  422. rerankScore: number;
  423. blendedScore: number;
  424. };
  425. /**
  426. * Error result when document is not found
  427. */
  428. export type DocumentNotFound = {
  429. error: "not_found";
  430. query: string;
  431. similarFiles: string[];
  432. };
  433. /**
  434. * Result from multi-get operations
  435. */
  436. export type MultiGetResult = {
  437. doc: DocumentResult;
  438. skipped: false;
  439. } | {
  440. doc: Pick<DocumentResult, "filepath" | "displayPath">;
  441. skipped: true;
  442. skipReason: string;
  443. };
  444. export type CollectionInfo = {
  445. name: string;
  446. path: string | null;
  447. pattern: string | null;
  448. documents: number;
  449. lastUpdated: string;
  450. };
  451. export type IndexStatus = {
  452. totalDocuments: number;
  453. needsEmbedding: number;
  454. hasVectorIndex: boolean;
  455. collections: CollectionInfo[];
  456. };
  457. export declare function getHashesNeedingEmbedding(db: Database): number;
  458. export type IndexHealthInfo = {
  459. needsEmbedding: number;
  460. totalDocs: number;
  461. daysStale: number | null;
  462. };
  463. export declare function getIndexHealth(db: Database): IndexHealthInfo;
  464. export declare function getCacheKey(url: string, body: object): string;
  465. export declare function getCachedResult(db: Database, cacheKey: string): string | null;
  466. export declare function setCachedResult(db: Database, cacheKey: string, result: string): void;
  467. export declare function clearCache(db: Database): void;
  468. /**
  469. * Delete cached LLM API responses.
  470. * Returns the number of cached responses deleted.
  471. */
  472. export declare function deleteLLMCache(db: Database): number;
  473. /**
  474. * Remove inactive document records (active = 0).
  475. * Returns the number of inactive documents deleted.
  476. */
  477. export declare function deleteInactiveDocuments(db: Database): number;
  478. /**
  479. * Remove orphaned content hashes that are not referenced by any active document.
  480. * Returns the number of orphaned content hashes deleted.
  481. */
  482. export declare function cleanupOrphanedContent(db: Database): number;
  483. /**
  484. * Remove orphaned vector embeddings that are not referenced by any active document.
  485. * Returns the number of orphaned embedding chunks deleted.
  486. */
  487. export declare function cleanupOrphanedVectors(db: Database): number;
  488. /**
  489. * Run VACUUM to reclaim unused space in the database.
  490. * This operation rebuilds the database file to eliminate fragmentation.
  491. */
  492. export declare function vacuumDatabase(db: Database): void;
  493. export declare function hashContent(content: string): Promise<string>;
  494. export declare function extractTitle(content: string, filename: string): string;
  495. /**
  496. * Insert content into the content table (content-addressable storage).
  497. * Uses INSERT OR IGNORE so duplicate hashes are skipped.
  498. */
  499. export declare function insertContent(db: Database, hash: string, content: string, createdAt: string): void;
  500. /**
  501. * Insert a new document into the documents table.
  502. */
  503. export declare function insertDocument(db: Database, collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string): void;
  504. /**
  505. * Find an active document by collection name and path.
  506. */
  507. export declare function findActiveDocument(db: Database, collectionName: string, path: string): {
  508. id: number;
  509. hash: string;
  510. title: string;
  511. } | null;
  512. /**
  513. * Update the title and modified_at timestamp for a document.
  514. */
  515. export declare function updateDocumentTitle(db: Database, documentId: number, title: string, modifiedAt: string): void;
  516. /**
  517. * Update an existing document's hash, title, and modified_at timestamp.
  518. * Used when content changes but the file path stays the same.
  519. */
  520. export declare function updateDocument(db: Database, documentId: number, title: string, hash: string, modifiedAt: string): void;
  521. /**
  522. * Deactivate a document (mark as inactive but don't delete).
  523. */
  524. export declare function deactivateDocument(db: Database, collectionName: string, path: string): void;
  525. /**
  526. * Get all active document paths for a collection.
  527. */
  528. export declare function getActiveDocumentPaths(db: Database, collectionName: string): string[];
  529. export { formatQueryForEmbedding, formatDocForEmbedding };
  530. /**
  531. * Chunk a document using regex-only break point detection.
  532. * This is the sync, backward-compatible API used by tests and legacy callers.
  533. */
  534. export declare function chunkDocument(content: string, maxChars?: number, overlapChars?: number, windowChars?: number): {
  535. text: string;
  536. pos: number;
  537. }[];
  538. /**
  539. * Async AST-aware chunking. Detects language from filepath, computes AST
  540. * break points for supported code files, merges with regex break points,
  541. * and delegates to the shared chunk algorithm.
  542. *
  543. * Falls back to regex-only when strategy is "regex", filepath is absent,
  544. * or language is unsupported.
  545. */
  546. export declare function chunkDocumentAsync(content: string, maxChars?: number, overlapChars?: number, windowChars?: number, filepath?: string, chunkStrategy?: ChunkStrategy): Promise<{
  547. text: string;
  548. pos: number;
  549. }[]>;
  550. /**
  551. * Chunk a document by actual token count using the LLM tokenizer.
  552. * More accurate than character-based chunking but requires async.
  553. *
  554. * When filepath and chunkStrategy are provided, uses AST-aware break points
  555. * for supported code files.
  556. */
  557. export declare function chunkDocumentByTokens(content: string, maxTokens?: number, overlapTokens?: number, windowTokens?: number, filepath?: string, chunkStrategy?: ChunkStrategy, signal?: AbortSignal): Promise<{
  558. text: string;
  559. pos: number;
  560. tokens: number;
  561. }[]>;
  562. /**
  563. * Normalize a docid input by stripping surrounding quotes and leading #.
  564. * Handles: "#abc123", 'abc123', "abc123", #abc123, abc123
  565. * Returns the bare hex string.
  566. */
  567. export declare function normalizeDocid(docid: string): string;
  568. /**
  569. * Check if a string looks like a docid reference.
  570. * Accepts: #abc123, abc123, "#abc123", "abc123", '#abc123', 'abc123'
  571. * Returns true if the normalized form is a valid hex string of 6+ chars.
  572. */
  573. export declare function isDocid(input: string): boolean;
  574. /**
  575. * Find a document by its short docid (first 6 characters of hash).
  576. * Returns the document's virtual path if found, null otherwise.
  577. * If multiple documents match the same short hash (collision), returns the first one.
  578. *
  579. * Accepts lenient input: #abc123, abc123, "#abc123", "abc123"
  580. */
  581. export declare function findDocumentByDocid(db: Database, docid: string): {
  582. filepath: string;
  583. hash: string;
  584. } | null;
  585. export declare function findSimilarFiles(db: Database, query: string, maxDistance?: number, limit?: number): string[];
  586. export declare function matchFilesByGlob(db: Database, pattern: string): {
  587. filepath: string;
  588. displayPath: string;
  589. bodyLength: number;
  590. }[];
  591. /**
  592. * Get context for a file path using hierarchical inheritance.
  593. * Contexts are collection-scoped and inherit from parent directories.
  594. * For example, context at "/talks" applies to "/talks/2024/keynote.md".
  595. *
  596. * @param db Database instance (unused - kept for compatibility)
  597. * @param collectionName Collection name
  598. * @param path Relative path within the collection
  599. * @returns Context string or null if no context is defined
  600. */
  601. export declare function getContextForPath(db: Database, collectionName: string, path: string): string | null;
  602. /**
  603. * Get context for a file path (virtual or filesystem).
  604. * Resolves the collection and relative path from the DB store_collections table.
  605. */
  606. export declare function getContextForFile(db: Database, filepath: string): string | null;
  607. /**
  608. * Get collection by name from DB store_collections table.
  609. */
  610. export declare function getCollectionByName(db: Database, name: string): {
  611. name: string;
  612. pwd: string;
  613. glob_pattern: string;
  614. } | null;
  615. /**
  616. * List all collections with document counts from database.
  617. * Merges store_collections config with database statistics.
  618. */
  619. export declare function listCollections(db: Database): {
  620. name: string;
  621. pwd: string;
  622. glob_pattern: string;
  623. doc_count: number;
  624. active_count: number;
  625. last_modified: string | null;
  626. includeByDefault: boolean;
  627. }[];
  628. /**
  629. * Remove a collection and clean up its documents.
  630. * Uses collections.ts to remove from YAML config and cleans up database.
  631. */
  632. export declare function removeCollection(db: Database, collectionName: string): {
  633. deletedDocs: number;
  634. cleanedHashes: number;
  635. };
  636. /**
  637. * Rename a collection.
  638. * Updates both YAML config and database documents table.
  639. */
  640. export declare function renameCollection(db: Database, oldName: string, newName: string): void;
  641. /**
  642. * Insert or update a context for a specific collection and path prefix.
  643. */
  644. export declare function insertContext(db: Database, collectionId: number, pathPrefix: string, context: string): void;
  645. /**
  646. * Delete a context for a specific collection and path prefix.
  647. * Returns the number of contexts deleted.
  648. */
  649. export declare function deleteContext(db: Database, collectionName: string, pathPrefix: string): number;
  650. /**
  651. * Delete all global contexts (contexts with empty path_prefix).
  652. * Returns the number of contexts deleted.
  653. */
  654. export declare function deleteGlobalContexts(db: Database): number;
  655. /**
  656. * List all contexts, grouped by collection.
  657. * Returns contexts ordered by collection name, then by path prefix length (longest first).
  658. */
  659. export declare function listPathContexts(db: Database): {
  660. collection_name: string;
  661. path_prefix: string;
  662. context: string;
  663. }[];
  664. /**
  665. * Get all collections (name only - from YAML config).
  666. */
  667. export declare function getAllCollections(db: Database): {
  668. name: string;
  669. }[];
  670. /**
  671. * Check which collections don't have any context defined.
  672. * Returns collections that have no context entries at all (not even root context).
  673. */
  674. export declare function getCollectionsWithoutContext(db: Database): {
  675. name: string;
  676. pwd: string;
  677. doc_count: number;
  678. }[];
  679. /**
  680. * Get top-level directories in a collection that don't have context.
  681. * Useful for suggesting where context might be needed.
  682. */
  683. export declare function getTopLevelPathsWithoutContext(db: Database, collectionName: string): string[];
  684. export declare function sanitizeFTS5Term(term: string): string;
  685. /**
  686. * Validate that a vec/hyde query doesn't use lex-only syntax.
  687. * Returns error message if invalid, null if valid.
  688. */
  689. export declare function validateSemanticQuery(query: string): string | null;
  690. export declare function validateLexQuery(query: string): string | null;
  691. export declare function searchFTS(db: Database, query: string, limit?: number, collectionName?: string): SearchResult[];
  692. export declare function searchVec(db: Database, query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]): Promise<SearchResult[]>;
  693. /**
  694. * Get all unique content hashes that need embeddings (from active documents).
  695. * Returns hash, document body, and a sample path for display purposes.
  696. */
  697. export declare function getHashesForEmbedding(db: Database): {
  698. hash: string;
  699. body: string;
  700. path: string;
  701. }[];
  702. /**
  703. * Clear all embeddings from the database (force re-index).
  704. * Deletes all rows from content_vectors and drops the vectors_vec table.
  705. */
  706. export declare function clearAllEmbeddings(db: Database): void;
  707. /**
  708. * Insert a single embedding into both content_vectors and vectors_vec tables.
  709. * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
  710. *
  711. * content_vectors is inserted first so that getHashesForEmbedding (which checks
  712. * only content_vectors) won't re-select the hash on a crash between the two inserts.
  713. *
  714. * vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's
  715. * vec0 virtual tables silently ignore the OR REPLACE conflict clause.
  716. */
  717. export declare function insertEmbedding(db: Database, hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string): void;
  718. export declare function expandQuery(query: string, model: string | undefined, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<ExpandedQuery[]>;
  719. export declare function rerank(query: string, documents: {
  720. file: string;
  721. text: string;
  722. }[], model: string | undefined, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<{
  723. file: string;
  724. score: number;
  725. }[]>;
  726. export declare function reciprocalRankFusion(resultLists: RankedResult[][], weights?: number[], k?: number): RankedResult[];
  727. /**
  728. * Build per-document RRF contribution traces for explain/debug output.
  729. */
  730. export declare function buildRrfTrace(resultLists: RankedResult[][], weights?: number[], listMeta?: RankedListMeta[], k?: number): Map<string, RRFScoreTrace>;
  731. /**
  732. * Find a document by filename/path, docid (#hash), or with fuzzy matching.
  733. * Returns document metadata without body by default.
  734. *
  735. * Supports:
  736. * - Virtual paths: qmd://collection/path/to/file.md
  737. * - Absolute paths: /path/to/file.md
  738. * - Relative paths: path/to/file.md
  739. * - Short docid: #abc123 (first 6 chars of hash)
  740. */
  741. export declare function findDocument(db: Database, filename: string, options?: {
  742. includeBody?: boolean;
  743. }): DocumentResult | DocumentNotFound;
  744. /**
  745. * Get the body content for a document
  746. * Optionally slice by line range
  747. */
  748. export declare function getDocumentBody(db: Database, doc: DocumentResult | {
  749. filepath: string;
  750. }, fromLine?: number, maxLines?: number): string | null;
  751. /**
  752. * Find multiple documents by glob pattern or comma-separated list
  753. * Returns documents without body by default (use getDocumentBody to load)
  754. */
  755. export declare function findDocuments(db: Database, pattern: string, options?: {
  756. includeBody?: boolean;
  757. maxBytes?: number;
  758. }): {
  759. docs: MultiGetResult[];
  760. errors: string[];
  761. };
  762. export declare function getStatus(db: Database): IndexStatus;
  763. export type SnippetResult = {
  764. line: number;
  765. snippet: string;
  766. linesBefore: number;
  767. linesAfter: number;
  768. snippetLines: number;
  769. };
  770. /** Weight for intent terms relative to query terms (1.0) in snippet scoring */
  771. export declare const INTENT_WEIGHT_SNIPPET = 0.3;
  772. /** Weight for intent terms relative to query terms (1.0) in chunk selection */
  773. export declare const INTENT_WEIGHT_CHUNK = 0.5;
  774. /**
  775. * Extract meaningful terms from an intent string, filtering stop words and punctuation.
  776. * Uses Unicode-aware punctuation stripping so domain terms like "API" survive.
  777. * Returns lowercase terms suitable for text matching.
  778. */
  779. export declare function extractIntentTerms(intent: string): string[];
  780. export declare function extractSnippet(body: string, query: string, maxLen?: number, chunkPos?: number, chunkLen?: number, intent?: string): SnippetResult;
  781. /**
  782. * Add line numbers to text content.
  783. * Each line becomes: "{lineNum}: {content}"
  784. */
  785. export declare function addLineNumbers(text: string, startLine?: number): string;
  786. /**
  787. * Optional progress hooks for search orchestration.
  788. * CLI wires these to stderr for user feedback; MCP leaves them unset.
  789. */
  790. export interface SearchHooks {
  791. /** BM25 probe found strong signal — expansion will be skipped */
  792. onStrongSignal?: (topScore: number) => void;
  793. /** Query expansion starting */
  794. onExpandStart?: () => void;
  795. /** Query expansion complete. Empty array = strong signal skip. elapsedMs = time taken. */
  796. onExpand?: (original: string, expanded: ExpandedQuery[], elapsedMs: number) => void;
  797. /** Embedding starting (vec/hyde queries) */
  798. onEmbedStart?: (count: number) => void;
  799. /** Embedding complete */
  800. onEmbedDone?: (elapsedMs: number) => void;
  801. /** Reranking is about to start */
  802. onRerankStart?: (chunkCount: number) => void;
  803. /** Reranking finished */
  804. onRerankDone?: (elapsedMs: number) => void;
  805. }
  806. export interface HybridQueryOptions {
  807. collection?: string;
  808. limit?: number;
  809. minScore?: number;
  810. candidateLimit?: number;
  811. explain?: boolean;
  812. intent?: string;
  813. skipRerank?: boolean;
  814. chunkStrategy?: ChunkStrategy;
  815. hooks?: SearchHooks;
  816. }
  817. export interface HybridQueryResult {
  818. file: string;
  819. displayPath: string;
  820. title: string;
  821. body: string;
  822. bestChunk: string;
  823. bestChunkPos: number;
  824. score: number;
  825. context: string | null;
  826. docid: string;
  827. explain?: HybridQueryExplain;
  828. }
  829. export type RankedListMeta = {
  830. source: "fts" | "vec";
  831. queryType: "original" | "lex" | "vec" | "hyde";
  832. query: string;
  833. };
  834. /**
  835. * Hybrid search: BM25 + vector + query expansion + RRF + chunked reranking.
  836. *
  837. * Pipeline:
  838. * 1. BM25 probe → skip expansion if strong signal
  839. * 2. expandQuery() → typed query variants (lex/vec/hyde)
  840. * 3. Type-routed search: original→vector, lex→FTS, vec/hyde→vector
  841. * 4. RRF fusion → slice to candidateLimit
  842. * 5. chunkDocument() + keyword-best-chunk selection
  843. * 6. rerank on chunks (NOT full bodies — O(tokens) trap)
  844. * 7. Position-aware score blending (RRF rank × reranker score)
  845. * 8. Dedup by file, filter by minScore, slice to limit
  846. */
  847. export declare function hybridQuery(store: Store, query: string, options?: HybridQueryOptions): Promise<HybridQueryResult[]>;
  848. export interface VectorSearchOptions {
  849. collection?: string;
  850. limit?: number;
  851. minScore?: number;
  852. intent?: string;
  853. hooks?: Pick<SearchHooks, 'onExpand'>;
  854. }
  855. export interface VectorSearchResult {
  856. file: string;
  857. displayPath: string;
  858. title: string;
  859. body: string;
  860. score: number;
  861. context: string | null;
  862. docid: string;
  863. }
  864. /**
  865. * Vector-only semantic search with query expansion.
  866. *
  867. * Pipeline:
  868. * 1. expandQuery() → typed variants, filter to vec/hyde only (lex irrelevant here)
  869. * 2. searchVec() for original + vec/hyde variants (sequential — node-llama-cpp embed limitation)
  870. * 3. Dedup by filepath (keep max score)
  871. * 4. Sort by score descending, filter by minScore, slice to limit
  872. */
  873. export declare function vectorSearchQuery(store: Store, query: string, options?: VectorSearchOptions): Promise<VectorSearchResult[]>;
  874. /**
  875. * A single sub-search in a structured search request.
  876. * Matches the format used in QMD training data.
  877. */
  878. export interface StructuredSearchOptions {
  879. collections?: string[];
  880. limit?: number;
  881. minScore?: number;
  882. candidateLimit?: number;
  883. explain?: boolean;
  884. /** Domain intent hint for disambiguation — steers reranking and chunk selection */
  885. intent?: string;
  886. /** Skip LLM reranking, use only RRF scores */
  887. skipRerank?: boolean;
  888. chunkStrategy?: ChunkStrategy;
  889. hooks?: SearchHooks;
  890. }
  891. /**
  892. * Structured search: execute pre-expanded queries without LLM query expansion.
  893. *
  894. * Designed for LLM callers (MCP/HTTP) that generate their own query expansions.
  895. * Skips the internal expandQuery() step — goes directly to:
  896. *
  897. * Pipeline:
  898. * 1. Route searches: lex→FTS, vec/hyde→vector (batch embed)
  899. * 2. RRF fusion across all result lists
  900. * 3. Chunk documents + keyword-best-chunk selection
  901. * 4. Rerank on chunks
  902. * 5. Position-aware score blending
  903. * 6. Dedup, filter, slice
  904. *
  905. * This is the recommended endpoint for capable LLMs — they can generate
  906. * better query variations than our small local model, especially for
  907. * domain-specific or nuanced queries.
  908. */
  909. export declare function structuredSearch(store: Store, searches: ExpandedQuery[], options?: StructuredSearchOptions): Promise<HybridQueryResult[]>;