| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984 |
- /**
- * QMD Store - Core data access and retrieval functions
- *
- * This module provides all database operations, search functions, and document
- * retrieval for QMD. It returns raw data structures that can be formatted by
- * CLI or MCP consumers.
- *
- * Usage:
- * const store = createStore("/path/to/db.sqlite");
- * // or use default path:
- * const store = createStore();
- */
- import type { Database } from "./db.js";
- import { LlamaCpp, formatQueryForEmbedding, formatDocForEmbedding, type ILLMSession } from "./llm.js";
- import type { NamedCollection, Collection, CollectionConfig } from "./collections.js";
- import { type EmbeddingProvider } from "./embedding/provider.js";
- export declare const DEFAULT_EMBED_MODEL = "embeddinggemma";
- export declare const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
- export declare const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
- export declare const DEFAULT_GLOB = "**/*.md";
- export declare const DEFAULT_MULTI_GET_MAX_BYTES: number;
- export declare const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
- export declare const DEFAULT_EMBED_MAX_BATCH_BYTES: number;
- export declare const CHUNK_SIZE_TOKENS = 900;
- export declare const CHUNK_OVERLAP_TOKENS: number;
- export declare const CHUNK_SIZE_CHARS: number;
- export declare const CHUNK_OVERLAP_CHARS: number;
- export declare const CHUNK_WINDOW_TOKENS = 200;
- export declare const CHUNK_WINDOW_CHARS: number;
- /**
- * A potential break point in the document with a base score indicating quality.
- */
- export interface BreakPoint {
- pos: number;
- score: number;
- type: string;
- }
- /**
- * A region where a code fence exists (between ``` markers).
- * We should never split inside a code fence.
- */
- export interface CodeFenceRegion {
- start: number;
- end: number;
- }
- /**
- * Patterns for detecting break points in markdown documents.
- * Higher scores indicate better places to split.
- * Scores are spread wide so headings decisively beat lower-quality breaks.
- * Order matters for scoring - more specific patterns first.
- */
- export declare const BREAK_PATTERNS: [RegExp, number, string][];
- /**
- * Scan text for all potential break points.
- * Returns sorted array of break points with higher-scoring patterns taking precedence
- * when multiple patterns match the same position.
- */
- export declare function scanBreakPoints(text: string): BreakPoint[];
- /**
- * Find all code fence regions in the text.
- * Code fences are delimited by ``` and we should never split inside them.
- */
- export declare function findCodeFences(text: string): CodeFenceRegion[];
- /**
- * Check if a position is inside a code fence region.
- */
- export declare function isInsideCodeFence(pos: number, fences: CodeFenceRegion[]): boolean;
- /**
- * Find the best cut position using scored break points with distance decay.
- *
- * Uses squared distance for gentler early decay - headings far back still win
- * over low-quality breaks near the target.
- *
- * @param breakPoints - Pre-scanned break points from scanBreakPoints()
- * @param targetCharPos - The ideal cut position (e.g., maxChars boundary)
- * @param windowChars - How far back to search for break points (default ~200 tokens)
- * @param decayFactor - How much to penalize distance (0.7 = 30% score at window edge)
- * @param codeFences - Code fence regions to avoid splitting inside
- * @returns The best position to cut at
- */
- export declare function findBestCutoff(breakPoints: BreakPoint[], targetCharPos: number, windowChars?: number, decayFactor?: number, codeFences?: CodeFenceRegion[]): number;
- export type ChunkStrategy = "auto" | "regex" | "function";
- /**
- * Merge two sets of break points (e.g. regex + AST), keeping the highest
- * score at each position. Result is sorted by position.
- */
- export declare function mergeBreakPoints(a: BreakPoint[], b: BreakPoint[]): BreakPoint[];
- /**
- * Core chunk algorithm that operates on precomputed break points and code fences.
- * This is the shared implementation used by both regex-only and AST-aware chunking.
- */
- export declare function chunkDocumentWithBreakPoints(content: string, breakPoints: BreakPoint[], codeFences: CodeFenceRegion[], maxChars?: number, overlapChars?: number, windowChars?: number): {
- text: string;
- pos: number;
- }[];
- export declare const STRONG_SIGNAL_MIN_SCORE = 0.85;
- export declare const STRONG_SIGNAL_MIN_GAP = 0.15;
- export declare const RERANK_CANDIDATE_LIMIT = 40;
- /**
- * A typed query expansion result. Decoupled from llm.ts internal Queryable —
- * same shape, but store.ts owns its own public API type.
- *
- * - lex: keyword variant → routes to FTS only
- * - vec: semantic variant → routes to vector only
- * - hyde: hypothetical document → routes to vector only
- */
- export type ExpandedQuery = {
- type: 'lex' | 'vec' | 'hyde';
- query: string;
- /** Optional line number for error reporting (CLI parser) */
- line?: number;
- };
- export declare function homedir(): string;
- /**
- * Check if a path is absolute.
- * Supports:
- * - Unix paths: /path/to/file
- * - Windows native: C:\path or C:/path
- * - Git Bash: /c/path or /C/path (C-Z drives, excluding A/B floppy drives)
- *
- * Note: /c without trailing slash is treated as Unix path (directory named "c"),
- * while /c/ or /c/path are treated as Git Bash paths (C: drive).
- */
- export declare function isAbsolutePath(path: string): boolean;
- /**
- * Normalize path separators to forward slashes.
- * Converts Windows backslashes to forward slashes.
- */
- export declare function normalizePathSeparators(path: string): string;
- /**
- * Get the relative path from a prefix.
- * Returns null if path is not under prefix.
- * Returns empty string if path equals prefix.
- */
- export declare function getRelativePathFromPrefix(path: string, prefix: string): string | null;
- export declare function resolve(...paths: string[]): string;
- export declare function enableProductionMode(): void;
- /** Reset production mode flag — only for testing. */
- export declare function _resetProductionModeForTesting(): void;
- export declare function getDefaultDbPath(indexName?: string): string;
- export declare function getPwd(): string;
- export declare function getRealPath(path: string): string;
- export type VirtualPath = {
- collectionName: string;
- path: string;
- };
- /**
- * Normalize explicit virtual path formats to standard qmd:// format.
- * Only handles paths that are already explicitly virtual:
- * - qmd://collection/path.md (already normalized)
- * - qmd:////collection/path.md (extra slashes - normalize)
- * - //collection/path.md (missing qmd: prefix - add it)
- *
- * Does NOT handle:
- * - collection/path.md (bare paths - could be filesystem relative)
- * - :linenum suffix (should be parsed separately before calling this)
- */
- export declare function normalizeVirtualPath(input: string): string;
- /**
- * Parse a virtual path like "qmd://collection-name/path/to/file.md"
- * into its components.
- * Also supports collection root: "qmd://collection-name/" or "qmd://collection-name"
- */
- export declare function parseVirtualPath(virtualPath: string): VirtualPath | null;
- /**
- * Build a virtual path from collection name and relative path.
- */
- export declare function buildVirtualPath(collectionName: string, path: string): string;
- /**
- * Check if a path is explicitly a virtual path.
- * Only recognizes explicit virtual path formats:
- * - qmd://collection/path.md
- * - //collection/path.md
- *
- * Does NOT consider bare collection/path.md as virtual - that should be
- * handled separately by checking if the first component is a collection name.
- */
- export declare function isVirtualPath(path: string): boolean;
- /**
- * Resolve a virtual path to absolute filesystem path.
- */
- export declare function resolveVirtualPath(db: Database, virtualPath: string): string | null;
- /**
- * Convert an absolute filesystem path to a virtual path.
- * Returns null if the file is not in any indexed collection.
- */
- export declare function toVirtualPath(db: Database, absolutePath: string): string | null;
- export declare function verifySqliteVecLoaded(db: Database): void;
- export declare function getStoreCollections(db: Database): NamedCollection[];
- export declare function getStoreCollection(db: Database, name: string): NamedCollection | null;
- export declare function getStoreGlobalContext(db: Database): string | undefined;
- export declare function getStoreContexts(db: Database): Array<{
- collection: string;
- path: string;
- context: string;
- }>;
- export declare function upsertStoreCollection(db: Database, name: string, collection: Omit<Collection, 'pattern'> & {
- pattern?: string;
- }): void;
- export declare function deleteStoreCollection(db: Database, name: string): boolean;
- export declare function renameStoreCollection(db: Database, oldName: string, newName: string): boolean;
- export declare function updateStoreContext(db: Database, collectionName: string, path: string, text: string): boolean;
- export declare function removeStoreContext(db: Database, collectionName: string, path: string): boolean;
- export declare function setStoreGlobalContext(db: Database, value: string | undefined): void;
- /**
- * Sync external config (YAML/inline) into SQLite store_collections.
- * External config always wins. Skips sync if config hash hasn't changed.
- */
- export declare function syncConfigToDb(db: Database, config: CollectionConfig): void;
- export declare function isSqliteVecAvailable(): boolean;
- export type Store = {
- db: Database;
- dbPath: string;
- /** Optional LlamaCpp instance for this store (overrides the global singleton) */
- llm?: LlamaCpp;
- close: () => void;
- ensureVecTable: (dimensions: number) => void;
- getHashesNeedingEmbedding: () => number;
- getIndexHealth: () => IndexHealthInfo;
- getStatus: () => IndexStatus;
- getCacheKey: typeof getCacheKey;
- getCachedResult: (cacheKey: string) => string | null;
- setCachedResult: (cacheKey: string, result: string) => void;
- clearCache: () => void;
- deleteLLMCache: () => number;
- deleteInactiveDocuments: () => number;
- cleanupOrphanedContent: () => number;
- cleanupOrphanedVectors: () => number;
- vacuumDatabase: () => void;
- getContextForFile: (filepath: string) => string | null;
- getContextForPath: (collectionName: string, path: string) => string | null;
- getCollectionByName: (name: string) => {
- name: string;
- pwd: string;
- glob_pattern: string;
- } | null;
- getCollectionsWithoutContext: () => {
- name: string;
- pwd: string;
- doc_count: number;
- }[];
- getTopLevelPathsWithoutContext: (collectionName: string) => string[];
- parseVirtualPath: typeof parseVirtualPath;
- buildVirtualPath: typeof buildVirtualPath;
- isVirtualPath: typeof isVirtualPath;
- resolveVirtualPath: (virtualPath: string) => string | null;
- toVirtualPath: (absolutePath: string) => string | null;
- searchFTS: (query: string, limit?: number, collectionName?: string) => SearchResult[];
- searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider) => Promise<SearchResult[]>;
- expandQuery: (query: string, model?: string, intent?: string) => Promise<ExpandedQuery[]>;
- rerank: (query: string, documents: {
- file: string;
- text: string;
- }[], model?: string, intent?: string) => Promise<{
- file: string;
- score: number;
- }[]>;
- findDocument: (filename: string, options?: {
- includeBody?: boolean;
- }) => DocumentResult | DocumentNotFound;
- getDocumentBody: (doc: DocumentResult | {
- filepath: string;
- }, fromLine?: number, maxLines?: number) => string | null;
- findDocuments: (pattern: string, options?: {
- includeBody?: boolean;
- maxBytes?: number;
- }) => {
- docs: MultiGetResult[];
- errors: string[];
- };
- findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => string[];
- matchFilesByGlob: (pattern: string) => {
- filepath: string;
- displayPath: string;
- bodyLength: number;
- }[];
- findDocumentByDocid: (docid: string) => {
- filepath: string;
- hash: string;
- } | null;
- insertContent: (hash: string, content: string, createdAt: string) => void;
- insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => void;
- findActiveDocument: (collectionName: string, path: string) => {
- id: number;
- hash: string;
- title: string;
- } | null;
- updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => void;
- updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => void;
- deactivateDocument: (collectionName: string, path: string) => void;
- getActiveDocumentPaths: (collectionName: string) => string[];
- getHashesForEmbedding: () => {
- hash: string;
- body: string;
- path: string;
- }[];
- clearAllEmbeddings: () => void;
- insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void;
- };
- export type ReindexProgress = {
- file: string;
- current: number;
- total: number;
- };
- export type ReindexResult = {
- indexed: number;
- updated: number;
- unchanged: number;
- removed: number;
- orphanedCleaned: number;
- };
- /**
- * Re-index a single collection by scanning the filesystem and updating the database.
- * Pure function — no console output, no db lifecycle management.
- */
- export declare function reindexCollection(store: Store, collectionPath: string, globPattern: string, collectionName: string, options?: {
- ignorePatterns?: string[];
- onProgress?: (info: ReindexProgress) => void;
- }): Promise<ReindexResult>;
- export type EmbedProgress = {
- chunksEmbedded: number;
- totalChunks: number;
- bytesProcessed: number;
- totalBytes: number;
- errors: number;
- };
- export type EmbedResult = {
- docsProcessed: number;
- chunksEmbedded: number;
- errors: number;
- durationMs: number;
- };
- export type EmbedOptions = {
- force?: boolean;
- model?: string;
- maxDocsPerBatch?: number;
- maxBatchBytes?: number;
- chunkStrategy?: ChunkStrategy;
- onProgress?: (info: EmbedProgress) => void;
- /**
- * Optional embedding provider. When supplied, embeddings are routed through
- * this provider (HTTP, GPU worker, etc.) instead of the local llama.cpp
- * session path. The provider's `getModelId()` is verified against existing
- * `content_vectors.model` rows; mismatch throws unless `force` is set.
- *
- * When omitted, behavior is identical to pre-patch: embeddings come from
- * the store's `LlamaCpp` (or the global singleton).
- */
- embedProvider?: EmbeddingProvider;
- };
- /**
- * Generate vector embeddings for documents that need them.
- * Pure function — no console output, no db lifecycle management.
- * Uses the store's LlamaCpp instance if set, otherwise the global singleton.
- */
- export declare function generateEmbeddings(store: Store, options?: EmbedOptions): Promise<EmbedResult>;
- /**
- * Create a new store instance with the given database path.
- * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
- *
- * @param dbPath - Path to the SQLite database file
- * @returns Store instance with all methods bound to the database
- */
- export declare function createStore(dbPath?: string): Store;
- /**
- * Unified document result type with all metadata.
- * Body is optional - use getDocumentBody() to load it separately if needed.
- */
- export type DocumentResult = {
- filepath: string;
- displayPath: string;
- title: string;
- context: string | null;
- hash: string;
- docid: string;
- collectionName: string;
- modifiedAt: string;
- bodyLength: number;
- body?: string;
- };
- /**
- * Extract short docid from a full hash (first 6 characters).
- */
- export declare function getDocid(hash: string): string;
- export declare function handelize(path: string): string;
- /**
- * Search result extends DocumentResult with score and source info
- */
- export type SearchResult = DocumentResult & {
- score: number;
- source: "fts" | "vec";
- chunkPos?: number;
- };
- /**
- * Ranked result for RRF fusion (simplified, used internally)
- */
- export type RankedResult = {
- file: string;
- displayPath: string;
- title: string;
- body: string;
- score: number;
- };
- export type RRFContributionTrace = {
- listIndex: number;
- source: "fts" | "vec";
- queryType: "original" | "lex" | "vec" | "hyde";
- query: string;
- rank: number;
- weight: number;
- backendScore: number;
- rrfContribution: number;
- };
- export type RRFScoreTrace = {
- contributions: RRFContributionTrace[];
- baseScore: number;
- topRank: number;
- topRankBonus: number;
- totalScore: number;
- };
- export type HybridQueryExplain = {
- ftsScores: number[];
- vectorScores: number[];
- rrf: {
- rank: number;
- positionScore: number;
- weight: number;
- baseScore: number;
- topRankBonus: number;
- totalScore: number;
- contributions: RRFContributionTrace[];
- };
- rerankScore: number;
- blendedScore: number;
- };
- /**
- * Error result when document is not found
- */
- export type DocumentNotFound = {
- error: "not_found";
- query: string;
- similarFiles: string[];
- };
- /**
- * Result from multi-get operations
- */
- export type MultiGetResult = {
- doc: DocumentResult;
- skipped: false;
- } | {
- doc: Pick<DocumentResult, "filepath" | "displayPath">;
- skipped: true;
- skipReason: string;
- };
- export type CollectionInfo = {
- name: string;
- path: string | null;
- pattern: string | null;
- documents: number;
- lastUpdated: string;
- };
- export type IndexStatus = {
- totalDocuments: number;
- needsEmbedding: number;
- hasVectorIndex: boolean;
- collections: CollectionInfo[];
- };
- export declare function getHashesNeedingEmbedding(db: Database): number;
- export type IndexHealthInfo = {
- needsEmbedding: number;
- totalDocs: number;
- daysStale: number | null;
- };
- export declare function getIndexHealth(db: Database): IndexHealthInfo;
- export declare function getCacheKey(url: string, body: object): string;
- export declare function getCachedResult(db: Database, cacheKey: string): string | null;
- export declare function setCachedResult(db: Database, cacheKey: string, result: string): void;
- export declare function clearCache(db: Database): void;
- /**
- * Delete cached LLM API responses.
- * Returns the number of cached responses deleted.
- */
- export declare function deleteLLMCache(db: Database): number;
- /**
- * Remove inactive document records (active = 0).
- * Returns the number of inactive documents deleted.
- */
- export declare function deleteInactiveDocuments(db: Database): number;
- /**
- * Remove orphaned content hashes that are not referenced by any active document.
- * Returns the number of orphaned content hashes deleted.
- */
- export declare function cleanupOrphanedContent(db: Database): number;
- /**
- * Remove orphaned vector embeddings that are not referenced by any active document.
- * Returns the number of orphaned embedding chunks deleted.
- */
- export declare function cleanupOrphanedVectors(db: Database): number;
- /**
- * Run VACUUM to reclaim unused space in the database.
- * This operation rebuilds the database file to eliminate fragmentation.
- */
- export declare function vacuumDatabase(db: Database): void;
- export declare function hashContent(content: string): Promise<string>;
- export declare function extractTitle(content: string, filename: string): string;
- /**
- * Insert content into the content table (content-addressable storage).
- * Uses INSERT OR IGNORE so duplicate hashes are skipped.
- */
- export declare function insertContent(db: Database, hash: string, content: string, createdAt: string): void;
- /**
- * Insert a new document into the documents table.
- */
- export declare function insertDocument(db: Database, collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string): void;
- /**
- * Find an active document by collection name and path.
- */
- export declare function findActiveDocument(db: Database, collectionName: string, path: string): {
- id: number;
- hash: string;
- title: string;
- } | null;
- /**
- * Update the title and modified_at timestamp for a document.
- */
- export declare function updateDocumentTitle(db: Database, documentId: number, title: string, modifiedAt: string): void;
- /**
- * Update an existing document's hash, title, and modified_at timestamp.
- * Used when content changes but the file path stays the same.
- */
- export declare function updateDocument(db: Database, documentId: number, title: string, hash: string, modifiedAt: string): void;
- /**
- * Deactivate a document (mark as inactive but don't delete).
- */
- export declare function deactivateDocument(db: Database, collectionName: string, path: string): void;
- /**
- * Get all active document paths for a collection.
- */
- export declare function getActiveDocumentPaths(db: Database, collectionName: string): string[];
- export { formatQueryForEmbedding, formatDocForEmbedding };
- /**
- * Chunk a document using regex-only break point detection.
- * This is the sync, backward-compatible API used by tests and legacy callers.
- */
- export declare function chunkDocument(content: string, maxChars?: number, overlapChars?: number, windowChars?: number): {
- text: string;
- pos: number;
- }[];
- /**
- * Async AST-aware chunking. Detects language from filepath, computes AST
- * break points for supported code files, merges with regex break points,
- * and delegates to the shared chunk algorithm.
- *
- * Strategies:
- * - "regex" (default) — char-based chunking with regex break points only.
- * - "auto" — regex break points merged with AST break points (soft hints).
- * - "function" — one chunk per AST function range (Phase 2); inter-range
- * gaps (imports, top-level code) are char-chunked with AST
- * hints. Falls back to "auto" when zero ranges are detected.
- */
- export declare function chunkDocumentAsync(content: string, maxChars?: number, overlapChars?: number, windowChars?: number, filepath?: string, chunkStrategy?: ChunkStrategy): Promise<{
- text: string;
- pos: number;
- }[]>;
- /**
- * Counts the tokens in `text`. Used by `chunkDocumentByTokens` for the
- * safety re-split that splits chunks exceeding `maxTokens`.
- *
- * When `chunkDocumentByTokens` is called WITHOUT a tokenizer (default),
- * it lazily resolves `getDefaultLlamaCpp()` and uses `llm.tokenize` —
- * accurate but expensive (loads the local GGUF embed model + initialises
- * llama.cpp, ~22s on cold cache).
- *
- * Provider-mode callers (HTTP embed providers like the GPU worker on
- * `models` LXC) MUST pass a JS-only approximator to avoid loading the
- * local model entirely. A char-based estimate like
- * `Math.ceil(text.length / 3)` is a reasonable default — it matches the
- * `avgCharsPerToken=3` heuristic used for the initial char-space chunk
- * step, so the safety re-split stays a near no-op while populating the
- * `tokens` field with a stable estimate.
- */
- export type TokenCounter = (text: string) => number | Promise<number>;
- /**
- * Chunk a document by actual token count using the LLM tokenizer.
- * More accurate than character-based chunking but requires async.
- *
- * When `tokenizer` is supplied, it is used in place of the local
- * `llm.tokenize(...)` call — neither `getDefaultLlamaCpp()` nor
- * `llm.tokenize(...)` is invoked. This lets remote-only deployments
- * (`QMD_EMBED_ENDPOINT=...`) chunk documents without warming up
- * node-llama-cpp (DoD #1 of i-1rqixh6m / i-qkarfffa).
- *
- * When `filepath` and `chunkStrategy` are provided, uses AST-aware break
- * points for supported code files.
- */
- export declare function chunkDocumentByTokens(content: string, maxTokens?: number, overlapTokens?: number, windowTokens?: number, filepath?: string, chunkStrategy?: ChunkStrategy, signal?: AbortSignal, tokenizer?: TokenCounter): Promise<{
- text: string;
- pos: number;
- tokens: number;
- }[]>;
- /**
- * Normalize a docid input by stripping surrounding quotes and leading #.
- * Handles: "#abc123", 'abc123', "abc123", #abc123, abc123
- * Returns the bare hex string.
- */
- export declare function normalizeDocid(docid: string): string;
- /**
- * Check if a string looks like a docid reference.
- * Accepts: #abc123, abc123, "#abc123", "abc123", '#abc123', 'abc123'
- * Returns true if the normalized form is a valid hex string of 6+ chars.
- */
- export declare function isDocid(input: string): boolean;
- /**
- * Find a document by its short docid (first 6 characters of hash).
- * Returns the document's virtual path if found, null otherwise.
- * If multiple documents match the same short hash (collision), returns the first one.
- *
- * Accepts lenient input: #abc123, abc123, "#abc123", "abc123"
- */
- export declare function findDocumentByDocid(db: Database, docid: string): {
- filepath: string;
- hash: string;
- } | null;
- export declare function findSimilarFiles(db: Database, query: string, maxDistance?: number, limit?: number): string[];
- export declare function matchFilesByGlob(db: Database, pattern: string): {
- filepath: string;
- displayPath: string;
- bodyLength: number;
- }[];
- /**
- * Get context for a file path using hierarchical inheritance.
- * Contexts are collection-scoped and inherit from parent directories.
- * For example, context at "/talks" applies to "/talks/2024/keynote.md".
- *
- * @param db Database instance (unused - kept for compatibility)
- * @param collectionName Collection name
- * @param path Relative path within the collection
- * @returns Context string or null if no context is defined
- */
- export declare function getContextForPath(db: Database, collectionName: string, path: string): string | null;
- /**
- * Get context for a file path (virtual or filesystem).
- * Resolves the collection and relative path from the DB store_collections table.
- */
- export declare function getContextForFile(db: Database, filepath: string): string | null;
- /**
- * Get collection by name from DB store_collections table.
- */
- export declare function getCollectionByName(db: Database, name: string): {
- name: string;
- pwd: string;
- glob_pattern: string;
- } | null;
- /**
- * List all collections with document counts from database.
- * Merges store_collections config with database statistics.
- */
- export declare function listCollections(db: Database): {
- name: string;
- pwd: string;
- glob_pattern: string;
- doc_count: number;
- active_count: number;
- last_modified: string | null;
- includeByDefault: boolean;
- }[];
- /**
- * Remove a collection and clean up its documents.
- * Uses collections.ts to remove from YAML config and cleans up database.
- */
- export declare function removeCollection(db: Database, collectionName: string): {
- deletedDocs: number;
- cleanedHashes: number;
- };
- /**
- * Rename a collection.
- * Updates both YAML config and database documents table.
- */
- export declare function renameCollection(db: Database, oldName: string, newName: string): void;
- /**
- * Insert or update a context for a specific collection and path prefix.
- */
- export declare function insertContext(db: Database, collectionId: number, pathPrefix: string, context: string): void;
- /**
- * Delete a context for a specific collection and path prefix.
- * Returns the number of contexts deleted.
- */
- export declare function deleteContext(db: Database, collectionName: string, pathPrefix: string): number;
- /**
- * Delete all global contexts (contexts with empty path_prefix).
- * Returns the number of contexts deleted.
- */
- export declare function deleteGlobalContexts(db: Database): number;
- /**
- * List all contexts, grouped by collection.
- * Returns contexts ordered by collection name, then by path prefix length (longest first).
- */
- export declare function listPathContexts(db: Database): {
- collection_name: string;
- path_prefix: string;
- context: string;
- }[];
- /**
- * Get all collections (name only - from YAML config).
- */
- export declare function getAllCollections(db: Database): {
- name: string;
- }[];
- /**
- * Check which collections don't have any context defined.
- * Returns collections that have no context entries at all (not even root context).
- */
- export declare function getCollectionsWithoutContext(db: Database): {
- name: string;
- pwd: string;
- doc_count: number;
- }[];
- /**
- * Get top-level directories in a collection that don't have context.
- * Useful for suggesting where context might be needed.
- */
- export declare function getTopLevelPathsWithoutContext(db: Database, collectionName: string): string[];
- export declare function sanitizeFTS5Term(term: string): string;
- /**
- * Validate that a vec/hyde query doesn't use lex-only syntax.
- * Returns error message if invalid, null if valid.
- *
- * Negation is detected ONLY when `-` is preceded by whitespace or sits at
- * the start of the query. Hyphens inside words (e.g. `auto-archived`,
- * `pre-commit`, `multi-session`, `state-of-the-art`) carry no negation
- * semantics in natural English and must pass through unchanged.
- */
- export declare function validateSemanticQuery(query: string): string | null;
- export declare function validateLexQuery(query: string): string | null;
- export declare function searchFTS(db: Database, query: string, limit?: number, collectionName?: string): SearchResult[];
- export declare function searchVec(db: Database, query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider): Promise<SearchResult[]>;
- /**
- * Get all unique content hashes that need embeddings (from active documents).
- * Returns hash, document body, and a sample path for display purposes.
- */
- export declare function getHashesForEmbedding(db: Database): {
- hash: string;
- body: string;
- path: string;
- }[];
- /**
- * Clear all embeddings from the database (force re-index).
- * Deletes all rows from content_vectors and drops the vectors_vec table.
- */
- export declare function clearAllEmbeddings(db: Database): void;
- /**
- * Get the distinct set of model identifiers present in `content_vectors`.
- *
- * Used by the embedding migration-safety guard: if a configured provider's
- * `getModelId()` does not appear in this list (and the table is non-empty),
- * we refuse to embed and ask the user to run `qmd embed -f` to rebuild.
- *
- * Returns `[]` when the table is empty (fresh DB) — in which case any
- * provider is allowed.
- */
- export declare function getDistinctEmbeddingModels(db: Database): string[];
- /**
- * Insert a single embedding into both content_vectors and vectors_vec tables.
- * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
- *
- * content_vectors is inserted first so that getHashesForEmbedding (which checks
- * only content_vectors) won't re-select the hash on a crash between the two inserts.
- *
- * vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's
- * vec0 virtual tables silently ignore the OR REPLACE conflict clause.
- */
- export declare function insertEmbedding(db: Database, hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string): void;
- export declare function expandQuery(query: string, model: string | undefined, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<ExpandedQuery[]>;
- export declare function rerank(query: string, documents: {
- file: string;
- text: string;
- }[], model: string | undefined, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<{
- file: string;
- score: number;
- }[]>;
- export declare function reciprocalRankFusion(resultLists: RankedResult[][], weights?: number[], k?: number): RankedResult[];
- /**
- * Build per-document RRF contribution traces for explain/debug output.
- */
- export declare function buildRrfTrace(resultLists: RankedResult[][], weights?: number[], listMeta?: RankedListMeta[], k?: number): Map<string, RRFScoreTrace>;
- /**
- * Find a document by filename/path, docid (#hash), or with fuzzy matching.
- * Returns document metadata without body by default.
- *
- * Supports:
- * - Virtual paths: qmd://collection/path/to/file.md
- * - Absolute paths: /path/to/file.md
- * - Relative paths: path/to/file.md
- * - Short docid: #abc123 (first 6 chars of hash)
- */
- export declare function findDocument(db: Database, filename: string, options?: {
- includeBody?: boolean;
- }): DocumentResult | DocumentNotFound;
- /**
- * Get the body content for a document
- * Optionally slice by line range
- */
- export declare function getDocumentBody(db: Database, doc: DocumentResult | {
- filepath: string;
- }, fromLine?: number, maxLines?: number): string | null;
- /**
- * Find multiple documents by glob pattern or comma-separated list
- * Returns documents without body by default (use getDocumentBody to load)
- */
- export declare function findDocuments(db: Database, pattern: string, options?: {
- includeBody?: boolean;
- maxBytes?: number;
- }): {
- docs: MultiGetResult[];
- errors: string[];
- };
- export declare function getStatus(db: Database): IndexStatus;
- export type SnippetResult = {
- line: number;
- snippet: string;
- linesBefore: number;
- linesAfter: number;
- snippetLines: number;
- };
- /** Weight for intent terms relative to query terms (1.0) in snippet scoring */
- export declare const INTENT_WEIGHT_SNIPPET = 0.3;
- /** Weight for intent terms relative to query terms (1.0) in chunk selection */
- export declare const INTENT_WEIGHT_CHUNK = 0.5;
- /**
- * Extract meaningful terms from an intent string, filtering stop words and punctuation.
- * Uses Unicode-aware punctuation stripping so domain terms like "API" survive.
- * Returns lowercase terms suitable for text matching.
- */
- export declare function extractIntentTerms(intent: string): string[];
- export declare function extractSnippet(body: string, query: string, maxLen?: number, chunkPos?: number, chunkLen?: number, intent?: string): SnippetResult;
- /**
- * Add line numbers to text content.
- * Each line becomes: "{lineNum}: {content}"
- */
- export declare function addLineNumbers(text: string, startLine?: number): string;
- /**
- * Optional progress hooks for search orchestration.
- * CLI wires these to stderr for user feedback; MCP leaves them unset.
- */
- export interface SearchHooks {
- /** BM25 probe found strong signal — expansion will be skipped */
- onStrongSignal?: (topScore: number) => void;
- /** Query expansion starting */
- onExpandStart?: () => void;
- /** Query expansion complete. Empty array = strong signal skip. elapsedMs = time taken. */
- onExpand?: (original: string, expanded: ExpandedQuery[], elapsedMs: number) => void;
- /** Embedding starting (vec/hyde queries) */
- onEmbedStart?: (count: number) => void;
- /** Embedding complete */
- onEmbedDone?: (elapsedMs: number) => void;
- /** Reranking is about to start */
- onRerankStart?: (chunkCount: number) => void;
- /** Reranking finished */
- onRerankDone?: (elapsedMs: number) => void;
- }
- export interface HybridQueryOptions {
- collection?: string;
- limit?: number;
- minScore?: number;
- candidateLimit?: number;
- explain?: boolean;
- intent?: string;
- skipRerank?: boolean;
- chunkStrategy?: ChunkStrategy;
- hooks?: SearchHooks;
- /**
- * Optional embedding provider for query-side encoding (i-loazq6ze).
- * When supplied, the original-query vector AND any vec/hyde expansion
- * variants are encoded through this provider (HTTP, GPU worker,
- * AutoFallback chain) instead of `getLlm(store).embedBatch(...)`. Skip
- * to keep pre-patch behavior (uses local LlamaCpp).
- */
- embedProvider?: EmbeddingProvider;
- }
- export interface HybridQueryResult {
- file: string;
- displayPath: string;
- title: string;
- body: string;
- bestChunk: string;
- bestChunkPos: number;
- score: number;
- context: string | null;
- docid: string;
- explain?: HybridQueryExplain;
- }
- export type RankedListMeta = {
- source: "fts" | "vec";
- queryType: "original" | "lex" | "vec" | "hyde";
- query: string;
- };
- /**
- * Hybrid search: BM25 + vector + query expansion + RRF + chunked reranking.
- *
- * Pipeline:
- * 1. BM25 probe → skip expansion if strong signal
- * 2. expandQuery() → typed query variants (lex/vec/hyde)
- * 3. Type-routed search: original→vector, lex→FTS, vec/hyde→vector
- * 4. RRF fusion → slice to candidateLimit
- * 5. chunkDocument() + keyword-best-chunk selection
- * 6. rerank on chunks (NOT full bodies — O(tokens) trap)
- * 7. Position-aware score blending (RRF rank × reranker score)
- * 8. Dedup by file, filter by minScore, slice to limit
- */
- export declare function hybridQuery(store: Store, query: string, options?: HybridQueryOptions): Promise<HybridQueryResult[]>;
- export interface VectorSearchOptions {
- collection?: string;
- limit?: number;
- minScore?: number;
- intent?: string;
- hooks?: Pick<SearchHooks, 'onExpand'>;
- /**
- * Optional embedding provider for query-side encoding (i-loazq6ze).
- * When supplied, query vectors are encoded via the provider (HTTP /
- * GPU worker / fallback chain) instead of the local llama-cpp model.
- */
- embedProvider?: EmbeddingProvider;
- }
- export interface VectorSearchResult {
- file: string;
- displayPath: string;
- title: string;
- body: string;
- score: number;
- context: string | null;
- docid: string;
- }
- /**
- * Vector-only semantic search with query expansion.
- *
- * Pipeline:
- * 1. expandQuery() → typed variants, filter to vec/hyde only (lex irrelevant here)
- * 2. searchVec() for original + vec/hyde variants (sequential — node-llama-cpp embed limitation)
- * 3. Dedup by filepath (keep max score)
- * 4. Sort by score descending, filter by minScore, slice to limit
- */
- export declare function vectorSearchQuery(store: Store, query: string, options?: VectorSearchOptions): Promise<VectorSearchResult[]>;
- /**
- * A single sub-search in a structured search request.
- * Matches the format used in QMD training data.
- */
- export interface StructuredSearchOptions {
- collections?: string[];
- limit?: number;
- minScore?: number;
- candidateLimit?: number;
- explain?: boolean;
- /** Domain intent hint for disambiguation — steers reranking and chunk selection */
- intent?: string;
- /** Skip LLM reranking, use only RRF scores */
- skipRerank?: boolean;
- chunkStrategy?: ChunkStrategy;
- hooks?: SearchHooks;
- /**
- * Optional embedding provider for query-side encoding (i-loazq6ze).
- * When supplied, vec/hyde sub-queries are batch-encoded via the provider
- * (HTTP / GPU worker / fallback chain) instead of `getLlm(store).embedBatch`.
- */
- embedProvider?: EmbeddingProvider;
- }
- /**
- * Structured search: execute pre-expanded queries without LLM query expansion.
- *
- * Designed for LLM callers (MCP/HTTP) that generate their own query expansions.
- * Skips the internal expandQuery() step — goes directly to:
- *
- * Pipeline:
- * 1. Route searches: lex→FTS, vec/hyde→vector (batch embed)
- * 2. RRF fusion across all result lists
- * 3. Chunk documents + keyword-best-chunk selection
- * 4. Rerank on chunks
- * 5. Position-aware score blending
- * 6. Dedup, filter, slice
- *
- * This is the recommended endpoint for capable LLMs — they can generate
- * better query variations than our small local model, especially for
- * domain-specific or nuanced queries.
- */
- export declare function structuredSearch(store: Store, searches: ExpandedQuery[], options?: StructuredSearchOptions): Promise<HybridQueryResult[]>;
|