| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870 |
- /**
- * QMD Store - Core data access and retrieval functions
- *
- * This module provides all database operations, search functions, and document
- * retrieval for QMD. It returns raw data structures that can be formatted by
- * CLI or MCP consumers.
- *
- * Usage:
- * const store = createStore("/path/to/db.sqlite");
- * // or use default path:
- * const store = createStore();
- */
- import { openDatabase, loadSqliteVec } from "./db.js";
- import picomatch from "picomatch";
- import { createHash } from "crypto";
- import { readFileSync, realpathSync, statSync, mkdirSync } from "node:fs";
- // Note: node:path resolve is not imported — we export our own cross-platform resolve()
- import fastGlob from "fast-glob";
- import { LlamaCpp, getDefaultLlamaCpp, formatQueryForEmbedding, formatDocForEmbedding, withLLMSessionForLlm, } from "./llm.js";
- import { assertModelCompatible, } from "./embedding/provider.js";
- // =============================================================================
- // Configuration
- // =============================================================================
- const HOME = process.env.HOME || "/tmp";
- export const DEFAULT_EMBED_MODEL = "embeddinggemma";
- export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
- export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
- export const DEFAULT_GLOB = "**/*.md";
- export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
- export const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
- export const DEFAULT_EMBED_MAX_BATCH_BYTES = 64 * 1024 * 1024; // 64MB
- // Chunking: 900 tokens per chunk with 15% overlap
- // Increased from 800 to accommodate smart chunking finding natural break points
- export const CHUNK_SIZE_TOKENS = 900;
- export const CHUNK_OVERLAP_TOKENS = Math.floor(CHUNK_SIZE_TOKENS * 0.15); // 135 tokens (15% overlap)
- // Fallback char-based approximation for sync chunking (~4 chars per token)
- export const CHUNK_SIZE_CHARS = CHUNK_SIZE_TOKENS * 4; // 3600 chars
- export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 540 chars
- // Search window for finding optimal break points (in tokens, ~200 tokens)
- export const CHUNK_WINDOW_TOKENS = 200;
- export const CHUNK_WINDOW_CHARS = CHUNK_WINDOW_TOKENS * 4; // 800 chars
- /**
- * Get the LlamaCpp instance for a store — prefers the store's own instance,
- * falls back to the global singleton.
- */
- function getLlm(store) {
- return store.llm ?? getDefaultLlamaCpp();
- }
- /**
- * Patterns for detecting break points in markdown documents.
- * Higher scores indicate better places to split.
- * Scores are spread wide so headings decisively beat lower-quality breaks.
- * Order matters for scoring - more specific patterns first.
- */
- export const BREAK_PATTERNS = [
- [/\n#{1}(?!#)/g, 100, 'h1'], // # but not ##
- [/\n#{2}(?!#)/g, 90, 'h2'], // ## but not ###
- [/\n#{3}(?!#)/g, 80, 'h3'], // ### but not ####
- [/\n#{4}(?!#)/g, 70, 'h4'], // #### but not #####
- [/\n#{5}(?!#)/g, 60, 'h5'], // ##### but not ######
- [/\n#{6}(?!#)/g, 50, 'h6'], // ######
- [/\n```/g, 80, 'codeblock'], // code block boundary (same as h3)
- [/\n(?:---|\*\*\*|___)\s*\n/g, 60, 'hr'], // horizontal rule
- [/\n\n+/g, 20, 'blank'], // paragraph boundary
- [/\n[-*]\s/g, 5, 'list'], // unordered list item
- [/\n\d+\.\s/g, 5, 'numlist'], // ordered list item
- [/\n/g, 1, 'newline'], // minimal break
- ];
- /**
- * Scan text for all potential break points.
- * Returns sorted array of break points with higher-scoring patterns taking precedence
- * when multiple patterns match the same position.
- */
- export function scanBreakPoints(text) {
- const points = [];
- const seen = new Map(); // pos -> best break point at that pos
- for (const [pattern, score, type] of BREAK_PATTERNS) {
- for (const match of text.matchAll(pattern)) {
- const pos = match.index;
- const existing = seen.get(pos);
- // Keep higher score if position already seen
- if (!existing || score > existing.score) {
- const bp = { pos, score, type };
- seen.set(pos, bp);
- }
- }
- }
- // Convert to array and sort by position
- for (const bp of seen.values()) {
- points.push(bp);
- }
- return points.sort((a, b) => a.pos - b.pos);
- }
- /**
- * Find all code fence regions in the text.
- * Code fences are delimited by ``` and we should never split inside them.
- */
- export function findCodeFences(text) {
- const regions = [];
- const fencePattern = /\n```/g;
- let inFence = false;
- let fenceStart = 0;
- for (const match of text.matchAll(fencePattern)) {
- if (!inFence) {
- fenceStart = match.index;
- inFence = true;
- }
- else {
- regions.push({ start: fenceStart, end: match.index + match[0].length });
- inFence = false;
- }
- }
- // Handle unclosed fence - extends to end of document
- if (inFence) {
- regions.push({ start: fenceStart, end: text.length });
- }
- return regions;
- }
- /**
- * Check if a position is inside a code fence region.
- */
- export function isInsideCodeFence(pos, fences) {
- return fences.some(f => pos > f.start && pos < f.end);
- }
- /**
- * Find the best cut position using scored break points with distance decay.
- *
- * Uses squared distance for gentler early decay - headings far back still win
- * over low-quality breaks near the target.
- *
- * @param breakPoints - Pre-scanned break points from scanBreakPoints()
- * @param targetCharPos - The ideal cut position (e.g., maxChars boundary)
- * @param windowChars - How far back to search for break points (default ~200 tokens)
- * @param decayFactor - How much to penalize distance (0.7 = 30% score at window edge)
- * @param codeFences - Code fence regions to avoid splitting inside
- * @returns The best position to cut at
- */
- export function findBestCutoff(breakPoints, targetCharPos, windowChars = CHUNK_WINDOW_CHARS, decayFactor = 0.7, codeFences = []) {
- const windowStart = targetCharPos - windowChars;
- let bestScore = -1;
- let bestPos = targetCharPos;
- for (const bp of breakPoints) {
- if (bp.pos < windowStart)
- continue;
- if (bp.pos > targetCharPos)
- break; // sorted, so we can stop
- // Skip break points inside code fences
- if (isInsideCodeFence(bp.pos, codeFences))
- continue;
- const distance = targetCharPos - bp.pos;
- // Squared distance decay: gentle early, steep late
- // At target: multiplier = 1.0
- // At 25% back: multiplier = 0.956
- // At 50% back: multiplier = 0.825
- // At 75% back: multiplier = 0.606
- // At window edge: multiplier = 0.3
- const normalizedDist = distance / windowChars;
- const multiplier = 1.0 - (normalizedDist * normalizedDist) * decayFactor;
- const finalScore = bp.score * multiplier;
- if (finalScore > bestScore) {
- bestScore = finalScore;
- bestPos = bp.pos;
- }
- }
- return bestPos;
- }
- /**
- * Merge two sets of break points (e.g. regex + AST), keeping the highest
- * score at each position. Result is sorted by position.
- */
- export function mergeBreakPoints(a, b) {
- const seen = new Map();
- for (const bp of a) {
- const existing = seen.get(bp.pos);
- if (!existing || bp.score > existing.score) {
- seen.set(bp.pos, bp);
- }
- }
- for (const bp of b) {
- const existing = seen.get(bp.pos);
- if (!existing || bp.score > existing.score) {
- seen.set(bp.pos, bp);
- }
- }
- return Array.from(seen.values()).sort((a, b) => a.pos - b.pos);
- }
- /**
- * Core chunk algorithm that operates on precomputed break points and code fences.
- * This is the shared implementation used by both regex-only and AST-aware chunking.
- */
- export function chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS) {
- if (content.length <= maxChars) {
- return [{ text: content, pos: 0 }];
- }
- const chunks = [];
- let charPos = 0;
- while (charPos < content.length) {
- const targetEndPos = Math.min(charPos + maxChars, content.length);
- let endPos = targetEndPos;
- if (endPos < content.length) {
- const bestCutoff = findBestCutoff(breakPoints, targetEndPos, windowChars, 0.7, codeFences);
- if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
- endPos = bestCutoff;
- }
- }
- if (endPos <= charPos) {
- endPos = Math.min(charPos + maxChars, content.length);
- }
- chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
- if (endPos >= content.length) {
- break;
- }
- charPos = endPos - overlapChars;
- const lastChunkPos = chunks.at(-1).pos;
- if (charPos <= lastChunkPos) {
- charPos = endPos;
- }
- }
- return chunks;
- }
- // Hybrid query: strong BM25 signal detection thresholds
- // Skip expensive LLM expansion when top result is strong AND clearly separated from runner-up
- export const STRONG_SIGNAL_MIN_SCORE = 0.85;
- export const STRONG_SIGNAL_MIN_GAP = 0.15;
- // Max candidates to pass to reranker — balances quality vs latency.
- // 40 keeps rank 31-40 visible to the reranker (matters for recall on broad queries).
- export const RERANK_CANDIDATE_LIMIT = 40;
- // =============================================================================
- // Path utilities
- // =============================================================================
- export function homedir() {
- return HOME;
- }
- /**
- * Check if a path is absolute.
- * Supports:
- * - Unix paths: /path/to/file
- * - Windows native: C:\path or C:/path
- * - Git Bash: /c/path or /C/path (C-Z drives, excluding A/B floppy drives)
- *
- * Note: /c without trailing slash is treated as Unix path (directory named "c"),
- * while /c/ or /c/path are treated as Git Bash paths (C: drive).
- */
- export function isAbsolutePath(path) {
- if (!path)
- return false;
- // Unix absolute path
- if (path.startsWith('/')) {
- // Check if it's a Git Bash style path like /c/ or /c/Users (C-Z only, not A or B)
- // Requires path[2] === '/' to distinguish from Unix paths like /c or /cache
- // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
- if (!isWSL() && path.length >= 3 && path[2] === '/') {
- const driveLetter = path[1];
- if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
- return true;
- }
- }
- // Any other path starting with / is Unix absolute
- return true;
- }
- // Windows native path: C:\ or C:/ (any letter A-Z)
- if (path.length >= 2 && /[a-zA-Z]/.test(path[0]) && path[1] === ':') {
- return true;
- }
- return false;
- }
- /**
- * Normalize path separators to forward slashes.
- * Converts Windows backslashes to forward slashes.
- */
- export function normalizePathSeparators(path) {
- return path.replace(/\\/g, '/');
- }
- /**
- * Detect if running inside WSL (Windows Subsystem for Linux).
- * On WSL, paths like /c/work/... are valid drvfs mount points, not Git Bash paths.
- */
- function isWSL() {
- return !!(process.env.WSL_DISTRO_NAME || process.env.WSL_INTEROP);
- }
- /**
- * Get the relative path from a prefix.
- * Returns null if path is not under prefix.
- * Returns empty string if path equals prefix.
- */
- export function getRelativePathFromPrefix(path, prefix) {
- // Empty prefix is invalid
- if (!prefix) {
- return null;
- }
- const normalizedPath = normalizePathSeparators(path);
- const normalizedPrefix = normalizePathSeparators(prefix);
- // Ensure prefix ends with / for proper matching
- const prefixWithSlash = !normalizedPrefix.endsWith('/')
- ? normalizedPrefix + '/'
- : normalizedPrefix;
- // Exact match
- if (normalizedPath === normalizedPrefix) {
- return '';
- }
- // Check if path starts with prefix
- if (normalizedPath.startsWith(prefixWithSlash)) {
- return normalizedPath.slice(prefixWithSlash.length);
- }
- return null;
- }
- export function resolve(...paths) {
- if (paths.length === 0) {
- throw new Error("resolve: at least one path segment is required");
- }
- // Normalize all paths to use forward slashes
- const normalizedPaths = paths.map(normalizePathSeparators);
- let result = '';
- let windowsDrive = '';
- // Check if first path is absolute
- const firstPath = normalizedPaths[0];
- if (isAbsolutePath(firstPath)) {
- result = firstPath;
- // Extract Windows drive letter if present
- if (firstPath.length >= 2 && /[a-zA-Z]/.test(firstPath[0]) && firstPath[1] === ':') {
- windowsDrive = firstPath.slice(0, 2);
- result = firstPath.slice(2);
- }
- else if (!isWSL() && firstPath.startsWith('/') && firstPath.length >= 3 && firstPath[2] === '/') {
- // Git Bash style: /c/ -> C: (C-Z drives only, not A or B)
- // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
- const driveLetter = firstPath[1];
- if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
- windowsDrive = driveLetter.toUpperCase() + ':';
- result = firstPath.slice(2);
- }
- }
- }
- else {
- // Start with PWD or cwd, then append the first relative path
- const pwd = normalizePathSeparators(process.env.PWD || process.cwd());
- // Extract Windows drive from PWD if present
- if (pwd.length >= 2 && /[a-zA-Z]/.test(pwd[0]) && pwd[1] === ':') {
- windowsDrive = pwd.slice(0, 2);
- result = pwd.slice(2) + '/' + firstPath;
- }
- else {
- result = pwd + '/' + firstPath;
- }
- }
- // Process remaining paths
- for (let i = 1; i < normalizedPaths.length; i++) {
- const p = normalizedPaths[i];
- if (isAbsolutePath(p)) {
- // Absolute path replaces everything
- result = p;
- // Update Windows drive if present
- if (p.length >= 2 && /[a-zA-Z]/.test(p[0]) && p[1] === ':') {
- windowsDrive = p.slice(0, 2);
- result = p.slice(2);
- }
- else if (!isWSL() && p.startsWith('/') && p.length >= 3 && p[2] === '/') {
- // Git Bash style (C-Z drives only, not A or B)
- // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
- const driveLetter = p[1];
- if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
- windowsDrive = driveLetter.toUpperCase() + ':';
- result = p.slice(2);
- }
- else {
- windowsDrive = '';
- }
- }
- else {
- windowsDrive = '';
- }
- }
- else {
- // Relative path - append
- result = result + '/' + p;
- }
- }
- // Normalize . and .. components
- const parts = result.split('/').filter(Boolean);
- const normalized = [];
- for (const part of parts) {
- if (part === '..') {
- normalized.pop();
- }
- else if (part !== '.') {
- normalized.push(part);
- }
- }
- // Build final path
- const finalPath = '/' + normalized.join('/');
- // Prepend Windows drive if present
- if (windowsDrive) {
- return windowsDrive + finalPath;
- }
- return finalPath;
- }
- // Flag to indicate production mode (set by qmd.ts at startup)
- let _productionMode = false;
- export function enableProductionMode() {
- _productionMode = true;
- }
- /** Reset production mode flag — only for testing. */
- export function _resetProductionModeForTesting() {
- _productionMode = false;
- }
- export function getDefaultDbPath(indexName = "index") {
- // Always allow override via INDEX_PATH (for testing)
- if (process.env.INDEX_PATH) {
- return process.env.INDEX_PATH;
- }
- // In non-production mode (tests), require explicit path
- if (!_productionMode) {
- throw new Error("Database path not set. Tests must set INDEX_PATH env var or use createStore() with explicit path. " +
- "This prevents tests from accidentally writing to the global index.");
- }
- const cacheDir = process.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
- const qmdCacheDir = resolve(cacheDir, "qmd");
- try {
- mkdirSync(qmdCacheDir, { recursive: true });
- }
- catch { }
- return resolve(qmdCacheDir, `${indexName}.sqlite`);
- }
- export function getPwd() {
- return process.env.PWD || process.cwd();
- }
- export function getRealPath(path) {
- try {
- return realpathSync(path);
- }
- catch {
- return resolve(path);
- }
- }
- /**
- * Normalize explicit virtual path formats to standard qmd:// format.
- * Only handles paths that are already explicitly virtual:
- * - qmd://collection/path.md (already normalized)
- * - qmd:////collection/path.md (extra slashes - normalize)
- * - //collection/path.md (missing qmd: prefix - add it)
- *
- * Does NOT handle:
- * - collection/path.md (bare paths - could be filesystem relative)
- * - :linenum suffix (should be parsed separately before calling this)
- */
- export function normalizeVirtualPath(input) {
- let path = input.trim();
- // Handle qmd:// with extra slashes: qmd:////collection/path -> qmd://collection/path
- if (path.startsWith('qmd:')) {
- // Remove qmd: prefix and normalize slashes
- path = path.slice(4);
- // Remove leading slashes and re-add exactly two
- path = path.replace(/^\/+/, '');
- return `qmd://${path}`;
- }
- // Handle //collection/path (missing qmd: prefix)
- if (path.startsWith('//')) {
- path = path.replace(/^\/+/, '');
- return `qmd://${path}`;
- }
- // Return as-is for other cases (filesystem paths, docids, bare collection/path, etc.)
- return path;
- }
- /**
- * Parse a virtual path like "qmd://collection-name/path/to/file.md"
- * into its components.
- * Also supports collection root: "qmd://collection-name/" or "qmd://collection-name"
- */
- export function parseVirtualPath(virtualPath) {
- // Normalize the path first
- const normalized = normalizeVirtualPath(virtualPath);
- // Match: qmd://collection-name[/optional-path]
- // Allows: qmd://name, qmd://name/, qmd://name/path
- const match = normalized.match(/^qmd:\/\/([^\/]+)\/?(.*)$/);
- if (!match?.[1])
- return null;
- return {
- collectionName: match[1],
- path: match[2] ?? '', // Empty string for collection root
- };
- }
- /**
- * Build a virtual path from collection name and relative path.
- */
- export function buildVirtualPath(collectionName, path) {
- return `qmd://${collectionName}/${path}`;
- }
- /**
- * Check if a path is explicitly a virtual path.
- * Only recognizes explicit virtual path formats:
- * - qmd://collection/path.md
- * - //collection/path.md
- *
- * Does NOT consider bare collection/path.md as virtual - that should be
- * handled separately by checking if the first component is a collection name.
- */
- export function isVirtualPath(path) {
- const trimmed = path.trim();
- // Explicit qmd:// prefix (with any number of slashes)
- if (trimmed.startsWith('qmd:'))
- return true;
- // //collection/path format (missing qmd: prefix)
- if (trimmed.startsWith('//'))
- return true;
- return false;
- }
- /**
- * Resolve a virtual path to absolute filesystem path.
- */
- export function resolveVirtualPath(db, virtualPath) {
- const parsed = parseVirtualPath(virtualPath);
- if (!parsed)
- return null;
- const coll = getCollectionByName(db, parsed.collectionName);
- if (!coll)
- return null;
- return resolve(coll.pwd, parsed.path);
- }
- /**
- * Convert an absolute filesystem path to a virtual path.
- * Returns null if the file is not in any indexed collection.
- */
- export function toVirtualPath(db, absolutePath) {
- // Get all collections from DB
- const collections = getStoreCollections(db);
- // Find which collection this absolute path belongs to
- for (const coll of collections) {
- if (absolutePath.startsWith(coll.path + '/') || absolutePath === coll.path) {
- // Extract relative path
- const relativePath = absolutePath.startsWith(coll.path + '/')
- ? absolutePath.slice(coll.path.length + 1)
- : '';
- // Verify this document exists in the database
- const doc = db.prepare(`
- SELECT d.path
- FROM documents d
- WHERE d.collection = ? AND d.path = ? AND d.active = 1
- LIMIT 1
- `).get(coll.name, relativePath);
- if (doc) {
- return buildVirtualPath(coll.name, relativePath);
- }
- }
- }
- return null;
- }
- // =============================================================================
- // Database initialization
- // =============================================================================
- function createSqliteVecUnavailableError(reason) {
- return new Error("sqlite-vec extension is unavailable. " +
- `${reason}. ` +
- "Install Homebrew SQLite so the sqlite-vec extension can be loaded, " +
- "and set BREW_PREFIX if Homebrew is installed in a non-standard location.");
- }
- function getErrorMessage(err) {
- return err instanceof Error ? err.message : String(err);
- }
- export function verifySqliteVecLoaded(db) {
- try {
- const row = db.prepare(`SELECT vec_version() AS version`).get();
- if (!row?.version || typeof row.version !== "string") {
- throw new Error("vec_version() returned no version");
- }
- }
- catch (err) {
- const message = getErrorMessage(err);
- throw createSqliteVecUnavailableError(`sqlite-vec probe failed (${message})`);
- }
- }
- let _sqliteVecAvailable = null;
- function initializeDatabase(db) {
- try {
- loadSqliteVec(db);
- verifySqliteVecLoaded(db);
- _sqliteVecAvailable = true;
- }
- catch (err) {
- // sqlite-vec is optional — vector search won't work but FTS is fine
- _sqliteVecAvailable = false;
- console.warn(getErrorMessage(err));
- }
- db.exec("PRAGMA journal_mode = WAL");
- db.exec("PRAGMA foreign_keys = ON");
- // Drop legacy tables that are now managed in YAML
- db.exec(`DROP TABLE IF EXISTS path_contexts`);
- db.exec(`DROP TABLE IF EXISTS collections`);
- // Content-addressable storage - the source of truth for document content
- db.exec(`
- CREATE TABLE IF NOT EXISTS content (
- hash TEXT PRIMARY KEY,
- doc TEXT NOT NULL,
- created_at TEXT NOT NULL
- )
- `);
- // Documents table - file system layer mapping virtual paths to content hashes
- // Collections are now managed in ~/.config/qmd/index.yml
- db.exec(`
- CREATE TABLE IF NOT EXISTS documents (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- collection TEXT NOT NULL,
- path TEXT NOT NULL,
- title TEXT NOT NULL,
- hash TEXT NOT NULL,
- created_at TEXT NOT NULL,
- modified_at TEXT NOT NULL,
- active INTEGER NOT NULL DEFAULT 1,
- FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
- UNIQUE(collection, path)
- )
- `);
- db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection, active)`);
- db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`);
- db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path, active)`);
- // Cache table for LLM API calls
- db.exec(`
- CREATE TABLE IF NOT EXISTS llm_cache (
- hash TEXT PRIMARY KEY,
- result TEXT NOT NULL,
- created_at TEXT NOT NULL
- )
- `);
- // Content vectors
- const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all();
- const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
- if (cvInfo.length > 0 && !hasSeqColumn) {
- db.exec(`DROP TABLE IF EXISTS content_vectors`);
- db.exec(`DROP TABLE IF EXISTS vectors_vec`);
- }
- db.exec(`
- CREATE TABLE IF NOT EXISTS content_vectors (
- hash TEXT NOT NULL,
- seq INTEGER NOT NULL DEFAULT 0,
- pos INTEGER NOT NULL DEFAULT 0,
- model TEXT NOT NULL,
- embedded_at TEXT NOT NULL,
- PRIMARY KEY (hash, seq)
- )
- `);
- // Store collections — makes the DB self-contained (no external config needed)
- db.exec(`
- CREATE TABLE IF NOT EXISTS store_collections (
- name TEXT PRIMARY KEY,
- path TEXT NOT NULL,
- pattern TEXT NOT NULL DEFAULT '**/*.md',
- ignore_patterns TEXT,
- include_by_default INTEGER DEFAULT 1,
- update_command TEXT,
- context TEXT
- )
- `);
- // Store config — key-value metadata (e.g. config_hash for sync optimization)
- db.exec(`
- CREATE TABLE IF NOT EXISTS store_config (
- key TEXT PRIMARY KEY,
- value TEXT
- )
- `);
- // FTS - index filepath (collection/path), title, and content
- db.exec(`
- CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
- filepath, title, body,
- tokenize='porter unicode61'
- )
- `);
- // Triggers to keep FTS in sync
- db.exec(`
- CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents
- WHEN new.active = 1
- BEGIN
- INSERT INTO documents_fts(rowid, filepath, title, body)
- SELECT
- new.id,
- new.collection || '/' || new.path,
- new.title,
- (SELECT doc FROM content WHERE hash = new.hash)
- WHERE new.active = 1;
- END
- `);
- db.exec(`
- CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
- DELETE FROM documents_fts WHERE rowid = old.id;
- END
- `);
- db.exec(`
- CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents
- BEGIN
- -- Delete from FTS if no longer active
- DELETE FROM documents_fts WHERE rowid = old.id AND new.active = 0;
- -- Update FTS if still/newly active
- INSERT OR REPLACE INTO documents_fts(rowid, filepath, title, body)
- SELECT
- new.id,
- new.collection || '/' || new.path,
- new.title,
- (SELECT doc FROM content WHERE hash = new.hash)
- WHERE new.active = 1;
- END
- `);
- }
- function rowToNamedCollection(row) {
- return {
- name: row.name,
- path: row.path,
- pattern: row.pattern,
- ...(row.ignore_patterns ? { ignore: JSON.parse(row.ignore_patterns) } : {}),
- ...(row.include_by_default === 0 ? { includeByDefault: false } : {}),
- ...(row.update_command ? { update: row.update_command } : {}),
- ...(row.context ? { context: JSON.parse(row.context) } : {}),
- };
- }
- export function getStoreCollections(db) {
- const rows = db.prepare(`SELECT * FROM store_collections`).all();
- return rows.map(rowToNamedCollection);
- }
- export function getStoreCollection(db, name) {
- const row = db.prepare(`SELECT * FROM store_collections WHERE name = ?`).get(name);
- if (row == null)
- return null;
- return rowToNamedCollection(row);
- }
- export function getStoreGlobalContext(db) {
- const row = db.prepare(`SELECT value FROM store_config WHERE key = 'global_context'`).get();
- if (row == null)
- return undefined;
- return row.value || undefined;
- }
- export function getStoreContexts(db) {
- const results = [];
- // Global context
- const globalCtx = getStoreGlobalContext(db);
- if (globalCtx) {
- results.push({ collection: "*", path: "/", context: globalCtx });
- }
- // Collection contexts
- const rows = db.prepare(`SELECT name, context FROM store_collections WHERE context IS NOT NULL`).all();
- for (const row of rows) {
- const ctxMap = JSON.parse(row.context);
- for (const [path, context] of Object.entries(ctxMap)) {
- results.push({ collection: row.name, path, context });
- }
- }
- return results;
- }
- export function upsertStoreCollection(db, name, collection) {
- db.prepare(`
- INSERT INTO store_collections (name, path, pattern, ignore_patterns, include_by_default, update_command, context)
- VALUES (?, ?, ?, ?, ?, ?, ?)
- ON CONFLICT(name) DO UPDATE SET
- path = excluded.path,
- pattern = excluded.pattern,
- ignore_patterns = excluded.ignore_patterns,
- include_by_default = excluded.include_by_default,
- update_command = excluded.update_command,
- context = excluded.context
- `).run(name, collection.path, collection.pattern || '**/*.md', collection.ignore ? JSON.stringify(collection.ignore) : null, collection.includeByDefault === false ? 0 : 1, collection.update || null, collection.context ? JSON.stringify(collection.context) : null);
- }
- export function deleteStoreCollection(db, name) {
- const result = db.prepare(`DELETE FROM store_collections WHERE name = ?`).run(name);
- return result.changes > 0;
- }
- export function renameStoreCollection(db, oldName, newName) {
- // Check target doesn't exist
- const existing = db.prepare(`SELECT name FROM store_collections WHERE name = ?`).get(newName);
- if (existing != null) {
- throw new Error(`Collection '${newName}' already exists`);
- }
- const result = db.prepare(`UPDATE store_collections SET name = ? WHERE name = ?`).run(newName, oldName);
- return result.changes > 0;
- }
- export function updateStoreContext(db, collectionName, path, text) {
- const row = db.prepare(`SELECT context FROM store_collections WHERE name = ?`).get(collectionName);
- if (row == null)
- return false;
- const ctxMap = row.context ? JSON.parse(row.context) : {};
- ctxMap[path] = text;
- db.prepare(`UPDATE store_collections SET context = ? WHERE name = ?`).run(JSON.stringify(ctxMap), collectionName);
- return true;
- }
- export function removeStoreContext(db, collectionName, path) {
- const row = db.prepare(`SELECT context FROM store_collections WHERE name = ?`).get(collectionName);
- if (row == null)
- return false;
- if (!row.context)
- return false;
- const ctxMap = JSON.parse(row.context);
- if (!(path in ctxMap))
- return false;
- delete ctxMap[path];
- const newCtx = Object.keys(ctxMap).length > 0 ? JSON.stringify(ctxMap) : null;
- db.prepare(`UPDATE store_collections SET context = ? WHERE name = ?`).run(newCtx, collectionName);
- return true;
- }
- export function setStoreGlobalContext(db, value) {
- if (value === undefined) {
- db.prepare(`DELETE FROM store_config WHERE key = 'global_context'`).run();
- }
- else {
- db.prepare(`INSERT INTO store_config (key, value) VALUES ('global_context', ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value`).run(value);
- }
- }
- /**
- * Sync external config (YAML/inline) into SQLite store_collections.
- * External config always wins. Skips sync if config hash hasn't changed.
- */
- export function syncConfigToDb(db, config) {
- // Check config hash — skip sync if unchanged
- const configJson = JSON.stringify(config);
- const hash = createHash('sha256').update(configJson).digest('hex');
- const existingHash = db.prepare(`SELECT value FROM store_config WHERE key = 'config_hash'`).get();
- if (existingHash != null && existingHash.value === hash) {
- return; // Config unchanged, skip sync
- }
- // Sync collections
- const configNames = new Set(Object.keys(config.collections));
- for (const [name, coll] of Object.entries(config.collections)) {
- upsertStoreCollection(db, name, coll);
- }
- // Delete collections not in config
- const dbCollections = db.prepare(`SELECT name FROM store_collections`).all();
- for (const row of dbCollections) {
- if (!configNames.has(row.name)) {
- db.prepare(`DELETE FROM store_collections WHERE name = ?`).run(row.name);
- }
- }
- // Sync global context
- if (config.global_context !== undefined) {
- setStoreGlobalContext(db, config.global_context);
- }
- else {
- setStoreGlobalContext(db, undefined);
- }
- // Save config hash
- db.prepare(`INSERT INTO store_config (key, value) VALUES ('config_hash', ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value`).run(hash);
- }
- export function isSqliteVecAvailable() {
- return _sqliteVecAvailable === true;
- }
- function ensureVecTableInternal(db, dimensions) {
- if (!_sqliteVecAvailable) {
- throw new Error("sqlite-vec is not available. Vector operations require a SQLite build with extension loading support.");
- }
- const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
- if (tableInfo) {
- const match = tableInfo.sql.match(/float\[(\d+)\]/);
- const hasHashSeq = tableInfo.sql.includes('hash_seq');
- const hasCosine = tableInfo.sql.includes('distance_metric=cosine');
- const existingDims = match?.[1] ? parseInt(match[1], 10) : null;
- if (existingDims === dimensions && hasHashSeq && hasCosine)
- return;
- if (existingDims !== null && existingDims !== dimensions) {
- throw new Error(`Embedding dimension mismatch: existing vectors are ${existingDims}d but the current model produces ${dimensions}d. ` +
- `Run 'qmd embed -f' to re-embed with the new model.`);
- }
- db.exec("DROP TABLE IF EXISTS vectors_vec");
- }
- db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}] distance_metric=cosine)`);
- }
- /**
- * Re-index a single collection by scanning the filesystem and updating the database.
- * Pure function — no console output, no db lifecycle management.
- */
- export async function reindexCollection(store, collectionPath, globPattern, collectionName, options) {
- const db = store.db;
- const now = new Date().toISOString();
- const excludeDirs = ["node_modules", ".git", ".cache", "vendor", "dist", "build"];
- const allIgnore = [
- ...excludeDirs.map(d => `**/${d}/**`),
- ...(options?.ignorePatterns || []),
- ];
- const allFiles = await fastGlob(globPattern, {
- cwd: collectionPath,
- onlyFiles: true,
- followSymbolicLinks: false,
- dot: false,
- ignore: allIgnore,
- });
- // Filter hidden files/folders
- const files = allFiles.filter(file => {
- const parts = file.split("/");
- return !parts.some(part => part.startsWith("."));
- });
- const total = files.length;
- let indexed = 0, updated = 0, unchanged = 0, processed = 0;
- const seenPaths = new Set();
- for (const relativeFile of files) {
- const filepath = getRealPath(resolve(collectionPath, relativeFile));
- const path = handelize(relativeFile);
- seenPaths.add(path);
- let content;
- try {
- content = readFileSync(filepath, "utf-8");
- }
- catch {
- processed++;
- options?.onProgress?.({ file: relativeFile, current: processed, total });
- continue;
- }
- if (!content.trim()) {
- processed++;
- continue;
- }
- const hash = await hashContent(content);
- const title = extractTitle(content, relativeFile);
- const existing = findActiveDocument(db, collectionName, path);
- if (existing) {
- if (existing.hash === hash) {
- if (existing.title !== title) {
- updateDocumentTitle(db, existing.id, title, now);
- updated++;
- }
- else {
- unchanged++;
- }
- }
- else {
- insertContent(db, hash, content, now);
- const stat = statSync(filepath);
- updateDocument(db, existing.id, title, hash, stat ? new Date(stat.mtime).toISOString() : now);
- updated++;
- }
- }
- else {
- indexed++;
- insertContent(db, hash, content, now);
- const stat = statSync(filepath);
- insertDocument(db, collectionName, path, title, hash, stat ? new Date(stat.birthtime).toISOString() : now, stat ? new Date(stat.mtime).toISOString() : now);
- }
- processed++;
- options?.onProgress?.({ file: relativeFile, current: processed, total });
- }
- // Deactivate documents that no longer exist
- const allActive = getActiveDocumentPaths(db, collectionName);
- let removed = 0;
- for (const path of allActive) {
- if (!seenPaths.has(path)) {
- deactivateDocument(db, collectionName, path);
- removed++;
- }
- }
- const orphanedCleaned = cleanupOrphanedContent(db);
- return { indexed, updated, unchanged, removed, orphanedCleaned };
- }
- function validatePositiveIntegerOption(name, value, fallback) {
- if (value === undefined)
- return fallback;
- if (!Number.isInteger(value) || value < 1) {
- throw new Error(`${name} must be a positive integer`);
- }
- return value;
- }
- function resolveEmbedOptions(options) {
- return {
- maxDocsPerBatch: validatePositiveIntegerOption("maxDocsPerBatch", options?.maxDocsPerBatch, DEFAULT_EMBED_MAX_DOCS_PER_BATCH),
- maxBatchBytes: validatePositiveIntegerOption("maxBatchBytes", options?.maxBatchBytes, DEFAULT_EMBED_MAX_BATCH_BYTES),
- };
- }
- function getPendingEmbeddingDocs(db) {
- // `MIN(d.collection)` deterministically picks one collection per hash when
- // the same content is indexed in multiple collections (SQLite tie-breaks
- // alphabetically). The identical bytes produce identical chunks regardless
- // of which collection wins; the chunkStrategy lookup still resolves via
- // that collection's YAML config. See Phase 2 design notes (i-bud0h8vu).
- return db.prepare(`
- SELECT d.hash, MIN(d.path) as path, MIN(d.collection) as collection, length(CAST(c.doc AS BLOB)) as bytes
- FROM documents d
- JOIN content c ON d.hash = c.hash
- LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
- WHERE d.active = 1 AND v.hash IS NULL
- GROUP BY d.hash
- ORDER BY MIN(d.path)
- `).all();
- }
- function buildEmbeddingBatches(docs, maxDocsPerBatch, maxBatchBytes) {
- const batches = [];
- let currentBatch = [];
- let currentBytes = 0;
- for (const doc of docs) {
- const docBytes = Math.max(0, doc.bytes);
- const wouldExceedDocs = currentBatch.length >= maxDocsPerBatch;
- const wouldExceedBytes = currentBatch.length > 0 && (currentBytes + docBytes) > maxBatchBytes;
- if (wouldExceedDocs || wouldExceedBytes) {
- batches.push(currentBatch);
- currentBatch = [];
- currentBytes = 0;
- }
- currentBatch.push(doc);
- currentBytes += docBytes;
- }
- if (currentBatch.length > 0) {
- batches.push(currentBatch);
- }
- return batches;
- }
- function getEmbeddingDocsForBatch(db, batch) {
- if (batch.length === 0)
- return [];
- const placeholders = batch.map(() => "?").join(",");
- const rows = db.prepare(`
- SELECT hash, doc as body
- FROM content
- WHERE hash IN (${placeholders})
- `).all(...batch.map(doc => doc.hash));
- const bodyByHash = new Map(rows.map(row => [row.hash, row.body]));
- return batch.map((doc) => ({
- ...doc,
- body: bodyByHash.get(doc.hash) ?? "",
- }));
- }
- /**
- * Run `body` with a session-shaped argument that supplies an AbortSignal +
- * isValid flag. When `provider` is supplied, the session is a lightweight
- * AbortController-backed stub — `getLlm(store)` is never called and
- * `withLLMSessionForLlm` is bypassed entirely, so node-llama-cpp is not
- * warmed up on remote-only deployments (i-08ovbvtb, follow-up to i-qkarfffa).
- *
- * When `provider` is undefined, behavior is unchanged: a real `LLMSession`
- * is created via `withLLMSessionForLlm(getLlm(store), ...)` so that the
- * body can use `session.embed`/`session.embedBatch` for the local path.
- *
- * The fake session's LLM-only methods (embed/embedBatch/expandQuery/rerank)
- * throw if called — they MUST NOT be reached when `provider` is set, since
- * the embed path is supposed to route through the provider instead.
- */
- async function withEmbedSession(store, provider, body, options) {
- if (provider) {
- const ac = new AbortController();
- const fakeSession = {
- get signal() { return ac.signal; },
- get isValid() { return !ac.signal.aborted; },
- embed: async () => {
- throw new Error("withEmbedSession: provider supplied — session.embed must not be called");
- },
- embedBatch: async () => {
- throw new Error("withEmbedSession: provider supplied — session.embedBatch must not be called");
- },
- expandQuery: async () => {
- throw new Error("withEmbedSession: provider supplied — session.expandQuery must not be called");
- },
- rerank: async () => {
- throw new Error("withEmbedSession: provider supplied — session.rerank must not be called");
- },
- };
- try {
- return await body(fakeSession);
- }
- finally {
- ac.abort();
- }
- }
- return withLLMSessionForLlm(getLlm(store), body, options);
- }
- /**
- * Generate vector embeddings for documents that need them.
- * Pure function — no console output, no db lifecycle management.
- * Uses the store's LlamaCpp instance if set, otherwise the global singleton.
- */
- export async function generateEmbeddings(store, options) {
- const db = store.db;
- const model = options?.model ?? DEFAULT_EMBED_MODEL;
- const now = new Date().toISOString();
- const { maxDocsPerBatch, maxBatchBytes } = resolveEmbedOptions(options);
- const encoder = new TextEncoder();
- // Migration safety: if an embedProvider is supplied, verify its model id
- // matches the existing content_vectors rows (unless we're about to clear
- // them via `force`). This must happen BEFORE we clear vectors so users
- // who pass `--force` aren't blocked.
- if (options?.embedProvider && !options.force) {
- const existing = getDistinctEmbeddingModels(db);
- assertModelCompatible(options.embedProvider.getModelId(), existing);
- }
- if (options?.force) {
- clearAllEmbeddings(db);
- }
- const docsToEmbed = getPendingEmbeddingDocs(db);
- if (docsToEmbed.length === 0) {
- return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
- }
- const totalBytes = docsToEmbed.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
- const totalDocs = docsToEmbed.length;
- const startTime = Date.now();
- // Per-collection chunkStrategy lookup (Phase 2 — i-bud0h8vu). YAML
- // `chunkStrategy` on a collection wins over `options.chunkStrategy`
- // (global CLI flag); falls back to the global option, then to
- // chunkDocumentByTokens' own "regex" default when neither is set.
- // Opt-in per collection — collections without the field are untouched.
- const collectionStrategies = new Map();
- try {
- const { listCollections: listYamlCollections } = await import("./collections.js");
- for (const c of listYamlCollections()) {
- if (c.chunkStrategy)
- collectionStrategies.set(c.name, c.chunkStrategy);
- }
- }
- catch {
- // If YAML config is missing/unreadable, fall back silently to the
- // global strategy — no collection overrides. Keeps SDK/inline
- // callers that never touch ~/.config/qmd working.
- }
- // Provider routing — when an EmbeddingProvider is supplied, embed calls go
- // through it (HTTP, GPU worker, etc.). Otherwise, use the LLM session path.
- // The outer session is still created for its abort signal (chunking uses
- // `session.signal` for cooperative cancellation).
- const provider = options?.embedProvider;
- const providerModel = provider?.getModelId() ?? model;
- // Resolve `embedModelUri` (used for formatting prefixes etc.) lazily —
- // when `provider` is set, take it from the provider; otherwise fall back
- // to the local LlamaCpp's embed model name. Accessing `getLlm(store)` is
- // deferred to the non-provider branch so remote-only deployments do not
- // construct a `LlamaCpp` instance just to read its embedModelName.
- const embedModelUri = provider
- ? provider.getModelId()
- : getLlm(store).embedModelName;
- // Run the embedding loop inside a session-scoped wrapper. When `provider`
- // is set, this short-circuits the local LLM warm-up entirely (i-08ovbvtb).
- const result = await withEmbedSession(store, provider, async (session) => {
- let chunksEmbedded = 0;
- let errors = 0;
- let bytesProcessed = 0;
- let totalChunks = 0;
- let vectorTableInitialized = false;
- const BATCH_SIZE = 32;
- const batches = buildEmbeddingBatches(docsToEmbed, maxDocsPerBatch, maxBatchBytes);
- // Embedding helpers — single point of provider/session selection.
- // Both return the same shape as ILLMSession.embed/embedBatch so the
- // rest of the loop is unchanged.
- const embedOne = async (text, modelArg) => {
- if (provider) {
- const sig = provider.kind === 'local' ? session.signal : undefined;
- const r = await provider.embed(text, { model: modelArg, signal: sig });
- return r ? { embedding: r.embedding, model: r.model } : null;
- }
- return session.embed(text, { model: modelArg });
- };
- const embedMany = async (texts, modelArg) => {
- if (provider) {
- const sig = provider.kind === 'local' ? session.signal : undefined;
- const r = await provider.embedBatch(texts, { model: modelArg, signal: sig });
- return r.map((x) => (x ? { embedding: x.embedding, model: x.model } : null));
- }
- return session.embedBatch(texts, { model: modelArg });
- };
- // JS-only token estimator for the provider path. Char-based with
- // avgCharsPerToken=3 — matches the heuristic the chunker already
- // uses for its initial char-space pass, so the safety re-split is a
- // near no-op while populating the `tokens` field with a stable
- // estimate. CRITICAL: avoids loading node-llama-cpp on remote-only
- // deployments (`QMD_EMBED_ENDPOINT=...`). i-1rqixh6m DoD #1.
- const chunkTokenizer = provider
- ? (text) => Math.ceil(text.length / 3)
- : undefined;
- for (const batchMeta of batches) {
- // Abort early if session has been invalidated
- if (!session.isValid) {
- console.warn(`⚠ Session expired — skipping remaining document batches`);
- break;
- }
- const batchDocs = getEmbeddingDocsForBatch(db, batchMeta);
- const batchChunks = [];
- const batchBytes = batchMeta.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
- for (const doc of batchDocs) {
- if (!doc.body.trim())
- continue;
- const title = extractTitle(doc.body, doc.path);
- const perCollectionStrategy = collectionStrategies.get(doc.collection);
- const chunkStrategy = perCollectionStrategy ?? options?.chunkStrategy;
- const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, chunkStrategy, session.signal, chunkTokenizer);
- for (let seq = 0; seq < chunks.length; seq++) {
- batchChunks.push({
- hash: doc.hash,
- title,
- text: chunks[seq].text,
- seq,
- pos: chunks[seq].pos,
- tokens: chunks[seq].tokens,
- bytes: encoder.encode(chunks[seq].text).length,
- });
- }
- }
- totalChunks += batchChunks.length;
- if (batchChunks.length === 0) {
- bytesProcessed += batchBytes;
- options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
- continue;
- }
- if (!vectorTableInitialized) {
- const firstChunk = batchChunks[0];
- const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title, embedModelUri);
- const firstResult = await embedOne(firstText, providerModel);
- if (!firstResult) {
- throw new Error("Failed to get embedding dimensions from first chunk");
- }
- store.ensureVecTable(firstResult.embedding.length);
- vectorTableInitialized = true;
- }
- const totalBatchChunkBytes = batchChunks.reduce((sum, chunk) => sum + chunk.bytes, 0);
- let batchChunkBytesProcessed = 0;
- for (let batchStart = 0; batchStart < batchChunks.length; batchStart += BATCH_SIZE) {
- // Abort early if session has been invalidated (e.g. max duration exceeded)
- if (!session.isValid) {
- const remaining = batchChunks.length - batchStart;
- errors += remaining;
- console.warn(`⚠ Session expired — skipping ${remaining} remaining chunks`);
- break;
- }
- // Abort early if error rate is too high (>80% of processed chunks failed)
- const processed = chunksEmbedded + errors;
- if (processed >= BATCH_SIZE && errors > processed * 0.8) {
- const remaining = batchChunks.length - batchStart;
- errors += remaining;
- console.warn(`⚠ Error rate too high (${errors}/${processed}) — aborting embedding`);
- break;
- }
- const batchEnd = Math.min(batchStart + BATCH_SIZE, batchChunks.length);
- const chunkBatch = batchChunks.slice(batchStart, batchEnd);
- const texts = chunkBatch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title, embedModelUri));
- try {
- const embeddings = await embedMany(texts, providerModel);
- for (let i = 0; i < chunkBatch.length; i++) {
- const chunk = chunkBatch[i];
- const embedding = embeddings[i];
- if (embedding) {
- insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), providerModel, now);
- chunksEmbedded++;
- }
- else {
- errors++;
- }
- batchChunkBytesProcessed += chunk.bytes;
- }
- }
- catch {
- // Batch failed — try individual embeddings as fallback
- // But skip if session is already invalid (avoids N doomed retries)
- if (!session.isValid) {
- errors += chunkBatch.length;
- batchChunkBytesProcessed += chunkBatch.reduce((sum, c) => sum + c.bytes, 0);
- }
- else {
- for (const chunk of chunkBatch) {
- try {
- const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
- const result = await embedOne(text, providerModel);
- if (result) {
- insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), providerModel, now);
- chunksEmbedded++;
- }
- else {
- errors++;
- }
- }
- catch {
- errors++;
- }
- batchChunkBytesProcessed += chunk.bytes;
- }
- }
- }
- const proportionalBytes = totalBatchChunkBytes === 0
- ? batchBytes
- : Math.min(batchBytes, Math.round((batchChunkBytesProcessed / totalBatchChunkBytes) * batchBytes));
- options?.onProgress?.({
- chunksEmbedded,
- totalChunks,
- bytesProcessed: bytesProcessed + proportionalBytes,
- totalBytes,
- errors,
- });
- }
- bytesProcessed += batchBytes;
- options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
- }
- return { chunksEmbedded, errors };
- }, { maxDuration: 30 * 60 * 1000, name: 'generateEmbeddings' });
- return {
- docsProcessed: totalDocs,
- chunksEmbedded: result.chunksEmbedded,
- errors: result.errors,
- durationMs: Date.now() - startTime,
- };
- }
- /**
- * Create a new store instance with the given database path.
- * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
- *
- * @param dbPath - Path to the SQLite database file
- * @returns Store instance with all methods bound to the database
- */
- export function createStore(dbPath) {
- const resolvedPath = dbPath || getDefaultDbPath();
- const db = openDatabase(resolvedPath);
- initializeDatabase(db);
- const store = {
- db,
- dbPath: resolvedPath,
- close: () => db.close(),
- ensureVecTable: (dimensions) => ensureVecTableInternal(db, dimensions),
- // Index health
- getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
- getIndexHealth: () => getIndexHealth(db),
- getStatus: () => getStatus(db),
- // Caching
- getCacheKey,
- getCachedResult: (cacheKey) => getCachedResult(db, cacheKey),
- setCachedResult: (cacheKey, result) => setCachedResult(db, cacheKey, result),
- clearCache: () => clearCache(db),
- // Cleanup and maintenance
- deleteLLMCache: () => deleteLLMCache(db),
- deleteInactiveDocuments: () => deleteInactiveDocuments(db),
- cleanupOrphanedContent: () => cleanupOrphanedContent(db),
- cleanupOrphanedVectors: () => cleanupOrphanedVectors(db),
- vacuumDatabase: () => vacuumDatabase(db),
- // Context
- getContextForFile: (filepath) => getContextForFile(db, filepath),
- getContextForPath: (collectionName, path) => getContextForPath(db, collectionName, path),
- getCollectionByName: (name) => getCollectionByName(db, name),
- getCollectionsWithoutContext: () => getCollectionsWithoutContext(db),
- getTopLevelPathsWithoutContext: (collectionName) => getTopLevelPathsWithoutContext(db, collectionName),
- // Virtual paths
- parseVirtualPath,
- buildVirtualPath,
- isVirtualPath,
- resolveVirtualPath: (virtualPath) => resolveVirtualPath(db, virtualPath),
- toVirtualPath: (absolutePath) => toVirtualPath(db, absolutePath),
- // Search
- searchFTS: (query, limit, collectionName) => searchFTS(db, query, limit, collectionName),
- searchVec: (query, model, limit, collectionName, session, precomputedEmbedding, embedProvider) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding, embedProvider),
- // Query expansion & reranking
- expandQuery: (query, model, intent) => expandQuery(query, model, db, intent, store.llm),
- rerank: (query, documents, model, intent) => rerank(query, documents, model, db, intent, store.llm),
- // Document retrieval
- findDocument: (filename, options) => findDocument(db, filename, options),
- getDocumentBody: (doc, fromLine, maxLines) => getDocumentBody(db, doc, fromLine, maxLines),
- findDocuments: (pattern, options) => findDocuments(db, pattern, options),
- // Fuzzy matching and docid lookup
- findSimilarFiles: (query, maxDistance, limit) => findSimilarFiles(db, query, maxDistance, limit),
- matchFilesByGlob: (pattern) => matchFilesByGlob(db, pattern),
- findDocumentByDocid: (docid) => findDocumentByDocid(db, docid),
- // Document indexing operations
- insertContent: (hash, content, createdAt) => insertContent(db, hash, content, createdAt),
- insertDocument: (collectionName, path, title, hash, createdAt, modifiedAt) => insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt),
- findActiveDocument: (collectionName, path) => findActiveDocument(db, collectionName, path),
- updateDocumentTitle: (documentId, title, modifiedAt) => updateDocumentTitle(db, documentId, title, modifiedAt),
- updateDocument: (documentId, title, hash, modifiedAt) => updateDocument(db, documentId, title, hash, modifiedAt),
- deactivateDocument: (collectionName, path) => deactivateDocument(db, collectionName, path),
- getActiveDocumentPaths: (collectionName) => getActiveDocumentPaths(db, collectionName),
- // Vector/embedding operations
- getHashesForEmbedding: () => getHashesForEmbedding(db),
- clearAllEmbeddings: () => clearAllEmbeddings(db),
- insertEmbedding: (hash, seq, pos, embedding, model, embeddedAt) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
- };
- return store;
- }
- /**
- * Extract short docid from a full hash (first 6 characters).
- */
- export function getDocid(hash) {
- return hash.slice(0, 6);
- }
- /**
- * Handelize a filename to be more token-friendly.
- * - Convert triple underscore `___` to `/` (folder separator)
- * - Convert to lowercase
- * - Replace sequences of non-word chars (except /) with single dash
- * - Remove leading/trailing dashes from path segments
- * - Preserve folder structure (a/b/c/d.md stays structured)
- * - Preserve file extension
- */
- /** Replace emoji/symbol codepoints with their hex representation (e.g. 🐘 → 1f418) */
- function emojiToHex(str) {
- return str.replace(/(?:\p{So}\p{Mn}?|\p{Sk})+/gu, (run) => {
- // Split the run into individual emoji and convert each to hex, dash-separated
- return [...run].filter(c => /\p{So}|\p{Sk}/u.test(c))
- .map(c => c.codePointAt(0).toString(16)).join('-');
- });
- }
- export function handelize(path) {
- if (!path || path.trim() === '') {
- throw new Error('handelize: path cannot be empty');
- }
- // Allow route-style "$" filenames while still rejecting paths with no usable content.
- // Emoji (\p{So}) counts as valid content — they get converted to hex codepoints below.
- const segments = path.split('/').filter(Boolean);
- const lastSegment = segments[segments.length - 1] || '';
- const filenameWithoutExt = lastSegment.replace(/\.[^.]+$/, '');
- const hasValidContent = /[\p{L}\p{N}\p{So}\p{Sk}$]/u.test(filenameWithoutExt);
- if (!hasValidContent) {
- throw new Error(`handelize: path "${path}" has no valid filename content`);
- }
- const result = path
- .replace(/___/g, '/') // Triple underscore becomes folder separator
- .toLowerCase()
- .split('/')
- .map((segment, idx, arr) => {
- const isLastSegment = idx === arr.length - 1;
- // Convert emoji to hex codepoints before cleaning
- segment = emojiToHex(segment);
- if (isLastSegment) {
- // For the filename (last segment), preserve the extension
- const extMatch = segment.match(/(\.[a-z0-9]+)$/i);
- const ext = extMatch ? extMatch[1] : '';
- const nameWithoutExt = ext ? segment.slice(0, -ext.length) : segment;
- const cleanedName = nameWithoutExt
- .replace(/[^\p{L}\p{N}$]+/gu, '-') // Keep letters, numbers, "$"; dash-separate rest (including dots)
- .replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
- return cleanedName + ext;
- }
- else {
- // For directories, just clean normally
- return segment
- .replace(/[^\p{L}\p{N}$]+/gu, '-')
- .replace(/^-+|-+$/g, '');
- }
- })
- .filter(Boolean)
- .join('/');
- if (!result) {
- throw new Error(`handelize: path "${path}" resulted in empty string after processing`);
- }
- return result;
- }
- // =============================================================================
- // Index health
- // =============================================================================
- export function getHashesNeedingEmbedding(db) {
- const result = db.prepare(`
- SELECT COUNT(DISTINCT d.hash) as count
- FROM documents d
- LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
- WHERE d.active = 1 AND v.hash IS NULL
- `).get();
- return result.count;
- }
- export function getIndexHealth(db) {
- const needsEmbedding = getHashesNeedingEmbedding(db);
- const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get().count;
- const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get();
- let daysStale = null;
- if (mostRecent?.latest) {
- const lastUpdate = new Date(mostRecent.latest);
- daysStale = Math.floor((Date.now() - lastUpdate.getTime()) / (24 * 60 * 60 * 1000));
- }
- return { needsEmbedding, totalDocs, daysStale };
- }
- // =============================================================================
- // Caching
- // =============================================================================
- export function getCacheKey(url, body) {
- const hash = createHash("sha256");
- hash.update(url);
- hash.update(JSON.stringify(body));
- return hash.digest("hex");
- }
- export function getCachedResult(db, cacheKey) {
- const row = db.prepare(`SELECT result FROM llm_cache WHERE hash = ?`).get(cacheKey);
- return row?.result || null;
- }
- export function setCachedResult(db, cacheKey, result) {
- const now = new Date().toISOString();
- db.prepare(`INSERT OR REPLACE INTO llm_cache (hash, result, created_at) VALUES (?, ?, ?)`).run(cacheKey, result, now);
- if (Math.random() < 0.01) {
- db.exec(`DELETE FROM llm_cache WHERE hash NOT IN (SELECT hash FROM llm_cache ORDER BY created_at DESC LIMIT 1000)`);
- }
- }
- export function clearCache(db) {
- db.exec(`DELETE FROM llm_cache`);
- }
- // =============================================================================
- // Cleanup and maintenance operations
- // =============================================================================
- /**
- * Delete cached LLM API responses.
- * Returns the number of cached responses deleted.
- */
- export function deleteLLMCache(db) {
- const result = db.prepare(`DELETE FROM llm_cache`).run();
- return result.changes;
- }
- /**
- * Remove inactive document records (active = 0).
- * Returns the number of inactive documents deleted.
- */
- export function deleteInactiveDocuments(db) {
- const result = db.prepare(`DELETE FROM documents WHERE active = 0`).run();
- return result.changes;
- }
- /**
- * Remove orphaned content hashes that are not referenced by any active document.
- * Returns the number of orphaned content hashes deleted.
- */
- export function cleanupOrphanedContent(db) {
- const result = db.prepare(`
- DELETE FROM content
- WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
- `).run();
- return result.changes;
- }
- /**
- * Remove orphaned vector embeddings that are not referenced by any active document.
- * Returns the number of orphaned embedding chunks deleted.
- */
- export function cleanupOrphanedVectors(db) {
- // sqlite-vec may not be loaded (e.g. Bun's bun:sqlite lacks loadExtension).
- // The vectors_vec virtual table can appear in sqlite_master from a prior
- // session, but querying it without the vec0 module loaded will crash (#380).
- if (!isSqliteVecAvailable()) {
- return 0;
- }
- // The schema entry can exist even when sqlite-vec itself is unavailable
- // (for example when reopening a DB without vec0 loaded). In that case,
- // touching the virtual table throws "no such module: vec0" and cleanup
- // should degrade gracefully like the rest of the vector features.
- try {
- db.prepare(`SELECT 1 FROM vectors_vec LIMIT 0`).get();
- }
- catch {
- return 0;
- }
- // Count orphaned vectors first
- const countResult = db.prepare(`
- SELECT COUNT(*) as c FROM content_vectors cv
- WHERE NOT EXISTS (
- SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
- )
- `).get();
- if (countResult.c === 0) {
- return 0;
- }
- // Delete from vectors_vec first
- db.exec(`
- DELETE FROM vectors_vec WHERE hash_seq IN (
- SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
- WHERE NOT EXISTS (
- SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
- )
- )
- `);
- // Delete from content_vectors
- db.exec(`
- DELETE FROM content_vectors WHERE hash NOT IN (
- SELECT hash FROM documents WHERE active = 1
- )
- `);
- return countResult.c;
- }
- /**
- * Run VACUUM to reclaim unused space in the database.
- * This operation rebuilds the database file to eliminate fragmentation.
- */
- export function vacuumDatabase(db) {
- db.exec(`VACUUM`);
- }
- // =============================================================================
- // Document helpers
- // =============================================================================
- export async function hashContent(content) {
- const hash = createHash("sha256");
- hash.update(content);
- return hash.digest("hex");
- }
- const titleExtractors = {
- '.md': (content) => {
- const match = content.match(/^##?\s+(.+)$/m);
- if (match) {
- const title = (match[1] ?? "").trim();
- if (title === "📝 Notes" || title === "Notes") {
- const nextMatch = content.match(/^##\s+(.+)$/m);
- if (nextMatch?.[1])
- return nextMatch[1].trim();
- }
- return title;
- }
- return null;
- },
- '.org': (content) => {
- const titleProp = content.match(/^#\+TITLE:\s*(.+)$/im);
- if (titleProp?.[1])
- return titleProp[1].trim();
- const heading = content.match(/^\*+\s+(.+)$/m);
- if (heading?.[1])
- return heading[1].trim();
- return null;
- },
- };
- export function extractTitle(content, filename) {
- const ext = filename.slice(filename.lastIndexOf('.')).toLowerCase();
- const extractor = titleExtractors[ext];
- if (extractor) {
- const title = extractor(content);
- if (title)
- return title;
- }
- return filename.replace(/\.[^.]+$/, "").split("/").pop() || filename;
- }
- // =============================================================================
- // Document indexing operations
- // =============================================================================
- /**
- * Insert content into the content table (content-addressable storage).
- * Uses INSERT OR IGNORE so duplicate hashes are skipped.
- */
- export function insertContent(db, hash, content, createdAt) {
- db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
- .run(hash, content, createdAt);
- }
- /**
- * Insert a new document into the documents table.
- */
- export function insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt) {
- db.prepare(`
- INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active)
- VALUES (?, ?, ?, ?, ?, ?, 1)
- ON CONFLICT(collection, path) DO UPDATE SET
- title = excluded.title,
- hash = excluded.hash,
- modified_at = excluded.modified_at,
- active = 1
- `).run(collectionName, path, title, hash, createdAt, modifiedAt);
- }
- /**
- * Find an active document by collection name and path.
- */
- export function findActiveDocument(db, collectionName, path) {
- const row = db.prepare(`
- SELECT id, hash, title FROM documents
- WHERE collection = ? AND path = ? AND active = 1
- `).get(collectionName, path);
- return row ?? null;
- }
- /**
- * Update the title and modified_at timestamp for a document.
- */
- export function updateDocumentTitle(db, documentId, title, modifiedAt) {
- db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
- .run(title, modifiedAt, documentId);
- }
- /**
- * Update an existing document's hash, title, and modified_at timestamp.
- * Used when content changes but the file path stays the same.
- */
- export function updateDocument(db, documentId, title, hash, modifiedAt) {
- db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`)
- .run(title, hash, modifiedAt, documentId);
- }
- /**
- * Deactivate a document (mark as inactive but don't delete).
- */
- export function deactivateDocument(db, collectionName, path) {
- db.prepare(`UPDATE documents SET active = 0 WHERE collection = ? AND path = ? AND active = 1`)
- .run(collectionName, path);
- }
- /**
- * Get all active document paths for a collection.
- */
- export function getActiveDocumentPaths(db, collectionName) {
- const rows = db.prepare(`
- SELECT path FROM documents WHERE collection = ? AND active = 1
- `).all(collectionName);
- return rows.map(r => r.path);
- }
- export { formatQueryForEmbedding, formatDocForEmbedding };
- /**
- * Chunk a document using regex-only break point detection.
- * This is the sync, backward-compatible API used by tests and legacy callers.
- */
- export function chunkDocument(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS) {
- const breakPoints = scanBreakPoints(content);
- const codeFences = findCodeFences(content);
- return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
- }
- /**
- * Async AST-aware chunking. Detects language from filepath, computes AST
- * break points for supported code files, merges with regex break points,
- * and delegates to the shared chunk algorithm.
- *
- * Strategies:
- * - "regex" (default) — char-based chunking with regex break points only.
- * - "auto" — regex break points merged with AST break points (soft hints).
- * - "function" — one chunk per AST function range (Phase 2); inter-range
- * gaps (imports, top-level code) are char-chunked with AST
- * hints. Falls back to "auto" when zero ranges are detected.
- */
- export async function chunkDocumentAsync(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS, filepath, chunkStrategy = "regex") {
- const regexPoints = scanBreakPoints(content);
- const codeFences = findCodeFences(content);
- // "function" strategy: delegate to the function-level chunker. If no
- // ranges are detected (markdown, unsupported lang, parse failure), fall
- // back to "auto" behavior (AST-break-point-assisted char chunking).
- if (chunkStrategy === "function" && filepath) {
- const { getASTFunctionRanges, getASTBreakPoints } = await import("./ast.js");
- const ranges = await getASTFunctionRanges(content, filepath);
- if (ranges.length > 0) {
- return chunkByFunctionRanges(content, ranges, regexPoints, codeFences, maxChars, overlapChars, windowChars);
- }
- // Zero ranges — fall through to auto behavior so break points still help.
- const astPoints = await getASTBreakPoints(content, filepath);
- const merged = astPoints.length > 0 ? mergeBreakPoints(regexPoints, astPoints) : regexPoints;
- return chunkDocumentWithBreakPoints(content, merged, codeFences, maxChars, overlapChars, windowChars);
- }
- let breakPoints = regexPoints;
- if (chunkStrategy === "auto" && filepath) {
- const { getASTBreakPoints } = await import("./ast.js");
- const astPoints = await getASTBreakPoints(content, filepath);
- if (astPoints.length > 0) {
- breakPoints = mergeBreakPoints(regexPoints, astPoints);
- }
- }
- return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
- }
- /**
- * Produce one chunk per AST function range, plus char-chunks for the gaps
- * between ranges (imports, top-level code). Ranges that exceed `maxChars`
- * are further split using the existing char-based algorithm so we never
- * emit a single oversized chunk.
- *
- * Preconditions: `ranges` is non-empty, sorted by `startIndex`, and the
- * ranges are non-overlapping (as produced by `getASTFunctionRanges`).
- */
- function chunkByFunctionRanges(content, ranges, regexPoints, codeFences, maxChars, overlapChars, windowChars) {
- const out = [];
- let cursor = 0;
- const emitGap = (start, end) => {
- if (start >= end)
- return;
- const gap = content.slice(start, end);
- // Whitespace-only gaps are dropped — they carry no embeddable signal.
- if (!gap.trim())
- return;
- if (gap.length <= maxChars) {
- out.push({ text: gap, pos: start });
- return;
- }
- // Reuse char-based algorithm for oversized gaps. Restrict break
- // points and code fences to the gap window and rebase positions so
- // chunkDocumentWithBreakPoints operates on a standalone slice.
- const subPoints = regexPoints
- .filter(p => p.pos >= start && p.pos < end)
- .map(p => ({ ...p, pos: p.pos - start }));
- const subFences = codeFences
- .filter(f => f.end > start && f.start < end)
- .map(f => ({
- start: Math.max(0, f.start - start),
- end: Math.max(0, Math.min(end, f.end) - start),
- }));
- const sub = chunkDocumentWithBreakPoints(gap, subPoints, subFences, maxChars, overlapChars, windowChars);
- for (const c of sub)
- out.push({ text: c.text, pos: start + c.pos });
- };
- for (const range of ranges) {
- // Emit any leading / inter-range gap (imports, top-level code).
- emitGap(cursor, range.startIndex);
- const body = content.slice(range.startIndex, range.endIndex);
- if (body.length === 0) {
- cursor = range.endIndex;
- continue;
- }
- if (body.length <= maxChars) {
- out.push({ text: body, pos: range.startIndex });
- }
- else {
- // Oversized function/class — split with char algorithm so we stay
- // under the embed token budget. Break points inside the range are
- // reused to keep splits at syntactically-sensible positions.
- const subPoints = regexPoints
- .filter(p => p.pos >= range.startIndex && p.pos < range.endIndex)
- .map(p => ({ ...p, pos: p.pos - range.startIndex }));
- const subFences = codeFences
- .filter(f => f.end > range.startIndex && f.start < range.endIndex)
- .map(f => ({
- start: Math.max(0, f.start - range.startIndex),
- end: Math.max(0, Math.min(range.endIndex, f.end) - range.startIndex),
- }));
- const sub = chunkDocumentWithBreakPoints(body, subPoints, subFences, maxChars, overlapChars, windowChars);
- for (const c of sub)
- out.push({ text: c.text, pos: range.startIndex + c.pos });
- }
- cursor = range.endIndex;
- }
- // Trailing gap after the last range.
- emitGap(cursor, content.length);
- // Edge case: content consisted entirely of whitespace-only gaps (zero
- // emitted chunks). Preserve the invariant that non-empty content yields
- // at least one chunk.
- if (out.length === 0 && content.length > 0) {
- return [{ text: content, pos: 0 }];
- }
- return out;
- }
- /**
- * Chunk a document by actual token count using the LLM tokenizer.
- * More accurate than character-based chunking but requires async.
- *
- * When `tokenizer` is supplied, it is used in place of the local
- * `llm.tokenize(...)` call — neither `getDefaultLlamaCpp()` nor
- * `llm.tokenize(...)` is invoked. This lets remote-only deployments
- * (`QMD_EMBED_ENDPOINT=...`) chunk documents without warming up
- * node-llama-cpp (DoD #1 of i-1rqixh6m / i-qkarfffa).
- *
- * When `filepath` and `chunkStrategy` are provided, uses AST-aware break
- * points for supported code files.
- */
- export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS, filepath, chunkStrategy = "regex", signal, tokenizer) {
- // Resolve token counter lazily so callers that supply `tokenizer` never
- // touch the local LlamaCpp instance — `getDefaultLlamaCpp()` is only
- // invoked from inside the default closure when it is actually called
- // (i.e. when no tokenizer is supplied).
- let llm;
- const countTokens = tokenizer ?? (async (text) => {
- if (!llm)
- llm = getDefaultLlamaCpp();
- return (await llm.tokenize(text)).length;
- });
- // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
- // If chunks exceed limit, they'll be re-split with actual ratio
- const avgCharsPerToken = 3;
- const maxChars = maxTokens * avgCharsPerToken;
- const overlapChars = overlapTokens * avgCharsPerToken;
- const windowChars = windowTokens * avgCharsPerToken;
- // Chunk in character space with conservative estimate
- // Use AST-aware chunking for the first pass when filepath/strategy provided
- let charChunks = await chunkDocumentAsync(content, maxChars, overlapChars, windowChars, filepath, chunkStrategy);
- // Tokenize and split any chunks that still exceed limit
- const results = [];
- for (const chunk of charChunks) {
- // Respect abort signal to avoid runaway tokenization
- if (signal?.aborted)
- break;
- const tokenCount = await countTokens(chunk.text);
- if (tokenCount <= maxTokens) {
- results.push({ text: chunk.text, pos: chunk.pos, tokens: tokenCount });
- }
- else {
- // Chunk is still too large - split it further
- // Use actual token count to estimate better char limit
- const actualCharsPerToken = chunk.text.length / tokenCount;
- const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
- const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
- for (const subChunk of subChunks) {
- if (signal?.aborted)
- break;
- const subCount = await countTokens(subChunk.text);
- results.push({
- text: subChunk.text,
- pos: chunk.pos + subChunk.pos,
- tokens: subCount,
- });
- }
- }
- }
- return results;
- }
- // =============================================================================
- // Fuzzy matching
- // =============================================================================
- function levenshtein(a, b) {
- const m = a.length, n = b.length;
- if (m === 0)
- return n;
- if (n === 0)
- return m;
- const dp = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0));
- for (let i = 0; i <= m; i++)
- dp[i][0] = i;
- for (let j = 0; j <= n; j++)
- dp[0][j] = j;
- for (let i = 1; i <= m; i++) {
- for (let j = 1; j <= n; j++) {
- const cost = a[i - 1] === b[j - 1] ? 0 : 1;
- dp[i][j] = Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost);
- }
- }
- return dp[m][n];
- }
- /**
- * Normalize a docid input by stripping surrounding quotes and leading #.
- * Handles: "#abc123", 'abc123', "abc123", #abc123, abc123
- * Returns the bare hex string.
- */
- export function normalizeDocid(docid) {
- let normalized = docid.trim();
- // Strip surrounding quotes (single or double)
- if ((normalized.startsWith('"') && normalized.endsWith('"')) ||
- (normalized.startsWith("'") && normalized.endsWith("'"))) {
- normalized = normalized.slice(1, -1);
- }
- // Strip leading # if present
- if (normalized.startsWith('#')) {
- normalized = normalized.slice(1);
- }
- return normalized;
- }
- /**
- * Check if a string looks like a docid reference.
- * Accepts: #abc123, abc123, "#abc123", "abc123", '#abc123', 'abc123'
- * Returns true if the normalized form is a valid hex string of 6+ chars.
- */
- export function isDocid(input) {
- const normalized = normalizeDocid(input);
- // Must be at least 6 hex characters
- return normalized.length >= 6 && /^[a-f0-9]+$/i.test(normalized);
- }
- /**
- * Find a document by its short docid (first 6 characters of hash).
- * Returns the document's virtual path if found, null otherwise.
- * If multiple documents match the same short hash (collision), returns the first one.
- *
- * Accepts lenient input: #abc123, abc123, "#abc123", "abc123"
- */
- export function findDocumentByDocid(db, docid) {
- const shortHash = normalizeDocid(docid);
- if (shortHash.length < 1)
- return null;
- // Look up documents where hash starts with the short hash
- const doc = db.prepare(`
- SELECT 'qmd://' || d.collection || '/' || d.path as filepath, d.hash
- FROM documents d
- WHERE d.hash LIKE ? AND d.active = 1
- LIMIT 1
- `).get(`${shortHash}%`);
- return doc;
- }
- export function findSimilarFiles(db, query, maxDistance = 3, limit = 5) {
- const allFiles = db.prepare(`
- SELECT d.path
- FROM documents d
- WHERE d.active = 1
- `).all();
- const queryLower = query.toLowerCase();
- const scored = allFiles
- .map(f => ({ path: f.path, dist: levenshtein(f.path.toLowerCase(), queryLower) }))
- .filter(f => f.dist <= maxDistance)
- .sort((a, b) => a.dist - b.dist)
- .slice(0, limit);
- return scored.map(f => f.path);
- }
- export function matchFilesByGlob(db, pattern) {
- const allFiles = db.prepare(`
- SELECT
- 'qmd://' || d.collection || '/' || d.path as virtual_path,
- LENGTH(content.doc) as body_length,
- d.path,
- d.collection
- FROM documents d
- JOIN content ON content.hash = d.hash
- WHERE d.active = 1
- `).all();
- const isMatch = picomatch(pattern);
- return allFiles
- .filter(f => isMatch(f.virtual_path) || isMatch(f.path) || isMatch(f.collection + '/' + f.path))
- .map(f => ({
- filepath: f.virtual_path, // Virtual path for precise lookup
- displayPath: f.path, // Relative path for display
- bodyLength: f.body_length
- }));
- }
- // =============================================================================
- // Context
- // =============================================================================
- /**
- * Get context for a file path using hierarchical inheritance.
- * Contexts are collection-scoped and inherit from parent directories.
- * For example, context at "/talks" applies to "/talks/2024/keynote.md".
- *
- * @param db Database instance (unused - kept for compatibility)
- * @param collectionName Collection name
- * @param path Relative path within the collection
- * @returns Context string or null if no context is defined
- */
- export function getContextForPath(db, collectionName, path) {
- const coll = getStoreCollection(db, collectionName);
- if (!coll)
- return null;
- // Collect ALL matching contexts (global + all path prefixes)
- const contexts = [];
- // Add global context if present
- const globalCtx = getStoreGlobalContext(db);
- if (globalCtx) {
- contexts.push(globalCtx);
- }
- // Add all matching path contexts (from most general to most specific)
- if (coll.context) {
- const normalizedPath = path.startsWith("/") ? path : `/${path}`;
- // Collect all matching prefixes
- const matchingContexts = [];
- for (const [prefix, context] of Object.entries(coll.context)) {
- const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
- if (normalizedPath.startsWith(normalizedPrefix)) {
- matchingContexts.push({ prefix: normalizedPrefix, context });
- }
- }
- // Sort by prefix length (shortest/most general first)
- matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
- // Add all matching contexts
- for (const match of matchingContexts) {
- contexts.push(match.context);
- }
- }
- // Join all contexts with double newline
- return contexts.length > 0 ? contexts.join('\n\n') : null;
- }
- /**
- * Get context for a file path (virtual or filesystem).
- * Resolves the collection and relative path from the DB store_collections table.
- */
- export function getContextForFile(db, filepath) {
- // Handle undefined or null filepath
- if (!filepath)
- return null;
- // Get all collections from DB
- const collections = getStoreCollections(db);
- // Parse virtual path format: qmd://collection/path
- let collectionName = null;
- let relativePath = null;
- const parsedVirtual = filepath.startsWith('qmd://') ? parseVirtualPath(filepath) : null;
- if (parsedVirtual) {
- collectionName = parsedVirtual.collectionName;
- relativePath = parsedVirtual.path;
- }
- else {
- // Filesystem path: find which collection this absolute path belongs to
- for (const coll of collections) {
- // Skip collections with missing paths
- if (!coll || !coll.path)
- continue;
- if (filepath.startsWith(coll.path + '/') || filepath === coll.path) {
- collectionName = coll.name;
- // Extract relative path
- relativePath = filepath.startsWith(coll.path + '/')
- ? filepath.slice(coll.path.length + 1)
- : '';
- break;
- }
- }
- if (!collectionName || relativePath === null)
- return null;
- }
- // Get the collection from DB
- const coll = getStoreCollection(db, collectionName);
- if (!coll)
- return null;
- // Verify this document exists in the database
- const doc = db.prepare(`
- SELECT d.path
- FROM documents d
- WHERE d.collection = ? AND d.path = ? AND d.active = 1
- LIMIT 1
- `).get(collectionName, relativePath);
- if (!doc)
- return null;
- // Collect ALL matching contexts (global + all path prefixes)
- const contexts = [];
- // Add global context if present
- const globalCtx = getStoreGlobalContext(db);
- if (globalCtx) {
- contexts.push(globalCtx);
- }
- // Add all matching path contexts (from most general to most specific)
- if (coll.context) {
- const normalizedPath = relativePath.startsWith("/") ? relativePath : `/${relativePath}`;
- // Collect all matching prefixes
- const matchingContexts = [];
- for (const [prefix, context] of Object.entries(coll.context)) {
- const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
- if (normalizedPath.startsWith(normalizedPrefix)) {
- matchingContexts.push({ prefix: normalizedPrefix, context });
- }
- }
- // Sort by prefix length (shortest/most general first)
- matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
- // Add all matching contexts
- for (const match of matchingContexts) {
- contexts.push(match.context);
- }
- }
- // Join all contexts with double newline
- return contexts.length > 0 ? contexts.join('\n\n') : null;
- }
- /**
- * Get collection by name from DB store_collections table.
- */
- export function getCollectionByName(db, name) {
- const collection = getStoreCollection(db, name);
- if (!collection)
- return null;
- return {
- name: collection.name,
- pwd: collection.path,
- glob_pattern: collection.pattern,
- };
- }
- /**
- * List all collections with document counts from database.
- * Merges store_collections config with database statistics.
- */
- export function listCollections(db) {
- const collections = getStoreCollections(db);
- // Get document counts from database for each collection
- const result = collections.map(coll => {
- const stats = db.prepare(`
- SELECT
- COUNT(d.id) as doc_count,
- SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count,
- MAX(d.modified_at) as last_modified
- FROM documents d
- WHERE d.collection = ?
- `).get(coll.name);
- return {
- name: coll.name,
- pwd: coll.path,
- glob_pattern: coll.pattern,
- doc_count: stats?.doc_count || 0,
- active_count: stats?.active_count || 0,
- last_modified: stats?.last_modified || null,
- includeByDefault: coll.includeByDefault !== false,
- };
- });
- return result;
- }
- /**
- * Remove a collection and clean up its documents.
- * Uses collections.ts to remove from YAML config and cleans up database.
- */
- export function removeCollection(db, collectionName) {
- // Delete documents from database
- const docResult = db.prepare(`DELETE FROM documents WHERE collection = ?`).run(collectionName);
- // Clean up orphaned content hashes
- const cleanupResult = db.prepare(`
- DELETE FROM content
- WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
- `).run();
- // Remove from store_collections
- deleteStoreCollection(db, collectionName);
- return {
- deletedDocs: docResult.changes,
- cleanedHashes: cleanupResult.changes
- };
- }
- /**
- * Rename a collection.
- * Updates both YAML config and database documents table.
- */
- export function renameCollection(db, oldName, newName) {
- // Update all documents with the new collection name in database
- db.prepare(`UPDATE documents SET collection = ? WHERE collection = ?`)
- .run(newName, oldName);
- // Rename in store_collections
- renameStoreCollection(db, oldName, newName);
- }
- // =============================================================================
- // Context Management Operations
- // =============================================================================
- /**
- * Insert or update a context for a specific collection and path prefix.
- */
- export function insertContext(db, collectionId, pathPrefix, context) {
- // Get collection name from ID
- const coll = db.prepare(`SELECT name FROM collections WHERE id = ?`).get(collectionId);
- if (!coll) {
- throw new Error(`Collection with id ${collectionId} not found`);
- }
- // Add context to store_collections
- updateStoreContext(db, coll.name, pathPrefix, context);
- }
- /**
- * Delete a context for a specific collection and path prefix.
- * Returns the number of contexts deleted.
- */
- export function deleteContext(db, collectionName, pathPrefix) {
- // Remove context from store_collections
- const success = removeStoreContext(db, collectionName, pathPrefix);
- return success ? 1 : 0;
- }
- /**
- * Delete all global contexts (contexts with empty path_prefix).
- * Returns the number of contexts deleted.
- */
- export function deleteGlobalContexts(db) {
- let deletedCount = 0;
- // Remove global context
- setStoreGlobalContext(db, undefined);
- deletedCount++;
- // Remove root context (empty string) from all collections
- const collections = getStoreCollections(db);
- for (const coll of collections) {
- const success = removeStoreContext(db, coll.name, '');
- if (success) {
- deletedCount++;
- }
- }
- return deletedCount;
- }
- /**
- * List all contexts, grouped by collection.
- * Returns contexts ordered by collection name, then by path prefix length (longest first).
- */
- export function listPathContexts(db) {
- const allContexts = getStoreContexts(db);
- // Convert to expected format and sort
- return allContexts.map(ctx => ({
- collection_name: ctx.collection,
- path_prefix: ctx.path,
- context: ctx.context,
- })).sort((a, b) => {
- // Sort by collection name first
- if (a.collection_name !== b.collection_name) {
- return a.collection_name.localeCompare(b.collection_name);
- }
- // Then by path prefix length (longest first)
- if (a.path_prefix.length !== b.path_prefix.length) {
- return b.path_prefix.length - a.path_prefix.length;
- }
- // Then alphabetically
- return a.path_prefix.localeCompare(b.path_prefix);
- });
- }
- /**
- * Get all collections (name only - from YAML config).
- */
- export function getAllCollections(db) {
- const collections = getStoreCollections(db);
- return collections.map(c => ({ name: c.name }));
- }
- /**
- * Check which collections don't have any context defined.
- * Returns collections that have no context entries at all (not even root context).
- */
- export function getCollectionsWithoutContext(db) {
- // Get all collections from DB
- const allCollections = getStoreCollections(db);
- // Filter to those without context
- const collectionsWithoutContext = [];
- for (const coll of allCollections) {
- // Check if collection has any context
- if (!coll.context || Object.keys(coll.context).length === 0) {
- // Get doc count from database
- const stats = db.prepare(`
- SELECT COUNT(d.id) as doc_count
- FROM documents d
- WHERE d.collection = ? AND d.active = 1
- `).get(coll.name);
- collectionsWithoutContext.push({
- name: coll.name,
- pwd: coll.path,
- doc_count: stats?.doc_count || 0,
- });
- }
- }
- return collectionsWithoutContext.sort((a, b) => a.name.localeCompare(b.name));
- }
- /**
- * Get top-level directories in a collection that don't have context.
- * Useful for suggesting where context might be needed.
- */
- export function getTopLevelPathsWithoutContext(db, collectionName) {
- // Get all paths in the collection from database
- const paths = db.prepare(`
- SELECT DISTINCT path FROM documents
- WHERE collection = ? AND active = 1
- `).all(collectionName);
- // Get existing contexts for this collection from DB
- const dbColl = getStoreCollection(db, collectionName);
- if (!dbColl)
- return [];
- const contextPrefixes = new Set();
- if (dbColl.context) {
- for (const prefix of Object.keys(dbColl.context)) {
- contextPrefixes.add(prefix);
- }
- }
- // Extract top-level directories (first path component)
- const topLevelDirs = new Set();
- for (const { path } of paths) {
- const parts = path.split('/').filter(Boolean);
- if (parts.length > 1) {
- const dir = parts[0];
- if (dir)
- topLevelDirs.add(dir);
- }
- }
- // Filter out directories that already have context (exact or parent)
- const missing = [];
- for (const dir of topLevelDirs) {
- let hasContext = false;
- // Check if this dir or any parent has context
- for (const prefix of contextPrefixes) {
- if (prefix === '' || prefix === dir || dir.startsWith(prefix + '/')) {
- hasContext = true;
- break;
- }
- }
- if (!hasContext) {
- missing.push(dir);
- }
- }
- return missing.sort();
- }
- // =============================================================================
- // FTS Search
- // =============================================================================
- export function sanitizeFTS5Term(term) {
- return term.replace(/[^\p{L}\p{N}'_]/gu, '').toLowerCase();
- }
- /**
- * Check if a token is a hyphenated compound word (e.g., multi-agent, DEC-0054, gpt-4).
- * Returns true if the token contains internal hyphens between word/digit characters.
- */
- function isHyphenatedToken(token) {
- return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
- }
- /**
- * Sanitize a hyphenated term into an FTS5 phrase by splitting on hyphens
- * and sanitizing each part. Returns the parts joined by spaces for use
- * inside FTS5 quotes: "multi agent" matches "multi-agent" in porter tokenizer.
- */
- function sanitizeHyphenatedTerm(term) {
- return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
- }
- /**
- * Parse lex query syntax into FTS5 query.
- *
- * Supports:
- * - Quoted phrases: "exact phrase" → "exact phrase" (exact match)
- * - Negation: -term or -"phrase" → uses FTS5 NOT operator
- * - Hyphenated tokens: multi-agent, DEC-0054, gpt-4 → treated as phrases
- * - Plain terms: term → "term"* (prefix match)
- *
- * FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2".
- * So `-term` only works when there are also positive terms.
- *
- * Hyphen disambiguation: `-sports` at a word boundary is negation, but `multi-agent`
- * (where `-` is between word characters) is treated as a hyphenated phrase.
- * When a leading `-` is followed by what looks like a hyphenated compound word
- * (e.g., `-multi-agent`), the entire token is treated as a negated phrase.
- *
- * Examples:
- * performance -sports → "performance"* NOT "sports"*
- * "machine learning" → "machine learning"
- * multi-agent memory → "multi agent" AND "memory"*
- * DEC-0054 → "dec 0054"
- * -multi-agent → NOT "multi agent"
- */
- function buildFTS5Query(query) {
- const positive = [];
- const negative = [];
- let i = 0;
- const s = query.trim();
- while (i < s.length) {
- // Skip whitespace
- while (i < s.length && /\s/.test(s[i]))
- i++;
- if (i >= s.length)
- break;
- // Check for negation prefix
- const negated = s[i] === '-';
- if (negated)
- i++;
- // Check for quoted phrase
- if (s[i] === '"') {
- const start = i + 1;
- i++;
- while (i < s.length && s[i] !== '"')
- i++;
- const phrase = s.slice(start, i).trim();
- i++; // skip closing quote
- if (phrase.length > 0) {
- const sanitized = phrase.split(/\s+/).map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
- if (sanitized) {
- const ftsPhrase = `"${sanitized}"`; // Exact phrase, no prefix match
- if (negated) {
- negative.push(ftsPhrase);
- }
- else {
- positive.push(ftsPhrase);
- }
- }
- }
- }
- else {
- // Plain term (until whitespace or quote)
- const start = i;
- while (i < s.length && !/[\s"]/.test(s[i]))
- i++;
- const term = s.slice(start, i);
- // Handle hyphenated tokens: multi-agent, DEC-0054, gpt-4
- // These get split into phrase queries so FTS5 porter tokenizer matches them.
- if (isHyphenatedToken(term)) {
- const sanitized = sanitizeHyphenatedTerm(term);
- if (sanitized) {
- const ftsPhrase = `"${sanitized}"`; // Phrase match (no prefix)
- if (negated) {
- negative.push(ftsPhrase);
- }
- else {
- positive.push(ftsPhrase);
- }
- }
- }
- else {
- const sanitized = sanitizeFTS5Term(term);
- if (sanitized) {
- const ftsTerm = `"${sanitized}"*`; // Prefix match
- if (negated) {
- negative.push(ftsTerm);
- }
- else {
- positive.push(ftsTerm);
- }
- }
- }
- }
- }
- if (positive.length === 0 && negative.length === 0)
- return null;
- // If only negative terms, we can't search (FTS5 NOT is binary)
- if (positive.length === 0)
- return null;
- // Join positive terms with AND
- let result = positive.join(' AND ');
- // Add NOT clause for negative terms
- for (const neg of negative) {
- result = `${result} NOT ${neg}`;
- }
- return result;
- }
- /**
- * Validate that a vec/hyde query doesn't use lex-only syntax.
- * Returns error message if invalid, null if valid.
- *
- * Negation is detected ONLY when `-` is preceded by whitespace or sits at
- * the start of the query. Hyphens inside words (e.g. `auto-archived`,
- * `pre-commit`, `multi-session`, `state-of-the-art`) carry no negation
- * semantics in natural English and must pass through unchanged.
- */
- export function validateSemanticQuery(query) {
- // `-term` or `-"phrase"` only counts as negation at SOS or after whitespace.
- if (/(?:^|\s)-\w/.test(query) || /(?:^|\s)-"/.test(query)) {
- return 'Negation (-term) is not supported in vec/hyde queries. Use lex for exclusions.';
- }
- return null;
- }
- export function validateLexQuery(query) {
- if (/[\r\n]/.test(query)) {
- return 'Lex queries must be a single line. Remove newline characters or split into separate lex: lines.';
- }
- const quoteCount = (query.match(/"/g) ?? []).length;
- if (quoteCount % 2 === 1) {
- return 'Lex query has an unmatched double quote ("). Add the closing quote or remove it.';
- }
- return null;
- }
- export function searchFTS(db, query, limit = 20, collectionName) {
- const ftsQuery = buildFTS5Query(query);
- if (!ftsQuery)
- return [];
- // Use a CTE to force FTS5 to run first, then filter by collection.
- // Without the CTE, SQLite's query planner combines FTS5 MATCH with the
- // collection filter in a single WHERE clause, which can cause it to
- // abandon the FTS5 index and fall back to a full scan — turning an 8ms
- // query into a 17-second query on large collections.
- const params = [ftsQuery];
- // When filtering by collection, fetch extra candidates from the FTS index
- // since some will be filtered out. Without a collection filter we can
- // fetch exactly the requested limit.
- const ftsLimit = collectionName ? limit * 10 : limit;
- let sql = `
- WITH fts_matches AS (
- SELECT rowid, bm25(documents_fts, 1.5, 4.0, 1.0) as bm25_score
- FROM documents_fts
- WHERE documents_fts MATCH ?
- ORDER BY bm25_score ASC
- LIMIT ${ftsLimit}
- )
- SELECT
- 'qmd://' || d.collection || '/' || d.path as filepath,
- d.collection || '/' || d.path as display_path,
- d.title,
- content.doc as body,
- d.hash,
- fm.bm25_score
- FROM fts_matches fm
- JOIN documents d ON d.id = fm.rowid
- JOIN content ON content.hash = d.hash
- WHERE d.active = 1
- `;
- if (collectionName) {
- sql += ` AND d.collection = ?`;
- params.push(String(collectionName));
- }
- // bm25 lower is better; sort ascending.
- sql += ` ORDER BY fm.bm25_score ASC LIMIT ?`;
- params.push(limit);
- const rows = db.prepare(sql).all(...params);
- return rows.map(row => {
- const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
- // Convert bm25 (negative, lower is better) into a stable [0..1) score where higher is better.
- // FTS5 BM25 scores are negative (e.g., -10 is strong, -2 is weak).
- // |x| / (1 + |x|) maps: strong(-10)→0.91, medium(-2)→0.67, weak(-0.5)→0.33, none(0)→0.
- // Monotonic and query-independent — no per-query normalization needed.
- const score = Math.abs(row.bm25_score) / (1 + Math.abs(row.bm25_score));
- return {
- filepath: row.filepath,
- displayPath: row.display_path,
- title: row.title,
- hash: row.hash,
- docid: getDocid(row.hash),
- collectionName,
- modifiedAt: "", // Not available in FTS query
- bodyLength: row.body.length,
- body: row.body,
- context: getContextForFile(db, row.filepath),
- score,
- source: "fts",
- };
- });
- }
- // =============================================================================
- // Vector Search
- // =============================================================================
- export async function searchVec(db, query, model, limit = 20, collectionName, session, precomputedEmbedding, embedProvider) {
- const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
- if (!tableExists)
- return [];
- const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session, undefined, embedProvider);
- if (!embedding)
- return [];
- // IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
- // hang indefinitely when combined with JOINs in the same query. Do NOT try to
- // "optimize" this by combining into a single query with JOINs - it will break.
- // See: https://github.com/tobi/qmd/pull/23
- // Step 1: Get vector matches from sqlite-vec (no JOINs allowed)
- const vecResults = db.prepare(`
- SELECT hash_seq, distance
- FROM vectors_vec
- WHERE embedding MATCH ? AND k = ?
- `).all(new Float32Array(embedding), limit * 3);
- if (vecResults.length === 0)
- return [];
- // Step 2: Get chunk info and document data
- const hashSeqs = vecResults.map(r => r.hash_seq);
- const distanceMap = new Map(vecResults.map(r => [r.hash_seq, r.distance]));
- // Build query for document lookup
- const placeholders = hashSeqs.map(() => '?').join(',');
- let docSql = `
- SELECT
- cv.hash || '_' || cv.seq as hash_seq,
- cv.hash,
- cv.pos,
- 'qmd://' || d.collection || '/' || d.path as filepath,
- d.collection || '/' || d.path as display_path,
- d.title,
- content.doc as body
- FROM content_vectors cv
- JOIN documents d ON d.hash = cv.hash AND d.active = 1
- JOIN content ON content.hash = d.hash
- WHERE cv.hash || '_' || cv.seq IN (${placeholders})
- `;
- const params = [...hashSeqs];
- if (collectionName) {
- docSql += ` AND d.collection = ?`;
- params.push(collectionName);
- }
- const docRows = db.prepare(docSql).all(...params);
- // Combine with distances and dedupe by filepath
- const seen = new Map();
- for (const row of docRows) {
- const distance = distanceMap.get(row.hash_seq) ?? 1;
- const existing = seen.get(row.filepath);
- if (!existing || distance < existing.bestDist) {
- seen.set(row.filepath, { row, bestDist: distance });
- }
- }
- return Array.from(seen.values())
- .sort((a, b) => a.bestDist - b.bestDist)
- .slice(0, limit)
- .map(({ row, bestDist }) => {
- const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
- return {
- filepath: row.filepath,
- displayPath: row.display_path,
- title: row.title,
- hash: row.hash,
- docid: getDocid(row.hash),
- collectionName,
- modifiedAt: "", // Not available in vec query
- bodyLength: row.body.length,
- body: row.body,
- context: getContextForFile(db, row.filepath),
- score: 1 - bestDist, // Cosine similarity = 1 - cosine distance
- source: "vec",
- chunkPos: row.pos,
- };
- });
- }
- // =============================================================================
- // Embeddings
- // =============================================================================
- async function getEmbedding(text, model, isQuery, session, llmOverride, embedProvider) {
- // When an EmbeddingProvider is supplied, route the encoding through it
- // (HTTP / GPU worker / fallback chain) instead of touching local
- // node-llama-cpp at all. The provider sees the raw text + the desired
- // model id; query-formatting prefixes are still applied via
- // formatQueryForEmbedding so embedding parity with the index is preserved.
- if (embedProvider) {
- const providerModel = embedProvider.getModelId();
- const formattedText = isQuery
- ? formatQueryForEmbedding(text, providerModel)
- : formatDocForEmbedding(text, undefined, providerModel);
- // Only forward an AbortSignal when the provider is local-backed;
- // remote providers manage their own timeouts and an LLM-session signal
- // would abort their HTTP request prematurely (i-08ovbvtb).
- const sig = embedProvider.kind === "local" ? session?.signal : undefined;
- const result = await embedProvider.embed(formattedText, sig ? { model: providerModel, signal: sig } : { model: providerModel });
- return result?.embedding ?? null;
- }
- // Format text using the appropriate prompt template
- const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model);
- const result = session
- ? await session.embed(formattedText, { model, isQuery })
- : await (llmOverride ?? getDefaultLlamaCpp()).embed(formattedText, { model, isQuery });
- return result?.embedding || null;
- }
- /**
- * Get all unique content hashes that need embeddings (from active documents).
- * Returns hash, document body, and a sample path for display purposes.
- */
- export function getHashesForEmbedding(db) {
- return db.prepare(`
- SELECT d.hash, c.doc as body, MIN(d.path) as path
- FROM documents d
- JOIN content c ON d.hash = c.hash
- LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
- WHERE d.active = 1 AND v.hash IS NULL
- GROUP BY d.hash
- `).all();
- }
- /**
- * Clear all embeddings from the database (force re-index).
- * Deletes all rows from content_vectors and drops the vectors_vec table.
- */
- export function clearAllEmbeddings(db) {
- db.exec(`DELETE FROM content_vectors`);
- db.exec(`DROP TABLE IF EXISTS vectors_vec`);
- }
- /**
- * Get the distinct set of model identifiers present in `content_vectors`.
- *
- * Used by the embedding migration-safety guard: if a configured provider's
- * `getModelId()` does not appear in this list (and the table is non-empty),
- * we refuse to embed and ask the user to run `qmd embed -f` to rebuild.
- *
- * Returns `[]` when the table is empty (fresh DB) — in which case any
- * provider is allowed.
- */
- export function getDistinctEmbeddingModels(db) {
- const rows = db.prepare(`SELECT DISTINCT model FROM content_vectors WHERE model IS NOT NULL`).all();
- return rows.map((r) => r.model).filter((m) => typeof m === "string" && m.length > 0);
- }
- /**
- * Insert a single embedding into both content_vectors and vectors_vec tables.
- * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
- *
- * content_vectors is inserted first so that getHashesForEmbedding (which checks
- * only content_vectors) won't re-select the hash on a crash between the two inserts.
- *
- * vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's
- * vec0 virtual tables silently ignore the OR REPLACE conflict clause.
- */
- export function insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt) {
- const hashSeq = `${hash}_${seq}`;
- // Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding)
- const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
- insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
- // vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
- const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
- const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
- deleteVecStmt.run(hashSeq);
- insertVecStmt.run(hashSeq, embedding);
- }
- // =============================================================================
- // Query expansion
- // =============================================================================
- export async function expandQuery(query, model = DEFAULT_QUERY_MODEL, db, intent, llmOverride) {
- // Check cache first — stored as JSON preserving types
- const cacheKey = getCacheKey("expandQuery", { query, model, ...(intent && { intent }) });
- const cached = getCachedResult(db, cacheKey);
- if (cached) {
- try {
- const parsed = JSON.parse(cached);
- // Migrate old cache format: { type, text } → { type, query }
- if (parsed.length > 0 && parsed[0].query) {
- return parsed;
- }
- else if (parsed.length > 0 && parsed[0].text) {
- return parsed.map((r) => ({ type: r.type, query: r.text }));
- }
- }
- catch {
- // Old cache format (pre-typed, newline-separated text) — re-expand
- }
- }
- const llm = llmOverride ?? getDefaultLlamaCpp();
- // Note: LlamaCpp uses hardcoded model, model parameter is ignored
- const results = await llm.expandQuery(query, { intent });
- // Map Queryable[] → ExpandedQuery[] (same shape, decoupled from llm.ts internals).
- // Filter out entries that duplicate the original query text.
- const expanded = results
- .filter(r => r.text !== query)
- .map(r => ({ type: r.type, query: r.text }));
- if (expanded.length > 0) {
- setCachedResult(db, cacheKey, JSON.stringify(expanded));
- }
- return expanded;
- }
- // =============================================================================
- // Reranking
- // =============================================================================
- export async function rerank(query, documents, model = DEFAULT_RERANK_MODEL, db, intent, llmOverride) {
- // Prepend intent to rerank query so the reranker scores with domain context
- const rerankQuery = intent ? `${intent}\n\n${query}` : query;
- const cachedResults = new Map();
- const uncachedDocsByChunk = new Map();
- // Check cache for each document
- // Cache key includes chunk text — different queries can select different chunks
- // from the same file, and the reranker score depends on which chunk was sent.
- // File path is excluded from the new cache key because the reranker score
- // depends on the chunk content, not where it came from.
- for (const doc of documents) {
- const cacheKey = getCacheKey("rerank", { query: rerankQuery, model, chunk: doc.text });
- const legacyCacheKey = getCacheKey("rerank", { query, file: doc.file, model, chunk: doc.text });
- const cached = getCachedResult(db, cacheKey) ?? getCachedResult(db, legacyCacheKey);
- if (cached !== null) {
- cachedResults.set(doc.text, parseFloat(cached));
- }
- else {
- uncachedDocsByChunk.set(doc.text, { file: doc.file, text: doc.text });
- }
- }
- // Rerank uncached documents using LlamaCpp
- if (uncachedDocsByChunk.size > 0) {
- const llm = llmOverride ?? getDefaultLlamaCpp();
- const uncachedDocs = [...uncachedDocsByChunk.values()];
- const rerankResult = await llm.rerank(rerankQuery, uncachedDocs, { model });
- // Cache results by chunk text so identical chunks across files are scored once.
- const textByFile = new Map(uncachedDocs.map(d => [d.file, d.text]));
- for (const result of rerankResult.results) {
- const chunk = textByFile.get(result.file) || "";
- const cacheKey = getCacheKey("rerank", { query: rerankQuery, model, chunk });
- setCachedResult(db, cacheKey, result.score.toString());
- cachedResults.set(chunk, result.score);
- }
- }
- // Return all results sorted by score
- return documents
- .map(doc => ({ file: doc.file, score: cachedResults.get(doc.text) || 0 }))
- .sort((a, b) => b.score - a.score);
- }
- // =============================================================================
- // Reciprocal Rank Fusion
- // =============================================================================
- export function reciprocalRankFusion(resultLists, weights = [], k = 60) {
- const scores = new Map();
- for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
- const list = resultLists[listIdx];
- if (!list)
- continue;
- const weight = weights[listIdx] ?? 1.0;
- for (let rank = 0; rank < list.length; rank++) {
- const result = list[rank];
- if (!result)
- continue;
- const rrfContribution = weight / (k + rank + 1);
- const existing = scores.get(result.file);
- if (existing) {
- existing.rrfScore += rrfContribution;
- existing.topRank = Math.min(existing.topRank, rank);
- }
- else {
- scores.set(result.file, {
- result,
- rrfScore: rrfContribution,
- topRank: rank,
- });
- }
- }
- }
- // Top-rank bonus
- for (const entry of scores.values()) {
- if (entry.topRank === 0) {
- entry.rrfScore += 0.05;
- }
- else if (entry.topRank <= 2) {
- entry.rrfScore += 0.02;
- }
- }
- return Array.from(scores.values())
- .sort((a, b) => b.rrfScore - a.rrfScore)
- .map(e => ({ ...e.result, score: e.rrfScore }));
- }
- /**
- * Build per-document RRF contribution traces for explain/debug output.
- */
- export function buildRrfTrace(resultLists, weights = [], listMeta = [], k = 60) {
- const traces = new Map();
- for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
- const list = resultLists[listIdx];
- if (!list)
- continue;
- const weight = weights[listIdx] ?? 1.0;
- const meta = listMeta[listIdx] ?? {
- source: "fts",
- queryType: "original",
- query: "",
- };
- for (let rank0 = 0; rank0 < list.length; rank0++) {
- const result = list[rank0];
- if (!result)
- continue;
- const rank = rank0 + 1; // 1-indexed rank for explain output
- const contribution = weight / (k + rank);
- const existing = traces.get(result.file);
- const detail = {
- listIndex: listIdx,
- source: meta.source,
- queryType: meta.queryType,
- query: meta.query,
- rank,
- weight,
- backendScore: result.score,
- rrfContribution: contribution,
- };
- if (existing) {
- existing.baseScore += contribution;
- existing.topRank = Math.min(existing.topRank, rank);
- existing.contributions.push(detail);
- }
- else {
- traces.set(result.file, {
- contributions: [detail],
- baseScore: contribution,
- topRank: rank,
- topRankBonus: 0,
- totalScore: 0,
- });
- }
- }
- }
- for (const trace of traces.values()) {
- let bonus = 0;
- if (trace.topRank === 1)
- bonus = 0.05;
- else if (trace.topRank <= 3)
- bonus = 0.02;
- trace.topRankBonus = bonus;
- trace.totalScore = trace.baseScore + bonus;
- }
- return traces;
- }
- /**
- * Find a document by filename/path, docid (#hash), or with fuzzy matching.
- * Returns document metadata without body by default.
- *
- * Supports:
- * - Virtual paths: qmd://collection/path/to/file.md
- * - Absolute paths: /path/to/file.md
- * - Relative paths: path/to/file.md
- * - Short docid: #abc123 (first 6 chars of hash)
- */
- export function findDocument(db, filename, options = {}) {
- let filepath = filename;
- const colonMatch = filepath.match(/:(\d+)$/);
- if (colonMatch) {
- filepath = filepath.slice(0, -colonMatch[0].length);
- }
- // Check if this is a docid lookup (#abc123, abc123, "#abc123", "abc123", etc.)
- if (isDocid(filepath)) {
- const docidMatch = findDocumentByDocid(db, filepath);
- if (docidMatch) {
- filepath = docidMatch.filepath;
- }
- else {
- return { error: "not_found", query: filename, similarFiles: [] };
- }
- }
- if (filepath.startsWith('~/')) {
- filepath = homedir() + filepath.slice(1);
- }
- const bodyCol = options.includeBody ? `, content.doc as body` : ``;
- // Build computed columns
- // Note: absoluteFilepath is computed from YAML collections after query
- const selectCols = `
- 'qmd://' || d.collection || '/' || d.path as virtual_path,
- d.collection || '/' || d.path as display_path,
- d.title,
- d.hash,
- d.collection,
- d.modified_at,
- LENGTH(content.doc) as body_length
- ${bodyCol}
- `;
- // Try to match by virtual path first
- let doc = db.prepare(`
- SELECT ${selectCols}
- FROM documents d
- JOIN content ON content.hash = d.hash
- WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
- `).get(filepath);
- // Try fuzzy match by virtual path
- if (!doc) {
- doc = db.prepare(`
- SELECT ${selectCols}
- FROM documents d
- JOIN content ON content.hash = d.hash
- WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
- LIMIT 1
- `).get(`%${filepath}`);
- }
- // Try to match by absolute path (requires looking up collection paths from DB)
- if (!doc && !filepath.startsWith('qmd://')) {
- const collections = getStoreCollections(db);
- for (const coll of collections) {
- let relativePath = null;
- // If filepath is absolute and starts with collection path, extract relative part
- if (filepath.startsWith(coll.path + '/')) {
- relativePath = filepath.slice(coll.path.length + 1);
- }
- // Otherwise treat filepath as relative to collection
- else if (!filepath.startsWith('/')) {
- relativePath = filepath;
- }
- if (relativePath) {
- doc = db.prepare(`
- SELECT ${selectCols}
- FROM documents d
- JOIN content ON content.hash = d.hash
- WHERE d.collection = ? AND d.path = ? AND d.active = 1
- `).get(coll.name, relativePath);
- if (doc)
- break;
- }
- }
- }
- if (!doc) {
- const similar = findSimilarFiles(db, filepath, 5, 5);
- return { error: "not_found", query: filename, similarFiles: similar };
- }
- // Get context using virtual path
- const virtualPath = doc.virtual_path || `qmd://${doc.collection}/${doc.display_path}`;
- const context = getContextForFile(db, virtualPath);
- return {
- filepath: virtualPath,
- displayPath: doc.display_path,
- title: doc.title,
- context,
- hash: doc.hash,
- docid: getDocid(doc.hash),
- collectionName: doc.collection,
- modifiedAt: doc.modified_at,
- bodyLength: doc.body_length,
- ...(options.includeBody && doc.body !== undefined && { body: doc.body }),
- };
- }
- /**
- * Get the body content for a document
- * Optionally slice by line range
- */
- export function getDocumentBody(db, doc, fromLine, maxLines) {
- const filepath = doc.filepath;
- // Try to resolve document by filepath (absolute or virtual)
- let row = null;
- // Try virtual path first
- if (filepath.startsWith('qmd://')) {
- row = db.prepare(`
- SELECT content.doc as body
- FROM documents d
- JOIN content ON content.hash = d.hash
- WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
- `).get(filepath);
- }
- // Try absolute path by looking up in DB store_collections
- if (!row) {
- const collections = getStoreCollections(db);
- for (const coll of collections) {
- if (filepath.startsWith(coll.path + '/')) {
- const relativePath = filepath.slice(coll.path.length + 1);
- row = db.prepare(`
- SELECT content.doc as body
- FROM documents d
- JOIN content ON content.hash = d.hash
- WHERE d.collection = ? AND d.path = ? AND d.active = 1
- `).get(coll.name, relativePath);
- if (row)
- break;
- }
- }
- }
- if (!row)
- return null;
- let body = row.body;
- if (fromLine !== undefined || maxLines !== undefined) {
- const lines = body.split('\n');
- const start = (fromLine || 1) - 1;
- const end = maxLines !== undefined ? start + maxLines : lines.length;
- body = lines.slice(start, end).join('\n');
- }
- return body;
- }
- /**
- * Find multiple documents by glob pattern or comma-separated list
- * Returns documents without body by default (use getDocumentBody to load)
- */
- export function findDocuments(db, pattern, options = {}) {
- const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?') && !pattern.includes('{');
- const errors = [];
- const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
- const bodyCol = options.includeBody ? `, content.doc as body` : ``;
- const selectCols = `
- 'qmd://' || d.collection || '/' || d.path as virtual_path,
- d.collection || '/' || d.path as display_path,
- d.title,
- d.hash,
- d.collection,
- d.modified_at,
- LENGTH(content.doc) as body_length
- ${bodyCol}
- `;
- let fileRows;
- if (isCommaSeparated) {
- const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
- fileRows = [];
- for (const name of names) {
- let doc = db.prepare(`
- SELECT ${selectCols}
- FROM documents d
- JOIN content ON content.hash = d.hash
- WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
- `).get(name);
- if (!doc) {
- doc = db.prepare(`
- SELECT ${selectCols}
- FROM documents d
- JOIN content ON content.hash = d.hash
- WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
- LIMIT 1
- `).get(`%${name}`);
- }
- if (doc) {
- fileRows.push(doc);
- }
- else {
- const similar = findSimilarFiles(db, name, 5, 3);
- let msg = `File not found: ${name}`;
- if (similar.length > 0) {
- msg += ` (did you mean: ${similar.join(', ')}?)`;
- }
- errors.push(msg);
- }
- }
- }
- else {
- // Glob pattern match
- const matched = matchFilesByGlob(db, pattern);
- if (matched.length === 0) {
- errors.push(`No files matched pattern: ${pattern}`);
- return { docs: [], errors };
- }
- const virtualPaths = matched.map(m => m.filepath);
- const placeholders = virtualPaths.map(() => '?').join(',');
- fileRows = db.prepare(`
- SELECT ${selectCols}
- FROM documents d
- JOIN content ON content.hash = d.hash
- WHERE 'qmd://' || d.collection || '/' || d.path IN (${placeholders}) AND d.active = 1
- `).all(...virtualPaths);
- }
- const results = [];
- for (const row of fileRows) {
- // Get context using virtual path
- const virtualPath = row.virtual_path || `qmd://${row.collection}/${row.display_path}`;
- const context = getContextForFile(db, virtualPath);
- if (row.body_length > maxBytes) {
- results.push({
- doc: { filepath: virtualPath, displayPath: row.display_path },
- skipped: true,
- skipReason: `File too large (${Math.round(row.body_length / 1024)}KB > ${Math.round(maxBytes / 1024)}KB)`,
- });
- continue;
- }
- results.push({
- doc: {
- filepath: virtualPath,
- displayPath: row.display_path,
- title: row.title || row.display_path.split('/').pop() || row.display_path,
- context,
- hash: row.hash,
- docid: getDocid(row.hash),
- collectionName: row.collection,
- modifiedAt: row.modified_at,
- bodyLength: row.body_length,
- ...(options.includeBody && row.body !== undefined && { body: row.body }),
- },
- skipped: false,
- });
- }
- return { docs: results, errors };
- }
- // =============================================================================
- // Status
- // =============================================================================
- export function getStatus(db) {
- // DB is source of truth for collections — config provides supplementary metadata
- const dbCollections = db.prepare(`
- SELECT
- collection as name,
- COUNT(*) as active_count,
- MAX(modified_at) as last_doc_update
- FROM documents
- WHERE active = 1
- GROUP BY collection
- `).all();
- // Build a lookup from store_collections for path/pattern metadata
- const storeCollections = getStoreCollections(db);
- const configLookup = new Map(storeCollections.map(c => [c.name, { path: c.path, pattern: c.pattern }]));
- const collections = dbCollections.map(row => {
- const config = configLookup.get(row.name);
- return {
- name: row.name,
- path: config?.path ?? null,
- pattern: config?.pattern ?? null,
- documents: row.active_count,
- lastUpdated: row.last_doc_update || new Date().toISOString(),
- };
- });
- // Sort by last update time (most recent first)
- collections.sort((a, b) => {
- if (!a.lastUpdated)
- return 1;
- if (!b.lastUpdated)
- return -1;
- return new Date(b.lastUpdated).getTime() - new Date(a.lastUpdated).getTime();
- });
- const totalDocs = db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get().c;
- const needsEmbedding = getHashesNeedingEmbedding(db);
- const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
- return {
- totalDocuments: totalDocs,
- needsEmbedding,
- hasVectorIndex: hasVectors,
- collections,
- };
- }
- /** Weight for intent terms relative to query terms (1.0) in snippet scoring */
- export const INTENT_WEIGHT_SNIPPET = 0.3;
- /** Weight for intent terms relative to query terms (1.0) in chunk selection */
- export const INTENT_WEIGHT_CHUNK = 0.5;
- // Common stop words filtered from intent strings before tokenization.
- // Seeded from finetune/reward.py KEY_TERM_STOPWORDS, extended with common
- // 2-3 char function words so the length threshold can drop to >1 and let
- // short domain terms (API, SQL, LLM, CPU, CDN, …) survive.
- const INTENT_STOP_WORDS = new Set([
- // 2-char function words
- "am", "an", "as", "at", "be", "by", "do", "he", "if",
- "in", "is", "it", "me", "my", "no", "of", "on", "or", "so",
- "to", "up", "us", "we",
- // 3-char function words
- "all", "and", "any", "are", "but", "can", "did", "for", "get",
- "has", "her", "him", "his", "how", "its", "let", "may", "not",
- "our", "out", "the", "too", "was", "who", "why", "you",
- // 4+ char common words
- "also", "does", "find", "from", "have", "into", "more", "need",
- "show", "some", "tell", "that", "them", "this", "want", "what",
- "when", "will", "with", "your",
- // Search-context noise
- "about", "looking", "notes", "search", "where", "which",
- ]);
- /**
- * Extract meaningful terms from an intent string, filtering stop words and punctuation.
- * Uses Unicode-aware punctuation stripping so domain terms like "API" survive.
- * Returns lowercase terms suitable for text matching.
- */
- export function extractIntentTerms(intent) {
- return intent.toLowerCase().split(/\s+/)
- .map(t => t.replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu, ""))
- .filter(t => t.length > 1 && !INTENT_STOP_WORDS.has(t));
- }
- export function extractSnippet(body, query, maxLen = 500, chunkPos, chunkLen, intent) {
- const totalLines = body.split('\n').length;
- let searchBody = body;
- let lineOffset = 0;
- if (chunkPos && chunkPos > 0) {
- // Search within the chunk region, with some padding for context
- // Use provided chunkLen or fall back to max chunk size (covers variable-length chunks)
- const searchLen = chunkLen || CHUNK_SIZE_CHARS;
- const contextStart = Math.max(0, chunkPos - 100);
- const contextEnd = Math.min(body.length, chunkPos + searchLen + 100);
- searchBody = body.slice(contextStart, contextEnd);
- if (contextStart > 0) {
- lineOffset = body.slice(0, contextStart).split('\n').length - 1;
- }
- }
- const lines = searchBody.split('\n');
- const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
- const intentTerms = intent ? extractIntentTerms(intent) : [];
- let bestLine = 0, bestScore = -1;
- for (let i = 0; i < lines.length; i++) {
- const lineLower = (lines[i] ?? "").toLowerCase();
- let score = 0;
- for (const term of queryTerms) {
- if (lineLower.includes(term))
- score += 1.0;
- }
- for (const term of intentTerms) {
- if (lineLower.includes(term))
- score += INTENT_WEIGHT_SNIPPET;
- }
- if (score > bestScore) {
- bestScore = score;
- bestLine = i;
- }
- }
- const start = Math.max(0, bestLine - 1);
- const end = Math.min(lines.length, bestLine + 3);
- const snippetLines = lines.slice(start, end);
- let snippetText = snippetLines.join('\n');
- // If we focused on a chunk window and it produced an empty/whitespace-only snippet,
- // fall back to a full-document snippet so we always show something useful.
- if (chunkPos && chunkPos > 0 && snippetText.trim().length === 0) {
- return extractSnippet(body, query, maxLen, undefined, undefined, intent);
- }
- if (snippetText.length > maxLen)
- snippetText = snippetText.substring(0, maxLen - 3) + "...";
- const absoluteStart = lineOffset + start + 1; // 1-indexed
- const snippetLineCount = snippetLines.length;
- const linesBefore = absoluteStart - 1;
- const linesAfter = totalLines - (absoluteStart + snippetLineCount - 1);
- // Format with diff-style header: @@ -start,count @@ (linesBefore before, linesAfter after)
- const header = `@@ -${absoluteStart},${snippetLineCount} @@ (${linesBefore} before, ${linesAfter} after)`;
- const snippet = `${header}\n${snippetText}`;
- return {
- line: lineOffset + bestLine + 1,
- snippet,
- linesBefore,
- linesAfter,
- snippetLines: snippetLineCount,
- };
- }
- // =============================================================================
- // Shared helpers (used by both CLI and MCP)
- // =============================================================================
- /**
- * Add line numbers to text content.
- * Each line becomes: "{lineNum}: {content}"
- */
- export function addLineNumbers(text, startLine = 1) {
- const lines = text.split('\n');
- return lines.map((line, i) => `${startLine + i}: ${line}`).join('\n');
- }
- /**
- * Hybrid search: BM25 + vector + query expansion + RRF + chunked reranking.
- *
- * Pipeline:
- * 1. BM25 probe → skip expansion if strong signal
- * 2. expandQuery() → typed query variants (lex/vec/hyde)
- * 3. Type-routed search: original→vector, lex→FTS, vec/hyde→vector
- * 4. RRF fusion → slice to candidateLimit
- * 5. chunkDocument() + keyword-best-chunk selection
- * 6. rerank on chunks (NOT full bodies — O(tokens) trap)
- * 7. Position-aware score blending (RRF rank × reranker score)
- * 8. Dedup by file, filter by minScore, slice to limit
- */
- export async function hybridQuery(store, query, options) {
- const limit = options?.limit ?? 10;
- const minScore = options?.minScore ?? 0;
- const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
- const collection = options?.collection;
- const explain = options?.explain ?? false;
- const intent = options?.intent;
- const skipRerank = options?.skipRerank ?? false;
- const hooks = options?.hooks;
- const embedProvider = options?.embedProvider;
- const rankedLists = [];
- const rankedListMeta = [];
- const docidMap = new Map(); // filepath -> docid
- const hasVectors = !!store.db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
- // Step 1: BM25 probe — strong signal skips expensive LLM expansion
- // When intent is provided, disable strong-signal bypass — the obvious BM25
- // match may not be what the caller wants (e.g. "performance" with intent
- // "web page load times" should NOT shortcut to a sports-performance doc).
- // Pass collection directly into FTS query (filter at SQL level, not post-hoc)
- const initialFts = store.searchFTS(query, 20, collection);
- const topScore = initialFts[0]?.score ?? 0;
- const secondScore = initialFts[1]?.score ?? 0;
- const hasStrongSignal = !intent && initialFts.length > 0
- && topScore >= STRONG_SIGNAL_MIN_SCORE
- && (topScore - secondScore) >= STRONG_SIGNAL_MIN_GAP;
- if (hasStrongSignal)
- hooks?.onStrongSignal?.(topScore);
- // Step 2: Expand query (or skip if strong signal)
- hooks?.onExpandStart?.();
- const expandStart = Date.now();
- const expanded = hasStrongSignal
- ? []
- : await store.expandQuery(query, undefined, intent);
- hooks?.onExpand?.(query, expanded, Date.now() - expandStart);
- // Seed with initial FTS results (avoid re-running original query FTS)
- if (initialFts.length > 0) {
- for (const r of initialFts)
- docidMap.set(r.filepath, r.docid);
- rankedLists.push(initialFts.map(r => ({
- file: r.filepath, displayPath: r.displayPath,
- title: r.title, body: r.body || "", score: r.score,
- })));
- rankedListMeta.push({ source: "fts", queryType: "original", query });
- }
- // Step 3: Route searches by query type
- //
- // Strategy: run all FTS queries immediately (they're sync/instant), then
- // batch-embed all vector queries in one embedBatch() call, then run
- // sqlite-vec lookups with pre-computed embeddings.
- // 3a: Run FTS for all lex expansions right away (no LLM needed)
- for (const q of expanded) {
- if (q.type === 'lex') {
- const ftsResults = store.searchFTS(q.query, 20, collection);
- if (ftsResults.length > 0) {
- for (const r of ftsResults)
- docidMap.set(r.filepath, r.docid);
- rankedLists.push(ftsResults.map(r => ({
- file: r.filepath, displayPath: r.displayPath,
- title: r.title, body: r.body || "", score: r.score,
- })));
- rankedListMeta.push({ source: "fts", queryType: "lex", query: q.query });
- }
- }
- }
- // 3b: Collect all texts that need vector search (original query + vec/hyde expansions)
- if (hasVectors) {
- const vecQueries = [
- { text: query, queryType: "original" },
- ];
- for (const q of expanded) {
- if (q.type === 'vec' || q.type === 'hyde') {
- vecQueries.push({ text: q.query, queryType: q.type });
- }
- }
- // Batch embed all vector queries in a single call.
- // When `embedProvider` is supplied (i-loazq6ze), route the encode through
- // it (HTTP / GPU worker / AutoFallback chain) instead of warming the
- // local llama-cpp model — this is the whole point of the GPU worker.
- const embedModelName = embedProvider
- ? embedProvider.getModelId()
- : getLlm(store).embedModelName;
- const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, embedModelName));
- hooks?.onEmbedStart?.(textsToEmbed.length);
- const embedStart = Date.now();
- const embeddings = embedProvider
- ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
- : await getLlm(store).embedBatch(textsToEmbed);
- hooks?.onEmbedDone?.(Date.now() - embedStart);
- // Run sqlite-vec lookups with pre-computed embeddings
- for (let i = 0; i < vecQueries.length; i++) {
- const embedding = embeddings[i]?.embedding;
- if (!embedding)
- continue;
- const vecResults = await store.searchVec(vecQueries[i].text, DEFAULT_EMBED_MODEL, 20, collection, undefined, embedding);
- if (vecResults.length > 0) {
- for (const r of vecResults)
- docidMap.set(r.filepath, r.docid);
- rankedLists.push(vecResults.map(r => ({
- file: r.filepath, displayPath: r.displayPath,
- title: r.title, body: r.body || "", score: r.score,
- })));
- rankedListMeta.push({
- source: "vec",
- queryType: vecQueries[i].queryType,
- query: vecQueries[i].text,
- });
- }
- }
- }
- // Step 4: RRF fusion — first 2 lists (original FTS + first vec) get 2x weight
- const weights = rankedLists.map((_, i) => i < 2 ? 2.0 : 1.0);
- const fused = reciprocalRankFusion(rankedLists, weights);
- const rrfTraceByFile = explain ? buildRrfTrace(rankedLists, weights, rankedListMeta) : null;
- const candidates = fused.slice(0, candidateLimit);
- if (candidates.length === 0)
- return [];
- // Step 5: Chunk documents, pick best chunk per doc for reranking.
- // Reranking full bodies is O(tokens) — the critical perf lesson that motivated this refactor.
- const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
- const intentTerms = intent ? extractIntentTerms(intent) : [];
- const docChunkMap = new Map();
- const chunkStrategy = options?.chunkStrategy;
- for (const cand of candidates) {
- const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, chunkStrategy);
- if (chunks.length === 0)
- continue;
- // Pick chunk with most keyword overlap (fallback: first chunk)
- // Intent terms contribute at INTENT_WEIGHT_CHUNK (0.5) relative to query terms (1.0)
- let bestIdx = 0;
- let bestScore = -1;
- for (let i = 0; i < chunks.length; i++) {
- const chunkLower = chunks[i].text.toLowerCase();
- let score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
- for (const term of intentTerms) {
- if (chunkLower.includes(term))
- score += INTENT_WEIGHT_CHUNK;
- }
- if (score > bestScore) {
- bestScore = score;
- bestIdx = i;
- }
- }
- docChunkMap.set(cand.file, { chunks, bestIdx });
- }
- if (skipRerank) {
- // Skip LLM reranking — return candidates scored by RRF only
- const seenFiles = new Set();
- return candidates
- .map((cand, i) => {
- const chunkInfo = docChunkMap.get(cand.file);
- const bestIdx = chunkInfo?.bestIdx ?? 0;
- const bestChunk = chunkInfo?.chunks[bestIdx]?.text || cand.body || "";
- const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
- const rrfRank = i + 1;
- const rrfScore = 1 / rrfRank;
- const trace = rrfTraceByFile?.get(cand.file);
- const explainData = explain ? {
- ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
- vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
- rrf: {
- rank: rrfRank,
- positionScore: rrfScore,
- weight: 1.0,
- baseScore: trace?.baseScore ?? 0,
- topRankBonus: trace?.topRankBonus ?? 0,
- totalScore: trace?.totalScore ?? 0,
- contributions: trace?.contributions ?? [],
- },
- rerankScore: 0,
- blendedScore: rrfScore,
- } : undefined;
- return {
- file: cand.file,
- displayPath: cand.displayPath,
- title: cand.title,
- body: cand.body,
- bestChunk,
- bestChunkPos,
- score: rrfScore,
- context: store.getContextForFile(cand.file),
- docid: docidMap.get(cand.file) || "",
- ...(explainData ? { explain: explainData } : {}),
- };
- })
- .filter(r => {
- if (seenFiles.has(r.file))
- return false;
- seenFiles.add(r.file);
- return true;
- })
- .filter(r => r.score >= minScore)
- .slice(0, limit);
- }
- // Step 6: Rerank chunks (NOT full bodies)
- const chunksToRerank = [];
- for (const cand of candidates) {
- const chunkInfo = docChunkMap.get(cand.file);
- if (chunkInfo) {
- chunksToRerank.push({ file: cand.file, text: chunkInfo.chunks[chunkInfo.bestIdx].text });
- }
- }
- hooks?.onRerankStart?.(chunksToRerank.length);
- const rerankStart = Date.now();
- const reranked = await store.rerank(query, chunksToRerank, undefined, intent);
- hooks?.onRerankDone?.(Date.now() - rerankStart);
- // Step 7: Blend RRF position score with reranker score
- // Position-aware weights: top retrieval results get more protection from reranker disagreement
- const candidateMap = new Map(candidates.map(c => [c.file, {
- displayPath: c.displayPath, title: c.title, body: c.body,
- }]));
- const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1]));
- const blended = reranked.map(r => {
- const rrfRank = rrfRankMap.get(r.file) || candidateLimit;
- let rrfWeight;
- if (rrfRank <= 3)
- rrfWeight = 0.75;
- else if (rrfRank <= 10)
- rrfWeight = 0.60;
- else
- rrfWeight = 0.40;
- const rrfScore = 1 / rrfRank;
- const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * r.score;
- const candidate = candidateMap.get(r.file);
- const chunkInfo = docChunkMap.get(r.file);
- const bestIdx = chunkInfo?.bestIdx ?? 0;
- const bestChunk = chunkInfo?.chunks[bestIdx]?.text || candidate?.body || "";
- const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
- const trace = rrfTraceByFile?.get(r.file);
- const explainData = explain ? {
- ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
- vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
- rrf: {
- rank: rrfRank,
- positionScore: rrfScore,
- weight: rrfWeight,
- baseScore: trace?.baseScore ?? 0,
- topRankBonus: trace?.topRankBonus ?? 0,
- totalScore: trace?.totalScore ?? 0,
- contributions: trace?.contributions ?? [],
- },
- rerankScore: r.score,
- blendedScore,
- } : undefined;
- return {
- file: r.file,
- displayPath: candidate?.displayPath || "",
- title: candidate?.title || "",
- body: candidate?.body || "",
- bestChunk,
- bestChunkPos,
- score: blendedScore,
- context: store.getContextForFile(r.file),
- docid: docidMap.get(r.file) || "",
- ...(explainData ? { explain: explainData } : {}),
- };
- }).sort((a, b) => b.score - a.score);
- // Step 8: Dedup by file (safety net — prevents duplicate output)
- const seenFiles = new Set();
- return blended
- .filter(r => {
- if (seenFiles.has(r.file))
- return false;
- seenFiles.add(r.file);
- return true;
- })
- .filter(r => r.score >= minScore)
- .slice(0, limit);
- }
- /**
- * Vector-only semantic search with query expansion.
- *
- * Pipeline:
- * 1. expandQuery() → typed variants, filter to vec/hyde only (lex irrelevant here)
- * 2. searchVec() for original + vec/hyde variants (sequential — node-llama-cpp embed limitation)
- * 3. Dedup by filepath (keep max score)
- * 4. Sort by score descending, filter by minScore, slice to limit
- */
- export async function vectorSearchQuery(store, query, options) {
- const limit = options?.limit ?? 10;
- const minScore = options?.minScore ?? 0.3;
- const collection = options?.collection;
- const intent = options?.intent;
- const embedProvider = options?.embedProvider;
- const hasVectors = !!store.db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
- if (!hasVectors)
- return [];
- // Expand query — filter to vec/hyde only (lex queries target FTS, not vector)
- const expandStart = Date.now();
- const allExpanded = await store.expandQuery(query, undefined, intent);
- const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
- options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
- // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs.
- // When `embedProvider` is supplied (i-loazq6ze), query encoding is routed
- // through it; the per-call signature `searchVec(...)` accepts the provider
- // as the trailing argument so existing tests / callers stay untouched.
- const queryTexts = [query, ...vecExpanded.map(q => q.query)];
- const allResults = new Map();
- for (const q of queryTexts) {
- const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection, undefined, undefined, embedProvider);
- for (const r of vecResults) {
- const existing = allResults.get(r.filepath);
- if (!existing || r.score > existing.score) {
- allResults.set(r.filepath, {
- file: r.filepath,
- displayPath: r.displayPath,
- title: r.title,
- body: r.body || "",
- score: r.score,
- context: store.getContextForFile(r.filepath),
- docid: r.docid,
- });
- }
- }
- }
- return Array.from(allResults.values())
- .sort((a, b) => b.score - a.score)
- .filter(r => r.score >= minScore)
- .slice(0, limit);
- }
- /**
- * Structured search: execute pre-expanded queries without LLM query expansion.
- *
- * Designed for LLM callers (MCP/HTTP) that generate their own query expansions.
- * Skips the internal expandQuery() step — goes directly to:
- *
- * Pipeline:
- * 1. Route searches: lex→FTS, vec/hyde→vector (batch embed)
- * 2. RRF fusion across all result lists
- * 3. Chunk documents + keyword-best-chunk selection
- * 4. Rerank on chunks
- * 5. Position-aware score blending
- * 6. Dedup, filter, slice
- *
- * This is the recommended endpoint for capable LLMs — they can generate
- * better query variations than our small local model, especially for
- * domain-specific or nuanced queries.
- */
- export async function structuredSearch(store, searches, options) {
- const limit = options?.limit ?? 10;
- const minScore = options?.minScore ?? 0;
- const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
- const explain = options?.explain ?? false;
- const intent = options?.intent;
- const skipRerank = options?.skipRerank ?? false;
- const hooks = options?.hooks;
- const embedProvider = options?.embedProvider;
- const collections = options?.collections;
- if (searches.length === 0)
- return [];
- // Validate queries before executing
- for (const search of searches) {
- const location = search.line ? `Line ${search.line}` : 'Structured search';
- if (/[\r\n]/.test(search.query)) {
- throw new Error(`${location} (${search.type}): queries must be single-line. Remove newline characters.`);
- }
- if (search.type === 'lex') {
- const error = validateLexQuery(search.query);
- if (error) {
- throw new Error(`${location} (lex): ${error}`);
- }
- }
- else if (search.type === 'vec' || search.type === 'hyde') {
- const error = validateSemanticQuery(search.query);
- if (error) {
- throw new Error(`${location} (${search.type}): ${error}`);
- }
- }
- }
- const rankedLists = [];
- const rankedListMeta = [];
- const docidMap = new Map(); // filepath -> docid
- const hasVectors = !!store.db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
- // Helper to run search across collections (or all if undefined)
- const collectionList = collections ?? [undefined]; // undefined = all collections
- // Step 1: Run FTS for all lex searches (sync, instant)
- for (const search of searches) {
- if (search.type === 'lex') {
- for (const coll of collectionList) {
- const ftsResults = store.searchFTS(search.query, 20, coll);
- if (ftsResults.length > 0) {
- for (const r of ftsResults)
- docidMap.set(r.filepath, r.docid);
- rankedLists.push(ftsResults.map(r => ({
- file: r.filepath, displayPath: r.displayPath,
- title: r.title, body: r.body || "", score: r.score,
- })));
- rankedListMeta.push({
- source: "fts",
- queryType: "lex",
- query: search.query,
- });
- }
- }
- }
- }
- // Step 2: Batch embed and run vector searches for vec/hyde
- if (hasVectors) {
- const vecSearches = searches.filter((s) => s.type === 'vec' || s.type === 'hyde');
- if (vecSearches.length > 0) {
- // Route batch encoding through the supplied EmbeddingProvider when
- // present (i-loazq6ze). Otherwise fall back to the local llama-cpp
- // singleton — preserves pre-patch behavior for callers that don't
- // configure a provider.
- const embedModelName = embedProvider
- ? embedProvider.getModelId()
- : getLlm(store).embedModelName;
- const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, embedModelName));
- hooks?.onEmbedStart?.(textsToEmbed.length);
- const embedStart = Date.now();
- const embeddings = embedProvider
- ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
- : await getLlm(store).embedBatch(textsToEmbed);
- hooks?.onEmbedDone?.(Date.now() - embedStart);
- for (let i = 0; i < vecSearches.length; i++) {
- const embedding = embeddings[i]?.embedding;
- if (!embedding)
- continue;
- for (const coll of collectionList) {
- const vecResults = await store.searchVec(vecSearches[i].query, DEFAULT_EMBED_MODEL, 20, coll, undefined, embedding);
- if (vecResults.length > 0) {
- for (const r of vecResults)
- docidMap.set(r.filepath, r.docid);
- rankedLists.push(vecResults.map(r => ({
- file: r.filepath, displayPath: r.displayPath,
- title: r.title, body: r.body || "", score: r.score,
- })));
- rankedListMeta.push({
- source: "vec",
- queryType: vecSearches[i].type,
- query: vecSearches[i].query,
- });
- }
- }
- }
- }
- }
- if (rankedLists.length === 0)
- return [];
- // Step 3: RRF fusion — first list gets 2x weight (assume caller ordered by importance)
- const weights = rankedLists.map((_, i) => i === 0 ? 2.0 : 1.0);
- const fused = reciprocalRankFusion(rankedLists, weights);
- const rrfTraceByFile = explain ? buildRrfTrace(rankedLists, weights, rankedListMeta) : null;
- const candidates = fused.slice(0, candidateLimit);
- if (candidates.length === 0)
- return [];
- hooks?.onExpand?.("", [], 0); // Signal no expansion (pre-expanded)
- // Step 4: Chunk documents, pick best chunk per doc for reranking
- // Use first lex query as the "query" for keyword matching, or first vec if no lex
- const primaryQuery = searches.find(s => s.type === 'lex')?.query
- || searches.find(s => s.type === 'vec')?.query
- || searches[0]?.query || "";
- const queryTerms = primaryQuery.toLowerCase().split(/\s+/).filter(t => t.length > 2);
- const intentTerms = intent ? extractIntentTerms(intent) : [];
- const docChunkMap = new Map();
- const ssChunkStrategy = options?.chunkStrategy;
- for (const cand of candidates) {
- const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, ssChunkStrategy);
- if (chunks.length === 0)
- continue;
- // Pick chunk with most keyword overlap
- // Intent terms contribute at INTENT_WEIGHT_CHUNK (0.5) relative to query terms (1.0)
- let bestIdx = 0;
- let bestScore = -1;
- for (let i = 0; i < chunks.length; i++) {
- const chunkLower = chunks[i].text.toLowerCase();
- let score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
- for (const term of intentTerms) {
- if (chunkLower.includes(term))
- score += INTENT_WEIGHT_CHUNK;
- }
- if (score > bestScore) {
- bestScore = score;
- bestIdx = i;
- }
- }
- docChunkMap.set(cand.file, { chunks, bestIdx });
- }
- if (skipRerank) {
- // Skip LLM reranking — return candidates scored by RRF only
- const seenFiles = new Set();
- return candidates
- .map((cand, i) => {
- const chunkInfo = docChunkMap.get(cand.file);
- const bestIdx = chunkInfo?.bestIdx ?? 0;
- const bestChunk = chunkInfo?.chunks[bestIdx]?.text || cand.body || "";
- const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
- const rrfRank = i + 1;
- const rrfScore = 1 / rrfRank;
- const trace = rrfTraceByFile?.get(cand.file);
- const explainData = explain ? {
- ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
- vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
- rrf: {
- rank: rrfRank,
- positionScore: rrfScore,
- weight: 1.0,
- baseScore: trace?.baseScore ?? 0,
- topRankBonus: trace?.topRankBonus ?? 0,
- totalScore: trace?.totalScore ?? 0,
- contributions: trace?.contributions ?? [],
- },
- rerankScore: 0,
- blendedScore: rrfScore,
- } : undefined;
- return {
- file: cand.file,
- displayPath: cand.displayPath,
- title: cand.title,
- body: cand.body,
- bestChunk,
- bestChunkPos,
- score: rrfScore,
- context: store.getContextForFile(cand.file),
- docid: docidMap.get(cand.file) || "",
- ...(explainData ? { explain: explainData } : {}),
- };
- })
- .filter(r => {
- if (seenFiles.has(r.file))
- return false;
- seenFiles.add(r.file);
- return true;
- })
- .filter(r => r.score >= minScore)
- .slice(0, limit);
- }
- // Step 5: Rerank chunks
- const chunksToRerank = [];
- for (const cand of candidates) {
- const chunkInfo = docChunkMap.get(cand.file);
- if (chunkInfo) {
- chunksToRerank.push({ file: cand.file, text: chunkInfo.chunks[chunkInfo.bestIdx].text });
- }
- }
- hooks?.onRerankStart?.(chunksToRerank.length);
- const rerankStart2 = Date.now();
- const reranked = await store.rerank(primaryQuery, chunksToRerank, undefined, intent);
- hooks?.onRerankDone?.(Date.now() - rerankStart2);
- // Step 6: Blend RRF position score with reranker score
- const candidateMap = new Map(candidates.map(c => [c.file, {
- displayPath: c.displayPath, title: c.title, body: c.body,
- }]));
- const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1]));
- const blended = reranked.map(r => {
- const rrfRank = rrfRankMap.get(r.file) || candidateLimit;
- let rrfWeight;
- if (rrfRank <= 3)
- rrfWeight = 0.75;
- else if (rrfRank <= 10)
- rrfWeight = 0.60;
- else
- rrfWeight = 0.40;
- const rrfScore = 1 / rrfRank;
- const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * r.score;
- const candidate = candidateMap.get(r.file);
- const chunkInfo = docChunkMap.get(r.file);
- const bestIdx = chunkInfo?.bestIdx ?? 0;
- const bestChunk = chunkInfo?.chunks[bestIdx]?.text || candidate?.body || "";
- const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
- const trace = rrfTraceByFile?.get(r.file);
- const explainData = explain ? {
- ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
- vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
- rrf: {
- rank: rrfRank,
- positionScore: rrfScore,
- weight: rrfWeight,
- baseScore: trace?.baseScore ?? 0,
- topRankBonus: trace?.topRankBonus ?? 0,
- totalScore: trace?.totalScore ?? 0,
- contributions: trace?.contributions ?? [],
- },
- rerankScore: r.score,
- blendedScore,
- } : undefined;
- return {
- file: r.file,
- displayPath: candidate?.displayPath || "",
- title: candidate?.title || "",
- body: candidate?.body || "",
- bestChunk,
- bestChunkPos,
- score: blendedScore,
- context: store.getContextForFile(r.file),
- docid: docidMap.get(r.file) || "",
- ...(explainData ? { explain: explainData } : {}),
- };
- }).sort((a, b) => b.score - a.score);
- // Step 7: Dedup by file
- const seenFiles = new Set();
- return blended
- .filter(r => {
- if (seenFiles.has(r.file))
- return false;
- seenFiles.add(r.file);
- return true;
- })
- .filter(r => r.score >= minScore)
- .slice(0, limit);
- }
|