suby
/
qmd


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020
							/**
 * QMD Store - Core data access and retrieval functions
 *
 * This module provides all database operations, search functions, and document
 * retrieval for QMD. It returns raw data structures that can be formatted by
 * CLI or MCP consumers.
 *
 * Usage:
 *   const store = createStore("/path/to/db.sqlite");
 *   // or use default path:
 *   const store = createStore();
 */
import { openDatabase, loadSqliteVec } from "./db.js";
import picomatch from "picomatch";
import { createHash } from "crypto";
import { readFileSync, realpathSync, statSync, mkdirSync } from "node:fs";
// Note: node:path resolve is not imported — we export our own cross-platform resolve()
import fastGlob from "fast-glob";
import { LlamaCpp, getDefaultLlamaCpp, formatQueryForEmbedding, formatDocForEmbedding, withLLMSessionForLlm, } from "./llm.js";
import { assertModelCompatible, } from "./embedding/provider.js";
// =============================================================================
// Configuration
// =============================================================================
const HOME = process.env.HOME || "/tmp";
export const DEFAULT_EMBED_MODEL = "embeddinggemma";
export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
export const DEFAULT_GLOB = "**/*.md";
export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
export const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
export const DEFAULT_EMBED_MAX_BATCH_BYTES = 64 * 1024 * 1024; // 64MB
// Chunking: 900 tokens per chunk with 15% overlap
// Increased from 800 to accommodate smart chunking finding natural break points
export const CHUNK_SIZE_TOKENS = 900;
export const CHUNK_OVERLAP_TOKENS = Math.floor(CHUNK_SIZE_TOKENS * 0.15); // 135 tokens (15% overlap)
// Fallback char-based approximation for sync chunking (~4 chars per token)
export const CHUNK_SIZE_CHARS = CHUNK_SIZE_TOKENS * 4; // 3600 chars
export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 540 chars
// Search window for finding optimal break points (in tokens, ~200 tokens)
export const CHUNK_WINDOW_TOKENS = 200;
export const CHUNK_WINDOW_CHARS = CHUNK_WINDOW_TOKENS * 4; // 800 chars
/**
 * Get the LlamaCpp instance for a store — prefers the store's own instance,
 * falls back to the global singleton.
 */
function getLlm(store) {
    return store.llm ?? getDefaultLlamaCpp();
}
/**
 * Patterns for detecting break points in markdown documents.
 * Higher scores indicate better places to split.
 * Scores are spread wide so headings decisively beat lower-quality breaks.
 * Order matters for scoring - more specific patterns first.
 */
export const BREAK_PATTERNS = [
    [/\n#{1}(?!#)/g, 100, 'h1'], // # but not ##
    [/\n#{2}(?!#)/g, 90, 'h2'], // ## but not ###
    [/\n#{3}(?!#)/g, 80, 'h3'], // ### but not ####
    [/\n#{4}(?!#)/g, 70, 'h4'], // #### but not #####
    [/\n#{5}(?!#)/g, 60, 'h5'], // ##### but not ######
    [/\n#{6}(?!#)/g, 50, 'h6'], // ######
    [/\n```/g, 80, 'codeblock'], // code block boundary (same as h3)
    [/\n(?:---|\*\*\*|___)\s*\n/g, 60, 'hr'], // horizontal rule
    [/\n\n+/g, 20, 'blank'], // paragraph boundary
    [/\n[-*]\s/g, 5, 'list'], // unordered list item
    [/\n\d+\.\s/g, 5, 'numlist'], // ordered list item
    [/\n/g, 1, 'newline'], // minimal break
];
/**
 * Scan text for all potential break points.
 * Returns sorted array of break points with higher-scoring patterns taking precedence
 * when multiple patterns match the same position.
 */
export function scanBreakPoints(text) {
    const points = [];
    const seen = new Map(); // pos -> best break point at that pos
    for (const [pattern, score, type] of BREAK_PATTERNS) {
        for (const match of text.matchAll(pattern)) {
            const pos = match.index;
            const existing = seen.get(pos);
            // Keep higher score if position already seen
            if (!existing || score > existing.score) {
                const bp = { pos, score, type };
                seen.set(pos, bp);
            }
        }
    }
    // Convert to array and sort by position
    for (const bp of seen.values()) {
        points.push(bp);
    }
    return points.sort((a, b) => a.pos - b.pos);
}
/**
 * Find all code fence regions in the text.
 * Code fences are delimited by ``` and we should never split inside them.
 */
export function findCodeFences(text) {
    const regions = [];
    const fencePattern = /\n```/g;
    let inFence = false;
    let fenceStart = 0;
    for (const match of text.matchAll(fencePattern)) {
        if (!inFence) {
            fenceStart = match.index;
            inFence = true;
        }
        else {
            regions.push({ start: fenceStart, end: match.index + match[0].length });
            inFence = false;
        }
    }
    // Handle unclosed fence - extends to end of document
    if (inFence) {
        regions.push({ start: fenceStart, end: text.length });
    }
    return regions;
}
/**
 * Check if a position is inside a code fence region.
 */
export function isInsideCodeFence(pos, fences) {
    return fences.some(f => pos > f.start && pos < f.end);
}
/**
 * Find the best cut position using scored break points with distance decay.
 *
 * Uses squared distance for gentler early decay - headings far back still win
 * over low-quality breaks near the target.
 *
 * @param breakPoints - Pre-scanned break points from scanBreakPoints()
 * @param targetCharPos - The ideal cut position (e.g., maxChars boundary)
 * @param windowChars - How far back to search for break points (default ~200 tokens)
 * @param decayFactor - How much to penalize distance (0.7 = 30% score at window edge)
 * @param codeFences - Code fence regions to avoid splitting inside
 * @returns The best position to cut at
 */
export function findBestCutoff(breakPoints, targetCharPos, windowChars = CHUNK_WINDOW_CHARS, decayFactor = 0.7, codeFences = []) {
    const windowStart = targetCharPos - windowChars;
    let bestScore = -1;
    let bestPos = targetCharPos;
    for (const bp of breakPoints) {
        if (bp.pos < windowStart)
            continue;
        if (bp.pos > targetCharPos)
            break; // sorted, so we can stop
        // Skip break points inside code fences
        if (isInsideCodeFence(bp.pos, codeFences))
            continue;
        const distance = targetCharPos - bp.pos;
        // Squared distance decay: gentle early, steep late
        // At target: multiplier = 1.0
        // At 25% back: multiplier = 0.956
        // At 50% back: multiplier = 0.825
        // At 75% back: multiplier = 0.606
        // At window edge: multiplier = 0.3
        const normalizedDist = distance / windowChars;
        const multiplier = 1.0 - (normalizedDist * normalizedDist) * decayFactor;
        const finalScore = bp.score * multiplier;
        if (finalScore > bestScore) {
            bestScore = finalScore;
            bestPos = bp.pos;
        }
    }
    return bestPos;
}
/**
 * Merge two sets of break points (e.g. regex + AST), keeping the highest
 * score at each position. Result is sorted by position.
 */
export function mergeBreakPoints(a, b) {
    const seen = new Map();
    for (const bp of a) {
        const existing = seen.get(bp.pos);
        if (!existing || bp.score > existing.score) {
            seen.set(bp.pos, bp);
        }
    }
    for (const bp of b) {
        const existing = seen.get(bp.pos);
        if (!existing || bp.score > existing.score) {
            seen.set(bp.pos, bp);
        }
    }
    return Array.from(seen.values()).sort((a, b) => a.pos - b.pos);
}
/**
 * Core chunk algorithm that operates on precomputed break points and code fences.
 * This is the shared implementation used by both regex-only and AST-aware chunking.
 */
export function chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS) {
    if (content.length <= maxChars) {
        return [{ text: content, pos: 0 }];
    }
    const chunks = [];
    let charPos = 0;
    while (charPos < content.length) {
        const targetEndPos = Math.min(charPos + maxChars, content.length);
        let endPos = targetEndPos;
        if (endPos < content.length) {
            const bestCutoff = findBestCutoff(breakPoints, targetEndPos, windowChars, 0.7, codeFences);
            if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
                endPos = bestCutoff;
            }
        }
        if (endPos <= charPos) {
            endPos = Math.min(charPos + maxChars, content.length);
        }
        chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
        if (endPos >= content.length) {
            break;
        }
        charPos = endPos - overlapChars;
        const lastChunkPos = chunks.at(-1).pos;
        if (charPos <= lastChunkPos) {
            charPos = endPos;
        }
    }
    return chunks;
}
// Hybrid query: strong BM25 signal detection thresholds
// Skip expensive LLM expansion when top result is strong AND clearly separated from runner-up
export const STRONG_SIGNAL_MIN_SCORE = 0.85;
export const STRONG_SIGNAL_MIN_GAP = 0.15;
// Max candidates to pass to reranker — balances quality vs latency.
// 40 keeps rank 31-40 visible to the reranker (matters for recall on broad queries).
export const RERANK_CANDIDATE_LIMIT = 40;
// =============================================================================
// Path utilities
// =============================================================================
export function homedir() {
    return HOME;
}
/**
 * Check if a path is absolute.
 * Supports:
 * - Unix paths: /path/to/file
 * - Windows native: C:\path or C:/path
 * - Git Bash: /c/path or /C/path (C-Z drives, excluding A/B floppy drives)
 *
 * Note: /c without trailing slash is treated as Unix path (directory named "c"),
 * while /c/ or /c/path are treated as Git Bash paths (C: drive).
 */
export function isAbsolutePath(path) {
    if (!path)
        return false;
    // Unix absolute path
    if (path.startsWith('/')) {
        // Check if it's a Git Bash style path like /c/ or /c/Users (C-Z only, not A or B)
        // Requires path[2] === '/' to distinguish from Unix paths like /c or /cache
        // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
        if (!isWSL() && path.length >= 3 && path[2] === '/') {
            const driveLetter = path[1];
            if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
                return true;
            }
        }
        // Any other path starting with / is Unix absolute
        return true;
    }
    // Windows native path: C:\ or C:/ (any letter A-Z)
    if (path.length >= 2 && /[a-zA-Z]/.test(path[0]) && path[1] === ':') {
        return true;
    }
    return false;
}
/**
 * Normalize path separators to forward slashes.
 * Converts Windows backslashes to forward slashes.
 */
export function normalizePathSeparators(path) {
    return path.replace(/\\/g, '/');
}
/**
 * Detect if running inside WSL (Windows Subsystem for Linux).
 * On WSL, paths like /c/work/... are valid drvfs mount points, not Git Bash paths.
 */
function isWSL() {
    return !!(process.env.WSL_DISTRO_NAME || process.env.WSL_INTEROP);
}
/**
 * Get the relative path from a prefix.
 * Returns null if path is not under prefix.
 * Returns empty string if path equals prefix.
 */
export function getRelativePathFromPrefix(path, prefix) {
    // Empty prefix is invalid
    if (!prefix) {
        return null;
    }
    const normalizedPath = normalizePathSeparators(path);
    const normalizedPrefix = normalizePathSeparators(prefix);
    // Ensure prefix ends with / for proper matching
    const prefixWithSlash = !normalizedPrefix.endsWith('/')
        ? normalizedPrefix + '/'
        : normalizedPrefix;
    // Exact match
    if (normalizedPath === normalizedPrefix) {
        return '';
    }
    // Check if path starts with prefix
    if (normalizedPath.startsWith(prefixWithSlash)) {
        return normalizedPath.slice(prefixWithSlash.length);
    }
    return null;
}
export function resolve(...paths) {
    if (paths.length === 0) {
        throw new Error("resolve: at least one path segment is required");
    }
    // Normalize all paths to use forward slashes
    const normalizedPaths = paths.map(normalizePathSeparators);
    let result = '';
    let windowsDrive = '';
    // Check if first path is absolute
    const firstPath = normalizedPaths[0];
    if (isAbsolutePath(firstPath)) {
        result = firstPath;
        // Extract Windows drive letter if present
        if (firstPath.length >= 2 && /[a-zA-Z]/.test(firstPath[0]) && firstPath[1] === ':') {
            windowsDrive = firstPath.slice(0, 2);
            result = firstPath.slice(2);
        }
        else if (!isWSL() && firstPath.startsWith('/') && firstPath.length >= 3 && firstPath[2] === '/') {
            // Git Bash style: /c/ -> C: (C-Z drives only, not A or B)
            // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
            const driveLetter = firstPath[1];
            if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
                windowsDrive = driveLetter.toUpperCase() + ':';
                result = firstPath.slice(2);
            }
        }
    }
    else {
        // Start with PWD or cwd, then append the first relative path
        const pwd = normalizePathSeparators(process.env.PWD || process.cwd());
        // Extract Windows drive from PWD if present
        if (pwd.length >= 2 && /[a-zA-Z]/.test(pwd[0]) && pwd[1] === ':') {
            windowsDrive = pwd.slice(0, 2);
            result = pwd.slice(2) + '/' + firstPath;
        }
        else {
            result = pwd + '/' + firstPath;
        }
    }
    // Process remaining paths
    for (let i = 1; i < normalizedPaths.length; i++) {
        const p = normalizedPaths[i];
        if (isAbsolutePath(p)) {
            // Absolute path replaces everything
            result = p;
            // Update Windows drive if present
            if (p.length >= 2 && /[a-zA-Z]/.test(p[0]) && p[1] === ':') {
                windowsDrive = p.slice(0, 2);
                result = p.slice(2);
            }
            else if (!isWSL() && p.startsWith('/') && p.length >= 3 && p[2] === '/') {
                // Git Bash style (C-Z drives only, not A or B)
                // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
                const driveLetter = p[1];
                if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
                    windowsDrive = driveLetter.toUpperCase() + ':';
                    result = p.slice(2);
                }
                else {
                    windowsDrive = '';
                }
            }
            else {
                windowsDrive = '';
            }
        }
        else {
            // Relative path - append
            result = result + '/' + p;
        }
    }
    // Normalize . and .. components
    const parts = result.split('/').filter(Boolean);
    const normalized = [];
    for (const part of parts) {
        if (part === '..') {
            normalized.pop();
        }
        else if (part !== '.') {
            normalized.push(part);
        }
    }
    // Build final path
    const finalPath = '/' + normalized.join('/');
    // Prepend Windows drive if present
    if (windowsDrive) {
        return windowsDrive + finalPath;
    }
    return finalPath;
}
// Flag to indicate production mode (set by qmd.ts at startup)
let _productionMode = false;
export function enableProductionMode() {
    _productionMode = true;
}
/** Reset production mode flag — only for testing. */
export function _resetProductionModeForTesting() {
    _productionMode = false;
}
export function getDefaultDbPath(indexName = "index") {
    // Always allow override via INDEX_PATH (for testing)
    if (process.env.INDEX_PATH) {
        return process.env.INDEX_PATH;
    }
    // In non-production mode (tests), require explicit path
    if (!_productionMode) {
        throw new Error("Database path not set. Tests must set INDEX_PATH env var or use createStore() with explicit path. " +
            "This prevents tests from accidentally writing to the global index.");
    }
    const cacheDir = process.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
    const qmdCacheDir = resolve(cacheDir, "qmd");
    try {
        mkdirSync(qmdCacheDir, { recursive: true });
    }
    catch { }
    return resolve(qmdCacheDir, `${indexName}.sqlite`);
}
export function getPwd() {
    return process.env.PWD || process.cwd();
}
export function getRealPath(path) {
    try {
        return realpathSync(path);
    }
    catch {
        return resolve(path);
    }
}
/**
 * Normalize explicit virtual path formats to standard qmd:// format.
 * Only handles paths that are already explicitly virtual:
 * - qmd://collection/path.md (already normalized)
 * - qmd:////collection/path.md (extra slashes - normalize)
 * - //collection/path.md (missing qmd: prefix - add it)
 *
 * Does NOT handle:
 * - collection/path.md (bare paths - could be filesystem relative)
 * - :linenum suffix (should be parsed separately before calling this)
 */
export function normalizeVirtualPath(input) {
    let path = input.trim();
    // Handle qmd:// with extra slashes: qmd:////collection/path -> qmd://collection/path
    if (path.startsWith('qmd:')) {
        // Remove qmd: prefix and normalize slashes
        path = path.slice(4);
        // Remove leading slashes and re-add exactly two
        path = path.replace(/^\/+/, '');
        return `qmd://${path}`;
    }
    // Handle //collection/path (missing qmd: prefix)
    if (path.startsWith('//')) {
        path = path.replace(/^\/+/, '');
        return `qmd://${path}`;
    }
    // Return as-is for other cases (filesystem paths, docids, bare collection/path, etc.)
    return path;
}
/**
 * Parse a virtual path like "qmd://collection-name/path/to/file.md"
 * into its components.
 * Also supports collection root: "qmd://collection-name/" or "qmd://collection-name"
 */
export function parseVirtualPath(virtualPath) {
    // Normalize the path first
    const normalized = normalizeVirtualPath(virtualPath);
    // Match: qmd://collection-name[/optional-path]
    // Allows: qmd://name, qmd://name/, qmd://name/path
    const match = normalized.match(/^qmd:\/\/([^\/]+)\/?(.*)$/);
    if (!match?.[1])
        return null;
    return {
        collectionName: match[1],
        path: match[2] ?? '', // Empty string for collection root
    };
}
/**
 * Build a virtual path from collection name and relative path.
 */
export function buildVirtualPath(collectionName, path) {
    return `qmd://${collectionName}/${path}`;
}
/**
 * Check if a path is explicitly a virtual path.
 * Only recognizes explicit virtual path formats:
 * - qmd://collection/path.md
 * - //collection/path.md
 *
 * Does NOT consider bare collection/path.md as virtual - that should be
 * handled separately by checking if the first component is a collection name.
 */
export function isVirtualPath(path) {
    const trimmed = path.trim();
    // Explicit qmd:// prefix (with any number of slashes)
    if (trimmed.startsWith('qmd:'))
        return true;
    // //collection/path format (missing qmd: prefix)
    if (trimmed.startsWith('//'))
        return true;
    return false;
}
/**
 * Resolve a virtual path to absolute filesystem path.
 */
export function resolveVirtualPath(db, virtualPath) {
    const parsed = parseVirtualPath(virtualPath);
    if (!parsed)
        return null;
    const coll = getCollectionByName(db, parsed.collectionName);
    if (!coll)
        return null;
    return resolve(coll.pwd, parsed.path);
}
/**
 * Convert an absolute filesystem path to a virtual path.
 * Returns null if the file is not in any indexed collection.
 */
export function toVirtualPath(db, absolutePath) {
    // Get all collections from DB
    const collections = getStoreCollections(db);
    // Find which collection this absolute path belongs to
    for (const coll of collections) {
        if (absolutePath.startsWith(coll.path + '/') || absolutePath === coll.path) {
            // Extract relative path
            const relativePath = absolutePath.startsWith(coll.path + '/')
                ? absolutePath.slice(coll.path.length + 1)
                : '';
            // Verify this document exists in the database
            const doc = db.prepare(`
        SELECT d.path
        FROM documents d
        WHERE d.collection = ? AND d.path = ? AND d.active = 1
        LIMIT 1
      `).get(coll.name, relativePath);
            if (doc) {
                return buildVirtualPath(coll.name, relativePath);
            }
        }
    }
    return null;
}
// =============================================================================
// Database initialization
// =============================================================================
function createSqliteVecUnavailableError(reason) {
    return new Error("sqlite-vec extension is unavailable. " +
        `${reason}. ` +
        "Install Homebrew SQLite so the sqlite-vec extension can be loaded, " +
        "and set BREW_PREFIX if Homebrew is installed in a non-standard location.");
}
function getErrorMessage(err) {
    return err instanceof Error ? err.message : String(err);
}
export function verifySqliteVecLoaded(db) {
    try {
        const row = db.prepare(`SELECT vec_version() AS version`).get();
        if (!row?.version || typeof row.version !== "string") {
            throw new Error("vec_version() returned no version");
        }
    }
    catch (err) {
        const message = getErrorMessage(err);
        throw createSqliteVecUnavailableError(`sqlite-vec probe failed (${message})`);
    }
}
let _sqliteVecAvailable = null;
/**
 * Concurrency-friendly pragma defaults applied by `initializeDatabase`.
 * Each entry is `{ pragma, default, envVar }` so operators can override
 * any one knob via env without code changes.
 *
 * Defaults are tuned for the Oivo fleet shape — many concurrent MCP
 * processes (one per agent session) sharing a single ~10 GB index that
 * a 30-minute cron runs `qmd embed` against. See issue i-6sw24v09 for
 * the failure mode this prevents.
 */
const CONCURRENCY_PRAGMAS = [
    { pragma: "busy_timeout", defaultValue: 30000, envVar: "QMD_SQLITE_BUSY_TIMEOUT_MS" },
    { pragma: "synchronous", defaultValue: "NORMAL", envVar: "QMD_SQLITE_SYNCHRONOUS" },
    { pragma: "temp_store", defaultValue: "MEMORY", envVar: "QMD_SQLITE_TEMP_STORE" },
    { pragma: "cache_size", defaultValue: -65536, envVar: "QMD_SQLITE_CACHE_SIZE" }, // ~64 MiB
    { pragma: "mmap_size", defaultValue: 268435456, envVar: "QMD_SQLITE_MMAP_SIZE" }, // 256 MiB
    { pragma: "wal_autocheckpoint", defaultValue: 1000, envVar: "QMD_SQLITE_WAL_AUTOCHECKPOINT" },
];
/**
 * Apply concurrency pragmas with env-var override support. Exported for
 * unit tests; consumers should rely on `initializeDatabase` instead.
 */
export function applyConcurrencyPragmas(db) {
    for (const { pragma, defaultValue, envVar } of CONCURRENCY_PRAGMAS) {
        const override = process.env[envVar];
        let value = defaultValue;
        if (override !== undefined && override !== "") {
            // Numeric overrides parse as base-10 integers (also accepts negatives
            // for cache_size). Non-numeric overrides pass through as identifiers
            // (e.g. NORMAL, FULL, MEMORY) — SQLite validates them.
            const numericPragmas = new Set(["busy_timeout", "cache_size", "mmap_size", "wal_autocheckpoint"]);
            if (numericPragmas.has(pragma)) {
                const parsed = parseInt(override, 10);
                if (Number.isFinite(parsed))
                    value = parsed;
            }
            else {
                value = override;
            }
        }
        try {
            db.exec(`PRAGMA ${pragma} = ${value}`);
        }
        catch (err) {
            // Don't blow up on pragma failure — log + carry on. SQLite without
            // mmap support, for example, simply ignores mmap_size silently on
            // some builds, but a strict build can throw.
            const msg = err instanceof Error ? err.message : String(err);
            console.warn(`[qmd] PRAGMA ${pragma} = ${value} failed: ${msg}`);
        }
    }
}
function initializeDatabase(db) {
    try {
        loadSqliteVec(db);
        verifySqliteVecLoaded(db);
        _sqliteVecAvailable = true;
    }
    catch (err) {
        // sqlite-vec is optional — vector search won't work but FTS is fine
        _sqliteVecAvailable = false;
        console.warn(getErrorMessage(err));
    }
    db.exec("PRAGMA journal_mode = WAL");
    db.exec("PRAGMA foreign_keys = ON");
    // Concurrency tuning — prevents reader timeouts during long writer windows
    // such as `qmd embed` (often 6-30 minutes on the Oivo fleet) which would
    // otherwise saturate the default 5s busy_timeout from better-sqlite3 and
    // surface as MCP transport timeouts in concurrent `qmd_query`/`qmd_status`
    // calls. See issue i-6sw24v09 for the empirical trace.
    //
    // - busy_timeout (default 30000 ms): readers wait through writer-held
    //   checkpoints instead of failing fast with SQLITE_BUSY.
    // - synchronous=NORMAL: WAL-safe (still durable across crashes), avoids
    //   the FULL fsync per transaction that compounds embed runtime.
    // - temp_store=MEMORY: keep FTS5 + vec sort scratch in RAM, not /tmp.
    // - cache_size: ~64 MiB per-connection page cache. Negative kibibyte
    //   form is the canonical SQLite idiom (positive = pages, negative = KiB).
    // - mmap_size: 256 MiB memory-mapped reads for the 10 GB index — cheap
    //   on Linux (lazy paging), no effect on non-mmap'd syscall fallback.
    // - wal_autocheckpoint: keep WAL bounded. Default 1000 pages is fine
    //   but setting it explicitly prevents drift when callers tune globally.
    //
    // Each pragma is overridable via env so operators can tune without a
    // code change; values must parse as base-10 integers or are skipped.
    applyConcurrencyPragmas(db);
    // Drop legacy tables that are now managed in YAML
    db.exec(`DROP TABLE IF EXISTS path_contexts`);
    db.exec(`DROP TABLE IF EXISTS collections`);
    // Content-addressable storage - the source of truth for document content
    db.exec(`
    CREATE TABLE IF NOT EXISTS content (
      hash TEXT PRIMARY KEY,
      doc TEXT NOT NULL,
      created_at TEXT NOT NULL
    )
  `);
    // Documents table - file system layer mapping virtual paths to content hashes
    // Collections are now managed in ~/.config/qmd/index.yml
    db.exec(`
    CREATE TABLE IF NOT EXISTS documents (
      id INTEGER PRIMARY KEY AUTOINCREMENT,
      collection TEXT NOT NULL,
      path TEXT NOT NULL,
      title TEXT NOT NULL,
      hash TEXT NOT NULL,
      created_at TEXT NOT NULL,
      modified_at TEXT NOT NULL,
      active INTEGER NOT NULL DEFAULT 1,
      FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
      UNIQUE(collection, path)
    )
  `);
    db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection, active)`);
    db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`);
    db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path, active)`);
    // Cache table for LLM API calls
    db.exec(`
    CREATE TABLE IF NOT EXISTS llm_cache (
      hash TEXT PRIMARY KEY,
      result TEXT NOT NULL,
      created_at TEXT NOT NULL
    )
  `);
    // Content vectors
    const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all();
    const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
    if (cvInfo.length > 0 && !hasSeqColumn) {
        db.exec(`DROP TABLE IF EXISTS content_vectors`);
        db.exec(`DROP TABLE IF EXISTS vectors_vec`);
    }
    db.exec(`
    CREATE TABLE IF NOT EXISTS content_vectors (
      hash TEXT NOT NULL,
      seq INTEGER NOT NULL DEFAULT 0,
      pos INTEGER NOT NULL DEFAULT 0,
      model TEXT NOT NULL,
      embedded_at TEXT NOT NULL,
      PRIMARY KEY (hash, seq)
    )
  `);
    // Store collections — makes the DB self-contained (no external config needed)
    db.exec(`
    CREATE TABLE IF NOT EXISTS store_collections (
      name TEXT PRIMARY KEY,
      path TEXT NOT NULL,
      pattern TEXT NOT NULL DEFAULT '**/*.md',
      ignore_patterns TEXT,
      include_by_default INTEGER DEFAULT 1,
      update_command TEXT,
      context TEXT
    )
  `);
    // Store config — key-value metadata (e.g. config_hash for sync optimization)
    db.exec(`
    CREATE TABLE IF NOT EXISTS store_config (
      key TEXT PRIMARY KEY,
      value TEXT
    )
  `);
    // FTS - index filepath (collection/path), title, and content
    db.exec(`
    CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
      filepath, title, body,
      tokenize='porter unicode61'
    )
  `);
    // Triggers to keep FTS in sync
    db.exec(`
    CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents
    WHEN new.active = 1
    BEGIN
      INSERT INTO documents_fts(rowid, filepath, title, body)
      SELECT
        new.id,
        new.collection || '/' || new.path,
        new.title,
        (SELECT doc FROM content WHERE hash = new.hash)
      WHERE new.active = 1;
    END
  `);
    db.exec(`
    CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
      DELETE FROM documents_fts WHERE rowid = old.id;
    END
  `);
    db.exec(`
    CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents
    BEGIN
      -- Delete from FTS if no longer active
      DELETE FROM documents_fts WHERE rowid = old.id AND new.active = 0;

      -- Update FTS if still/newly active
      INSERT OR REPLACE INTO documents_fts(rowid, filepath, title, body)
      SELECT
        new.id,
        new.collection || '/' || new.path,
        new.title,
        (SELECT doc FROM content WHERE hash = new.hash)
      WHERE new.active = 1;
    END
  `);
}
function rowToNamedCollection(row) {
    return {
        name: row.name,
        path: row.path,
        pattern: row.pattern,
        ...(row.ignore_patterns ? { ignore: JSON.parse(row.ignore_patterns) } : {}),
        ...(row.include_by_default === 0 ? { includeByDefault: false } : {}),
        ...(row.update_command ? { update: row.update_command } : {}),
        ...(row.context ? { context: JSON.parse(row.context) } : {}),
    };
}
export function getStoreCollections(db) {
    const rows = db.prepare(`SELECT * FROM store_collections`).all();
    return rows.map(rowToNamedCollection);
}
export function getStoreCollection(db, name) {
    const row = db.prepare(`SELECT * FROM store_collections WHERE name = ?`).get(name);
    if (row == null)
        return null;
    return rowToNamedCollection(row);
}
export function getStoreGlobalContext(db) {
    const row = db.prepare(`SELECT value FROM store_config WHERE key = 'global_context'`).get();
    if (row == null)
        return undefined;
    return row.value || undefined;
}
export function getStoreContexts(db) {
    const results = [];
    // Global context
    const globalCtx = getStoreGlobalContext(db);
    if (globalCtx) {
        results.push({ collection: "*", path: "/", context: globalCtx });
    }
    // Collection contexts
    const rows = db.prepare(`SELECT name, context FROM store_collections WHERE context IS NOT NULL`).all();
    for (const row of rows) {
        const ctxMap = JSON.parse(row.context);
        for (const [path, context] of Object.entries(ctxMap)) {
            results.push({ collection: row.name, path, context });
        }
    }
    return results;
}
export function upsertStoreCollection(db, name, collection) {
    db.prepare(`
    INSERT INTO store_collections (name, path, pattern, ignore_patterns, include_by_default, update_command, context)
    VALUES (?, ?, ?, ?, ?, ?, ?)
    ON CONFLICT(name) DO UPDATE SET
      path = excluded.path,
      pattern = excluded.pattern,
      ignore_patterns = excluded.ignore_patterns,
      include_by_default = excluded.include_by_default,
      update_command = excluded.update_command,
      context = excluded.context
  `).run(name, collection.path, collection.pattern || '**/*.md', collection.ignore ? JSON.stringify(collection.ignore) : null, collection.includeByDefault === false ? 0 : 1, collection.update || null, collection.context ? JSON.stringify(collection.context) : null);
}
export function deleteStoreCollection(db, name) {
    const result = db.prepare(`DELETE FROM store_collections WHERE name = ?`).run(name);
    return result.changes > 0;
}
export function renameStoreCollection(db, oldName, newName) {
    // Check target doesn't exist
    const existing = db.prepare(`SELECT name FROM store_collections WHERE name = ?`).get(newName);
    if (existing != null) {
        throw new Error(`Collection '${newName}' already exists`);
    }
    const result = db.prepare(`UPDATE store_collections SET name = ? WHERE name = ?`).run(newName, oldName);
    return result.changes > 0;
}
export function updateStoreContext(db, collectionName, path, text) {
    const row = db.prepare(`SELECT context FROM store_collections WHERE name = ?`).get(collectionName);
    if (row == null)
        return false;
    const ctxMap = row.context ? JSON.parse(row.context) : {};
    ctxMap[path] = text;
    db.prepare(`UPDATE store_collections SET context = ? WHERE name = ?`).run(JSON.stringify(ctxMap), collectionName);
    return true;
}
export function removeStoreContext(db, collectionName, path) {
    const row = db.prepare(`SELECT context FROM store_collections WHERE name = ?`).get(collectionName);
    if (row == null)
        return false;
    if (!row.context)
        return false;
    const ctxMap = JSON.parse(row.context);
    if (!(path in ctxMap))
        return false;
    delete ctxMap[path];
    const newCtx = Object.keys(ctxMap).length > 0 ? JSON.stringify(ctxMap) : null;
    db.prepare(`UPDATE store_collections SET context = ? WHERE name = ?`).run(newCtx, collectionName);
    return true;
}
export function setStoreGlobalContext(db, value) {
    if (value === undefined) {
        db.prepare(`DELETE FROM store_config WHERE key = 'global_context'`).run();
    }
    else {
        db.prepare(`INSERT INTO store_config (key, value) VALUES ('global_context', ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value`).run(value);
    }
}
/**
 * Sync external config (YAML/inline) into SQLite store_collections.
 * External config always wins. Skips sync if config hash hasn't changed.
 */
export function syncConfigToDb(db, config) {
    // Check config hash — skip sync if unchanged
    const configJson = JSON.stringify(config);
    const hash = createHash('sha256').update(configJson).digest('hex');
    const existingHash = db.prepare(`SELECT value FROM store_config WHERE key = 'config_hash'`).get();
    if (existingHash != null && existingHash.value === hash) {
        return; // Config unchanged, skip sync
    }
    // Sync collections
    const configNames = new Set(Object.keys(config.collections));
    for (const [name, coll] of Object.entries(config.collections)) {
        upsertStoreCollection(db, name, coll);
    }
    // Delete collections not in config
    const dbCollections = db.prepare(`SELECT name FROM store_collections`).all();
    for (const row of dbCollections) {
        if (!configNames.has(row.name)) {
            db.prepare(`DELETE FROM store_collections WHERE name = ?`).run(row.name);
        }
    }
    // Sync global context
    if (config.global_context !== undefined) {
        setStoreGlobalContext(db, config.global_context);
    }
    else {
        setStoreGlobalContext(db, undefined);
    }
    // Save config hash
    db.prepare(`INSERT INTO store_config (key, value) VALUES ('config_hash', ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value`).run(hash);
}
export function isSqliteVecAvailable() {
    return _sqliteVecAvailable === true;
}
function ensureVecTableInternal(db, dimensions) {
    if (!_sqliteVecAvailable) {
        throw new Error("sqlite-vec is not available. Vector operations require a SQLite build with extension loading support.");
    }
    const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
    if (tableInfo) {
        const match = tableInfo.sql.match(/float\[(\d+)\]/);
        const hasHashSeq = tableInfo.sql.includes('hash_seq');
        const hasCosine = tableInfo.sql.includes('distance_metric=cosine');
        const existingDims = match?.[1] ? parseInt(match[1], 10) : null;
        if (existingDims === dimensions && hasHashSeq && hasCosine)
            return;
        if (existingDims !== null && existingDims !== dimensions) {
            throw new Error(`Embedding dimension mismatch: existing vectors are ${existingDims}d but the current model produces ${dimensions}d. ` +
                `Run 'qmd embed -f' to re-embed with the new model.`);
        }
        db.exec("DROP TABLE IF EXISTS vectors_vec");
    }
    db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}] distance_metric=cosine)`);
}
/**
 * Re-index a single collection by scanning the filesystem and updating the database.
 * Pure function — no console output, no db lifecycle management.
 */
export async function reindexCollection(store, collectionPath, globPattern, collectionName, options) {
    const db = store.db;
    const now = new Date().toISOString();
    const excludeDirs = ["node_modules", ".git", ".cache", "vendor", "dist", "build"];
    const allIgnore = [
        ...excludeDirs.map(d => `**/${d}/**`),
        ...(options?.ignorePatterns || []),
    ];
    const allFiles = await fastGlob(globPattern, {
        cwd: collectionPath,
        onlyFiles: true,
        followSymbolicLinks: false,
        dot: false,
        ignore: allIgnore,
    });
    // Filter hidden files/folders
    const files = allFiles.filter(file => {
        const parts = file.split("/");
        return !parts.some(part => part.startsWith("."));
    });
    const total = files.length;
    let indexed = 0, updated = 0, unchanged = 0, processed = 0;
    const seenPaths = new Set();
    for (const relativeFile of files) {
        const filepath = getRealPath(resolve(collectionPath, relativeFile));
        const path = handelize(relativeFile);
        seenPaths.add(path);
        let content;
        try {
            content = readFileSync(filepath, "utf-8");
        }
        catch {
            processed++;
            options?.onProgress?.({ file: relativeFile, current: processed, total });
            continue;
        }
        if (!content.trim()) {
            processed++;
            continue;
        }
        const hash = await hashContent(content);
        const title = extractTitle(content, relativeFile);
        const existing = findActiveDocument(db, collectionName, path);
        if (existing) {
            if (existing.hash === hash) {
                if (existing.title !== title) {
                    updateDocumentTitle(db, existing.id, title, now);
                    updated++;
                }
                else {
                    unchanged++;
                }
            }
            else {
                insertContent(db, hash, content, now);
                const stat = statSync(filepath);
                updateDocument(db, existing.id, title, hash, stat ? new Date(stat.mtime).toISOString() : now);
                updated++;
            }
        }
        else {
            indexed++;
            insertContent(db, hash, content, now);
            const stat = statSync(filepath);
            insertDocument(db, collectionName, path, title, hash, stat ? new Date(stat.birthtime).toISOString() : now, stat ? new Date(stat.mtime).toISOString() : now);
        }
        processed++;
        options?.onProgress?.({ file: relativeFile, current: processed, total });
    }
    // Deactivate documents that no longer exist
    const allActive = getActiveDocumentPaths(db, collectionName);
    let removed = 0;
    for (const path of allActive) {
        if (!seenPaths.has(path)) {
            deactivateDocument(db, collectionName, path);
            removed++;
        }
    }
    const orphanedCleaned = cleanupOrphanedContent(db);
    return { indexed, updated, unchanged, removed, orphanedCleaned };
}
function validatePositiveIntegerOption(name, value, fallback) {
    if (value === undefined)
        return fallback;
    if (!Number.isInteger(value) || value < 1) {
        throw new Error(`${name} must be a positive integer`);
    }
    return value;
}
function resolveEmbedOptions(options) {
    return {
        maxDocsPerBatch: validatePositiveIntegerOption("maxDocsPerBatch", options?.maxDocsPerBatch, DEFAULT_EMBED_MAX_DOCS_PER_BATCH),
        maxBatchBytes: validatePositiveIntegerOption("maxBatchBytes", options?.maxBatchBytes, DEFAULT_EMBED_MAX_BATCH_BYTES),
    };
}
function getPendingEmbeddingDocs(db, collection) {
    // `MIN(d.collection)` deterministically picks one collection per hash when
    // the same content is indexed in multiple collections (SQLite tie-breaks
    // alphabetically). The identical bytes produce identical chunks regardless
    // of which collection wins; the chunkStrategy lookup still resolves via
    // that collection's YAML config. See Phase 2 design notes (i-bud0h8vu).
    //
    // i-ofojj7dy — when a collection name is supplied, filter rows BEFORE the
    // GROUP BY so we only emit hashes whose documents include that collection.
    // Other collections sharing the same content hash still benefit from any
    // embeddings generated for the canonical owner (content_vectors is keyed
    // by hash, not by collection).
    if (collection !== undefined) {
        return db.prepare(`
      SELECT d.hash, MIN(d.path) as path, MIN(d.collection) as collection, length(CAST(c.doc AS BLOB)) as bytes
      FROM documents d
      JOIN content c ON d.hash = c.hash
      LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
      WHERE d.active = 1 AND v.hash IS NULL AND d.collection = ?
      GROUP BY d.hash
      ORDER BY MIN(d.path)
    `).all(collection);
    }
    return db.prepare(`
    SELECT d.hash, MIN(d.path) as path, MIN(d.collection) as collection, length(CAST(c.doc AS BLOB)) as bytes
    FROM documents d
    JOIN content c ON d.hash = c.hash
    LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
    WHERE d.active = 1 AND v.hash IS NULL
    GROUP BY d.hash
    ORDER BY MIN(d.path)
  `).all();
}
function buildEmbeddingBatches(docs, maxDocsPerBatch, maxBatchBytes) {
    const batches = [];
    let currentBatch = [];
    let currentBytes = 0;
    for (const doc of docs) {
        const docBytes = Math.max(0, doc.bytes);
        const wouldExceedDocs = currentBatch.length >= maxDocsPerBatch;
        const wouldExceedBytes = currentBatch.length > 0 && (currentBytes + docBytes) > maxBatchBytes;
        if (wouldExceedDocs || wouldExceedBytes) {
            batches.push(currentBatch);
            currentBatch = [];
            currentBytes = 0;
        }
        currentBatch.push(doc);
        currentBytes += docBytes;
    }
    if (currentBatch.length > 0) {
        batches.push(currentBatch);
    }
    return batches;
}
function getEmbeddingDocsForBatch(db, batch) {
    if (batch.length === 0)
        return [];
    const placeholders = batch.map(() => "?").join(",");
    const rows = db.prepare(`
    SELECT hash, doc as body
    FROM content
    WHERE hash IN (${placeholders})
  `).all(...batch.map(doc => doc.hash));
    const bodyByHash = new Map(rows.map(row => [row.hash, row.body]));
    return batch.map((doc) => ({
        ...doc,
        body: bodyByHash.get(doc.hash) ?? "",
    }));
}
/**
 * Run `body` with a session-shaped argument that supplies an AbortSignal +
 * isValid flag. When `provider` is supplied, the session is a lightweight
 * AbortController-backed stub — `getLlm(store)` is never called and
 * `withLLMSessionForLlm` is bypassed entirely, so node-llama-cpp is not
 * warmed up on remote-only deployments (i-08ovbvtb, follow-up to i-qkarfffa).
 *
 * When `provider` is undefined, behavior is unchanged: a real `LLMSession`
 * is created via `withLLMSessionForLlm(getLlm(store), ...)` so that the
 * body can use `session.embed`/`session.embedBatch` for the local path.
 *
 * The fake session's LLM-only methods (embed/embedBatch/expandQuery/rerank)
 * throw if called — they MUST NOT be reached when `provider` is set, since
 * the embed path is supposed to route through the provider instead.
 */
async function withEmbedSession(store, provider, body, options) {
    if (provider) {
        const ac = new AbortController();
        const fakeSession = {
            get signal() { return ac.signal; },
            get isValid() { return !ac.signal.aborted; },
            embed: async () => {
                throw new Error("withEmbedSession: provider supplied — session.embed must not be called");
            },
            embedBatch: async () => {
                throw new Error("withEmbedSession: provider supplied — session.embedBatch must not be called");
            },
            expandQuery: async () => {
                throw new Error("withEmbedSession: provider supplied — session.expandQuery must not be called");
            },
            rerank: async () => {
                throw new Error("withEmbedSession: provider supplied — session.rerank must not be called");
            },
        };
        try {
            return await body(fakeSession);
        }
        finally {
            ac.abort();
        }
    }
    return withLLMSessionForLlm(getLlm(store), body, options);
}
/**
 * Generate vector embeddings for documents that need them.
 * Pure function — no console output, no db lifecycle management.
 * Uses the store's LlamaCpp instance if set, otherwise the global singleton.
 */
export async function generateEmbeddings(store, options) {
    const db = store.db;
    const model = options?.model ?? DEFAULT_EMBED_MODEL;
    const now = new Date().toISOString();
    const { maxDocsPerBatch, maxBatchBytes } = resolveEmbedOptions(options);
    const encoder = new TextEncoder();
    // Migration safety: if an embedProvider is supplied, verify its model id
    // matches the existing content_vectors rows (unless we're about to clear
    // them via `force`). This must happen BEFORE we clear vectors so users
    // who pass `--force` aren't blocked.
    if (options?.embedProvider && !options.force) {
        const existing = getDistinctEmbeddingModels(db);
        assertModelCompatible(options.embedProvider.getModelId(), existing);
    }
    if (options?.force) {
        clearAllEmbeddings(db);
    }
    // i-ofojj7dy — optional collection filter restricts the pending-doc set.
    const docsToEmbed = getPendingEmbeddingDocs(db, options?.collection);
    if (docsToEmbed.length === 0) {
        return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
    }
    const totalBytes = docsToEmbed.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
    const totalDocs = docsToEmbed.length;
    const startTime = Date.now();
    // Per-collection chunkStrategy lookup (Phase 2 — i-bud0h8vu). YAML
    // `chunkStrategy` on a collection wins over `options.chunkStrategy`
    // (global CLI flag); falls back to the global option, then to
    // chunkDocumentByTokens' own "regex" default when neither is set.
    // Opt-in per collection — collections without the field are untouched.
    const collectionStrategies = new Map();
    try {
        const { listCollections: listYamlCollections } = await import("./collections.js");
        for (const c of listYamlCollections()) {
            if (c.chunkStrategy)
                collectionStrategies.set(c.name, c.chunkStrategy);
        }
    }
    catch {
        // If YAML config is missing/unreadable, fall back silently to the
        // global strategy — no collection overrides. Keeps SDK/inline
        // callers that never touch ~/.config/qmd working.
    }
    // Provider routing — when an EmbeddingProvider is supplied, embed calls go
    // through it (HTTP, GPU worker, etc.). Otherwise, use the LLM session path.
    // The outer session is still created for its abort signal (chunking uses
    // `session.signal` for cooperative cancellation).
    const provider = options?.embedProvider;
    const providerModel = provider?.getModelId() ?? model;
    // Resolve `embedModelUri` (used for formatting prefixes etc.) lazily —
    // when `provider` is set, take it from the provider; otherwise fall back
    // to the local LlamaCpp's embed model name. Accessing `getLlm(store)` is
    // deferred to the non-provider branch so remote-only deployments do not
    // construct a `LlamaCpp` instance just to read its embedModelName.
    const embedModelUri = provider
        ? provider.getModelId()
        : getLlm(store).embedModelName;
    // Run the embedding loop inside a session-scoped wrapper. When `provider`
    // is set, this short-circuits the local LLM warm-up entirely (i-08ovbvtb).
    const result = await withEmbedSession(store, provider, async (session) => {
        let chunksEmbedded = 0;
        let errors = 0;
        let bytesProcessed = 0;
        let totalChunks = 0;
        let vectorTableInitialized = false;
        // Inner batch size — number of chunks fed into each `embedMany` call.
        // Bumped 32 → 256 (i-fkpnar9i) so the openai provider's concurrent
        // dispatcher receives ≥ 4 sub-chunks of size 64 (worker MAX_BATCH) and
        // can saturate the worker's MAX_CONCURRENT_REQUESTS=4 semaphore.
        // Override per-deploy via `QMD_EMBED_INNER_BATCH_SIZE`.
        const BATCH_SIZE = parseInt(process.env.QMD_EMBED_INNER_BATCH_SIZE ?? "256", 10) || 256;
        const batches = buildEmbeddingBatches(docsToEmbed, maxDocsPerBatch, maxBatchBytes);
        // Embedding helpers — single point of provider/session selection.
        // Both return the same shape as ILLMSession.embed/embedBatch so the
        // rest of the loop is unchanged.
        const embedOne = async (text, modelArg) => {
            if (provider) {
                const sig = provider.kind === 'local' ? session.signal : undefined;
                const r = await provider.embed(text, { model: modelArg, signal: sig });
                return r ? { embedding: r.embedding, model: r.model } : null;
            }
            return session.embed(text, { model: modelArg });
        };
        const embedMany = async (texts, modelArg) => {
            if (provider) {
                const sig = provider.kind === 'local' ? session.signal : undefined;
                const r = await provider.embedBatch(texts, { model: modelArg, signal: sig });
                return r.map((x) => (x ? { embedding: x.embedding, model: x.model } : null));
            }
            return session.embedBatch(texts, { model: modelArg });
        };
        // JS-only token estimator for the provider path. Char-based with
        // avgCharsPerToken=3 — matches the heuristic the chunker already
        // uses for its initial char-space pass, so the safety re-split is a
        // near no-op while populating the `tokens` field with a stable
        // estimate. CRITICAL: avoids loading node-llama-cpp on remote-only
        // deployments (`QMD_EMBED_ENDPOINT=...`). i-1rqixh6m DoD #1.
        const chunkTokenizer = provider
            ? (text) => Math.ceil(text.length / 3)
            : undefined;
        for (const batchMeta of batches) {
            // Abort early if session has been invalidated
            if (!session.isValid) {
                console.warn(`⚠ Session expired — skipping remaining document batches`);
                break;
            }
            const batchDocs = getEmbeddingDocsForBatch(db, batchMeta);
            const batchChunks = [];
            const batchBytes = batchMeta.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
            for (const doc of batchDocs) {
                if (!doc.body.trim())
                    continue;
                const title = extractTitle(doc.body, doc.path);
                const perCollectionStrategy = collectionStrategies.get(doc.collection);
                const chunkStrategy = perCollectionStrategy ?? options?.chunkStrategy;
                const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, chunkStrategy, session.signal, chunkTokenizer);
                for (let seq = 0; seq < chunks.length; seq++) {
                    batchChunks.push({
                        hash: doc.hash,
                        title,
                        text: chunks[seq].text,
                        seq,
                        pos: chunks[seq].pos,
                        tokens: chunks[seq].tokens,
                        bytes: encoder.encode(chunks[seq].text).length,
                    });
                }
            }
            totalChunks += batchChunks.length;
            if (batchChunks.length === 0) {
                bytesProcessed += batchBytes;
                options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
                continue;
            }
            if (!vectorTableInitialized) {
                const firstChunk = batchChunks[0];
                const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title, embedModelUri);
                // Single retry on transient failure (issue i-vm1lxwry). The provider
                // swallows per-chunk errors per its contract — `getLastError?.()`
                // surfaces the actual cause (HTTP status / abort / parse error) so we
                // can include it in the thrown message instead of the cryptic
                // "Failed to get embedding dimensions from first chunk".
                let firstResult = await embedOne(firstText, providerModel);
                if (!firstResult && session.isValid) {
                    const firstErr = provider?.getLastError?.();
                    // Brief backoff before retry — embedding worker may be re-warming
                    // a model or the GPU host may be transiently busy. 250ms is short
                    // enough to be invisible on the happy path and long enough to
                    // clear most "thundering-herd" race conditions.
                    await new Promise((resolve) => setTimeout(resolve, 250));
                    if (process.env.QMD_EMBED_DEBUG) {
                        process.stderr.write(`qmd embed: first-chunk dimension probe failed, retrying once${firstErr ? ` (last error: ${firstErr})` : ""}\n`);
                    }
                    firstResult = await embedOne(firstText, providerModel);
                }
                if (!firstResult) {
                    const lastErr = provider?.getLastError?.();
                    const providerHint = provider ? `provider=${provider.kind}` : "provider=session";
                    const errSuffix = lastErr ? ` — underlying: ${lastErr}` : "";
                    const debugHint = process.env.QMD_EMBED_DEBUG
                        ? ""
                        : " (set QMD_EMBED_DEBUG=1 for per-chunk traces)";
                    throw new Error(`Failed to get embedding dimensions from first chunk after retry [${providerHint}]${errSuffix}${debugHint}`);
                }
                store.ensureVecTable(firstResult.embedding.length);
                vectorTableInitialized = true;
            }
            const totalBatchChunkBytes = batchChunks.reduce((sum, chunk) => sum + chunk.bytes, 0);
            let batchChunkBytesProcessed = 0;
            for (let batchStart = 0; batchStart < batchChunks.length; batchStart += BATCH_SIZE) {
                // Abort early if session has been invalidated (e.g. max duration exceeded)
                if (!session.isValid) {
                    const remaining = batchChunks.length - batchStart;
                    errors += remaining;
                    console.warn(`⚠ Session expired — skipping ${remaining} remaining chunks`);
                    break;
                }
                // Abort early if error rate is too high (>80% of processed chunks failed)
                const processed = chunksEmbedded + errors;
                if (processed >= BATCH_SIZE && errors > processed * 0.8) {
                    const remaining = batchChunks.length - batchStart;
                    errors += remaining;
                    console.warn(`⚠ Error rate too high (${errors}/${processed}) — aborting embedding`);
                    break;
                }
                const batchEnd = Math.min(batchStart + BATCH_SIZE, batchChunks.length);
                const chunkBatch = batchChunks.slice(batchStart, batchEnd);
                const texts = chunkBatch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title, embedModelUri));
                try {
                    const embeddings = await embedMany(texts, providerModel);
                    // Wrap the per-chunk inserts in a single SQLite transaction
                    // (i-fkpnar9i Phase 1 #3): avoids the WAL fsync per-row tax on
                    // large `BATCH_SIZE`. better-sqlite3's `db.transaction(fn)` opens
                    // BEGIN IMMEDIATE on entry and COMMITs on return; if any insert
                    // throws, the wrapper rolls back AND re-throws, falling through
                    // to the per-chunk fallback below — preserving the legacy
                    // "best-effort survive partial failures" semantics.
                    //
                    // We DELIBERATELY do not wrap the fallback's per-chunk loop —
                    // that path is per-chunk individual auto-commits so a single
                    // bad chunk doesn't drag down the rest. (Wrapping would be a
                    // step backward.)
                    const insertBatchTxn = db.transaction(() => {
                        let okCount = 0;
                        let errCount = 0;
                        for (let i = 0; i < chunkBatch.length; i++) {
                            const chunk = chunkBatch[i];
                            const embedding = embeddings[i];
                            if (embedding) {
                                insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), providerModel, now);
                                okCount++;
                            }
                            else {
                                errCount++;
                            }
                        }
                        return { okCount, errCount };
                    });
                    const { okCount, errCount } = insertBatchTxn();
                    chunksEmbedded += okCount;
                    errors += errCount;
                    batchChunkBytesProcessed += chunkBatch.reduce((sum, c) => sum + c.bytes, 0);
                }
                catch {
                    // Batch failed — try individual embeddings as fallback
                    // But skip if session is already invalid (avoids N doomed retries)
                    if (!session.isValid) {
                        errors += chunkBatch.length;
                        batchChunkBytesProcessed += chunkBatch.reduce((sum, c) => sum + c.bytes, 0);
                    }
                    else {
                        for (const chunk of chunkBatch) {
                            try {
                                const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
                                const result = await embedOne(text, providerModel);
                                if (result) {
                                    insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), providerModel, now);
                                    chunksEmbedded++;
                                }
                                else {
                                    errors++;
                                }
                            }
                            catch {
                                errors++;
                            }
                            batchChunkBytesProcessed += chunk.bytes;
                        }
                    }
                }
                const proportionalBytes = totalBatchChunkBytes === 0
                    ? batchBytes
                    : Math.min(batchBytes, Math.round((batchChunkBytesProcessed / totalBatchChunkBytes) * batchBytes));
                options?.onProgress?.({
                    chunksEmbedded,
                    totalChunks,
                    bytesProcessed: bytesProcessed + proportionalBytes,
                    totalBytes,
                    errors,
                });
            }
            bytesProcessed += batchBytes;
            options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
        }
        return { chunksEmbedded, errors };
    }, { maxDuration: 30 * 60 * 1000, name: 'generateEmbeddings' });
    return {
        docsProcessed: totalDocs,
        chunksEmbedded: result.chunksEmbedded,
        errors: result.errors,
        durationMs: Date.now() - startTime,
    };
}
/**
 * Create a new store instance with the given database path.
 * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
 *
 * @param dbPath - Path to the SQLite database file
 * @returns Store instance with all methods bound to the database
 */
export function createStore(dbPath) {
    const resolvedPath = dbPath || getDefaultDbPath();
    const db = openDatabase(resolvedPath);
    initializeDatabase(db);
    const store = {
        db,
        dbPath: resolvedPath,
        close: () => db.close(),
        ensureVecTable: (dimensions) => ensureVecTableInternal(db, dimensions),
        // Index health
        getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
        getIndexHealth: () => getIndexHealth(db),
        getStatus: () => getStatus(db),
        // Caching
        getCacheKey,
        getCachedResult: (cacheKey) => getCachedResult(db, cacheKey),
        setCachedResult: (cacheKey, result) => setCachedResult(db, cacheKey, result),
        clearCache: () => clearCache(db),
        // Cleanup and maintenance
        deleteLLMCache: () => deleteLLMCache(db),
        deleteInactiveDocuments: () => deleteInactiveDocuments(db),
        cleanupOrphanedContent: () => cleanupOrphanedContent(db),
        cleanupOrphanedVectors: () => cleanupOrphanedVectors(db),
        vacuumDatabase: () => vacuumDatabase(db),
        // Context
        getContextForFile: (filepath) => getContextForFile(db, filepath),
        getContextForPath: (collectionName, path) => getContextForPath(db, collectionName, path),
        getCollectionByName: (name) => getCollectionByName(db, name),
        getCollectionsWithoutContext: () => getCollectionsWithoutContext(db),
        getTopLevelPathsWithoutContext: (collectionName) => getTopLevelPathsWithoutContext(db, collectionName),
        // Virtual paths
        parseVirtualPath,
        buildVirtualPath,
        isVirtualPath,
        resolveVirtualPath: (virtualPath) => resolveVirtualPath(db, virtualPath),
        toVirtualPath: (absolutePath) => toVirtualPath(db, absolutePath),
        // Search
        searchFTS: (query, limit, collectionName) => searchFTS(db, query, limit, collectionName),
        searchVec: (query, model, limit, collectionName, session, precomputedEmbedding, embedProvider) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding, embedProvider),
        // Query expansion & reranking
        expandQuery: (query, model, intent) => expandQuery(query, model, db, intent, store.llm),
        rerank: (query, documents, model, intent) => rerank(query, documents, model, db, intent, store.llm),
        // Document retrieval
        findDocument: (filename, options) => findDocument(db, filename, options),
        getDocumentBody: (doc, fromLine, maxLines) => getDocumentBody(db, doc, fromLine, maxLines),
        findDocuments: (pattern, options) => findDocuments(db, pattern, options),
        // Fuzzy matching and docid lookup
        findSimilarFiles: (query, maxDistance, limit) => findSimilarFiles(db, query, maxDistance, limit),
        matchFilesByGlob: (pattern) => matchFilesByGlob(db, pattern),
        findDocumentByDocid: (docid) => findDocumentByDocid(db, docid),
        // Document indexing operations
        insertContent: (hash, content, createdAt) => insertContent(db, hash, content, createdAt),
        insertDocument: (collectionName, path, title, hash, createdAt, modifiedAt) => insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt),
        findActiveDocument: (collectionName, path) => findActiveDocument(db, collectionName, path),
        updateDocumentTitle: (documentId, title, modifiedAt) => updateDocumentTitle(db, documentId, title, modifiedAt),
        updateDocument: (documentId, title, hash, modifiedAt) => updateDocument(db, documentId, title, hash, modifiedAt),
        deactivateDocument: (collectionName, path) => deactivateDocument(db, collectionName, path),
        getActiveDocumentPaths: (collectionName) => getActiveDocumentPaths(db, collectionName),
        // Vector/embedding operations
        getHashesForEmbedding: () => getHashesForEmbedding(db),
        clearAllEmbeddings: () => clearAllEmbeddings(db),
        insertEmbedding: (hash, seq, pos, embedding, model, embeddedAt) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
    };
    return store;
}
/**
 * Extract short docid from a full hash (first 6 characters).
 */
export function getDocid(hash) {
    return hash.slice(0, 6);
}
/**
 * Handelize a filename to be more token-friendly.
 * - Convert triple underscore `___` to `/` (folder separator)
 * - Convert to lowercase
 * - Replace sequences of non-word chars (except /) with single dash
 * - Remove leading/trailing dashes from path segments
 * - Preserve folder structure (a/b/c/d.md stays structured)
 * - Preserve file extension
 */
/** Replace emoji/symbol codepoints with their hex representation (e.g. 🐘 → 1f418) */
function emojiToHex(str) {
    return str.replace(/(?:\p{So}\p{Mn}?|\p{Sk})+/gu, (run) => {
        // Split the run into individual emoji and convert each to hex, dash-separated
        return [...run].filter(c => /\p{So}|\p{Sk}/u.test(c))
            .map(c => c.codePointAt(0).toString(16)).join('-');
    });
}
export function handelize(path) {
    if (!path || path.trim() === '') {
        throw new Error('handelize: path cannot be empty');
    }
    // Allow route-style "$" filenames while still rejecting paths with no usable content.
    // Emoji (\p{So}) counts as valid content — they get converted to hex codepoints below.
    const segments = path.split('/').filter(Boolean);
    const lastSegment = segments[segments.length - 1] || '';
    const filenameWithoutExt = lastSegment.replace(/\.[^.]+$/, '');
    const hasValidContent = /[\p{L}\p{N}\p{So}\p{Sk}$]/u.test(filenameWithoutExt);
    if (!hasValidContent) {
        throw new Error(`handelize: path "${path}" has no valid filename content`);
    }
    const result = path
        .replace(/___/g, '/') // Triple underscore becomes folder separator
        .toLowerCase()
        .split('/')
        .map((segment, idx, arr) => {
        const isLastSegment = idx === arr.length - 1;
        // Convert emoji to hex codepoints before cleaning
        segment = emojiToHex(segment);
        if (isLastSegment) {
            // For the filename (last segment), preserve the extension
            const extMatch = segment.match(/(\.[a-z0-9]+)$/i);
            const ext = extMatch ? extMatch[1] : '';
            const nameWithoutExt = ext ? segment.slice(0, -ext.length) : segment;
            const cleanedName = nameWithoutExt
                .replace(/[^\p{L}\p{N}$]+/gu, '-') // Keep letters, numbers, "$"; dash-separate rest (including dots)
                .replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
            return cleanedName + ext;
        }
        else {
            // For directories, just clean normally
            return segment
                .replace(/[^\p{L}\p{N}$]+/gu, '-')
                .replace(/^-+|-+$/g, '');
        }
    })
        .filter(Boolean)
        .join('/');
    if (!result) {
        throw new Error(`handelize: path "${path}" resulted in empty string after processing`);
    }
    return result;
}
// =============================================================================
// Index health
// =============================================================================
export function getHashesNeedingEmbedding(db, collection) {
    // i-ofojj7dy — optional collection filter. Restricts the count to hashes
    // whose documents are in the named collection.
    if (collection !== undefined) {
        const result = db.prepare(`
      SELECT COUNT(DISTINCT d.hash) as count
      FROM documents d
      LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
      WHERE d.active = 1 AND v.hash IS NULL AND d.collection = ?
    `).get(collection);
        return result.count;
    }
    const result = db.prepare(`
    SELECT COUNT(DISTINCT d.hash) as count
    FROM documents d
    LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
    WHERE d.active = 1 AND v.hash IS NULL
  `).get();
    return result.count;
}
export function getIndexHealth(db) {
    const needsEmbedding = getHashesNeedingEmbedding(db);
    const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get().count;
    const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get();
    let daysStale = null;
    if (mostRecent?.latest) {
        const lastUpdate = new Date(mostRecent.latest);
        daysStale = Math.floor((Date.now() - lastUpdate.getTime()) / (24 * 60 * 60 * 1000));
    }
    return { needsEmbedding, totalDocs, daysStale };
}
// =============================================================================
// Caching
// =============================================================================
export function getCacheKey(url, body) {
    const hash = createHash("sha256");
    hash.update(url);
    hash.update(JSON.stringify(body));
    return hash.digest("hex");
}
export function getCachedResult(db, cacheKey) {
    const row = db.prepare(`SELECT result FROM llm_cache WHERE hash = ?`).get(cacheKey);
    return row?.result || null;
}
export function setCachedResult(db, cacheKey, result) {
    const now = new Date().toISOString();
    db.prepare(`INSERT OR REPLACE INTO llm_cache (hash, result, created_at) VALUES (?, ?, ?)`).run(cacheKey, result, now);
    if (Math.random() < 0.01) {
        db.exec(`DELETE FROM llm_cache WHERE hash NOT IN (SELECT hash FROM llm_cache ORDER BY created_at DESC LIMIT 1000)`);
    }
}
export function clearCache(db) {
    db.exec(`DELETE FROM llm_cache`);
}
// =============================================================================
// Cleanup and maintenance operations
// =============================================================================
/**
 * Delete cached LLM API responses.
 * Returns the number of cached responses deleted.
 */
export function deleteLLMCache(db) {
    const result = db.prepare(`DELETE FROM llm_cache`).run();
    return result.changes;
}
/**
 * Remove inactive document records (active = 0).
 * Returns the number of inactive documents deleted.
 */
export function deleteInactiveDocuments(db) {
    const result = db.prepare(`DELETE FROM documents WHERE active = 0`).run();
    return result.changes;
}
/**
 * Remove orphaned content hashes that are not referenced by any active document.
 * Returns the number of orphaned content hashes deleted.
 */
export function cleanupOrphanedContent(db) {
    const result = db.prepare(`
    DELETE FROM content
    WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  `).run();
    return result.changes;
}
/**
 * Remove orphaned vector embeddings that are not referenced by any active document.
 * Returns the number of orphaned embedding chunks deleted.
 */
export function cleanupOrphanedVectors(db) {
    // sqlite-vec may not be loaded (e.g. Bun's bun:sqlite lacks loadExtension).
    // The vectors_vec virtual table can appear in sqlite_master from a prior
    // session, but querying it without the vec0 module loaded will crash (#380).
    if (!isSqliteVecAvailable()) {
        return 0;
    }
    // The schema entry can exist even when sqlite-vec itself is unavailable
    // (for example when reopening a DB without vec0 loaded). In that case,
    // touching the virtual table throws "no such module: vec0" and cleanup
    // should degrade gracefully like the rest of the vector features.
    try {
        db.prepare(`SELECT 1 FROM vectors_vec LIMIT 0`).get();
    }
    catch {
        return 0;
    }
    // Count orphaned vectors first
    const countResult = db.prepare(`
    SELECT COUNT(*) as c FROM content_vectors cv
    WHERE NOT EXISTS (
      SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
    )
  `).get();
    if (countResult.c === 0) {
        return 0;
    }
    // Delete from vectors_vec first
    db.exec(`
    DELETE FROM vectors_vec WHERE hash_seq IN (
      SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
      WHERE NOT EXISTS (
        SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
      )
    )
  `);
    // Delete from content_vectors
    db.exec(`
    DELETE FROM content_vectors WHERE hash NOT IN (
      SELECT hash FROM documents WHERE active = 1
    )
  `);
    return countResult.c;
}
/**
 * Run VACUUM to reclaim unused space in the database.
 * This operation rebuilds the database file to eliminate fragmentation.
 */
export function vacuumDatabase(db) {
    db.exec(`VACUUM`);
}
// =============================================================================
// Document helpers
// =============================================================================
export async function hashContent(content) {
    const hash = createHash("sha256");
    hash.update(content);
    return hash.digest("hex");
}
const titleExtractors = {
    '.md': (content) => {
        const match = content.match(/^##?\s+(.+)$/m);
        if (match) {
            const title = (match[1] ?? "").trim();
            if (title === "📝 Notes" || title === "Notes") {
                const nextMatch = content.match(/^##\s+(.+)$/m);
                if (nextMatch?.[1])
                    return nextMatch[1].trim();
            }
            return title;
        }
        return null;
    },
    '.org': (content) => {
        const titleProp = content.match(/^#\+TITLE:\s*(.+)$/im);
        if (titleProp?.[1])
            return titleProp[1].trim();
        const heading = content.match(/^\*+\s+(.+)$/m);
        if (heading?.[1])
            return heading[1].trim();
        return null;
    },
};
export function extractTitle(content, filename) {
    const ext = filename.slice(filename.lastIndexOf('.')).toLowerCase();
    const extractor = titleExtractors[ext];
    if (extractor) {
        const title = extractor(content);
        if (title)
            return title;
    }
    return filename.replace(/\.[^.]+$/, "").split("/").pop() || filename;
}
// =============================================================================
// Document indexing operations
// =============================================================================
/**
 * Insert content into the content table (content-addressable storage).
 * Uses INSERT OR IGNORE so duplicate hashes are skipped.
 */
export function insertContent(db, hash, content, createdAt) {
    db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
        .run(hash, content, createdAt);
}
/**
 * Insert a new document into the documents table.
 */
export function insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt) {
    db.prepare(`
    INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active)
    VALUES (?, ?, ?, ?, ?, ?, 1)
    ON CONFLICT(collection, path) DO UPDATE SET
      title = excluded.title,
      hash = excluded.hash,
      modified_at = excluded.modified_at,
      active = 1
  `).run(collectionName, path, title, hash, createdAt, modifiedAt);
}
/**
 * Find an active document by collection name and path.
 */
export function findActiveDocument(db, collectionName, path) {
    const row = db.prepare(`
    SELECT id, hash, title FROM documents
    WHERE collection = ? AND path = ? AND active = 1
  `).get(collectionName, path);
    return row ?? null;
}
/**
 * Update the title and modified_at timestamp for a document.
 */
export function updateDocumentTitle(db, documentId, title, modifiedAt) {
    db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
        .run(title, modifiedAt, documentId);
}
/**
 * Update an existing document's hash, title, and modified_at timestamp.
 * Used when content changes but the file path stays the same.
 */
export function updateDocument(db, documentId, title, hash, modifiedAt) {
    db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`)
        .run(title, hash, modifiedAt, documentId);
}
/**
 * Deactivate a document (mark as inactive but don't delete).
 */
export function deactivateDocument(db, collectionName, path) {
    db.prepare(`UPDATE documents SET active = 0 WHERE collection = ? AND path = ? AND active = 1`)
        .run(collectionName, path);
}
/**
 * Get all active document paths for a collection.
 */
export function getActiveDocumentPaths(db, collectionName) {
    const rows = db.prepare(`
    SELECT path FROM documents WHERE collection = ? AND active = 1
  `).all(collectionName);
    return rows.map(r => r.path);
}
export { formatQueryForEmbedding, formatDocForEmbedding };
/**
 * Chunk a document using regex-only break point detection.
 * This is the sync, backward-compatible API used by tests and legacy callers.
 */
export function chunkDocument(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS) {
    const breakPoints = scanBreakPoints(content);
    const codeFences = findCodeFences(content);
    return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
}
/**
 * Async AST-aware chunking. Detects language from filepath, computes AST
 * break points for supported code files, merges with regex break points,
 * and delegates to the shared chunk algorithm.
 *
 * Strategies:
 *   - "regex"    (default) — char-based chunking with regex break points only.
 *   - "auto"     — regex break points merged with AST break points (soft hints).
 *   - "function" — one chunk per AST function range (Phase 2); inter-range
 *                  gaps (imports, top-level code) are char-chunked with AST
 *                  hints. Falls back to "auto" when zero ranges are detected.
 */
export async function chunkDocumentAsync(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS, filepath, chunkStrategy = "regex") {
    const regexPoints = scanBreakPoints(content);
    const codeFences = findCodeFences(content);
    // "function" strategy: delegate to the function-level chunker. If no
    // ranges are detected (markdown, unsupported lang, parse failure), fall
    // back to "auto" behavior (AST-break-point-assisted char chunking).
    if (chunkStrategy === "function" && filepath) {
        const { getASTFunctionRanges, getASTBreakPoints } = await import("./ast.js");
        const ranges = await getASTFunctionRanges(content, filepath);
        if (ranges.length > 0) {
            return chunkByFunctionRanges(content, ranges, regexPoints, codeFences, maxChars, overlapChars, windowChars);
        }
        // Zero ranges — fall through to auto behavior so break points still help.
        const astPoints = await getASTBreakPoints(content, filepath);
        const merged = astPoints.length > 0 ? mergeBreakPoints(regexPoints, astPoints) : regexPoints;
        return chunkDocumentWithBreakPoints(content, merged, codeFences, maxChars, overlapChars, windowChars);
    }
    let breakPoints = regexPoints;
    if (chunkStrategy === "auto" && filepath) {
        const { getASTBreakPoints } = await import("./ast.js");
        const astPoints = await getASTBreakPoints(content, filepath);
        if (astPoints.length > 0) {
            breakPoints = mergeBreakPoints(regexPoints, astPoints);
        }
    }
    return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
}
/**
 * Produce one chunk per AST function range, plus char-chunks for the gaps
 * between ranges (imports, top-level code). Ranges that exceed `maxChars`
 * are further split using the existing char-based algorithm so we never
 * emit a single oversized chunk.
 *
 * Preconditions: `ranges` is non-empty, sorted by `startIndex`, and the
 * ranges are non-overlapping (as produced by `getASTFunctionRanges`).
 */
function chunkByFunctionRanges(content, ranges, regexPoints, codeFences, maxChars, overlapChars, windowChars) {
    const out = [];
    let cursor = 0;
    const emitGap = (start, end) => {
        if (start >= end)
            return;
        const gap = content.slice(start, end);
        // Whitespace-only gaps are dropped — they carry no embeddable signal.
        if (!gap.trim())
            return;
        if (gap.length <= maxChars) {
            out.push({ text: gap, pos: start });
            return;
        }
        // Reuse char-based algorithm for oversized gaps. Restrict break
        // points and code fences to the gap window and rebase positions so
        // chunkDocumentWithBreakPoints operates on a standalone slice.
        const subPoints = regexPoints
            .filter(p => p.pos >= start && p.pos < end)
            .map(p => ({ ...p, pos: p.pos - start }));
        const subFences = codeFences
            .filter(f => f.end > start && f.start < end)
            .map(f => ({
            start: Math.max(0, f.start - start),
            end: Math.max(0, Math.min(end, f.end) - start),
        }));
        const sub = chunkDocumentWithBreakPoints(gap, subPoints, subFences, maxChars, overlapChars, windowChars);
        for (const c of sub)
            out.push({ text: c.text, pos: start + c.pos });
    };
    for (const range of ranges) {
        // Emit any leading / inter-range gap (imports, top-level code).
        emitGap(cursor, range.startIndex);
        const body = content.slice(range.startIndex, range.endIndex);
        if (body.length === 0) {
            cursor = range.endIndex;
            continue;
        }
        if (body.length <= maxChars) {
            out.push({ text: body, pos: range.startIndex });
        }
        else {
            // Oversized function/class — split with char algorithm so we stay
            // under the embed token budget. Break points inside the range are
            // reused to keep splits at syntactically-sensible positions.
            const subPoints = regexPoints
                .filter(p => p.pos >= range.startIndex && p.pos < range.endIndex)
                .map(p => ({ ...p, pos: p.pos - range.startIndex }));
            const subFences = codeFences
                .filter(f => f.end > range.startIndex && f.start < range.endIndex)
                .map(f => ({
                start: Math.max(0, f.start - range.startIndex),
                end: Math.max(0, Math.min(range.endIndex, f.end) - range.startIndex),
            }));
            const sub = chunkDocumentWithBreakPoints(body, subPoints, subFences, maxChars, overlapChars, windowChars);
            for (const c of sub)
                out.push({ text: c.text, pos: range.startIndex + c.pos });
        }
        cursor = range.endIndex;
    }
    // Trailing gap after the last range.
    emitGap(cursor, content.length);
    // Edge case: content consisted entirely of whitespace-only gaps (zero
    // emitted chunks). Preserve the invariant that non-empty content yields
    // at least one chunk.
    if (out.length === 0 && content.length > 0) {
        return [{ text: content, pos: 0 }];
    }
    return out;
}
/**
 * Chunk a document by actual token count using the LLM tokenizer.
 * More accurate than character-based chunking but requires async.
 *
 * When `tokenizer` is supplied, it is used in place of the local
 * `llm.tokenize(...)` call — neither `getDefaultLlamaCpp()` nor
 * `llm.tokenize(...)` is invoked. This lets remote-only deployments
 * (`QMD_EMBED_ENDPOINT=...`) chunk documents without warming up
 * node-llama-cpp (DoD #1 of i-1rqixh6m / i-qkarfffa).
 *
 * When `filepath` and `chunkStrategy` are provided, uses AST-aware break
 * points for supported code files.
 */
export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS, filepath, chunkStrategy = "regex", signal, tokenizer) {
    // Resolve token counter lazily so callers that supply `tokenizer` never
    // touch the local LlamaCpp instance — `getDefaultLlamaCpp()` is only
    // invoked from inside the default closure when it is actually called
    // (i.e. when no tokenizer is supplied).
    let llm;
    const countTokens = tokenizer ?? (async (text) => {
        if (!llm)
            llm = getDefaultLlamaCpp();
        return (await llm.tokenize(text)).length;
    });
    // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
    // If chunks exceed limit, they'll be re-split with actual ratio
    const avgCharsPerToken = 3;
    const maxChars = maxTokens * avgCharsPerToken;
    const overlapChars = overlapTokens * avgCharsPerToken;
    const windowChars = windowTokens * avgCharsPerToken;
    // Chunk in character space with conservative estimate
    // Use AST-aware chunking for the first pass when filepath/strategy provided
    let charChunks = await chunkDocumentAsync(content, maxChars, overlapChars, windowChars, filepath, chunkStrategy);
    // Tokenize and split any chunks that still exceed limit
    const results = [];
    for (const chunk of charChunks) {
        // Respect abort signal to avoid runaway tokenization
        if (signal?.aborted)
            break;
        const tokenCount = await countTokens(chunk.text);
        if (tokenCount <= maxTokens) {
            results.push({ text: chunk.text, pos: chunk.pos, tokens: tokenCount });
        }
        else {
            // Chunk is still too large - split it further
            // Use actual token count to estimate better char limit
            const actualCharsPerToken = chunk.text.length / tokenCount;
            const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
            const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
            for (const subChunk of subChunks) {
                if (signal?.aborted)
                    break;
                const subCount = await countTokens(subChunk.text);
                results.push({
                    text: subChunk.text,
                    pos: chunk.pos + subChunk.pos,
                    tokens: subCount,
                });
            }
        }
    }
    return results;
}
// =============================================================================
// Fuzzy matching
// =============================================================================
function levenshtein(a, b) {
    const m = a.length, n = b.length;
    if (m === 0)
        return n;
    if (n === 0)
        return m;
    const dp = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0));
    for (let i = 0; i <= m; i++)
        dp[i][0] = i;
    for (let j = 0; j <= n; j++)
        dp[0][j] = j;
    for (let i = 1; i <= m; i++) {
        for (let j = 1; j <= n; j++) {
            const cost = a[i - 1] === b[j - 1] ? 0 : 1;
            dp[i][j] = Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost);
        }
    }
    return dp[m][n];
}
/**
 * Normalize a docid input by stripping surrounding quotes and leading #.
 * Handles: "#abc123", 'abc123', "abc123", #abc123, abc123
 * Returns the bare hex string.
 */
export function normalizeDocid(docid) {
    let normalized = docid.trim();
    // Strip surrounding quotes (single or double)
    if ((normalized.startsWith('"') && normalized.endsWith('"')) ||
        (normalized.startsWith("'") && normalized.endsWith("'"))) {
        normalized = normalized.slice(1, -1);
    }
    // Strip leading # if present
    if (normalized.startsWith('#')) {
        normalized = normalized.slice(1);
    }
    return normalized;
}
/**
 * Check if a string looks like a docid reference.
 * Accepts: #abc123, abc123, "#abc123", "abc123", '#abc123', 'abc123'
 * Returns true if the normalized form is a valid hex string of 6+ chars.
 */
export function isDocid(input) {
    const normalized = normalizeDocid(input);
    // Must be at least 6 hex characters
    return normalized.length >= 6 && /^[a-f0-9]+$/i.test(normalized);
}
/**
 * Find a document by its short docid (first 6 characters of hash).
 * Returns the document's virtual path if found, null otherwise.
 * If multiple documents match the same short hash (collision), returns the first one.
 *
 * Accepts lenient input: #abc123, abc123, "#abc123", "abc123"
 */
export function findDocumentByDocid(db, docid) {
    const shortHash = normalizeDocid(docid);
    if (shortHash.length < 1)
        return null;
    // Look up documents where hash starts with the short hash
    const doc = db.prepare(`
    SELECT 'qmd://' || d.collection || '/' || d.path as filepath, d.hash
    FROM documents d
    WHERE d.hash LIKE ? AND d.active = 1
    LIMIT 1
  `).get(`${shortHash}%`);
    return doc;
}
export function findSimilarFiles(db, query, maxDistance = 3, limit = 5) {
    const allFiles = db.prepare(`
    SELECT d.path
    FROM documents d
    WHERE d.active = 1
  `).all();
    const queryLower = query.toLowerCase();
    const scored = allFiles
        .map(f => ({ path: f.path, dist: levenshtein(f.path.toLowerCase(), queryLower) }))
        .filter(f => f.dist <= maxDistance)
        .sort((a, b) => a.dist - b.dist)
        .slice(0, limit);
    return scored.map(f => f.path);
}
export function matchFilesByGlob(db, pattern) {
    const allFiles = db.prepare(`
    SELECT
      'qmd://' || d.collection || '/' || d.path as virtual_path,
      LENGTH(content.doc) as body_length,
      d.path,
      d.collection
    FROM documents d
    JOIN content ON content.hash = d.hash
    WHERE d.active = 1
  `).all();
    const isMatch = picomatch(pattern);
    return allFiles
        .filter(f => isMatch(f.virtual_path) || isMatch(f.path) || isMatch(f.collection + '/' + f.path))
        .map(f => ({
        filepath: f.virtual_path, // Virtual path for precise lookup
        displayPath: f.path, // Relative path for display
        bodyLength: f.body_length
    }));
}
// =============================================================================
// Context
// =============================================================================
/**
 * Get context for a file path using hierarchical inheritance.
 * Contexts are collection-scoped and inherit from parent directories.
 * For example, context at "/talks" applies to "/talks/2024/keynote.md".
 *
 * @param db Database instance (unused - kept for compatibility)
 * @param collectionName Collection name
 * @param path Relative path within the collection
 * @returns Context string or null if no context is defined
 */
export function getContextForPath(db, collectionName, path) {
    const coll = getStoreCollection(db, collectionName);
    if (!coll)
        return null;
    // Collect ALL matching contexts (global + all path prefixes)
    const contexts = [];
    // Add global context if present
    const globalCtx = getStoreGlobalContext(db);
    if (globalCtx) {
        contexts.push(globalCtx);
    }
    // Add all matching path contexts (from most general to most specific)
    if (coll.context) {
        const normalizedPath = path.startsWith("/") ? path : `/${path}`;
        // Collect all matching prefixes
        const matchingContexts = [];
        for (const [prefix, context] of Object.entries(coll.context)) {
            const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
            if (normalizedPath.startsWith(normalizedPrefix)) {
                matchingContexts.push({ prefix: normalizedPrefix, context });
            }
        }
        // Sort by prefix length (shortest/most general first)
        matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
        // Add all matching contexts
        for (const match of matchingContexts) {
            contexts.push(match.context);
        }
    }
    // Join all contexts with double newline
    return contexts.length > 0 ? contexts.join('\n\n') : null;
}
/**
 * Get context for a file path (virtual or filesystem).
 * Resolves the collection and relative path from the DB store_collections table.
 */
export function getContextForFile(db, filepath) {
    // Handle undefined or null filepath
    if (!filepath)
        return null;
    // Get all collections from DB
    const collections = getStoreCollections(db);
    // Parse virtual path format: qmd://collection/path
    let collectionName = null;
    let relativePath = null;
    const parsedVirtual = filepath.startsWith('qmd://') ? parseVirtualPath(filepath) : null;
    if (parsedVirtual) {
        collectionName = parsedVirtual.collectionName;
        relativePath = parsedVirtual.path;
    }
    else {
        // Filesystem path: find which collection this absolute path belongs to
        for (const coll of collections) {
            // Skip collections with missing paths
            if (!coll || !coll.path)
                continue;
            if (filepath.startsWith(coll.path + '/') || filepath === coll.path) {
                collectionName = coll.name;
                // Extract relative path
                relativePath = filepath.startsWith(coll.path + '/')
                    ? filepath.slice(coll.path.length + 1)
                    : '';
                break;
            }
        }
        if (!collectionName || relativePath === null)
            return null;
    }
    // Get the collection from DB
    const coll = getStoreCollection(db, collectionName);
    if (!coll)
        return null;
    // Verify this document exists in the database
    const doc = db.prepare(`
    SELECT d.path
    FROM documents d
    WHERE d.collection = ? AND d.path = ? AND d.active = 1
    LIMIT 1
  `).get(collectionName, relativePath);
    if (!doc)
        return null;
    // Collect ALL matching contexts (global + all path prefixes)
    const contexts = [];
    // Add global context if present
    const globalCtx = getStoreGlobalContext(db);
    if (globalCtx) {
        contexts.push(globalCtx);
    }
    // Add all matching path contexts (from most general to most specific)
    if (coll.context) {
        const normalizedPath = relativePath.startsWith("/") ? relativePath : `/${relativePath}`;
        // Collect all matching prefixes
        const matchingContexts = [];
        for (const [prefix, context] of Object.entries(coll.context)) {
            const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
            if (normalizedPath.startsWith(normalizedPrefix)) {
                matchingContexts.push({ prefix: normalizedPrefix, context });
            }
        }
        // Sort by prefix length (shortest/most general first)
        matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
        // Add all matching contexts
        for (const match of matchingContexts) {
            contexts.push(match.context);
        }
    }
    // Join all contexts with double newline
    return contexts.length > 0 ? contexts.join('\n\n') : null;
}
/**
 * Get collection by name from DB store_collections table.
 */
export function getCollectionByName(db, name) {
    const collection = getStoreCollection(db, name);
    if (!collection)
        return null;
    return {
        name: collection.name,
        pwd: collection.path,
        glob_pattern: collection.pattern,
    };
}
/**
 * List all collections with document counts from database.
 * Merges store_collections config with database statistics.
 */
export function listCollections(db) {
    const collections = getStoreCollections(db);
    // Get document counts from database for each collection
    const result = collections.map(coll => {
        const stats = db.prepare(`
      SELECT
        COUNT(d.id) as doc_count,
        SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count,
        MAX(d.modified_at) as last_modified
      FROM documents d
      WHERE d.collection = ?
    `).get(coll.name);
        return {
            name: coll.name,
            pwd: coll.path,
            glob_pattern: coll.pattern,
            doc_count: stats?.doc_count || 0,
            active_count: stats?.active_count || 0,
            last_modified: stats?.last_modified || null,
            includeByDefault: coll.includeByDefault !== false,
        };
    });
    return result;
}
/**
 * Remove a collection and clean up its documents.
 * Uses collections.ts to remove from YAML config and cleans up database.
 */
export function removeCollection(db, collectionName) {
    // Delete documents from database
    const docResult = db.prepare(`DELETE FROM documents WHERE collection = ?`).run(collectionName);
    // Clean up orphaned content hashes
    const cleanupResult = db.prepare(`
    DELETE FROM content
    WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  `).run();
    // Remove from store_collections
    deleteStoreCollection(db, collectionName);
    return {
        deletedDocs: docResult.changes,
        cleanedHashes: cleanupResult.changes
    };
}
/**
 * Rename a collection.
 * Updates both YAML config and database documents table.
 */
export function renameCollection(db, oldName, newName) {
    // Update all documents with the new collection name in database
    db.prepare(`UPDATE documents SET collection = ? WHERE collection = ?`)
        .run(newName, oldName);
    // Rename in store_collections
    renameStoreCollection(db, oldName, newName);
}
// =============================================================================
// Context Management Operations
// =============================================================================
/**
 * Insert or update a context for a specific collection and path prefix.
 */
export function insertContext(db, collectionId, pathPrefix, context) {
    // Get collection name from ID
    const coll = db.prepare(`SELECT name FROM collections WHERE id = ?`).get(collectionId);
    if (!coll) {
        throw new Error(`Collection with id ${collectionId} not found`);
    }
    // Add context to store_collections
    updateStoreContext(db, coll.name, pathPrefix, context);
}
/**
 * Delete a context for a specific collection and path prefix.
 * Returns the number of contexts deleted.
 */
export function deleteContext(db, collectionName, pathPrefix) {
    // Remove context from store_collections
    const success = removeStoreContext(db, collectionName, pathPrefix);
    return success ? 1 : 0;
}
/**
 * Delete all global contexts (contexts with empty path_prefix).
 * Returns the number of contexts deleted.
 */
export function deleteGlobalContexts(db) {
    let deletedCount = 0;
    // Remove global context
    setStoreGlobalContext(db, undefined);
    deletedCount++;
    // Remove root context (empty string) from all collections
    const collections = getStoreCollections(db);
    for (const coll of collections) {
        const success = removeStoreContext(db, coll.name, '');
        if (success) {
            deletedCount++;
        }
    }
    return deletedCount;
}
/**
 * List all contexts, grouped by collection.
 * Returns contexts ordered by collection name, then by path prefix length (longest first).
 */
export function listPathContexts(db) {
    const allContexts = getStoreContexts(db);
    // Convert to expected format and sort
    return allContexts.map(ctx => ({
        collection_name: ctx.collection,
        path_prefix: ctx.path,
        context: ctx.context,
    })).sort((a, b) => {
        // Sort by collection name first
        if (a.collection_name !== b.collection_name) {
            return a.collection_name.localeCompare(b.collection_name);
        }
        // Then by path prefix length (longest first)
        if (a.path_prefix.length !== b.path_prefix.length) {
            return b.path_prefix.length - a.path_prefix.length;
        }
        // Then alphabetically
        return a.path_prefix.localeCompare(b.path_prefix);
    });
}
/**
 * Get all collections (name only - from YAML config).
 */
export function getAllCollections(db) {
    const collections = getStoreCollections(db);
    return collections.map(c => ({ name: c.name }));
}
/**
 * Check which collections don't have any context defined.
 * Returns collections that have no context entries at all (not even root context).
 */
export function getCollectionsWithoutContext(db) {
    // Get all collections from DB
    const allCollections = getStoreCollections(db);
    // Filter to those without context
    const collectionsWithoutContext = [];
    for (const coll of allCollections) {
        // Check if collection has any context
        if (!coll.context || Object.keys(coll.context).length === 0) {
            // Get doc count from database
            const stats = db.prepare(`
        SELECT COUNT(d.id) as doc_count
        FROM documents d
        WHERE d.collection = ? AND d.active = 1
      `).get(coll.name);
            collectionsWithoutContext.push({
                name: coll.name,
                pwd: coll.path,
                doc_count: stats?.doc_count || 0,
            });
        }
    }
    return collectionsWithoutContext.sort((a, b) => a.name.localeCompare(b.name));
}
/**
 * Get top-level directories in a collection that don't have context.
 * Useful for suggesting where context might be needed.
 */
export function getTopLevelPathsWithoutContext(db, collectionName) {
    // Get all paths in the collection from database
    const paths = db.prepare(`
    SELECT DISTINCT path FROM documents
    WHERE collection = ? AND active = 1
  `).all(collectionName);
    // Get existing contexts for this collection from DB
    const dbColl = getStoreCollection(db, collectionName);
    if (!dbColl)
        return [];
    const contextPrefixes = new Set();
    if (dbColl.context) {
        for (const prefix of Object.keys(dbColl.context)) {
            contextPrefixes.add(prefix);
        }
    }
    // Extract top-level directories (first path component)
    const topLevelDirs = new Set();
    for (const { path } of paths) {
        const parts = path.split('/').filter(Boolean);
        if (parts.length > 1) {
            const dir = parts[0];
            if (dir)
                topLevelDirs.add(dir);
        }
    }
    // Filter out directories that already have context (exact or parent)
    const missing = [];
    for (const dir of topLevelDirs) {
        let hasContext = false;
        // Check if this dir or any parent has context
        for (const prefix of contextPrefixes) {
            if (prefix === '' || prefix === dir || dir.startsWith(prefix + '/')) {
                hasContext = true;
                break;
            }
        }
        if (!hasContext) {
            missing.push(dir);
        }
    }
    return missing.sort();
}
// =============================================================================
// FTS Search
// =============================================================================
export function sanitizeFTS5Term(term) {
    return term.replace(/[^\p{L}\p{N}'_]/gu, '').toLowerCase();
}
/**
 * Check if a token is a hyphenated compound word (e.g., multi-agent, DEC-0054, gpt-4).
 * Returns true if the token contains internal hyphens between word/digit characters.
 */
function isHyphenatedToken(token) {
    return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
}
/**
 * Sanitize a hyphenated term into an FTS5 phrase by splitting on hyphens
 * and sanitizing each part. Returns the parts joined by spaces for use
 * inside FTS5 quotes: "multi agent" matches "multi-agent" in porter tokenizer.
 */
function sanitizeHyphenatedTerm(term) {
    return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
}
/**
 * Parse lex query syntax into FTS5 query.
 *
 * Supports:
 * - Quoted phrases: "exact phrase" → "exact phrase" (exact match)
 * - Negation: -term or -"phrase" → uses FTS5 NOT operator
 * - Hyphenated tokens: multi-agent, DEC-0054, gpt-4 → treated as phrases
 * - Plain terms: term → "term"* (prefix match)
 *
 * FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2".
 * So `-term` only works when there are also positive terms.
 *
 * Hyphen disambiguation: `-sports` at a word boundary is negation, but `multi-agent`
 * (where `-` is between word characters) is treated as a hyphenated phrase.
 * When a leading `-` is followed by what looks like a hyphenated compound word
 * (e.g., `-multi-agent`), the entire token is treated as a negated phrase.
 *
 * Examples:
 *   performance -sports     → "performance"* NOT "sports"*
 *   "machine learning"      → "machine learning"
 *   multi-agent memory      → "multi agent" AND "memory"*
 *   DEC-0054               → "dec 0054"
 *   -multi-agent            → NOT "multi agent"
 */
function buildFTS5Query(query) {
    const positive = [];
    const negative = [];
    let i = 0;
    const s = query.trim();
    while (i < s.length) {
        // Skip whitespace
        while (i < s.length && /\s/.test(s[i]))
            i++;
        if (i >= s.length)
            break;
        // Check for negation prefix
        const negated = s[i] === '-';
        if (negated)
            i++;
        // Check for quoted phrase
        if (s[i] === '"') {
            const start = i + 1;
            i++;
            while (i < s.length && s[i] !== '"')
                i++;
            const phrase = s.slice(start, i).trim();
            i++; // skip closing quote
            if (phrase.length > 0) {
                const sanitized = phrase.split(/\s+/).map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
                if (sanitized) {
                    const ftsPhrase = `"${sanitized}"`; // Exact phrase, no prefix match
                    if (negated) {
                        negative.push(ftsPhrase);
                    }
                    else {
                        positive.push(ftsPhrase);
                    }
                }
            }
        }
        else {
            // Plain term (until whitespace or quote)
            const start = i;
            while (i < s.length && !/[\s"]/.test(s[i]))
                i++;
            const term = s.slice(start, i);
            // Handle hyphenated tokens: multi-agent, DEC-0054, gpt-4
            // These get split into phrase queries so FTS5 porter tokenizer matches them.
            if (isHyphenatedToken(term)) {
                const sanitized = sanitizeHyphenatedTerm(term);
                if (sanitized) {
                    const ftsPhrase = `"${sanitized}"`; // Phrase match (no prefix)
                    if (negated) {
                        negative.push(ftsPhrase);
                    }
                    else {
                        positive.push(ftsPhrase);
                    }
                }
            }
            else {
                const sanitized = sanitizeFTS5Term(term);
                if (sanitized) {
                    const ftsTerm = `"${sanitized}"*`; // Prefix match
                    if (negated) {
                        negative.push(ftsTerm);
                    }
                    else {
                        positive.push(ftsTerm);
                    }
                }
            }
        }
    }
    if (positive.length === 0 && negative.length === 0)
        return null;
    // If only negative terms, we can't search (FTS5 NOT is binary)
    if (positive.length === 0)
        return null;
    // Join positive terms with AND
    let result = positive.join(' AND ');
    // Add NOT clause for negative terms
    for (const neg of negative) {
        result = `${result} NOT ${neg}`;
    }
    return result;
}
/**
 * Validate that a vec/hyde query doesn't use lex-only syntax.
 * Returns error message if invalid, null if valid.
 *
 * Negation is detected ONLY when `-` is preceded by whitespace or sits at
 * the start of the query. Hyphens inside words (e.g. `auto-archived`,
 * `pre-commit`, `multi-session`, `state-of-the-art`) carry no negation
 * semantics in natural English and must pass through unchanged.
 */
export function validateSemanticQuery(query) {
    // `-term` or `-"phrase"` only counts as negation at SOS or after whitespace.
    if (/(?:^|\s)-\w/.test(query) || /(?:^|\s)-"/.test(query)) {
        return 'Negation (-term) is not supported in vec/hyde queries. Use lex for exclusions.';
    }
    return null;
}
export function validateLexQuery(query) {
    if (/[\r\n]/.test(query)) {
        return 'Lex queries must be a single line. Remove newline characters or split into separate lex: lines.';
    }
    const quoteCount = (query.match(/"/g) ?? []).length;
    if (quoteCount % 2 === 1) {
        return 'Lex query has an unmatched double quote ("). Add the closing quote or remove it.';
    }
    return null;
}
export function searchFTS(db, query, limit = 20, collectionName) {
    const ftsQuery = buildFTS5Query(query);
    if (!ftsQuery)
        return [];
    // Use a CTE to force FTS5 to run first, then filter by collection.
    // Without the CTE, SQLite's query planner combines FTS5 MATCH with the
    // collection filter in a single WHERE clause, which can cause it to
    // abandon the FTS5 index and fall back to a full scan — turning an 8ms
    // query into a 17-second query on large collections.
    const params = [ftsQuery];
    // When filtering by collection, fetch extra candidates from the FTS index
    // since some will be filtered out. Without a collection filter we can
    // fetch exactly the requested limit.
    const ftsLimit = collectionName ? limit * 10 : limit;
    let sql = `
    WITH fts_matches AS (
      SELECT rowid, bm25(documents_fts, 1.5, 4.0, 1.0) as bm25_score
      FROM documents_fts
      WHERE documents_fts MATCH ?
      ORDER BY bm25_score ASC
      LIMIT ${ftsLimit}
    )
    SELECT
      'qmd://' || d.collection || '/' || d.path as filepath,
      d.collection || '/' || d.path as display_path,
      d.title,
      content.doc as body,
      d.hash,
      fm.bm25_score
    FROM fts_matches fm
    JOIN documents d ON d.id = fm.rowid
    JOIN content ON content.hash = d.hash
    WHERE d.active = 1
  `;
    if (collectionName) {
        sql += ` AND d.collection = ?`;
        params.push(String(collectionName));
    }
    // bm25 lower is better; sort ascending.
    sql += ` ORDER BY fm.bm25_score ASC LIMIT ?`;
    params.push(limit);
    const rows = db.prepare(sql).all(...params);
    return rows.map(row => {
        const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
        // Convert bm25 (negative, lower is better) into a stable [0..1) score where higher is better.
        // FTS5 BM25 scores are negative (e.g., -10 is strong, -2 is weak).
        // |x| / (1 + |x|) maps: strong(-10)→0.91, medium(-2)→0.67, weak(-0.5)→0.33, none(0)→0.
        // Monotonic and query-independent — no per-query normalization needed.
        const score = Math.abs(row.bm25_score) / (1 + Math.abs(row.bm25_score));
        return {
            filepath: row.filepath,
            displayPath: row.display_path,
            title: row.title,
            hash: row.hash,
            docid: getDocid(row.hash),
            collectionName,
            modifiedAt: "", // Not available in FTS query
            bodyLength: row.body.length,
            body: row.body,
            context: getContextForFile(db, row.filepath),
            score,
            source: "fts",
        };
    });
}
// =============================================================================
// Vector Search
// =============================================================================
export async function searchVec(db, query, model, limit = 20, collectionName, session, precomputedEmbedding, embedProvider) {
    const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
    if (!tableExists)
        return [];
    const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session, undefined, embedProvider);
    if (!embedding)
        return [];
    // IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
    // hang indefinitely when combined with JOINs in the same query. Do NOT try to
    // "optimize" this by combining into a single query with JOINs - it will break.
    // See: https://github.com/tobi/qmd/pull/23
    // Step 1: Get vector matches from sqlite-vec (no JOINs allowed)
    const vecResults = db.prepare(`
    SELECT hash_seq, distance
    FROM vectors_vec
    WHERE embedding MATCH ? AND k = ?
  `).all(new Float32Array(embedding), limit * 3);
    if (vecResults.length === 0)
        return [];
    // Step 2: Get chunk info and document data
    const hashSeqs = vecResults.map(r => r.hash_seq);
    const distanceMap = new Map(vecResults.map(r => [r.hash_seq, r.distance]));
    // Build query for document lookup
    const placeholders = hashSeqs.map(() => '?').join(',');
    let docSql = `
    SELECT
      cv.hash || '_' || cv.seq as hash_seq,
      cv.hash,
      cv.pos,
      'qmd://' || d.collection || '/' || d.path as filepath,
      d.collection || '/' || d.path as display_path,
      d.title,
      content.doc as body
    FROM content_vectors cv
    JOIN documents d ON d.hash = cv.hash AND d.active = 1
    JOIN content ON content.hash = d.hash
    WHERE cv.hash || '_' || cv.seq IN (${placeholders})
  `;
    const params = [...hashSeqs];
    if (collectionName) {
        docSql += ` AND d.collection = ?`;
        params.push(collectionName);
    }
    const docRows = db.prepare(docSql).all(...params);
    // Combine with distances and dedupe by filepath
    const seen = new Map();
    for (const row of docRows) {
        const distance = distanceMap.get(row.hash_seq) ?? 1;
        const existing = seen.get(row.filepath);
        if (!existing || distance < existing.bestDist) {
            seen.set(row.filepath, { row, bestDist: distance });
        }
    }
    return Array.from(seen.values())
        .sort((a, b) => a.bestDist - b.bestDist)
        .slice(0, limit)
        .map(({ row, bestDist }) => {
        const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
        return {
            filepath: row.filepath,
            displayPath: row.display_path,
            title: row.title,
            hash: row.hash,
            docid: getDocid(row.hash),
            collectionName,
            modifiedAt: "", // Not available in vec query
            bodyLength: row.body.length,
            body: row.body,
            context: getContextForFile(db, row.filepath),
            score: 1 - bestDist, // Cosine similarity = 1 - cosine distance
            source: "vec",
            chunkPos: row.pos,
        };
    });
}
// =============================================================================
// Embeddings
// =============================================================================
async function getEmbedding(text, model, isQuery, session, llmOverride, embedProvider) {
    // When an EmbeddingProvider is supplied, route the encoding through it
    // (HTTP / GPU worker / fallback chain) instead of touching local
    // node-llama-cpp at all. The provider sees the raw text + the desired
    // model id; query-formatting prefixes are still applied via
    // formatQueryForEmbedding so embedding parity with the index is preserved.
    if (embedProvider) {
        const providerModel = embedProvider.getModelId();
        const formattedText = isQuery
            ? formatQueryForEmbedding(text, providerModel)
            : formatDocForEmbedding(text, undefined, providerModel);
        // Only forward an AbortSignal when the provider is local-backed;
        // remote providers manage their own timeouts and an LLM-session signal
        // would abort their HTTP request prematurely (i-08ovbvtb).
        const sig = embedProvider.kind === "local" ? session?.signal : undefined;
        const result = await embedProvider.embed(formattedText, sig ? { model: providerModel, signal: sig } : { model: providerModel });
        return result?.embedding ?? null;
    }
    // Format text using the appropriate prompt template
    const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model);
    const result = session
        ? await session.embed(formattedText, { model, isQuery })
        : await (llmOverride ?? getDefaultLlamaCpp()).embed(formattedText, { model, isQuery });
    return result?.embedding || null;
}
/**
 * Get all unique content hashes that need embeddings (from active documents).
 * Returns hash, document body, and a sample path for display purposes.
 */
export function getHashesForEmbedding(db) {
    return db.prepare(`
    SELECT d.hash, c.doc as body, MIN(d.path) as path
    FROM documents d
    JOIN content c ON d.hash = c.hash
    LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
    WHERE d.active = 1 AND v.hash IS NULL
    GROUP BY d.hash
  `).all();
}
/**
 * Clear all embeddings from the database (force re-index).
 * Deletes all rows from content_vectors and drops the vectors_vec table.
 */
export function clearAllEmbeddings(db) {
    db.exec(`DELETE FROM content_vectors`);
    db.exec(`DROP TABLE IF EXISTS vectors_vec`);
}
/**
 * Get the distinct set of model identifiers present in `content_vectors`.
 *
 * Used by the embedding migration-safety guard: if a configured provider's
 * `getModelId()` does not appear in this list (and the table is non-empty),
 * we refuse to embed and ask the user to run `qmd embed -f` to rebuild.
 *
 * Returns `[]` when the table is empty (fresh DB) — in which case any
 * provider is allowed.
 */
export function getDistinctEmbeddingModels(db) {
    const rows = db.prepare(`SELECT DISTINCT model FROM content_vectors WHERE model IS NOT NULL`).all();
    return rows.map((r) => r.model).filter((m) => typeof m === "string" && m.length > 0);
}
/**
 * Insert a single embedding into both content_vectors and vectors_vec tables.
 * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
 *
 * content_vectors is inserted first so that getHashesForEmbedding (which checks
 * only content_vectors) won't re-select the hash on a crash between the two inserts.
 *
 * vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's
 * vec0 virtual tables silently ignore the OR REPLACE conflict clause.
 */
export function insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt) {
    const hashSeq = `${hash}_${seq}`;
    // Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding)
    const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
    insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
    // vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
    const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
    const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
    deleteVecStmt.run(hashSeq);
    insertVecStmt.run(hashSeq, embedding);
}
// =============================================================================
// Query expansion
// =============================================================================
export async function expandQuery(query, model = DEFAULT_QUERY_MODEL, db, intent, llmOverride) {
    // Check cache first — stored as JSON preserving types
    const cacheKey = getCacheKey("expandQuery", { query, model, ...(intent && { intent }) });
    const cached = getCachedResult(db, cacheKey);
    if (cached) {
        try {
            const parsed = JSON.parse(cached);
            // Migrate old cache format: { type, text } → { type, query }
            if (parsed.length > 0 && parsed[0].query) {
                return parsed;
            }
            else if (parsed.length > 0 && parsed[0].text) {
                return parsed.map((r) => ({ type: r.type, query: r.text }));
            }
        }
        catch {
            // Old cache format (pre-typed, newline-separated text) — re-expand
        }
    }
    const llm = llmOverride ?? getDefaultLlamaCpp();
    // Note: LlamaCpp uses hardcoded model, model parameter is ignored
    const results = await llm.expandQuery(query, { intent });
    // Map Queryable[] → ExpandedQuery[] (same shape, decoupled from llm.ts internals).
    // Filter out entries that duplicate the original query text.
    const expanded = results
        .filter(r => r.text !== query)
        .map(r => ({ type: r.type, query: r.text }));
    if (expanded.length > 0) {
        setCachedResult(db, cacheKey, JSON.stringify(expanded));
    }
    return expanded;
}
// =============================================================================
// Reranking
// =============================================================================
export async function rerank(query, documents, model = DEFAULT_RERANK_MODEL, db, intent, llmOverride) {
    // Prepend intent to rerank query so the reranker scores with domain context
    const rerankQuery = intent ? `${intent}\n\n${query}` : query;
    const cachedResults = new Map();
    const uncachedDocsByChunk = new Map();
    // Check cache for each document
    // Cache key includes chunk text — different queries can select different chunks
    // from the same file, and the reranker score depends on which chunk was sent.
    // File path is excluded from the new cache key because the reranker score
    // depends on the chunk content, not where it came from.
    for (const doc of documents) {
        const cacheKey = getCacheKey("rerank", { query: rerankQuery, model, chunk: doc.text });
        const legacyCacheKey = getCacheKey("rerank", { query, file: doc.file, model, chunk: doc.text });
        const cached = getCachedResult(db, cacheKey) ?? getCachedResult(db, legacyCacheKey);
        if (cached !== null) {
            cachedResults.set(doc.text, parseFloat(cached));
        }
        else {
            uncachedDocsByChunk.set(doc.text, { file: doc.file, text: doc.text });
        }
    }
    // Rerank uncached documents using LlamaCpp
    if (uncachedDocsByChunk.size > 0) {
        const llm = llmOverride ?? getDefaultLlamaCpp();
        const uncachedDocs = [...uncachedDocsByChunk.values()];
        const rerankResult = await llm.rerank(rerankQuery, uncachedDocs, { model });
        // Cache results by chunk text so identical chunks across files are scored once.
        const textByFile = new Map(uncachedDocs.map(d => [d.file, d.text]));
        for (const result of rerankResult.results) {
            const chunk = textByFile.get(result.file) || "";
            const cacheKey = getCacheKey("rerank", { query: rerankQuery, model, chunk });
            setCachedResult(db, cacheKey, result.score.toString());
            cachedResults.set(chunk, result.score);
        }
    }
    // Return all results sorted by score
    return documents
        .map(doc => ({ file: doc.file, score: cachedResults.get(doc.text) || 0 }))
        .sort((a, b) => b.score - a.score);
}
// =============================================================================
// Reciprocal Rank Fusion
// =============================================================================
export function reciprocalRankFusion(resultLists, weights = [], k = 60) {
    const scores = new Map();
    for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
        const list = resultLists[listIdx];
        if (!list)
            continue;
        const weight = weights[listIdx] ?? 1.0;
        for (let rank = 0; rank < list.length; rank++) {
            const result = list[rank];
            if (!result)
                continue;
            const rrfContribution = weight / (k + rank + 1);
            const existing = scores.get(result.file);
            if (existing) {
                existing.rrfScore += rrfContribution;
                existing.topRank = Math.min(existing.topRank, rank);
            }
            else {
                scores.set(result.file, {
                    result,
                    rrfScore: rrfContribution,
                    topRank: rank,
                });
            }
        }
    }
    // Top-rank bonus
    for (const entry of scores.values()) {
        if (entry.topRank === 0) {
            entry.rrfScore += 0.05;
        }
        else if (entry.topRank <= 2) {
            entry.rrfScore += 0.02;
        }
    }
    return Array.from(scores.values())
        .sort((a, b) => b.rrfScore - a.rrfScore)
        .map(e => ({ ...e.result, score: e.rrfScore }));
}
/**
 * Build per-document RRF contribution traces for explain/debug output.
 */
export function buildRrfTrace(resultLists, weights = [], listMeta = [], k = 60) {
    const traces = new Map();
    for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
        const list = resultLists[listIdx];
        if (!list)
            continue;
        const weight = weights[listIdx] ?? 1.0;
        const meta = listMeta[listIdx] ?? {
            source: "fts",
            queryType: "original",
            query: "",
        };
        for (let rank0 = 0; rank0 < list.length; rank0++) {
            const result = list[rank0];
            if (!result)
                continue;
            const rank = rank0 + 1; // 1-indexed rank for explain output
            const contribution = weight / (k + rank);
            const existing = traces.get(result.file);
            const detail = {
                listIndex: listIdx,
                source: meta.source,
                queryType: meta.queryType,
                query: meta.query,
                rank,
                weight,
                backendScore: result.score,
                rrfContribution: contribution,
            };
            if (existing) {
                existing.baseScore += contribution;
                existing.topRank = Math.min(existing.topRank, rank);
                existing.contributions.push(detail);
            }
            else {
                traces.set(result.file, {
                    contributions: [detail],
                    baseScore: contribution,
                    topRank: rank,
                    topRankBonus: 0,
                    totalScore: 0,
                });
            }
        }
    }
    for (const trace of traces.values()) {
        let bonus = 0;
        if (trace.topRank === 1)
            bonus = 0.05;
        else if (trace.topRank <= 3)
            bonus = 0.02;
        trace.topRankBonus = bonus;
        trace.totalScore = trace.baseScore + bonus;
    }
    return traces;
}
/**
 * Find a document by filename/path, docid (#hash), or with fuzzy matching.
 * Returns document metadata without body by default.
 *
 * Supports:
 * - Virtual paths: qmd://collection/path/to/file.md
 * - Absolute paths: /path/to/file.md
 * - Relative paths: path/to/file.md
 * - Short docid: #abc123 (first 6 chars of hash)
 */
export function findDocument(db, filename, options = {}) {
    let filepath = filename;
    const colonMatch = filepath.match(/:(\d+)$/);
    if (colonMatch) {
        filepath = filepath.slice(0, -colonMatch[0].length);
    }
    // Check if this is a docid lookup (#abc123, abc123, "#abc123", "abc123", etc.)
    if (isDocid(filepath)) {
        const docidMatch = findDocumentByDocid(db, filepath);
        if (docidMatch) {
            filepath = docidMatch.filepath;
        }
        else {
            return { error: "not_found", query: filename, similarFiles: [] };
        }
    }
    if (filepath.startsWith('~/')) {
        filepath = homedir() + filepath.slice(1);
    }
    const bodyCol = options.includeBody ? `, content.doc as body` : ``;
    // Build computed columns
    // Note: absoluteFilepath is computed from YAML collections after query
    const selectCols = `
    'qmd://' || d.collection || '/' || d.path as virtual_path,
    d.collection || '/' || d.path as display_path,
    d.title,
    d.hash,
    d.collection,
    d.modified_at,
    LENGTH(content.doc) as body_length
    ${bodyCol}
  `;
    // Try to match by virtual path first
    let doc = db.prepare(`
    SELECT ${selectCols}
    FROM documents d
    JOIN content ON content.hash = d.hash
    WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  `).get(filepath);
    // Try fuzzy match by virtual path
    if (!doc) {
        doc = db.prepare(`
      SELECT ${selectCols}
      FROM documents d
      JOIN content ON content.hash = d.hash
      WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
      LIMIT 1
    `).get(`%${filepath}`);
    }
    // Try to match by absolute path (requires looking up collection paths from DB)
    if (!doc && !filepath.startsWith('qmd://')) {
        const collections = getStoreCollections(db);
        for (const coll of collections) {
            let relativePath = null;
            // If filepath is absolute and starts with collection path, extract relative part
            if (filepath.startsWith(coll.path + '/')) {
                relativePath = filepath.slice(coll.path.length + 1);
            }
            // Otherwise treat filepath as relative to collection
            else if (!filepath.startsWith('/')) {
                relativePath = filepath;
            }
            if (relativePath) {
                doc = db.prepare(`
          SELECT ${selectCols}
          FROM documents d
          JOIN content ON content.hash = d.hash
          WHERE d.collection = ? AND d.path = ? AND d.active = 1
        `).get(coll.name, relativePath);
                if (doc)
                    break;
            }
        }
    }
    if (!doc) {
        const similar = findSimilarFiles(db, filepath, 5, 5);
        return { error: "not_found", query: filename, similarFiles: similar };
    }
    // Get context using virtual path
    const virtualPath = doc.virtual_path || `qmd://${doc.collection}/${doc.display_path}`;
    const context = getContextForFile(db, virtualPath);
    return {
        filepath: virtualPath,
        displayPath: doc.display_path,
        title: doc.title,
        context,
        hash: doc.hash,
        docid: getDocid(doc.hash),
        collectionName: doc.collection,
        modifiedAt: doc.modified_at,
        bodyLength: doc.body_length,
        ...(options.includeBody && doc.body !== undefined && { body: doc.body }),
    };
}
/**
 * Get the body content for a document
 * Optionally slice by line range
 */
export function getDocumentBody(db, doc, fromLine, maxLines) {
    const filepath = doc.filepath;
    // Try to resolve document by filepath (absolute or virtual)
    let row = null;
    // Try virtual path first
    if (filepath.startsWith('qmd://')) {
        row = db.prepare(`
      SELECT content.doc as body
      FROM documents d
      JOIN content ON content.hash = d.hash
      WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
    `).get(filepath);
    }
    // Try absolute path by looking up in DB store_collections
    if (!row) {
        const collections = getStoreCollections(db);
        for (const coll of collections) {
            if (filepath.startsWith(coll.path + '/')) {
                const relativePath = filepath.slice(coll.path.length + 1);
                row = db.prepare(`
          SELECT content.doc as body
          FROM documents d
          JOIN content ON content.hash = d.hash
          WHERE d.collection = ? AND d.path = ? AND d.active = 1
        `).get(coll.name, relativePath);
                if (row)
                    break;
            }
        }
    }
    if (!row)
        return null;
    let body = row.body;
    if (fromLine !== undefined || maxLines !== undefined) {
        const lines = body.split('\n');
        const start = (fromLine || 1) - 1;
        const end = maxLines !== undefined ? start + maxLines : lines.length;
        body = lines.slice(start, end).join('\n');
    }
    return body;
}
/**
 * Find multiple documents by glob pattern or comma-separated list
 * Returns documents without body by default (use getDocumentBody to load)
 */
export function findDocuments(db, pattern, options = {}) {
    const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?') && !pattern.includes('{');
    const errors = [];
    const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
    const bodyCol = options.includeBody ? `, content.doc as body` : ``;
    const selectCols = `
    'qmd://' || d.collection || '/' || d.path as virtual_path,
    d.collection || '/' || d.path as display_path,
    d.title,
    d.hash,
    d.collection,
    d.modified_at,
    LENGTH(content.doc) as body_length
    ${bodyCol}
  `;
    let fileRows;
    if (isCommaSeparated) {
        const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
        fileRows = [];
        for (const name of names) {
            let doc = db.prepare(`
        SELECT ${selectCols}
        FROM documents d
        JOIN content ON content.hash = d.hash
        WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
      `).get(name);
            if (!doc) {
                doc = db.prepare(`
          SELECT ${selectCols}
          FROM documents d
          JOIN content ON content.hash = d.hash
          WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
          LIMIT 1
        `).get(`%${name}`);
            }
            if (doc) {
                fileRows.push(doc);
            }
            else {
                const similar = findSimilarFiles(db, name, 5, 3);
                let msg = `File not found: ${name}`;
                if (similar.length > 0) {
                    msg += ` (did you mean: ${similar.join(', ')}?)`;
                }
                errors.push(msg);
            }
        }
    }
    else {
        // Glob pattern match
        const matched = matchFilesByGlob(db, pattern);
        if (matched.length === 0) {
            errors.push(`No files matched pattern: ${pattern}`);
            return { docs: [], errors };
        }
        const virtualPaths = matched.map(m => m.filepath);
        const placeholders = virtualPaths.map(() => '?').join(',');
        fileRows = db.prepare(`
      SELECT ${selectCols}
      FROM documents d
      JOIN content ON content.hash = d.hash
      WHERE 'qmd://' || d.collection || '/' || d.path IN (${placeholders}) AND d.active = 1
    `).all(...virtualPaths);
    }
    const results = [];
    for (const row of fileRows) {
        // Get context using virtual path
        const virtualPath = row.virtual_path || `qmd://${row.collection}/${row.display_path}`;
        const context = getContextForFile(db, virtualPath);
        if (row.body_length > maxBytes) {
            results.push({
                doc: { filepath: virtualPath, displayPath: row.display_path },
                skipped: true,
                skipReason: `File too large (${Math.round(row.body_length / 1024)}KB > ${Math.round(maxBytes / 1024)}KB)`,
            });
            continue;
        }
        results.push({
            doc: {
                filepath: virtualPath,
                displayPath: row.display_path,
                title: row.title || row.display_path.split('/').pop() || row.display_path,
                context,
                hash: row.hash,
                docid: getDocid(row.hash),
                collectionName: row.collection,
                modifiedAt: row.modified_at,
                bodyLength: row.body_length,
                ...(options.includeBody && row.body !== undefined && { body: row.body }),
            },
            skipped: false,
        });
    }
    return { docs: results, errors };
}
// =============================================================================
// Status
// =============================================================================
export function getStatus(db) {
    // DB is source of truth for collections — config provides supplementary metadata
    const dbCollections = db.prepare(`
    SELECT
      collection as name,
      COUNT(*) as active_count,
      MAX(modified_at) as last_doc_update
    FROM documents
    WHERE active = 1
    GROUP BY collection
  `).all();
    // Build a lookup from store_collections for path/pattern metadata
    const storeCollections = getStoreCollections(db);
    const configLookup = new Map(storeCollections.map(c => [c.name, { path: c.path, pattern: c.pattern }]));
    const collections = dbCollections.map(row => {
        const config = configLookup.get(row.name);
        return {
            name: row.name,
            path: config?.path ?? null,
            pattern: config?.pattern ?? null,
            documents: row.active_count,
            lastUpdated: row.last_doc_update || new Date().toISOString(),
        };
    });
    // Sort by last update time (most recent first)
    collections.sort((a, b) => {
        if (!a.lastUpdated)
            return 1;
        if (!b.lastUpdated)
            return -1;
        return new Date(b.lastUpdated).getTime() - new Date(a.lastUpdated).getTime();
    });
    const totalDocs = db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get().c;
    const needsEmbedding = getHashesNeedingEmbedding(db);
    const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
    return {
        totalDocuments: totalDocs,
        needsEmbedding,
        hasVectorIndex: hasVectors,
        collections,
    };
}
/** Weight for intent terms relative to query terms (1.0) in snippet scoring */
export const INTENT_WEIGHT_SNIPPET = 0.3;
/** Weight for intent terms relative to query terms (1.0) in chunk selection */
export const INTENT_WEIGHT_CHUNK = 0.5;
// Common stop words filtered from intent strings before tokenization.
// Seeded from finetune/reward.py KEY_TERM_STOPWORDS, extended with common
// 2-3 char function words so the length threshold can drop to >1 and let
// short domain terms (API, SQL, LLM, CPU, CDN, …) survive.
const INTENT_STOP_WORDS = new Set([
    // 2-char function words
    "am", "an", "as", "at", "be", "by", "do", "he", "if",
    "in", "is", "it", "me", "my", "no", "of", "on", "or", "so",
    "to", "up", "us", "we",
    // 3-char function words
    "all", "and", "any", "are", "but", "can", "did", "for", "get",
    "has", "her", "him", "his", "how", "its", "let", "may", "not",
    "our", "out", "the", "too", "was", "who", "why", "you",
    // 4+ char common words
    "also", "does", "find", "from", "have", "into", "more", "need",
    "show", "some", "tell", "that", "them", "this", "want", "what",
    "when", "will", "with", "your",
    // Search-context noise
    "about", "looking", "notes", "search", "where", "which",
]);
/**
 * Extract meaningful terms from an intent string, filtering stop words and punctuation.
 * Uses Unicode-aware punctuation stripping so domain terms like "API" survive.
 * Returns lowercase terms suitable for text matching.
 */
export function extractIntentTerms(intent) {
    return intent.toLowerCase().split(/\s+/)
        .map(t => t.replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu, ""))
        .filter(t => t.length > 1 && !INTENT_STOP_WORDS.has(t));
}
export function extractSnippet(body, query, maxLen = 500, chunkPos, chunkLen, intent) {
    const totalLines = body.split('\n').length;
    let searchBody = body;
    let lineOffset = 0;
    if (chunkPos && chunkPos > 0) {
        // Search within the chunk region, with some padding for context
        // Use provided chunkLen or fall back to max chunk size (covers variable-length chunks)
        const searchLen = chunkLen || CHUNK_SIZE_CHARS;
        const contextStart = Math.max(0, chunkPos - 100);
        const contextEnd = Math.min(body.length, chunkPos + searchLen + 100);
        searchBody = body.slice(contextStart, contextEnd);
        if (contextStart > 0) {
            lineOffset = body.slice(0, contextStart).split('\n').length - 1;
        }
    }
    const lines = searchBody.split('\n');
    const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
    const intentTerms = intent ? extractIntentTerms(intent) : [];
    let bestLine = 0, bestScore = -1;
    for (let i = 0; i < lines.length; i++) {
        const lineLower = (lines[i] ?? "").toLowerCase();
        let score = 0;
        for (const term of queryTerms) {
            if (lineLower.includes(term))
                score += 1.0;
        }
        for (const term of intentTerms) {
            if (lineLower.includes(term))
                score += INTENT_WEIGHT_SNIPPET;
        }
        if (score > bestScore) {
            bestScore = score;
            bestLine = i;
        }
    }
    const start = Math.max(0, bestLine - 1);
    const end = Math.min(lines.length, bestLine + 3);
    const snippetLines = lines.slice(start, end);
    let snippetText = snippetLines.join('\n');
    // If we focused on a chunk window and it produced an empty/whitespace-only snippet,
    // fall back to a full-document snippet so we always show something useful.
    if (chunkPos && chunkPos > 0 && snippetText.trim().length === 0) {
        return extractSnippet(body, query, maxLen, undefined, undefined, intent);
    }
    if (snippetText.length > maxLen)
        snippetText = snippetText.substring(0, maxLen - 3) + "...";
    const absoluteStart = lineOffset + start + 1; // 1-indexed
    const snippetLineCount = snippetLines.length;
    const linesBefore = absoluteStart - 1;
    const linesAfter = totalLines - (absoluteStart + snippetLineCount - 1);
    // Format with diff-style header: @@ -start,count @@ (linesBefore before, linesAfter after)
    const header = `@@ -${absoluteStart},${snippetLineCount} @@ (${linesBefore} before, ${linesAfter} after)`;
    const snippet = `${header}\n${snippetText}`;
    return {
        line: lineOffset + bestLine + 1,
        snippet,
        linesBefore,
        linesAfter,
        snippetLines: snippetLineCount,
    };
}
// =============================================================================
// Shared helpers (used by both CLI and MCP)
// =============================================================================
/**
 * Add line numbers to text content.
 * Each line becomes: "{lineNum}: {content}"
 */
export function addLineNumbers(text, startLine = 1) {
    const lines = text.split('\n');
    return lines.map((line, i) => `${startLine + i}: ${line}`).join('\n');
}
/**
 * Hybrid search: BM25 + vector + query expansion + RRF + chunked reranking.
 *
 * Pipeline:
 * 1. BM25 probe → skip expansion if strong signal
 * 2. expandQuery() → typed query variants (lex/vec/hyde)
 * 3. Type-routed search: original→vector, lex→FTS, vec/hyde→vector
 * 4. RRF fusion → slice to candidateLimit
 * 5. chunkDocument() + keyword-best-chunk selection
 * 6. rerank on chunks (NOT full bodies — O(tokens) trap)
 * 7. Position-aware score blending (RRF rank × reranker score)
 * 8. Dedup by file, filter by minScore, slice to limit
 */
export async function hybridQuery(store, query, options) {
    const limit = options?.limit ?? 10;
    const minScore = options?.minScore ?? 0;
    const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
    const collection = options?.collection;
    const explain = options?.explain ?? false;
    const intent = options?.intent;
    const skipRerank = options?.skipRerank ?? false;
    const hooks = options?.hooks;
    const embedProvider = options?.embedProvider;
    const rankedLists = [];
    const rankedListMeta = [];
    const docidMap = new Map(); // filepath -> docid
    const hasVectors = !!store.db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
    // Step 1: BM25 probe — strong signal skips expensive LLM expansion
    // When intent is provided, disable strong-signal bypass — the obvious BM25
    // match may not be what the caller wants (e.g. "performance" with intent
    // "web page load times" should NOT shortcut to a sports-performance doc).
    // Pass collection directly into FTS query (filter at SQL level, not post-hoc)
    const initialFts = store.searchFTS(query, 20, collection);
    const topScore = initialFts[0]?.score ?? 0;
    const secondScore = initialFts[1]?.score ?? 0;
    const hasStrongSignal = !intent && initialFts.length > 0
        && topScore >= STRONG_SIGNAL_MIN_SCORE
        && (topScore - secondScore) >= STRONG_SIGNAL_MIN_GAP;
    if (hasStrongSignal)
        hooks?.onStrongSignal?.(topScore);
    // Step 2: Expand query (or skip if strong signal)
    hooks?.onExpandStart?.();
    const expandStart = Date.now();
    const expanded = hasStrongSignal
        ? []
        : await store.expandQuery(query, undefined, intent);
    hooks?.onExpand?.(query, expanded, Date.now() - expandStart);
    // Seed with initial FTS results (avoid re-running original query FTS)
    if (initialFts.length > 0) {
        for (const r of initialFts)
            docidMap.set(r.filepath, r.docid);
        rankedLists.push(initialFts.map(r => ({
            file: r.filepath, displayPath: r.displayPath,
            title: r.title, body: r.body || "", score: r.score,
        })));
        rankedListMeta.push({ source: "fts", queryType: "original", query });
    }
    // Step 3: Route searches by query type
    //
    // Strategy: run all FTS queries immediately (they're sync/instant), then
    // batch-embed all vector queries in one embedBatch() call, then run
    // sqlite-vec lookups with pre-computed embeddings.
    // 3a: Run FTS for all lex expansions right away (no LLM needed)
    for (const q of expanded) {
        if (q.type === 'lex') {
            const ftsResults = store.searchFTS(q.query, 20, collection);
            if (ftsResults.length > 0) {
                for (const r of ftsResults)
                    docidMap.set(r.filepath, r.docid);
                rankedLists.push(ftsResults.map(r => ({
                    file: r.filepath, displayPath: r.displayPath,
                    title: r.title, body: r.body || "", score: r.score,
                })));
                rankedListMeta.push({ source: "fts", queryType: "lex", query: q.query });
            }
        }
    }
    // 3b: Collect all texts that need vector search (original query + vec/hyde expansions)
    if (hasVectors) {
        const vecQueries = [
            { text: query, queryType: "original" },
        ];
        for (const q of expanded) {
            if (q.type === 'vec' || q.type === 'hyde') {
                vecQueries.push({ text: q.query, queryType: q.type });
            }
        }
        // Batch embed all vector queries in a single call.
        // When `embedProvider` is supplied (i-loazq6ze), route the encode through
        // it (HTTP / GPU worker / AutoFallback chain) instead of warming the
        // local llama-cpp model — this is the whole point of the GPU worker.
        const embedModelName = embedProvider
            ? embedProvider.getModelId()
            : getLlm(store).embedModelName;
        const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, embedModelName));
        hooks?.onEmbedStart?.(textsToEmbed.length);
        const embedStart = Date.now();
        const embeddings = embedProvider
            ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
            : await getLlm(store).embedBatch(textsToEmbed);
        hooks?.onEmbedDone?.(Date.now() - embedStart);
        // Run sqlite-vec lookups with pre-computed embeddings
        for (let i = 0; i < vecQueries.length; i++) {
            const embedding = embeddings[i]?.embedding;
            if (!embedding)
                continue;
            const vecResults = await store.searchVec(vecQueries[i].text, DEFAULT_EMBED_MODEL, 20, collection, undefined, embedding);
            if (vecResults.length > 0) {
                for (const r of vecResults)
                    docidMap.set(r.filepath, r.docid);
                rankedLists.push(vecResults.map(r => ({
                    file: r.filepath, displayPath: r.displayPath,
                    title: r.title, body: r.body || "", score: r.score,
                })));
                rankedListMeta.push({
                    source: "vec",
                    queryType: vecQueries[i].queryType,
                    query: vecQueries[i].text,
                });
            }
        }
    }
    // Step 4: RRF fusion — first 2 lists (original FTS + first vec) get 2x weight
    const weights = rankedLists.map((_, i) => i < 2 ? 2.0 : 1.0);
    const fused = reciprocalRankFusion(rankedLists, weights);
    const rrfTraceByFile = explain ? buildRrfTrace(rankedLists, weights, rankedListMeta) : null;
    const candidates = fused.slice(0, candidateLimit);
    if (candidates.length === 0)
        return [];
    // Step 5: Chunk documents, pick best chunk per doc for reranking.
    // Reranking full bodies is O(tokens) — the critical perf lesson that motivated this refactor.
    const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
    const intentTerms = intent ? extractIntentTerms(intent) : [];
    const docChunkMap = new Map();
    const chunkStrategy = options?.chunkStrategy;
    for (const cand of candidates) {
        const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, chunkStrategy);
        if (chunks.length === 0)
            continue;
        // Pick chunk with most keyword overlap (fallback: first chunk)
        // Intent terms contribute at INTENT_WEIGHT_CHUNK (0.5) relative to query terms (1.0)
        let bestIdx = 0;
        let bestScore = -1;
        for (let i = 0; i < chunks.length; i++) {
            const chunkLower = chunks[i].text.toLowerCase();
            let score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
            for (const term of intentTerms) {
                if (chunkLower.includes(term))
                    score += INTENT_WEIGHT_CHUNK;
            }
            if (score > bestScore) {
                bestScore = score;
                bestIdx = i;
            }
        }
        docChunkMap.set(cand.file, { chunks, bestIdx });
    }
    if (skipRerank) {
        // Skip LLM reranking — return candidates scored by RRF only
        const seenFiles = new Set();
        return candidates
            .map((cand, i) => {
            const chunkInfo = docChunkMap.get(cand.file);
            const bestIdx = chunkInfo?.bestIdx ?? 0;
            const bestChunk = chunkInfo?.chunks[bestIdx]?.text || cand.body || "";
            const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
            const rrfRank = i + 1;
            const rrfScore = 1 / rrfRank;
            const trace = rrfTraceByFile?.get(cand.file);
            const explainData = explain ? {
                ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
                vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
                rrf: {
                    rank: rrfRank,
                    positionScore: rrfScore,
                    weight: 1.0,
                    baseScore: trace?.baseScore ?? 0,
                    topRankBonus: trace?.topRankBonus ?? 0,
                    totalScore: trace?.totalScore ?? 0,
                    contributions: trace?.contributions ?? [],
                },
                rerankScore: 0,
                blendedScore: rrfScore,
            } : undefined;
            return {
                file: cand.file,
                displayPath: cand.displayPath,
                title: cand.title,
                body: cand.body,
                bestChunk,
                bestChunkPos,
                score: rrfScore,
                context: store.getContextForFile(cand.file),
                docid: docidMap.get(cand.file) || "",
                ...(explainData ? { explain: explainData } : {}),
            };
        })
            .filter(r => {
            if (seenFiles.has(r.file))
                return false;
            seenFiles.add(r.file);
            return true;
        })
            .filter(r => r.score >= minScore)
            .slice(0, limit);
    }
    // Step 6: Rerank chunks (NOT full bodies)
    const chunksToRerank = [];
    for (const cand of candidates) {
        const chunkInfo = docChunkMap.get(cand.file);
        if (chunkInfo) {
            chunksToRerank.push({ file: cand.file, text: chunkInfo.chunks[chunkInfo.bestIdx].text });
        }
    }
    hooks?.onRerankStart?.(chunksToRerank.length);
    const rerankStart = Date.now();
    const reranked = await store.rerank(query, chunksToRerank, undefined, intent);
    hooks?.onRerankDone?.(Date.now() - rerankStart);
    // Step 7: Blend RRF position score with reranker score
    // Position-aware weights: top retrieval results get more protection from reranker disagreement
    const candidateMap = new Map(candidates.map(c => [c.file, {
            displayPath: c.displayPath, title: c.title, body: c.body,
        }]));
    const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1]));
    const blended = reranked.map(r => {
        const rrfRank = rrfRankMap.get(r.file) || candidateLimit;
        let rrfWeight;
        if (rrfRank <= 3)
            rrfWeight = 0.75;
        else if (rrfRank <= 10)
            rrfWeight = 0.60;
        else
            rrfWeight = 0.40;
        const rrfScore = 1 / rrfRank;
        const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * r.score;
        const candidate = candidateMap.get(r.file);
        const chunkInfo = docChunkMap.get(r.file);
        const bestIdx = chunkInfo?.bestIdx ?? 0;
        const bestChunk = chunkInfo?.chunks[bestIdx]?.text || candidate?.body || "";
        const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
        const trace = rrfTraceByFile?.get(r.file);
        const explainData = explain ? {
            ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
            vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
            rrf: {
                rank: rrfRank,
                positionScore: rrfScore,
                weight: rrfWeight,
                baseScore: trace?.baseScore ?? 0,
                topRankBonus: trace?.topRankBonus ?? 0,
                totalScore: trace?.totalScore ?? 0,
                contributions: trace?.contributions ?? [],
            },
            rerankScore: r.score,
            blendedScore,
        } : undefined;
        return {
            file: r.file,
            displayPath: candidate?.displayPath || "",
            title: candidate?.title || "",
            body: candidate?.body || "",
            bestChunk,
            bestChunkPos,
            score: blendedScore,
            context: store.getContextForFile(r.file),
            docid: docidMap.get(r.file) || "",
            ...(explainData ? { explain: explainData } : {}),
        };
    }).sort((a, b) => b.score - a.score);
    // Step 8: Dedup by file (safety net — prevents duplicate output)
    const seenFiles = new Set();
    return blended
        .filter(r => {
        if (seenFiles.has(r.file))
            return false;
        seenFiles.add(r.file);
        return true;
    })
        .filter(r => r.score >= minScore)
        .slice(0, limit);
}
/**
 * Vector-only semantic search with query expansion.
 *
 * Pipeline:
 * 1. expandQuery() → typed variants, filter to vec/hyde only (lex irrelevant here)
 * 2. searchVec() for original + vec/hyde variants (sequential — node-llama-cpp embed limitation)
 * 3. Dedup by filepath (keep max score)
 * 4. Sort by score descending, filter by minScore, slice to limit
 */
export async function vectorSearchQuery(store, query, options) {
    const limit = options?.limit ?? 10;
    const minScore = options?.minScore ?? 0.3;
    const collection = options?.collection;
    const intent = options?.intent;
    const embedProvider = options?.embedProvider;
    const hasVectors = !!store.db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
    if (!hasVectors)
        return [];
    // Expand query — filter to vec/hyde only (lex queries target FTS, not vector)
    const expandStart = Date.now();
    const allExpanded = await store.expandQuery(query, undefined, intent);
    const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
    options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
    // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs.
    // When `embedProvider` is supplied (i-loazq6ze), query encoding is routed
    // through it; the per-call signature `searchVec(...)` accepts the provider
    // as the trailing argument so existing tests / callers stay untouched.
    const queryTexts = [query, ...vecExpanded.map(q => q.query)];
    const allResults = new Map();
    for (const q of queryTexts) {
        const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection, undefined, undefined, embedProvider);
        for (const r of vecResults) {
            const existing = allResults.get(r.filepath);
            if (!existing || r.score > existing.score) {
                allResults.set(r.filepath, {
                    file: r.filepath,
                    displayPath: r.displayPath,
                    title: r.title,
                    body: r.body || "",
                    score: r.score,
                    context: store.getContextForFile(r.filepath),
                    docid: r.docid,
                });
            }
        }
    }
    return Array.from(allResults.values())
        .sort((a, b) => b.score - a.score)
        .filter(r => r.score >= minScore)
        .slice(0, limit);
}
/**
 * Structured search: execute pre-expanded queries without LLM query expansion.
 *
 * Designed for LLM callers (MCP/HTTP) that generate their own query expansions.
 * Skips the internal expandQuery() step — goes directly to:
 *
 * Pipeline:
 * 1. Route searches: lex→FTS, vec/hyde→vector (batch embed)
 * 2. RRF fusion across all result lists
 * 3. Chunk documents + keyword-best-chunk selection
 * 4. Rerank on chunks
 * 5. Position-aware score blending
 * 6. Dedup, filter, slice
 *
 * This is the recommended endpoint for capable LLMs — they can generate
 * better query variations than our small local model, especially for
 * domain-specific or nuanced queries.
 */
export async function structuredSearch(store, searches, options) {
    const limit = options?.limit ?? 10;
    const minScore = options?.minScore ?? 0;
    const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
    const explain = options?.explain ?? false;
    const intent = options?.intent;
    const skipRerank = options?.skipRerank ?? false;
    const hooks = options?.hooks;
    const embedProvider = options?.embedProvider;
    const collections = options?.collections;
    if (searches.length === 0)
        return [];
    // Validate queries before executing
    for (const search of searches) {
        const location = search.line ? `Line ${search.line}` : 'Structured search';
        if (/[\r\n]/.test(search.query)) {
            throw new Error(`${location} (${search.type}): queries must be single-line. Remove newline characters.`);
        }
        if (search.type === 'lex') {
            const error = validateLexQuery(search.query);
            if (error) {
                throw new Error(`${location} (lex): ${error}`);
            }
        }
        else if (search.type === 'vec' || search.type === 'hyde') {
            const error = validateSemanticQuery(search.query);
            if (error) {
                throw new Error(`${location} (${search.type}): ${error}`);
            }
        }
    }
    const rankedLists = [];
    const rankedListMeta = [];
    const docidMap = new Map(); // filepath -> docid
    const hasVectors = !!store.db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
    // Helper to run search across collections (or all if undefined)
    const collectionList = collections ?? [undefined]; // undefined = all collections
    // Step 1: Run FTS for all lex searches (sync, instant)
    for (const search of searches) {
        if (search.type === 'lex') {
            for (const coll of collectionList) {
                const ftsResults = store.searchFTS(search.query, 20, coll);
                if (ftsResults.length > 0) {
                    for (const r of ftsResults)
                        docidMap.set(r.filepath, r.docid);
                    rankedLists.push(ftsResults.map(r => ({
                        file: r.filepath, displayPath: r.displayPath,
                        title: r.title, body: r.body || "", score: r.score,
                    })));
                    rankedListMeta.push({
                        source: "fts",
                        queryType: "lex",
                        query: search.query,
                    });
                }
            }
        }
    }
    // Step 2: Batch embed and run vector searches for vec/hyde
    if (hasVectors) {
        const vecSearches = searches.filter((s) => s.type === 'vec' || s.type === 'hyde');
        if (vecSearches.length > 0) {
            // Route batch encoding through the supplied EmbeddingProvider when
            // present (i-loazq6ze). Otherwise fall back to the local llama-cpp
            // singleton — preserves pre-patch behavior for callers that don't
            // configure a provider.
            const embedModelName = embedProvider
                ? embedProvider.getModelId()
                : getLlm(store).embedModelName;
            const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, embedModelName));
            hooks?.onEmbedStart?.(textsToEmbed.length);
            const embedStart = Date.now();
            const embeddings = embedProvider
                ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
                : await getLlm(store).embedBatch(textsToEmbed);
            hooks?.onEmbedDone?.(Date.now() - embedStart);
            for (let i = 0; i < vecSearches.length; i++) {
                const embedding = embeddings[i]?.embedding;
                if (!embedding)
                    continue;
                for (const coll of collectionList) {
                    const vecResults = await store.searchVec(vecSearches[i].query, DEFAULT_EMBED_MODEL, 20, coll, undefined, embedding);
                    if (vecResults.length > 0) {
                        for (const r of vecResults)
                            docidMap.set(r.filepath, r.docid);
                        rankedLists.push(vecResults.map(r => ({
                            file: r.filepath, displayPath: r.displayPath,
                            title: r.title, body: r.body || "", score: r.score,
                        })));
                        rankedListMeta.push({
                            source: "vec",
                            queryType: vecSearches[i].type,
                            query: vecSearches[i].query,
                        });
                    }
                }
            }
        }
    }
    if (rankedLists.length === 0)
        return [];
    // Step 3: RRF fusion — first list gets 2x weight (assume caller ordered by importance)
    const weights = rankedLists.map((_, i) => i === 0 ? 2.0 : 1.0);
    const fused = reciprocalRankFusion(rankedLists, weights);
    const rrfTraceByFile = explain ? buildRrfTrace(rankedLists, weights, rankedListMeta) : null;
    const candidates = fused.slice(0, candidateLimit);
    if (candidates.length === 0)
        return [];
    hooks?.onExpand?.("", [], 0); // Signal no expansion (pre-expanded)
    // Step 4: Chunk documents, pick best chunk per doc for reranking
    // Use first lex query as the "query" for keyword matching, or first vec if no lex
    const primaryQuery = searches.find(s => s.type === 'lex')?.query
        || searches.find(s => s.type === 'vec')?.query
        || searches[0]?.query || "";
    const queryTerms = primaryQuery.toLowerCase().split(/\s+/).filter(t => t.length > 2);
    const intentTerms = intent ? extractIntentTerms(intent) : [];
    const docChunkMap = new Map();
    const ssChunkStrategy = options?.chunkStrategy;
    for (const cand of candidates) {
        const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, ssChunkStrategy);
        if (chunks.length === 0)
            continue;
        // Pick chunk with most keyword overlap
        // Intent terms contribute at INTENT_WEIGHT_CHUNK (0.5) relative to query terms (1.0)
        let bestIdx = 0;
        let bestScore = -1;
        for (let i = 0; i < chunks.length; i++) {
            const chunkLower = chunks[i].text.toLowerCase();
            let score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
            for (const term of intentTerms) {
                if (chunkLower.includes(term))
                    score += INTENT_WEIGHT_CHUNK;
            }
            if (score > bestScore) {
                bestScore = score;
                bestIdx = i;
            }
        }
        docChunkMap.set(cand.file, { chunks, bestIdx });
    }
    if (skipRerank) {
        // Skip LLM reranking — return candidates scored by RRF only
        const seenFiles = new Set();
        return candidates
            .map((cand, i) => {
            const chunkInfo = docChunkMap.get(cand.file);
            const bestIdx = chunkInfo?.bestIdx ?? 0;
            const bestChunk = chunkInfo?.chunks[bestIdx]?.text || cand.body || "";
            const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
            const rrfRank = i + 1;
            const rrfScore = 1 / rrfRank;
            const trace = rrfTraceByFile?.get(cand.file);
            const explainData = explain ? {
                ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
                vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
                rrf: {
                    rank: rrfRank,
                    positionScore: rrfScore,
                    weight: 1.0,
                    baseScore: trace?.baseScore ?? 0,
                    topRankBonus: trace?.topRankBonus ?? 0,
                    totalScore: trace?.totalScore ?? 0,
                    contributions: trace?.contributions ?? [],
                },
                rerankScore: 0,
                blendedScore: rrfScore,
            } : undefined;
            return {
                file: cand.file,
                displayPath: cand.displayPath,
                title: cand.title,
                body: cand.body,
                bestChunk,
                bestChunkPos,
                score: rrfScore,
                context: store.getContextForFile(cand.file),
                docid: docidMap.get(cand.file) || "",
                ...(explainData ? { explain: explainData } : {}),
            };
        })
            .filter(r => {
            if (seenFiles.has(r.file))
                return false;
            seenFiles.add(r.file);
            return true;
        })
            .filter(r => r.score >= minScore)
            .slice(0, limit);
    }
    // Step 5: Rerank chunks
    const chunksToRerank = [];
    for (const cand of candidates) {
        const chunkInfo = docChunkMap.get(cand.file);
        if (chunkInfo) {
            chunksToRerank.push({ file: cand.file, text: chunkInfo.chunks[chunkInfo.bestIdx].text });
        }
    }
    hooks?.onRerankStart?.(chunksToRerank.length);
    const rerankStart2 = Date.now();
    const reranked = await store.rerank(primaryQuery, chunksToRerank, undefined, intent);
    hooks?.onRerankDone?.(Date.now() - rerankStart2);
    // Step 6: Blend RRF position score with reranker score
    const candidateMap = new Map(candidates.map(c => [c.file, {
            displayPath: c.displayPath, title: c.title, body: c.body,
        }]));
    const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1]));
    const blended = reranked.map(r => {
        const rrfRank = rrfRankMap.get(r.file) || candidateLimit;
        let rrfWeight;
        if (rrfRank <= 3)
            rrfWeight = 0.75;
        else if (rrfRank <= 10)
            rrfWeight = 0.60;
        else
            rrfWeight = 0.40;
        const rrfScore = 1 / rrfRank;
        const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * r.score;
        const candidate = candidateMap.get(r.file);
        const chunkInfo = docChunkMap.get(r.file);
        const bestIdx = chunkInfo?.bestIdx ?? 0;
        const bestChunk = chunkInfo?.chunks[bestIdx]?.text || candidate?.body || "";
        const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
        const trace = rrfTraceByFile?.get(r.file);
        const explainData = explain ? {
            ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
            vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
            rrf: {
                rank: rrfRank,
                positionScore: rrfScore,
                weight: rrfWeight,
                baseScore: trace?.baseScore ?? 0,
                topRankBonus: trace?.topRankBonus ?? 0,
                totalScore: trace?.totalScore ?? 0,
                contributions: trace?.contributions ?? [],
            },
            rerankScore: r.score,
            blendedScore,
        } : undefined;
        return {
            file: r.file,
            displayPath: candidate?.displayPath || "",
            title: candidate?.title || "",
            body: candidate?.body || "",
            bestChunk,
            bestChunkPos,
            score: blendedScore,
            context: store.getContextForFile(r.file),
            docid: docidMap.get(r.file) || "",
            ...(explainData ? { explain: explainData } : {}),
        };
    }).sort((a, b) => b.score - a.score);
    // Step 7: Dedup by file
    const seenFiles = new Set();
    return blended
        .filter(r => {
        if (seenFiles.has(r.file))
            return false;
        seenFiles.add(r.file);
        return true;
    })
        .filter(r => r.score >= minScore)
        .slice(0, limit);
}