store.ts 182 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060
  1. /**
  2. * QMD Store - Core data access and retrieval functions
  3. *
  4. * This module provides all database operations, search functions, and document
  5. * retrieval for QMD. It returns raw data structures that can be formatted by
  6. * CLI or MCP consumers.
  7. *
  8. * Usage:
  9. * const store = createStore("/path/to/db.sqlite");
  10. * // or use default path:
  11. * const store = createStore();
  12. */
  13. import { openDatabase, loadSqliteVec } from "./db.js";
  14. import type { Database } from "./db.js";
  15. import picomatch from "picomatch";
  16. import { createHash } from "crypto";
  17. import { readFileSync, realpathSync, statSync, mkdirSync } from "node:fs";
  18. // Note: node:path resolve is not imported — we export our own cross-platform resolve()
  19. import fastGlob from "fast-glob";
  20. import {
  21. LlamaCpp,
  22. getDefaultLlamaCpp,
  23. formatQueryForEmbedding,
  24. formatDocForEmbedding,
  25. withLLMSessionForLlm,
  26. type LLMSessionOptions,
  27. type RerankDocument,
  28. type ILLMSession,
  29. } from "./llm.js";
  30. import type {
  31. NamedCollection,
  32. Collection,
  33. CollectionConfig,
  34. ContextMap,
  35. } from "./collections.js";
  36. import {
  37. type EmbeddingProvider,
  38. assertModelCompatible,
  39. } from "./embedding/provider.js";
  40. // =============================================================================
  41. // Configuration
  42. // =============================================================================
  43. const HOME = process.env.HOME || "/tmp";
  44. export const DEFAULT_EMBED_MODEL = "embeddinggemma";
  45. export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
  46. export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
  47. export const DEFAULT_GLOB = "**/*.md";
  48. export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
  49. export const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
  50. export const DEFAULT_EMBED_MAX_BATCH_BYTES = 64 * 1024 * 1024; // 64MB
  51. // Chunking: 900 tokens per chunk with 15% overlap
  52. // Increased from 800 to accommodate smart chunking finding natural break points
  53. export const CHUNK_SIZE_TOKENS = 900;
  54. export const CHUNK_OVERLAP_TOKENS = Math.floor(CHUNK_SIZE_TOKENS * 0.15); // 135 tokens (15% overlap)
  55. // Fallback char-based approximation for sync chunking (~4 chars per token)
  56. export const CHUNK_SIZE_CHARS = CHUNK_SIZE_TOKENS * 4; // 3600 chars
  57. export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 540 chars
  58. // Search window for finding optimal break points (in tokens, ~200 tokens)
  59. export const CHUNK_WINDOW_TOKENS = 200;
  60. export const CHUNK_WINDOW_CHARS = CHUNK_WINDOW_TOKENS * 4; // 800 chars
  61. /**
  62. * Get the LlamaCpp instance for a store — prefers the store's own instance,
  63. * falls back to the global singleton.
  64. */
  65. function getLlm(store: Store): LlamaCpp {
  66. return store.llm ?? getDefaultLlamaCpp();
  67. }
  68. // =============================================================================
  69. // Smart Chunking - Break Point Detection
  70. // =============================================================================
  71. /**
  72. * A potential break point in the document with a base score indicating quality.
  73. */
  74. export interface BreakPoint {
  75. pos: number; // character position
  76. score: number; // base score (higher = better break point)
  77. type: string; // for debugging: 'h1', 'h2', 'blank', etc.
  78. }
  79. /**
  80. * A region where a code fence exists (between ``` markers).
  81. * We should never split inside a code fence.
  82. */
  83. export interface CodeFenceRegion {
  84. start: number; // position of opening ```
  85. end: number; // position of closing ``` (or document end if unclosed)
  86. }
  87. /**
  88. * Patterns for detecting break points in markdown documents.
  89. * Higher scores indicate better places to split.
  90. * Scores are spread wide so headings decisively beat lower-quality breaks.
  91. * Order matters for scoring - more specific patterns first.
  92. */
  93. export const BREAK_PATTERNS: [RegExp, number, string][] = [
  94. [/\n#{1}(?!#)/g, 100, 'h1'], // # but not ##
  95. [/\n#{2}(?!#)/g, 90, 'h2'], // ## but not ###
  96. [/\n#{3}(?!#)/g, 80, 'h3'], // ### but not ####
  97. [/\n#{4}(?!#)/g, 70, 'h4'], // #### but not #####
  98. [/\n#{5}(?!#)/g, 60, 'h5'], // ##### but not ######
  99. [/\n#{6}(?!#)/g, 50, 'h6'], // ######
  100. [/\n```/g, 80, 'codeblock'], // code block boundary (same as h3)
  101. [/\n(?:---|\*\*\*|___)\s*\n/g, 60, 'hr'], // horizontal rule
  102. [/\n\n+/g, 20, 'blank'], // paragraph boundary
  103. [/\n[-*]\s/g, 5, 'list'], // unordered list item
  104. [/\n\d+\.\s/g, 5, 'numlist'], // ordered list item
  105. [/\n/g, 1, 'newline'], // minimal break
  106. ];
  107. /**
  108. * Scan text for all potential break points.
  109. * Returns sorted array of break points with higher-scoring patterns taking precedence
  110. * when multiple patterns match the same position.
  111. */
  112. export function scanBreakPoints(text: string): BreakPoint[] {
  113. const points: BreakPoint[] = [];
  114. const seen = new Map<number, BreakPoint>(); // pos -> best break point at that pos
  115. for (const [pattern, score, type] of BREAK_PATTERNS) {
  116. for (const match of text.matchAll(pattern)) {
  117. const pos = match.index!;
  118. const existing = seen.get(pos);
  119. // Keep higher score if position already seen
  120. if (!existing || score > existing.score) {
  121. const bp = { pos, score, type };
  122. seen.set(pos, bp);
  123. }
  124. }
  125. }
  126. // Convert to array and sort by position
  127. for (const bp of seen.values()) {
  128. points.push(bp);
  129. }
  130. return points.sort((a, b) => a.pos - b.pos);
  131. }
  132. /**
  133. * Find all code fence regions in the text.
  134. * Code fences are delimited by ``` and we should never split inside them.
  135. */
  136. export function findCodeFences(text: string): CodeFenceRegion[] {
  137. const regions: CodeFenceRegion[] = [];
  138. const fencePattern = /\n```/g;
  139. let inFence = false;
  140. let fenceStart = 0;
  141. for (const match of text.matchAll(fencePattern)) {
  142. if (!inFence) {
  143. fenceStart = match.index!;
  144. inFence = true;
  145. } else {
  146. regions.push({ start: fenceStart, end: match.index! + match[0].length });
  147. inFence = false;
  148. }
  149. }
  150. // Handle unclosed fence - extends to end of document
  151. if (inFence) {
  152. regions.push({ start: fenceStart, end: text.length });
  153. }
  154. return regions;
  155. }
  156. /**
  157. * Check if a position is inside a code fence region.
  158. */
  159. export function isInsideCodeFence(pos: number, fences: CodeFenceRegion[]): boolean {
  160. return fences.some(f => pos > f.start && pos < f.end);
  161. }
  162. /**
  163. * Find the best cut position using scored break points with distance decay.
  164. *
  165. * Uses squared distance for gentler early decay - headings far back still win
  166. * over low-quality breaks near the target.
  167. *
  168. * @param breakPoints - Pre-scanned break points from scanBreakPoints()
  169. * @param targetCharPos - The ideal cut position (e.g., maxChars boundary)
  170. * @param windowChars - How far back to search for break points (default ~200 tokens)
  171. * @param decayFactor - How much to penalize distance (0.7 = 30% score at window edge)
  172. * @param codeFences - Code fence regions to avoid splitting inside
  173. * @returns The best position to cut at
  174. */
  175. export function findBestCutoff(
  176. breakPoints: BreakPoint[],
  177. targetCharPos: number,
  178. windowChars: number = CHUNK_WINDOW_CHARS,
  179. decayFactor: number = 0.7,
  180. codeFences: CodeFenceRegion[] = []
  181. ): number {
  182. const windowStart = targetCharPos - windowChars;
  183. let bestScore = -1;
  184. let bestPos = targetCharPos;
  185. for (const bp of breakPoints) {
  186. if (bp.pos < windowStart) continue;
  187. if (bp.pos > targetCharPos) break; // sorted, so we can stop
  188. // Skip break points inside code fences
  189. if (isInsideCodeFence(bp.pos, codeFences)) continue;
  190. const distance = targetCharPos - bp.pos;
  191. // Squared distance decay: gentle early, steep late
  192. // At target: multiplier = 1.0
  193. // At 25% back: multiplier = 0.956
  194. // At 50% back: multiplier = 0.825
  195. // At 75% back: multiplier = 0.606
  196. // At window edge: multiplier = 0.3
  197. const normalizedDist = distance / windowChars;
  198. const multiplier = 1.0 - (normalizedDist * normalizedDist) * decayFactor;
  199. const finalScore = bp.score * multiplier;
  200. if (finalScore > bestScore) {
  201. bestScore = finalScore;
  202. bestPos = bp.pos;
  203. }
  204. }
  205. return bestPos;
  206. }
  207. // =============================================================================
  208. // Chunk Strategy
  209. // =============================================================================
  210. export type ChunkStrategy = "auto" | "regex" | "function";
  211. /**
  212. * Merge two sets of break points (e.g. regex + AST), keeping the highest
  213. * score at each position. Result is sorted by position.
  214. */
  215. export function mergeBreakPoints(a: BreakPoint[], b: BreakPoint[]): BreakPoint[] {
  216. const seen = new Map<number, BreakPoint>();
  217. for (const bp of a) {
  218. const existing = seen.get(bp.pos);
  219. if (!existing || bp.score > existing.score) {
  220. seen.set(bp.pos, bp);
  221. }
  222. }
  223. for (const bp of b) {
  224. const existing = seen.get(bp.pos);
  225. if (!existing || bp.score > existing.score) {
  226. seen.set(bp.pos, bp);
  227. }
  228. }
  229. return Array.from(seen.values()).sort((a, b) => a.pos - b.pos);
  230. }
  231. /**
  232. * Core chunk algorithm that operates on precomputed break points and code fences.
  233. * This is the shared implementation used by both regex-only and AST-aware chunking.
  234. */
  235. export function chunkDocumentWithBreakPoints(
  236. content: string,
  237. breakPoints: BreakPoint[],
  238. codeFences: CodeFenceRegion[],
  239. maxChars: number = CHUNK_SIZE_CHARS,
  240. overlapChars: number = CHUNK_OVERLAP_CHARS,
  241. windowChars: number = CHUNK_WINDOW_CHARS
  242. ): { text: string; pos: number }[] {
  243. if (content.length <= maxChars) {
  244. return [{ text: content, pos: 0 }];
  245. }
  246. const chunks: { text: string; pos: number }[] = [];
  247. let charPos = 0;
  248. while (charPos < content.length) {
  249. const targetEndPos = Math.min(charPos + maxChars, content.length);
  250. let endPos = targetEndPos;
  251. if (endPos < content.length) {
  252. const bestCutoff = findBestCutoff(
  253. breakPoints,
  254. targetEndPos,
  255. windowChars,
  256. 0.7,
  257. codeFences
  258. );
  259. if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
  260. endPos = bestCutoff;
  261. }
  262. }
  263. if (endPos <= charPos) {
  264. endPos = Math.min(charPos + maxChars, content.length);
  265. }
  266. chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
  267. if (endPos >= content.length) {
  268. break;
  269. }
  270. charPos = endPos - overlapChars;
  271. const lastChunkPos = chunks.at(-1)!.pos;
  272. if (charPos <= lastChunkPos) {
  273. charPos = endPos;
  274. }
  275. }
  276. return chunks;
  277. }
  278. // Hybrid query: strong BM25 signal detection thresholds
  279. // Skip expensive LLM expansion when top result is strong AND clearly separated from runner-up
  280. export const STRONG_SIGNAL_MIN_SCORE = 0.85;
  281. export const STRONG_SIGNAL_MIN_GAP = 0.15;
  282. // Max candidates to pass to reranker — balances quality vs latency.
  283. // 40 keeps rank 31-40 visible to the reranker (matters for recall on broad queries).
  284. export const RERANK_CANDIDATE_LIMIT = 40;
  285. /**
  286. * A typed query expansion result. Decoupled from llm.ts internal Queryable —
  287. * same shape, but store.ts owns its own public API type.
  288. *
  289. * - lex: keyword variant → routes to FTS only
  290. * - vec: semantic variant → routes to vector only
  291. * - hyde: hypothetical document → routes to vector only
  292. */
  293. export type ExpandedQuery = {
  294. type: 'lex' | 'vec' | 'hyde';
  295. query: string;
  296. /** Optional line number for error reporting (CLI parser) */
  297. line?: number;
  298. };
  299. // =============================================================================
  300. // Path utilities
  301. // =============================================================================
  302. export function homedir(): string {
  303. return HOME;
  304. }
  305. /**
  306. * Check if a path is absolute.
  307. * Supports:
  308. * - Unix paths: /path/to/file
  309. * - Windows native: C:\path or C:/path
  310. * - Git Bash: /c/path or /C/path (C-Z drives, excluding A/B floppy drives)
  311. *
  312. * Note: /c without trailing slash is treated as Unix path (directory named "c"),
  313. * while /c/ or /c/path are treated as Git Bash paths (C: drive).
  314. */
  315. export function isAbsolutePath(path: string): boolean {
  316. if (!path) return false;
  317. // Unix absolute path
  318. if (path.startsWith('/')) {
  319. // Check if it's a Git Bash style path like /c/ or /c/Users (C-Z only, not A or B)
  320. // Requires path[2] === '/' to distinguish from Unix paths like /c or /cache
  321. // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
  322. if (!isWSL() && path.length >= 3 && path[2] === '/') {
  323. const driveLetter = path[1];
  324. if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
  325. return true;
  326. }
  327. }
  328. // Any other path starting with / is Unix absolute
  329. return true;
  330. }
  331. // Windows native path: C:\ or C:/ (any letter A-Z)
  332. if (path.length >= 2 && /[a-zA-Z]/.test(path[0]!) && path[1] === ':') {
  333. return true;
  334. }
  335. return false;
  336. }
  337. /**
  338. * Normalize path separators to forward slashes.
  339. * Converts Windows backslashes to forward slashes.
  340. */
  341. export function normalizePathSeparators(path: string): string {
  342. return path.replace(/\\/g, '/');
  343. }
  344. /**
  345. * Detect if running inside WSL (Windows Subsystem for Linux).
  346. * On WSL, paths like /c/work/... are valid drvfs mount points, not Git Bash paths.
  347. */
  348. function isWSL(): boolean {
  349. return !!(process.env.WSL_DISTRO_NAME || process.env.WSL_INTEROP);
  350. }
  351. /**
  352. * Get the relative path from a prefix.
  353. * Returns null if path is not under prefix.
  354. * Returns empty string if path equals prefix.
  355. */
  356. export function getRelativePathFromPrefix(path: string, prefix: string): string | null {
  357. // Empty prefix is invalid
  358. if (!prefix) {
  359. return null;
  360. }
  361. const normalizedPath = normalizePathSeparators(path);
  362. const normalizedPrefix = normalizePathSeparators(prefix);
  363. // Ensure prefix ends with / for proper matching
  364. const prefixWithSlash = !normalizedPrefix.endsWith('/')
  365. ? normalizedPrefix + '/'
  366. : normalizedPrefix;
  367. // Exact match
  368. if (normalizedPath === normalizedPrefix) {
  369. return '';
  370. }
  371. // Check if path starts with prefix
  372. if (normalizedPath.startsWith(prefixWithSlash)) {
  373. return normalizedPath.slice(prefixWithSlash.length);
  374. }
  375. return null;
  376. }
  377. export function resolve(...paths: string[]): string {
  378. if (paths.length === 0) {
  379. throw new Error("resolve: at least one path segment is required");
  380. }
  381. // Normalize all paths to use forward slashes
  382. const normalizedPaths = paths.map(normalizePathSeparators);
  383. let result = '';
  384. let windowsDrive = '';
  385. // Check if first path is absolute
  386. const firstPath = normalizedPaths[0]!;
  387. if (isAbsolutePath(firstPath)) {
  388. result = firstPath;
  389. // Extract Windows drive letter if present
  390. if (firstPath.length >= 2 && /[a-zA-Z]/.test(firstPath[0]!) && firstPath[1] === ':') {
  391. windowsDrive = firstPath.slice(0, 2);
  392. result = firstPath.slice(2);
  393. } else if (!isWSL() && firstPath.startsWith('/') && firstPath.length >= 3 && firstPath[2] === '/') {
  394. // Git Bash style: /c/ -> C: (C-Z drives only, not A or B)
  395. // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
  396. const driveLetter = firstPath[1];
  397. if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
  398. windowsDrive = driveLetter.toUpperCase() + ':';
  399. result = firstPath.slice(2);
  400. }
  401. }
  402. } else {
  403. // Start with PWD or cwd, then append the first relative path
  404. const pwd = normalizePathSeparators(process.env.PWD || process.cwd());
  405. // Extract Windows drive from PWD if present
  406. if (pwd.length >= 2 && /[a-zA-Z]/.test(pwd[0]!) && pwd[1] === ':') {
  407. windowsDrive = pwd.slice(0, 2);
  408. result = pwd.slice(2) + '/' + firstPath;
  409. } else {
  410. result = pwd + '/' + firstPath;
  411. }
  412. }
  413. // Process remaining paths
  414. for (let i = 1; i < normalizedPaths.length; i++) {
  415. const p = normalizedPaths[i]!;
  416. if (isAbsolutePath(p)) {
  417. // Absolute path replaces everything
  418. result = p;
  419. // Update Windows drive if present
  420. if (p.length >= 2 && /[a-zA-Z]/.test(p[0]!) && p[1] === ':') {
  421. windowsDrive = p.slice(0, 2);
  422. result = p.slice(2);
  423. } else if (!isWSL() && p.startsWith('/') && p.length >= 3 && p[2] === '/') {
  424. // Git Bash style (C-Z drives only, not A or B)
  425. // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
  426. const driveLetter = p[1];
  427. if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
  428. windowsDrive = driveLetter.toUpperCase() + ':';
  429. result = p.slice(2);
  430. } else {
  431. windowsDrive = '';
  432. }
  433. } else {
  434. windowsDrive = '';
  435. }
  436. } else {
  437. // Relative path - append
  438. result = result + '/' + p;
  439. }
  440. }
  441. // Normalize . and .. components
  442. const parts = result.split('/').filter(Boolean);
  443. const normalized: string[] = [];
  444. for (const part of parts) {
  445. if (part === '..') {
  446. normalized.pop();
  447. } else if (part !== '.') {
  448. normalized.push(part);
  449. }
  450. }
  451. // Build final path
  452. const finalPath = '/' + normalized.join('/');
  453. // Prepend Windows drive if present
  454. if (windowsDrive) {
  455. return windowsDrive + finalPath;
  456. }
  457. return finalPath;
  458. }
  459. // Flag to indicate production mode (set by qmd.ts at startup)
  460. let _productionMode = false;
  461. export function enableProductionMode(): void {
  462. _productionMode = true;
  463. }
  464. /** Reset production mode flag — only for testing. */
  465. export function _resetProductionModeForTesting(): void {
  466. _productionMode = false;
  467. }
  468. export function getDefaultDbPath(indexName: string = "index"): string {
  469. // Always allow override via INDEX_PATH (for testing)
  470. if (process.env.INDEX_PATH) {
  471. return process.env.INDEX_PATH;
  472. }
  473. // In non-production mode (tests), require explicit path
  474. if (!_productionMode) {
  475. throw new Error(
  476. "Database path not set. Tests must set INDEX_PATH env var or use createStore() with explicit path. " +
  477. "This prevents tests from accidentally writing to the global index."
  478. );
  479. }
  480. const cacheDir = process.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
  481. const qmdCacheDir = resolve(cacheDir, "qmd");
  482. try { mkdirSync(qmdCacheDir, { recursive: true }); } catch { }
  483. return resolve(qmdCacheDir, `${indexName}.sqlite`);
  484. }
  485. export function getPwd(): string {
  486. return process.env.PWD || process.cwd();
  487. }
  488. export function getRealPath(path: string): string {
  489. try {
  490. return realpathSync(path);
  491. } catch {
  492. return resolve(path);
  493. }
  494. }
  495. // =============================================================================
  496. // Virtual Path Utilities (qmd://)
  497. // =============================================================================
  498. export type VirtualPath = {
  499. collectionName: string;
  500. path: string; // relative path within collection
  501. };
  502. /**
  503. * Normalize explicit virtual path formats to standard qmd:// format.
  504. * Only handles paths that are already explicitly virtual:
  505. * - qmd://collection/path.md (already normalized)
  506. * - qmd:////collection/path.md (extra slashes - normalize)
  507. * - //collection/path.md (missing qmd: prefix - add it)
  508. *
  509. * Does NOT handle:
  510. * - collection/path.md (bare paths - could be filesystem relative)
  511. * - :linenum suffix (should be parsed separately before calling this)
  512. */
  513. export function normalizeVirtualPath(input: string): string {
  514. let path = input.trim();
  515. // Handle qmd:// with extra slashes: qmd:////collection/path -> qmd://collection/path
  516. if (path.startsWith('qmd:')) {
  517. // Remove qmd: prefix and normalize slashes
  518. path = path.slice(4);
  519. // Remove leading slashes and re-add exactly two
  520. path = path.replace(/^\/+/, '');
  521. return `qmd://${path}`;
  522. }
  523. // Handle //collection/path (missing qmd: prefix)
  524. if (path.startsWith('//')) {
  525. path = path.replace(/^\/+/, '');
  526. return `qmd://${path}`;
  527. }
  528. // Return as-is for other cases (filesystem paths, docids, bare collection/path, etc.)
  529. return path;
  530. }
  531. /**
  532. * Parse a virtual path like "qmd://collection-name/path/to/file.md"
  533. * into its components.
  534. * Also supports collection root: "qmd://collection-name/" or "qmd://collection-name"
  535. */
  536. export function parseVirtualPath(virtualPath: string): VirtualPath | null {
  537. // Normalize the path first
  538. const normalized = normalizeVirtualPath(virtualPath);
  539. // Match: qmd://collection-name[/optional-path]
  540. // Allows: qmd://name, qmd://name/, qmd://name/path
  541. const match = normalized.match(/^qmd:\/\/([^\/]+)\/?(.*)$/);
  542. if (!match?.[1]) return null;
  543. return {
  544. collectionName: match[1],
  545. path: match[2] ?? '', // Empty string for collection root
  546. };
  547. }
  548. /**
  549. * Build a virtual path from collection name and relative path.
  550. */
  551. export function buildVirtualPath(collectionName: string, path: string): string {
  552. return `qmd://${collectionName}/${path}`;
  553. }
  554. /**
  555. * Check if a path is explicitly a virtual path.
  556. * Only recognizes explicit virtual path formats:
  557. * - qmd://collection/path.md
  558. * - //collection/path.md
  559. *
  560. * Does NOT consider bare collection/path.md as virtual - that should be
  561. * handled separately by checking if the first component is a collection name.
  562. */
  563. export function isVirtualPath(path: string): boolean {
  564. const trimmed = path.trim();
  565. // Explicit qmd:// prefix (with any number of slashes)
  566. if (trimmed.startsWith('qmd:')) return true;
  567. // //collection/path format (missing qmd: prefix)
  568. if (trimmed.startsWith('//')) return true;
  569. return false;
  570. }
  571. /**
  572. * Resolve a virtual path to absolute filesystem path.
  573. */
  574. export function resolveVirtualPath(db: Database, virtualPath: string): string | null {
  575. const parsed = parseVirtualPath(virtualPath);
  576. if (!parsed) return null;
  577. const coll = getCollectionByName(db, parsed.collectionName);
  578. if (!coll) return null;
  579. return resolve(coll.pwd, parsed.path);
  580. }
  581. /**
  582. * Convert an absolute filesystem path to a virtual path.
  583. * Returns null if the file is not in any indexed collection.
  584. */
  585. export function toVirtualPath(db: Database, absolutePath: string): string | null {
  586. // Get all collections from DB
  587. const collections = getStoreCollections(db);
  588. // Find which collection this absolute path belongs to
  589. for (const coll of collections) {
  590. if (absolutePath.startsWith(coll.path + '/') || absolutePath === coll.path) {
  591. // Extract relative path
  592. const relativePath = absolutePath.startsWith(coll.path + '/')
  593. ? absolutePath.slice(coll.path.length + 1)
  594. : '';
  595. // Verify this document exists in the database
  596. const doc = db.prepare(`
  597. SELECT d.path
  598. FROM documents d
  599. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  600. LIMIT 1
  601. `).get(coll.name, relativePath) as { path: string } | null;
  602. if (doc) {
  603. return buildVirtualPath(coll.name, relativePath);
  604. }
  605. }
  606. }
  607. return null;
  608. }
  609. // =============================================================================
  610. // Database initialization
  611. // =============================================================================
  612. function createSqliteVecUnavailableError(reason: string): Error {
  613. return new Error(
  614. "sqlite-vec extension is unavailable. " +
  615. `${reason}. ` +
  616. "Install Homebrew SQLite so the sqlite-vec extension can be loaded, " +
  617. "and set BREW_PREFIX if Homebrew is installed in a non-standard location."
  618. );
  619. }
  620. function getErrorMessage(err: unknown): string {
  621. return err instanceof Error ? err.message : String(err);
  622. }
  623. export function verifySqliteVecLoaded(db: Database): void {
  624. try {
  625. const row = db.prepare(`SELECT vec_version() AS version`).get() as { version?: string } | null;
  626. if (!row?.version || typeof row.version !== "string") {
  627. throw new Error("vec_version() returned no version");
  628. }
  629. } catch (err) {
  630. const message = getErrorMessage(err);
  631. throw createSqliteVecUnavailableError(`sqlite-vec probe failed (${message})`);
  632. }
  633. }
  634. let _sqliteVecAvailable: boolean | null = null;
  635. /**
  636. * Concurrency-friendly pragma defaults applied by `initializeDatabase`.
  637. * Each entry is `{ pragma, default, envVar }` so operators can override
  638. * any one knob via env without code changes.
  639. *
  640. * Defaults are tuned for the Oivo fleet shape — many concurrent MCP
  641. * processes (one per agent session) sharing a single ~10 GB index that
  642. * a 30-minute cron runs `qmd embed` against. See issue i-6sw24v09 for
  643. * the failure mode this prevents.
  644. */
  645. const CONCURRENCY_PRAGMAS: Array<{ pragma: string; defaultValue: string | number; envVar: string }> = [
  646. { pragma: "busy_timeout", defaultValue: 30000, envVar: "QMD_SQLITE_BUSY_TIMEOUT_MS" },
  647. { pragma: "synchronous", defaultValue: "NORMAL", envVar: "QMD_SQLITE_SYNCHRONOUS" },
  648. { pragma: "temp_store", defaultValue: "MEMORY", envVar: "QMD_SQLITE_TEMP_STORE" },
  649. { pragma: "cache_size", defaultValue: -65536, envVar: "QMD_SQLITE_CACHE_SIZE" }, // ~64 MiB
  650. { pragma: "mmap_size", defaultValue: 268435456, envVar: "QMD_SQLITE_MMAP_SIZE" }, // 256 MiB
  651. { pragma: "wal_autocheckpoint", defaultValue: 1000, envVar: "QMD_SQLITE_WAL_AUTOCHECKPOINT" },
  652. ];
  653. /**
  654. * Apply concurrency pragmas with env-var override support. Exported for
  655. * unit tests; consumers should rely on `initializeDatabase` instead.
  656. */
  657. export function applyConcurrencyPragmas(db: Database): void {
  658. for (const { pragma, defaultValue, envVar } of CONCURRENCY_PRAGMAS) {
  659. const override = process.env[envVar];
  660. let value: string | number = defaultValue;
  661. if (override !== undefined && override !== "") {
  662. // Numeric overrides parse as base-10 integers (also accepts negatives
  663. // for cache_size). Non-numeric overrides pass through as identifiers
  664. // (e.g. NORMAL, FULL, MEMORY) — SQLite validates them.
  665. const numericPragmas = new Set(["busy_timeout", "cache_size", "mmap_size", "wal_autocheckpoint"]);
  666. if (numericPragmas.has(pragma)) {
  667. const parsed = parseInt(override, 10);
  668. if (Number.isFinite(parsed)) value = parsed;
  669. } else {
  670. value = override;
  671. }
  672. }
  673. try {
  674. db.exec(`PRAGMA ${pragma} = ${value}`);
  675. } catch (err) {
  676. // Don't blow up on pragma failure — log + carry on. SQLite without
  677. // mmap support, for example, simply ignores mmap_size silently on
  678. // some builds, but a strict build can throw.
  679. const msg = err instanceof Error ? err.message : String(err);
  680. console.warn(`[qmd] PRAGMA ${pragma} = ${value} failed: ${msg}`);
  681. }
  682. }
  683. }
  684. function initializeDatabase(db: Database): void {
  685. try {
  686. loadSqliteVec(db);
  687. verifySqliteVecLoaded(db);
  688. _sqliteVecAvailable = true;
  689. } catch (err) {
  690. // sqlite-vec is optional — vector search won't work but FTS is fine
  691. _sqliteVecAvailable = false;
  692. console.warn(getErrorMessage(err));
  693. }
  694. db.exec("PRAGMA journal_mode = WAL");
  695. db.exec("PRAGMA foreign_keys = ON");
  696. // Concurrency tuning — prevents reader timeouts during long writer windows
  697. // such as `qmd embed` (often 6-30 minutes on the Oivo fleet) which would
  698. // otherwise saturate the default 5s busy_timeout from better-sqlite3 and
  699. // surface as MCP transport timeouts in concurrent `qmd_query`/`qmd_status`
  700. // calls. See issue i-6sw24v09 for the empirical trace.
  701. //
  702. // - busy_timeout (default 30000 ms): readers wait through writer-held
  703. // checkpoints instead of failing fast with SQLITE_BUSY.
  704. // - synchronous=NORMAL: WAL-safe (still durable across crashes), avoids
  705. // the FULL fsync per transaction that compounds embed runtime.
  706. // - temp_store=MEMORY: keep FTS5 + vec sort scratch in RAM, not /tmp.
  707. // - cache_size: ~64 MiB per-connection page cache. Negative kibibyte
  708. // form is the canonical SQLite idiom (positive = pages, negative = KiB).
  709. // - mmap_size: 256 MiB memory-mapped reads for the 10 GB index — cheap
  710. // on Linux (lazy paging), no effect on non-mmap'd syscall fallback.
  711. // - wal_autocheckpoint: keep WAL bounded. Default 1000 pages is fine
  712. // but setting it explicitly prevents drift when callers tune globally.
  713. //
  714. // Each pragma is overridable via env so operators can tune without a
  715. // code change; values must parse as base-10 integers or are skipped.
  716. applyConcurrencyPragmas(db);
  717. // Drop legacy tables that are now managed in YAML
  718. db.exec(`DROP TABLE IF EXISTS path_contexts`);
  719. db.exec(`DROP TABLE IF EXISTS collections`);
  720. // Content-addressable storage - the source of truth for document content
  721. db.exec(`
  722. CREATE TABLE IF NOT EXISTS content (
  723. hash TEXT PRIMARY KEY,
  724. doc TEXT NOT NULL,
  725. created_at TEXT NOT NULL
  726. )
  727. `);
  728. // Documents table - file system layer mapping virtual paths to content hashes
  729. // Collections are now managed in ~/.config/qmd/index.yml
  730. db.exec(`
  731. CREATE TABLE IF NOT EXISTS documents (
  732. id INTEGER PRIMARY KEY AUTOINCREMENT,
  733. collection TEXT NOT NULL,
  734. path TEXT NOT NULL,
  735. title TEXT NOT NULL,
  736. hash TEXT NOT NULL,
  737. created_at TEXT NOT NULL,
  738. modified_at TEXT NOT NULL,
  739. active INTEGER NOT NULL DEFAULT 1,
  740. FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
  741. UNIQUE(collection, path)
  742. )
  743. `);
  744. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection, active)`);
  745. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`);
  746. db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path, active)`);
  747. // Cache table for LLM API calls
  748. db.exec(`
  749. CREATE TABLE IF NOT EXISTS llm_cache (
  750. hash TEXT PRIMARY KEY,
  751. result TEXT NOT NULL,
  752. created_at TEXT NOT NULL
  753. )
  754. `);
  755. // Content vectors
  756. const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
  757. const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
  758. if (cvInfo.length > 0 && !hasSeqColumn) {
  759. db.exec(`DROP TABLE IF EXISTS content_vectors`);
  760. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  761. }
  762. db.exec(`
  763. CREATE TABLE IF NOT EXISTS content_vectors (
  764. hash TEXT NOT NULL,
  765. seq INTEGER NOT NULL DEFAULT 0,
  766. pos INTEGER NOT NULL DEFAULT 0,
  767. model TEXT NOT NULL,
  768. embedded_at TEXT NOT NULL,
  769. PRIMARY KEY (hash, seq)
  770. )
  771. `);
  772. // Store collections — makes the DB self-contained (no external config needed)
  773. db.exec(`
  774. CREATE TABLE IF NOT EXISTS store_collections (
  775. name TEXT PRIMARY KEY,
  776. path TEXT NOT NULL,
  777. pattern TEXT NOT NULL DEFAULT '**/*.md',
  778. ignore_patterns TEXT,
  779. include_by_default INTEGER DEFAULT 1,
  780. update_command TEXT,
  781. context TEXT
  782. )
  783. `);
  784. // Store config — key-value metadata (e.g. config_hash for sync optimization)
  785. db.exec(`
  786. CREATE TABLE IF NOT EXISTS store_config (
  787. key TEXT PRIMARY KEY,
  788. value TEXT
  789. )
  790. `);
  791. // FTS - index filepath (collection/path), title, and content
  792. db.exec(`
  793. CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
  794. filepath, title, body,
  795. tokenize='porter unicode61'
  796. )
  797. `);
  798. // Triggers to keep FTS in sync
  799. db.exec(`
  800. CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents
  801. WHEN new.active = 1
  802. BEGIN
  803. INSERT INTO documents_fts(rowid, filepath, title, body)
  804. SELECT
  805. new.id,
  806. new.collection || '/' || new.path,
  807. new.title,
  808. (SELECT doc FROM content WHERE hash = new.hash)
  809. WHERE new.active = 1;
  810. END
  811. `);
  812. db.exec(`
  813. CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
  814. DELETE FROM documents_fts WHERE rowid = old.id;
  815. END
  816. `);
  817. db.exec(`
  818. CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents
  819. BEGIN
  820. -- Delete from FTS if no longer active
  821. DELETE FROM documents_fts WHERE rowid = old.id AND new.active = 0;
  822. -- Update FTS if still/newly active
  823. INSERT OR REPLACE INTO documents_fts(rowid, filepath, title, body)
  824. SELECT
  825. new.id,
  826. new.collection || '/' || new.path,
  827. new.title,
  828. (SELECT doc FROM content WHERE hash = new.hash)
  829. WHERE new.active = 1;
  830. END
  831. `);
  832. }
  833. // =============================================================================
  834. // Store Collections — DB accessor functions
  835. // =============================================================================
  836. type StoreCollectionRow = {
  837. name: string;
  838. path: string;
  839. pattern: string;
  840. ignore_patterns: string | null;
  841. include_by_default: number;
  842. update_command: string | null;
  843. context: string | null;
  844. };
  845. function rowToNamedCollection(row: StoreCollectionRow): NamedCollection {
  846. return {
  847. name: row.name,
  848. path: row.path,
  849. pattern: row.pattern,
  850. ...(row.ignore_patterns ? { ignore: JSON.parse(row.ignore_patterns) as string[] } : {}),
  851. ...(row.include_by_default === 0 ? { includeByDefault: false } : {}),
  852. ...(row.update_command ? { update: row.update_command } : {}),
  853. ...(row.context ? { context: JSON.parse(row.context) as ContextMap } : {}),
  854. };
  855. }
  856. export function getStoreCollections(db: Database): NamedCollection[] {
  857. const rows = db.prepare(`SELECT * FROM store_collections`).all() as StoreCollectionRow[];
  858. return rows.map(rowToNamedCollection);
  859. }
  860. export function getStoreCollection(db: Database, name: string): NamedCollection | null {
  861. const row = db.prepare(`SELECT * FROM store_collections WHERE name = ?`).get(name) as StoreCollectionRow | null | undefined;
  862. if (row == null) return null;
  863. return rowToNamedCollection(row);
  864. }
  865. export function getStoreGlobalContext(db: Database): string | undefined {
  866. const row = db.prepare(`SELECT value FROM store_config WHERE key = 'global_context'`).get() as { value: string } | null | undefined;
  867. if (row == null) return undefined;
  868. return row.value || undefined;
  869. }
  870. export function getStoreContexts(db: Database): Array<{ collection: string; path: string; context: string }> {
  871. const results: Array<{ collection: string; path: string; context: string }> = [];
  872. // Global context
  873. const globalCtx = getStoreGlobalContext(db);
  874. if (globalCtx) {
  875. results.push({ collection: "*", path: "/", context: globalCtx });
  876. }
  877. // Collection contexts
  878. const rows = db.prepare(`SELECT name, context FROM store_collections WHERE context IS NOT NULL`).all() as { name: string; context: string }[];
  879. for (const row of rows) {
  880. const ctxMap = JSON.parse(row.context) as ContextMap;
  881. for (const [path, context] of Object.entries(ctxMap)) {
  882. results.push({ collection: row.name, path, context });
  883. }
  884. }
  885. return results;
  886. }
  887. export function upsertStoreCollection(db: Database, name: string, collection: Omit<Collection, 'pattern'> & { pattern?: string }): void {
  888. db.prepare(`
  889. INSERT INTO store_collections (name, path, pattern, ignore_patterns, include_by_default, update_command, context)
  890. VALUES (?, ?, ?, ?, ?, ?, ?)
  891. ON CONFLICT(name) DO UPDATE SET
  892. path = excluded.path,
  893. pattern = excluded.pattern,
  894. ignore_patterns = excluded.ignore_patterns,
  895. include_by_default = excluded.include_by_default,
  896. update_command = excluded.update_command,
  897. context = excluded.context
  898. `).run(
  899. name,
  900. collection.path,
  901. collection.pattern || '**/*.md',
  902. collection.ignore ? JSON.stringify(collection.ignore) : null,
  903. collection.includeByDefault === false ? 0 : 1,
  904. collection.update || null,
  905. collection.context ? JSON.stringify(collection.context) : null,
  906. );
  907. }
  908. export function deleteStoreCollection(db: Database, name: string): boolean {
  909. const result = db.prepare(`DELETE FROM store_collections WHERE name = ?`).run(name);
  910. return result.changes > 0;
  911. }
  912. export function renameStoreCollection(db: Database, oldName: string, newName: string): boolean {
  913. // Check target doesn't exist
  914. const existing = db.prepare(`SELECT name FROM store_collections WHERE name = ?`).get(newName) as { name: string } | null | undefined;
  915. if (existing != null) {
  916. throw new Error(`Collection '${newName}' already exists`);
  917. }
  918. const result = db.prepare(`UPDATE store_collections SET name = ? WHERE name = ?`).run(newName, oldName);
  919. return result.changes > 0;
  920. }
  921. export function updateStoreContext(db: Database, collectionName: string, path: string, text: string): boolean {
  922. const row = db.prepare(`SELECT context FROM store_collections WHERE name = ?`).get(collectionName) as { context: string | null } | null | undefined;
  923. if (row == null) return false;
  924. const ctxMap: ContextMap = row.context ? JSON.parse(row.context) : {};
  925. ctxMap[path] = text;
  926. db.prepare(`UPDATE store_collections SET context = ? WHERE name = ?`).run(JSON.stringify(ctxMap), collectionName);
  927. return true;
  928. }
  929. export function removeStoreContext(db: Database, collectionName: string, path: string): boolean {
  930. const row = db.prepare(`SELECT context FROM store_collections WHERE name = ?`).get(collectionName) as { context: string | null } | null | undefined;
  931. if (row == null) return false;
  932. if (!row.context) return false;
  933. const ctxMap: ContextMap = JSON.parse(row.context);
  934. if (!(path in ctxMap)) return false;
  935. delete ctxMap[path];
  936. const newCtx = Object.keys(ctxMap).length > 0 ? JSON.stringify(ctxMap) : null;
  937. db.prepare(`UPDATE store_collections SET context = ? WHERE name = ?`).run(newCtx, collectionName);
  938. return true;
  939. }
  940. export function setStoreGlobalContext(db: Database, value: string | undefined): void {
  941. if (value === undefined) {
  942. db.prepare(`DELETE FROM store_config WHERE key = 'global_context'`).run();
  943. } else {
  944. db.prepare(`INSERT INTO store_config (key, value) VALUES ('global_context', ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value`).run(value);
  945. }
  946. }
  947. /**
  948. * Sync external config (YAML/inline) into SQLite store_collections.
  949. * External config always wins. Skips sync if config hash hasn't changed.
  950. */
  951. export function syncConfigToDb(db: Database, config: CollectionConfig): void {
  952. // Check config hash — skip sync if unchanged
  953. const configJson = JSON.stringify(config);
  954. const hash = createHash('sha256').update(configJson).digest('hex');
  955. const existingHash = db.prepare(`SELECT value FROM store_config WHERE key = 'config_hash'`).get() as { value: string } | null | undefined;
  956. if (existingHash != null && existingHash.value === hash) {
  957. return; // Config unchanged, skip sync
  958. }
  959. // Sync collections
  960. const configNames = new Set(Object.keys(config.collections));
  961. for (const [name, coll] of Object.entries(config.collections)) {
  962. upsertStoreCollection(db, name, coll);
  963. }
  964. // Delete collections not in config
  965. const dbCollections = db.prepare(`SELECT name FROM store_collections`).all() as { name: string }[];
  966. for (const row of dbCollections) {
  967. if (!configNames.has(row.name)) {
  968. db.prepare(`DELETE FROM store_collections WHERE name = ?`).run(row.name);
  969. }
  970. }
  971. // Sync global context
  972. if (config.global_context !== undefined) {
  973. setStoreGlobalContext(db, config.global_context);
  974. } else {
  975. setStoreGlobalContext(db, undefined);
  976. }
  977. // Save config hash
  978. db.prepare(`INSERT INTO store_config (key, value) VALUES ('config_hash', ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value`).run(hash);
  979. }
  980. export function isSqliteVecAvailable(): boolean {
  981. return _sqliteVecAvailable === true;
  982. }
  983. function ensureVecTableInternal(db: Database, dimensions: number): void {
  984. if (!_sqliteVecAvailable) {
  985. throw new Error("sqlite-vec is not available. Vector operations require a SQLite build with extension loading support.");
  986. }
  987. const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get() as { sql: string } | null;
  988. if (tableInfo) {
  989. const match = tableInfo.sql.match(/float\[(\d+)\]/);
  990. const hasHashSeq = tableInfo.sql.includes('hash_seq');
  991. const hasCosine = tableInfo.sql.includes('distance_metric=cosine');
  992. const existingDims = match?.[1] ? parseInt(match[1], 10) : null;
  993. if (existingDims === dimensions && hasHashSeq && hasCosine) return;
  994. if (existingDims !== null && existingDims !== dimensions) {
  995. throw new Error(
  996. `Embedding dimension mismatch: existing vectors are ${existingDims}d but the current model produces ${dimensions}d. ` +
  997. `Run 'qmd embed -f' to re-embed with the new model.`
  998. );
  999. }
  1000. db.exec("DROP TABLE IF EXISTS vectors_vec");
  1001. }
  1002. db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}] distance_metric=cosine)`);
  1003. }
  1004. // =============================================================================
  1005. // Store Factory
  1006. // =============================================================================
  1007. export type Store = {
  1008. db: Database;
  1009. dbPath: string;
  1010. /** Optional LlamaCpp instance for this store (overrides the global singleton) */
  1011. llm?: LlamaCpp;
  1012. close: () => void;
  1013. ensureVecTable: (dimensions: number) => void;
  1014. // Index health
  1015. getHashesNeedingEmbedding: () => number;
  1016. getIndexHealth: () => IndexHealthInfo;
  1017. getStatus: () => IndexStatus;
  1018. // Caching
  1019. getCacheKey: typeof getCacheKey;
  1020. getCachedResult: (cacheKey: string) => string | null;
  1021. setCachedResult: (cacheKey: string, result: string) => void;
  1022. clearCache: () => void;
  1023. // Cleanup and maintenance
  1024. deleteLLMCache: () => number;
  1025. deleteInactiveDocuments: () => number;
  1026. cleanupOrphanedContent: () => number;
  1027. cleanupOrphanedVectors: () => number;
  1028. vacuumDatabase: () => void;
  1029. // Context
  1030. getContextForFile: (filepath: string) => string | null;
  1031. getContextForPath: (collectionName: string, path: string) => string | null;
  1032. getCollectionByName: (name: string) => { name: string; pwd: string; glob_pattern: string } | null;
  1033. getCollectionsWithoutContext: () => { name: string; pwd: string; doc_count: number }[];
  1034. getTopLevelPathsWithoutContext: (collectionName: string) => string[];
  1035. // Virtual paths
  1036. parseVirtualPath: typeof parseVirtualPath;
  1037. buildVirtualPath: typeof buildVirtualPath;
  1038. isVirtualPath: typeof isVirtualPath;
  1039. resolveVirtualPath: (virtualPath: string) => string | null;
  1040. toVirtualPath: (absolutePath: string) => string | null;
  1041. // Search
  1042. searchFTS: (query: string, limit?: number, collectionName?: string) => SearchResult[];
  1043. searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider) => Promise<SearchResult[]>;
  1044. // Query expansion & reranking
  1045. expandQuery: (query: string, model?: string, intent?: string) => Promise<ExpandedQuery[]>;
  1046. rerank: (query: string, documents: { file: string; text: string }[], model?: string, intent?: string) => Promise<{ file: string; score: number }[]>;
  1047. // Document retrieval
  1048. findDocument: (filename: string, options?: { includeBody?: boolean }) => DocumentResult | DocumentNotFound;
  1049. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => string | null;
  1050. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => { docs: MultiGetResult[]; errors: string[] };
  1051. // Fuzzy matching and docid lookup
  1052. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => string[];
  1053. matchFilesByGlob: (pattern: string) => { filepath: string; displayPath: string; bodyLength: number }[];
  1054. findDocumentByDocid: (docid: string) => { filepath: string; hash: string } | null;
  1055. // Document indexing operations
  1056. insertContent: (hash: string, content: string, createdAt: string) => void;
  1057. insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => void;
  1058. findActiveDocument: (collectionName: string, path: string) => { id: number; hash: string; title: string } | null;
  1059. updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => void;
  1060. updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => void;
  1061. deactivateDocument: (collectionName: string, path: string) => void;
  1062. getActiveDocumentPaths: (collectionName: string) => string[];
  1063. // Vector/embedding operations
  1064. getHashesForEmbedding: () => { hash: string; body: string; path: string }[];
  1065. clearAllEmbeddings: () => void;
  1066. insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void;
  1067. };
  1068. // =============================================================================
  1069. // Reindex & Embed — pure-logic functions for SDK and CLI
  1070. // =============================================================================
  1071. export type ReindexProgress = {
  1072. file: string;
  1073. current: number;
  1074. total: number;
  1075. };
  1076. export type ReindexResult = {
  1077. indexed: number;
  1078. updated: number;
  1079. unchanged: number;
  1080. removed: number;
  1081. orphanedCleaned: number;
  1082. };
  1083. /**
  1084. * Re-index a single collection by scanning the filesystem and updating the database.
  1085. * Pure function — no console output, no db lifecycle management.
  1086. */
  1087. export async function reindexCollection(
  1088. store: Store,
  1089. collectionPath: string,
  1090. globPattern: string,
  1091. collectionName: string,
  1092. options?: {
  1093. ignorePatterns?: string[];
  1094. onProgress?: (info: ReindexProgress) => void;
  1095. }
  1096. ): Promise<ReindexResult> {
  1097. const db = store.db;
  1098. const now = new Date().toISOString();
  1099. const excludeDirs = ["node_modules", ".git", ".cache", "vendor", "dist", "build"];
  1100. const allIgnore = [
  1101. ...excludeDirs.map(d => `**/${d}/**`),
  1102. ...(options?.ignorePatterns || []),
  1103. ];
  1104. const allFiles: string[] = await fastGlob(globPattern, {
  1105. cwd: collectionPath,
  1106. onlyFiles: true,
  1107. followSymbolicLinks: false,
  1108. dot: false,
  1109. ignore: allIgnore,
  1110. });
  1111. // Filter hidden files/folders
  1112. const files = allFiles.filter(file => {
  1113. const parts = file.split("/");
  1114. return !parts.some(part => part.startsWith("."));
  1115. });
  1116. const total = files.length;
  1117. let indexed = 0, updated = 0, unchanged = 0, processed = 0;
  1118. const seenPaths = new Set<string>();
  1119. for (const relativeFile of files) {
  1120. const filepath = getRealPath(resolve(collectionPath, relativeFile));
  1121. const path = handelize(relativeFile);
  1122. seenPaths.add(path);
  1123. let content: string;
  1124. try {
  1125. content = readFileSync(filepath, "utf-8");
  1126. } catch {
  1127. processed++;
  1128. options?.onProgress?.({ file: relativeFile, current: processed, total });
  1129. continue;
  1130. }
  1131. if (!content.trim()) {
  1132. processed++;
  1133. continue;
  1134. }
  1135. const hash = await hashContent(content);
  1136. const title = extractTitle(content, relativeFile);
  1137. const existing = findActiveDocument(db, collectionName, path);
  1138. if (existing) {
  1139. if (existing.hash === hash) {
  1140. if (existing.title !== title) {
  1141. updateDocumentTitle(db, existing.id, title, now);
  1142. updated++;
  1143. } else {
  1144. unchanged++;
  1145. }
  1146. } else {
  1147. insertContent(db, hash, content, now);
  1148. const stat = statSync(filepath);
  1149. updateDocument(db, existing.id, title, hash,
  1150. stat ? new Date(stat.mtime).toISOString() : now);
  1151. updated++;
  1152. }
  1153. } else {
  1154. indexed++;
  1155. insertContent(db, hash, content, now);
  1156. const stat = statSync(filepath);
  1157. insertDocument(db, collectionName, path, title, hash,
  1158. stat ? new Date(stat.birthtime).toISOString() : now,
  1159. stat ? new Date(stat.mtime).toISOString() : now);
  1160. }
  1161. processed++;
  1162. options?.onProgress?.({ file: relativeFile, current: processed, total });
  1163. }
  1164. // Deactivate documents that no longer exist
  1165. const allActive = getActiveDocumentPaths(db, collectionName);
  1166. let removed = 0;
  1167. for (const path of allActive) {
  1168. if (!seenPaths.has(path)) {
  1169. deactivateDocument(db, collectionName, path);
  1170. removed++;
  1171. }
  1172. }
  1173. const orphanedCleaned = cleanupOrphanedContent(db);
  1174. return { indexed, updated, unchanged, removed, orphanedCleaned };
  1175. }
  1176. export type EmbedProgress = {
  1177. chunksEmbedded: number;
  1178. totalChunks: number;
  1179. bytesProcessed: number;
  1180. totalBytes: number;
  1181. errors: number;
  1182. };
  1183. export type EmbedResult = {
  1184. docsProcessed: number;
  1185. chunksEmbedded: number;
  1186. errors: number;
  1187. durationMs: number;
  1188. };
  1189. export type EmbedOptions = {
  1190. force?: boolean;
  1191. model?: string;
  1192. maxDocsPerBatch?: number;
  1193. maxBatchBytes?: number;
  1194. chunkStrategy?: ChunkStrategy;
  1195. onProgress?: (info: EmbedProgress) => void;
  1196. /**
  1197. * Optional embedding provider. When supplied, embeddings are routed through
  1198. * this provider (HTTP, GPU worker, etc.) instead of the local llama.cpp
  1199. * session path. The provider's `getModelId()` is verified against existing
  1200. * `content_vectors.model` rows; mismatch throws unless `force` is set.
  1201. *
  1202. * When omitted, behavior is identical to pre-patch: embeddings come from
  1203. * the store's `LlamaCpp` (or the global singleton).
  1204. */
  1205. embedProvider?: EmbeddingProvider;
  1206. };
  1207. type PendingEmbeddingDoc = {
  1208. hash: string;
  1209. path: string;
  1210. bytes: number;
  1211. collection: string;
  1212. };
  1213. type EmbeddingDoc = PendingEmbeddingDoc & {
  1214. body: string;
  1215. };
  1216. type ChunkItem = {
  1217. hash: string;
  1218. title: string;
  1219. text: string;
  1220. seq: number;
  1221. pos: number;
  1222. tokens: number;
  1223. bytes: number;
  1224. };
  1225. function validatePositiveIntegerOption(name: string, value: number | undefined, fallback: number): number {
  1226. if (value === undefined) return fallback;
  1227. if (!Number.isInteger(value) || value < 1) {
  1228. throw new Error(`${name} must be a positive integer`);
  1229. }
  1230. return value;
  1231. }
  1232. function resolveEmbedOptions(options?: EmbedOptions): Required<Pick<EmbedOptions, "maxDocsPerBatch" | "maxBatchBytes">> {
  1233. return {
  1234. maxDocsPerBatch: validatePositiveIntegerOption("maxDocsPerBatch", options?.maxDocsPerBatch, DEFAULT_EMBED_MAX_DOCS_PER_BATCH),
  1235. maxBatchBytes: validatePositiveIntegerOption("maxBatchBytes", options?.maxBatchBytes, DEFAULT_EMBED_MAX_BATCH_BYTES),
  1236. };
  1237. }
  1238. function getPendingEmbeddingDocs(db: Database): PendingEmbeddingDoc[] {
  1239. // `MIN(d.collection)` deterministically picks one collection per hash when
  1240. // the same content is indexed in multiple collections (SQLite tie-breaks
  1241. // alphabetically). The identical bytes produce identical chunks regardless
  1242. // of which collection wins; the chunkStrategy lookup still resolves via
  1243. // that collection's YAML config. See Phase 2 design notes (i-bud0h8vu).
  1244. return db.prepare(`
  1245. SELECT d.hash, MIN(d.path) as path, MIN(d.collection) as collection, length(CAST(c.doc AS BLOB)) as bytes
  1246. FROM documents d
  1247. JOIN content c ON d.hash = c.hash
  1248. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  1249. WHERE d.active = 1 AND v.hash IS NULL
  1250. GROUP BY d.hash
  1251. ORDER BY MIN(d.path)
  1252. `).all() as PendingEmbeddingDoc[];
  1253. }
  1254. function buildEmbeddingBatches(
  1255. docs: PendingEmbeddingDoc[],
  1256. maxDocsPerBatch: number,
  1257. maxBatchBytes: number,
  1258. ): PendingEmbeddingDoc[][] {
  1259. const batches: PendingEmbeddingDoc[][] = [];
  1260. let currentBatch: PendingEmbeddingDoc[] = [];
  1261. let currentBytes = 0;
  1262. for (const doc of docs) {
  1263. const docBytes = Math.max(0, doc.bytes);
  1264. const wouldExceedDocs = currentBatch.length >= maxDocsPerBatch;
  1265. const wouldExceedBytes = currentBatch.length > 0 && (currentBytes + docBytes) > maxBatchBytes;
  1266. if (wouldExceedDocs || wouldExceedBytes) {
  1267. batches.push(currentBatch);
  1268. currentBatch = [];
  1269. currentBytes = 0;
  1270. }
  1271. currentBatch.push(doc);
  1272. currentBytes += docBytes;
  1273. }
  1274. if (currentBatch.length > 0) {
  1275. batches.push(currentBatch);
  1276. }
  1277. return batches;
  1278. }
  1279. function getEmbeddingDocsForBatch(db: Database, batch: PendingEmbeddingDoc[]): EmbeddingDoc[] {
  1280. if (batch.length === 0) return [];
  1281. const placeholders = batch.map(() => "?").join(",");
  1282. const rows = db.prepare(`
  1283. SELECT hash, doc as body
  1284. FROM content
  1285. WHERE hash IN (${placeholders})
  1286. `).all(...batch.map(doc => doc.hash)) as { hash: string; body: string }[];
  1287. const bodyByHash = new Map(rows.map(row => [row.hash, row.body]));
  1288. return batch.map((doc) => ({
  1289. ...doc,
  1290. body: bodyByHash.get(doc.hash) ?? "",
  1291. }));
  1292. }
  1293. /**
  1294. * Run `body` with a session-shaped argument that supplies an AbortSignal +
  1295. * isValid flag. When `provider` is supplied, the session is a lightweight
  1296. * AbortController-backed stub — `getLlm(store)` is never called and
  1297. * `withLLMSessionForLlm` is bypassed entirely, so node-llama-cpp is not
  1298. * warmed up on remote-only deployments (i-08ovbvtb, follow-up to i-qkarfffa).
  1299. *
  1300. * When `provider` is undefined, behavior is unchanged: a real `LLMSession`
  1301. * is created via `withLLMSessionForLlm(getLlm(store), ...)` so that the
  1302. * body can use `session.embed`/`session.embedBatch` for the local path.
  1303. *
  1304. * The fake session's LLM-only methods (embed/embedBatch/expandQuery/rerank)
  1305. * throw if called — they MUST NOT be reached when `provider` is set, since
  1306. * the embed path is supposed to route through the provider instead.
  1307. */
  1308. async function withEmbedSession<T>(
  1309. store: Store,
  1310. provider: EmbeddingProvider | undefined,
  1311. body: (session: ILLMSession) => Promise<T>,
  1312. options?: LLMSessionOptions,
  1313. ): Promise<T> {
  1314. if (provider) {
  1315. const ac = new AbortController();
  1316. const fakeSession: ILLMSession = {
  1317. get signal() { return ac.signal; },
  1318. get isValid() { return !ac.signal.aborted; },
  1319. embed: async () => {
  1320. throw new Error("withEmbedSession: provider supplied — session.embed must not be called");
  1321. },
  1322. embedBatch: async () => {
  1323. throw new Error("withEmbedSession: provider supplied — session.embedBatch must not be called");
  1324. },
  1325. expandQuery: async () => {
  1326. throw new Error("withEmbedSession: provider supplied — session.expandQuery must not be called");
  1327. },
  1328. rerank: async () => {
  1329. throw new Error("withEmbedSession: provider supplied — session.rerank must not be called");
  1330. },
  1331. };
  1332. try {
  1333. return await body(fakeSession);
  1334. } finally {
  1335. ac.abort();
  1336. }
  1337. }
  1338. return withLLMSessionForLlm(getLlm(store), body, options);
  1339. }
  1340. /**
  1341. * Generate vector embeddings for documents that need them.
  1342. * Pure function — no console output, no db lifecycle management.
  1343. * Uses the store's LlamaCpp instance if set, otherwise the global singleton.
  1344. */
  1345. export async function generateEmbeddings(
  1346. store: Store,
  1347. options?: EmbedOptions
  1348. ): Promise<EmbedResult> {
  1349. const db = store.db;
  1350. const model = options?.model ?? DEFAULT_EMBED_MODEL;
  1351. const now = new Date().toISOString();
  1352. const { maxDocsPerBatch, maxBatchBytes } = resolveEmbedOptions(options);
  1353. const encoder = new TextEncoder();
  1354. // Migration safety: if an embedProvider is supplied, verify its model id
  1355. // matches the existing content_vectors rows (unless we're about to clear
  1356. // them via `force`). This must happen BEFORE we clear vectors so users
  1357. // who pass `--force` aren't blocked.
  1358. if (options?.embedProvider && !options.force) {
  1359. const existing = getDistinctEmbeddingModels(db);
  1360. assertModelCompatible(options.embedProvider.getModelId(), existing);
  1361. }
  1362. if (options?.force) {
  1363. clearAllEmbeddings(db);
  1364. }
  1365. const docsToEmbed = getPendingEmbeddingDocs(db);
  1366. if (docsToEmbed.length === 0) {
  1367. return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
  1368. }
  1369. const totalBytes = docsToEmbed.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
  1370. const totalDocs = docsToEmbed.length;
  1371. const startTime = Date.now();
  1372. // Per-collection chunkStrategy lookup (Phase 2 — i-bud0h8vu). YAML
  1373. // `chunkStrategy` on a collection wins over `options.chunkStrategy`
  1374. // (global CLI flag); falls back to the global option, then to
  1375. // chunkDocumentByTokens' own "regex" default when neither is set.
  1376. // Opt-in per collection — collections without the field are untouched.
  1377. const collectionStrategies = new Map<string, ChunkStrategy>();
  1378. try {
  1379. const { listCollections: listYamlCollections } = await import("./collections.js");
  1380. for (const c of listYamlCollections()) {
  1381. if (c.chunkStrategy) collectionStrategies.set(c.name, c.chunkStrategy);
  1382. }
  1383. } catch {
  1384. // If YAML config is missing/unreadable, fall back silently to the
  1385. // global strategy — no collection overrides. Keeps SDK/inline
  1386. // callers that never touch ~/.config/qmd working.
  1387. }
  1388. // Provider routing — when an EmbeddingProvider is supplied, embed calls go
  1389. // through it (HTTP, GPU worker, etc.). Otherwise, use the LLM session path.
  1390. // The outer session is still created for its abort signal (chunking uses
  1391. // `session.signal` for cooperative cancellation).
  1392. const provider = options?.embedProvider;
  1393. const providerModel = provider?.getModelId() ?? model;
  1394. // Resolve `embedModelUri` (used for formatting prefixes etc.) lazily —
  1395. // when `provider` is set, take it from the provider; otherwise fall back
  1396. // to the local LlamaCpp's embed model name. Accessing `getLlm(store)` is
  1397. // deferred to the non-provider branch so remote-only deployments do not
  1398. // construct a `LlamaCpp` instance just to read its embedModelName.
  1399. const embedModelUri = provider
  1400. ? provider.getModelId()
  1401. : getLlm(store).embedModelName;
  1402. // Run the embedding loop inside a session-scoped wrapper. When `provider`
  1403. // is set, this short-circuits the local LLM warm-up entirely (i-08ovbvtb).
  1404. const result = await withEmbedSession(store, provider, async (session) => {
  1405. let chunksEmbedded = 0;
  1406. let errors = 0;
  1407. let bytesProcessed = 0;
  1408. let totalChunks = 0;
  1409. let vectorTableInitialized = false;
  1410. const BATCH_SIZE = 32;
  1411. const batches = buildEmbeddingBatches(docsToEmbed, maxDocsPerBatch, maxBatchBytes);
  1412. // Embedding helpers — single point of provider/session selection.
  1413. // Both return the same shape as ILLMSession.embed/embedBatch so the
  1414. // rest of the loop is unchanged.
  1415. const embedOne = async (
  1416. text: string,
  1417. modelArg: string,
  1418. ): Promise<{ embedding: number[]; model: string } | null> => {
  1419. if (provider) {
  1420. const sig = provider.kind === 'local' ? session.signal : undefined;
  1421. const r = await provider.embed(text, { model: modelArg, signal: sig });
  1422. return r ? { embedding: r.embedding, model: r.model } : null;
  1423. }
  1424. return session.embed(text, { model: modelArg });
  1425. };
  1426. const embedMany = async (
  1427. texts: string[],
  1428. modelArg: string,
  1429. ): Promise<({ embedding: number[]; model: string } | null)[]> => {
  1430. if (provider) {
  1431. const sig = provider.kind === 'local' ? session.signal : undefined;
  1432. const r = await provider.embedBatch(texts, { model: modelArg, signal: sig });
  1433. return r.map((x) => (x ? { embedding: x.embedding, model: x.model } : null));
  1434. }
  1435. return session.embedBatch(texts, { model: modelArg });
  1436. };
  1437. // JS-only token estimator for the provider path. Char-based with
  1438. // avgCharsPerToken=3 — matches the heuristic the chunker already
  1439. // uses for its initial char-space pass, so the safety re-split is a
  1440. // near no-op while populating the `tokens` field with a stable
  1441. // estimate. CRITICAL: avoids loading node-llama-cpp on remote-only
  1442. // deployments (`QMD_EMBED_ENDPOINT=...`). i-1rqixh6m DoD #1.
  1443. const chunkTokenizer: TokenCounter | undefined = provider
  1444. ? (text: string) => Math.ceil(text.length / 3)
  1445. : undefined;
  1446. for (const batchMeta of batches) {
  1447. // Abort early if session has been invalidated
  1448. if (!session.isValid) {
  1449. console.warn(`⚠ Session expired — skipping remaining document batches`);
  1450. break;
  1451. }
  1452. const batchDocs = getEmbeddingDocsForBatch(db, batchMeta);
  1453. const batchChunks: ChunkItem[] = [];
  1454. const batchBytes = batchMeta.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
  1455. for (const doc of batchDocs) {
  1456. if (!doc.body.trim()) continue;
  1457. const title = extractTitle(doc.body, doc.path);
  1458. const perCollectionStrategy = collectionStrategies.get(doc.collection);
  1459. const chunkStrategy = perCollectionStrategy ?? options?.chunkStrategy;
  1460. const chunks = await chunkDocumentByTokens(
  1461. doc.body,
  1462. undefined, undefined, undefined,
  1463. doc.path,
  1464. chunkStrategy,
  1465. session.signal,
  1466. chunkTokenizer,
  1467. );
  1468. for (let seq = 0; seq < chunks.length; seq++) {
  1469. batchChunks.push({
  1470. hash: doc.hash,
  1471. title,
  1472. text: chunks[seq]!.text,
  1473. seq,
  1474. pos: chunks[seq]!.pos,
  1475. tokens: chunks[seq]!.tokens,
  1476. bytes: encoder.encode(chunks[seq]!.text).length,
  1477. });
  1478. }
  1479. }
  1480. totalChunks += batchChunks.length;
  1481. if (batchChunks.length === 0) {
  1482. bytesProcessed += batchBytes;
  1483. options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
  1484. continue;
  1485. }
  1486. if (!vectorTableInitialized) {
  1487. const firstChunk = batchChunks[0]!;
  1488. const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title, embedModelUri);
  1489. // Single retry on transient failure (issue i-vm1lxwry). The provider
  1490. // swallows per-chunk errors per its contract — `getLastError?.()`
  1491. // surfaces the actual cause (HTTP status / abort / parse error) so we
  1492. // can include it in the thrown message instead of the cryptic
  1493. // "Failed to get embedding dimensions from first chunk".
  1494. let firstResult = await embedOne(firstText, providerModel);
  1495. if (!firstResult && session.isValid) {
  1496. const firstErr = provider?.getLastError?.();
  1497. // Brief backoff before retry — embedding worker may be re-warming
  1498. // a model or the GPU host may be transiently busy. 250ms is short
  1499. // enough to be invisible on the happy path and long enough to
  1500. // clear most "thundering-herd" race conditions.
  1501. await new Promise((resolve) => setTimeout(resolve, 250));
  1502. if (process.env.QMD_EMBED_DEBUG) {
  1503. process.stderr.write(
  1504. `qmd embed: first-chunk dimension probe failed, retrying once${firstErr ? ` (last error: ${firstErr})` : ""}\n`,
  1505. );
  1506. }
  1507. firstResult = await embedOne(firstText, providerModel);
  1508. }
  1509. if (!firstResult) {
  1510. const lastErr = provider?.getLastError?.();
  1511. const providerHint = provider ? `provider=${provider.kind}` : "provider=session";
  1512. const errSuffix = lastErr ? ` — underlying: ${lastErr}` : "";
  1513. const debugHint = process.env.QMD_EMBED_DEBUG
  1514. ? ""
  1515. : " (set QMD_EMBED_DEBUG=1 for per-chunk traces)";
  1516. throw new Error(
  1517. `Failed to get embedding dimensions from first chunk after retry [${providerHint}]${errSuffix}${debugHint}`,
  1518. );
  1519. }
  1520. store.ensureVecTable(firstResult.embedding.length);
  1521. vectorTableInitialized = true;
  1522. }
  1523. const totalBatchChunkBytes = batchChunks.reduce((sum, chunk) => sum + chunk.bytes, 0);
  1524. let batchChunkBytesProcessed = 0;
  1525. for (let batchStart = 0; batchStart < batchChunks.length; batchStart += BATCH_SIZE) {
  1526. // Abort early if session has been invalidated (e.g. max duration exceeded)
  1527. if (!session.isValid) {
  1528. const remaining = batchChunks.length - batchStart;
  1529. errors += remaining;
  1530. console.warn(`⚠ Session expired — skipping ${remaining} remaining chunks`);
  1531. break;
  1532. }
  1533. // Abort early if error rate is too high (>80% of processed chunks failed)
  1534. const processed = chunksEmbedded + errors;
  1535. if (processed >= BATCH_SIZE && errors > processed * 0.8) {
  1536. const remaining = batchChunks.length - batchStart;
  1537. errors += remaining;
  1538. console.warn(`⚠ Error rate too high (${errors}/${processed}) — aborting embedding`);
  1539. break;
  1540. }
  1541. const batchEnd = Math.min(batchStart + BATCH_SIZE, batchChunks.length);
  1542. const chunkBatch = batchChunks.slice(batchStart, batchEnd);
  1543. const texts = chunkBatch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title, embedModelUri));
  1544. try {
  1545. const embeddings = await embedMany(texts, providerModel);
  1546. for (let i = 0; i < chunkBatch.length; i++) {
  1547. const chunk = chunkBatch[i]!;
  1548. const embedding = embeddings[i];
  1549. if (embedding) {
  1550. insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), providerModel, now);
  1551. chunksEmbedded++;
  1552. } else {
  1553. errors++;
  1554. }
  1555. batchChunkBytesProcessed += chunk.bytes;
  1556. }
  1557. } catch {
  1558. // Batch failed — try individual embeddings as fallback
  1559. // But skip if session is already invalid (avoids N doomed retries)
  1560. if (!session.isValid) {
  1561. errors += chunkBatch.length;
  1562. batchChunkBytesProcessed += chunkBatch.reduce((sum, c) => sum + c.bytes, 0);
  1563. } else {
  1564. for (const chunk of chunkBatch) {
  1565. try {
  1566. const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
  1567. const result = await embedOne(text, providerModel);
  1568. if (result) {
  1569. insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), providerModel, now);
  1570. chunksEmbedded++;
  1571. } else {
  1572. errors++;
  1573. }
  1574. } catch {
  1575. errors++;
  1576. }
  1577. batchChunkBytesProcessed += chunk.bytes;
  1578. }
  1579. }
  1580. }
  1581. const proportionalBytes = totalBatchChunkBytes === 0
  1582. ? batchBytes
  1583. : Math.min(batchBytes, Math.round((batchChunkBytesProcessed / totalBatchChunkBytes) * batchBytes));
  1584. options?.onProgress?.({
  1585. chunksEmbedded,
  1586. totalChunks,
  1587. bytesProcessed: bytesProcessed + proportionalBytes,
  1588. totalBytes,
  1589. errors,
  1590. });
  1591. }
  1592. bytesProcessed += batchBytes;
  1593. options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
  1594. }
  1595. return { chunksEmbedded, errors };
  1596. }, { maxDuration: 30 * 60 * 1000, name: 'generateEmbeddings' });
  1597. return {
  1598. docsProcessed: totalDocs,
  1599. chunksEmbedded: result.chunksEmbedded,
  1600. errors: result.errors,
  1601. durationMs: Date.now() - startTime,
  1602. };
  1603. }
  1604. /**
  1605. * Create a new store instance with the given database path.
  1606. * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
  1607. *
  1608. * @param dbPath - Path to the SQLite database file
  1609. * @returns Store instance with all methods bound to the database
  1610. */
  1611. export function createStore(dbPath?: string): Store {
  1612. const resolvedPath = dbPath || getDefaultDbPath();
  1613. const db = openDatabase(resolvedPath);
  1614. initializeDatabase(db);
  1615. const store: Store = {
  1616. db,
  1617. dbPath: resolvedPath,
  1618. close: () => db.close(),
  1619. ensureVecTable: (dimensions: number) => ensureVecTableInternal(db, dimensions),
  1620. // Index health
  1621. getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
  1622. getIndexHealth: () => getIndexHealth(db),
  1623. getStatus: () => getStatus(db),
  1624. // Caching
  1625. getCacheKey,
  1626. getCachedResult: (cacheKey: string) => getCachedResult(db, cacheKey),
  1627. setCachedResult: (cacheKey: string, result: string) => setCachedResult(db, cacheKey, result),
  1628. clearCache: () => clearCache(db),
  1629. // Cleanup and maintenance
  1630. deleteLLMCache: () => deleteLLMCache(db),
  1631. deleteInactiveDocuments: () => deleteInactiveDocuments(db),
  1632. cleanupOrphanedContent: () => cleanupOrphanedContent(db),
  1633. cleanupOrphanedVectors: () => cleanupOrphanedVectors(db),
  1634. vacuumDatabase: () => vacuumDatabase(db),
  1635. // Context
  1636. getContextForFile: (filepath: string) => getContextForFile(db, filepath),
  1637. getContextForPath: (collectionName: string, path: string) => getContextForPath(db, collectionName, path),
  1638. getCollectionByName: (name: string) => getCollectionByName(db, name),
  1639. getCollectionsWithoutContext: () => getCollectionsWithoutContext(db),
  1640. getTopLevelPathsWithoutContext: (collectionName: string) => getTopLevelPathsWithoutContext(db, collectionName),
  1641. // Virtual paths
  1642. parseVirtualPath,
  1643. buildVirtualPath,
  1644. isVirtualPath,
  1645. resolveVirtualPath: (virtualPath: string) => resolveVirtualPath(db, virtualPath),
  1646. toVirtualPath: (absolutePath: string) => toVirtualPath(db, absolutePath),
  1647. // Search
  1648. searchFTS: (query: string, limit?: number, collectionName?: string) => searchFTS(db, query, limit, collectionName),
  1649. searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding, embedProvider),
  1650. // Query expansion & reranking
  1651. expandQuery: (query: string, model?: string, intent?: string) => expandQuery(query, model, db, intent, store.llm),
  1652. rerank: (query: string, documents: { file: string; text: string }[], model?: string, intent?: string) => rerank(query, documents, model, db, intent, store.llm),
  1653. // Document retrieval
  1654. findDocument: (filename: string, options?: { includeBody?: boolean }) => findDocument(db, filename, options),
  1655. getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => getDocumentBody(db, doc, fromLine, maxLines),
  1656. findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => findDocuments(db, pattern, options),
  1657. // Fuzzy matching and docid lookup
  1658. findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => findSimilarFiles(db, query, maxDistance, limit),
  1659. matchFilesByGlob: (pattern: string) => matchFilesByGlob(db, pattern),
  1660. findDocumentByDocid: (docid: string) => findDocumentByDocid(db, docid),
  1661. // Document indexing operations
  1662. insertContent: (hash: string, content: string, createdAt: string) => insertContent(db, hash, content, createdAt),
  1663. insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt),
  1664. findActiveDocument: (collectionName: string, path: string) => findActiveDocument(db, collectionName, path),
  1665. updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => updateDocumentTitle(db, documentId, title, modifiedAt),
  1666. updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => updateDocument(db, documentId, title, hash, modifiedAt),
  1667. deactivateDocument: (collectionName: string, path: string) => deactivateDocument(db, collectionName, path),
  1668. getActiveDocumentPaths: (collectionName: string) => getActiveDocumentPaths(db, collectionName),
  1669. // Vector/embedding operations
  1670. getHashesForEmbedding: () => getHashesForEmbedding(db),
  1671. clearAllEmbeddings: () => clearAllEmbeddings(db),
  1672. insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
  1673. };
  1674. return store;
  1675. }
  1676. // =============================================================================
  1677. // Core Document Type
  1678. // =============================================================================
  1679. /**
  1680. * Unified document result type with all metadata.
  1681. * Body is optional - use getDocumentBody() to load it separately if needed.
  1682. */
  1683. export type DocumentResult = {
  1684. filepath: string; // Full filesystem path
  1685. displayPath: string; // Short display path (e.g., "docs/readme.md")
  1686. title: string; // Document title (from first heading or filename)
  1687. context: string | null; // Folder context description if configured
  1688. hash: string; // Content hash for caching/change detection
  1689. docid: string; // Short docid (first 6 chars of hash) for quick reference
  1690. collectionName: string; // Parent collection name
  1691. modifiedAt: string; // Last modification timestamp
  1692. bodyLength: number; // Body length in bytes (useful before loading)
  1693. body?: string; // Document body (optional, load with getDocumentBody)
  1694. };
  1695. /**
  1696. * Extract short docid from a full hash (first 6 characters).
  1697. */
  1698. export function getDocid(hash: string): string {
  1699. return hash.slice(0, 6);
  1700. }
  1701. /**
  1702. * Handelize a filename to be more token-friendly.
  1703. * - Convert triple underscore `___` to `/` (folder separator)
  1704. * - Convert to lowercase
  1705. * - Replace sequences of non-word chars (except /) with single dash
  1706. * - Remove leading/trailing dashes from path segments
  1707. * - Preserve folder structure (a/b/c/d.md stays structured)
  1708. * - Preserve file extension
  1709. */
  1710. /** Replace emoji/symbol codepoints with their hex representation (e.g. 🐘 → 1f418) */
  1711. function emojiToHex(str: string): string {
  1712. return str.replace(/(?:\p{So}\p{Mn}?|\p{Sk})+/gu, (run) => {
  1713. // Split the run into individual emoji and convert each to hex, dash-separated
  1714. return [...run].filter(c => /\p{So}|\p{Sk}/u.test(c))
  1715. .map(c => c.codePointAt(0)!.toString(16)).join('-');
  1716. });
  1717. }
  1718. export function handelize(path: string): string {
  1719. if (!path || path.trim() === '') {
  1720. throw new Error('handelize: path cannot be empty');
  1721. }
  1722. // Allow route-style "$" filenames while still rejecting paths with no usable content.
  1723. // Emoji (\p{So}) counts as valid content — they get converted to hex codepoints below.
  1724. const segments = path.split('/').filter(Boolean);
  1725. const lastSegment = segments[segments.length - 1] || '';
  1726. const filenameWithoutExt = lastSegment.replace(/\.[^.]+$/, '');
  1727. const hasValidContent = /[\p{L}\p{N}\p{So}\p{Sk}$]/u.test(filenameWithoutExt);
  1728. if (!hasValidContent) {
  1729. throw new Error(`handelize: path "${path}" has no valid filename content`);
  1730. }
  1731. const result = path
  1732. .replace(/___/g, '/') // Triple underscore becomes folder separator
  1733. .toLowerCase()
  1734. .split('/')
  1735. .map((segment, idx, arr) => {
  1736. const isLastSegment = idx === arr.length - 1;
  1737. // Convert emoji to hex codepoints before cleaning
  1738. segment = emojiToHex(segment);
  1739. if (isLastSegment) {
  1740. // For the filename (last segment), preserve the extension
  1741. const extMatch = segment.match(/(\.[a-z0-9]+)$/i);
  1742. const ext = extMatch ? extMatch[1] : '';
  1743. const nameWithoutExt = ext ? segment.slice(0, -ext.length) : segment;
  1744. const cleanedName = nameWithoutExt
  1745. .replace(/[^\p{L}\p{N}$]+/gu, '-') // Keep letters, numbers, "$"; dash-separate rest (including dots)
  1746. .replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
  1747. return cleanedName + ext;
  1748. } else {
  1749. // For directories, just clean normally
  1750. return segment
  1751. .replace(/[^\p{L}\p{N}$]+/gu, '-')
  1752. .replace(/^-+|-+$/g, '');
  1753. }
  1754. })
  1755. .filter(Boolean)
  1756. .join('/');
  1757. if (!result) {
  1758. throw new Error(`handelize: path "${path}" resulted in empty string after processing`);
  1759. }
  1760. return result;
  1761. }
  1762. /**
  1763. * Search result extends DocumentResult with score and source info
  1764. */
  1765. export type SearchResult = DocumentResult & {
  1766. score: number; // Relevance score (0-1)
  1767. source: "fts" | "vec"; // Search source (full-text or vector)
  1768. chunkPos?: number; // Character position of matching chunk (for vector search)
  1769. };
  1770. /**
  1771. * Ranked result for RRF fusion (simplified, used internally)
  1772. */
  1773. export type RankedResult = {
  1774. file: string;
  1775. displayPath: string;
  1776. title: string;
  1777. body: string;
  1778. score: number;
  1779. };
  1780. export type RRFContributionTrace = {
  1781. listIndex: number;
  1782. source: "fts" | "vec";
  1783. queryType: "original" | "lex" | "vec" | "hyde";
  1784. query: string;
  1785. rank: number; // 1-indexed rank within list
  1786. weight: number;
  1787. backendScore: number; // Backend-normalized score before fusion
  1788. rrfContribution: number; // weight / (k + rank)
  1789. };
  1790. export type RRFScoreTrace = {
  1791. contributions: RRFContributionTrace[];
  1792. baseScore: number; // Sum of reciprocal-rank contributions
  1793. topRank: number; // Best (lowest) rank seen across lists
  1794. topRankBonus: number; // +0.05 for rank 1, +0.02 for rank 2-3
  1795. totalScore: number; // baseScore + topRankBonus
  1796. };
  1797. export type HybridQueryExplain = {
  1798. ftsScores: number[];
  1799. vectorScores: number[];
  1800. rrf: {
  1801. rank: number; // Rank after RRF fusion (1-indexed)
  1802. positionScore: number; // 1 / rank used in position-aware blending
  1803. weight: number; // Position-aware RRF weight (0.75 / 0.60 / 0.40)
  1804. baseScore: number;
  1805. topRankBonus: number;
  1806. totalScore: number;
  1807. contributions: RRFContributionTrace[];
  1808. };
  1809. rerankScore: number;
  1810. blendedScore: number;
  1811. };
  1812. /**
  1813. * Error result when document is not found
  1814. */
  1815. export type DocumentNotFound = {
  1816. error: "not_found";
  1817. query: string;
  1818. similarFiles: string[];
  1819. };
  1820. /**
  1821. * Result from multi-get operations
  1822. */
  1823. export type MultiGetResult = {
  1824. doc: DocumentResult;
  1825. skipped: false;
  1826. } | {
  1827. doc: Pick<DocumentResult, "filepath" | "displayPath">;
  1828. skipped: true;
  1829. skipReason: string;
  1830. };
  1831. export type CollectionInfo = {
  1832. name: string;
  1833. path: string | null;
  1834. pattern: string | null;
  1835. documents: number;
  1836. lastUpdated: string;
  1837. };
  1838. export type IndexStatus = {
  1839. totalDocuments: number;
  1840. needsEmbedding: number;
  1841. hasVectorIndex: boolean;
  1842. collections: CollectionInfo[];
  1843. };
  1844. // =============================================================================
  1845. // Index health
  1846. // =============================================================================
  1847. export function getHashesNeedingEmbedding(db: Database): number {
  1848. const result = db.prepare(`
  1849. SELECT COUNT(DISTINCT d.hash) as count
  1850. FROM documents d
  1851. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  1852. WHERE d.active = 1 AND v.hash IS NULL
  1853. `).get() as { count: number };
  1854. return result.count;
  1855. }
  1856. export type IndexHealthInfo = {
  1857. needsEmbedding: number;
  1858. totalDocs: number;
  1859. daysStale: number | null;
  1860. };
  1861. export function getIndexHealth(db: Database): IndexHealthInfo {
  1862. const needsEmbedding = getHashesNeedingEmbedding(db);
  1863. const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count;
  1864. const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
  1865. let daysStale: number | null = null;
  1866. if (mostRecent?.latest) {
  1867. const lastUpdate = new Date(mostRecent.latest);
  1868. daysStale = Math.floor((Date.now() - lastUpdate.getTime()) / (24 * 60 * 60 * 1000));
  1869. }
  1870. return { needsEmbedding, totalDocs, daysStale };
  1871. }
  1872. // =============================================================================
  1873. // Caching
  1874. // =============================================================================
  1875. export function getCacheKey(url: string, body: object): string {
  1876. const hash = createHash("sha256");
  1877. hash.update(url);
  1878. hash.update(JSON.stringify(body));
  1879. return hash.digest("hex");
  1880. }
  1881. export function getCachedResult(db: Database, cacheKey: string): string | null {
  1882. const row = db.prepare(`SELECT result FROM llm_cache WHERE hash = ?`).get(cacheKey) as { result: string } | null;
  1883. return row?.result || null;
  1884. }
  1885. export function setCachedResult(db: Database, cacheKey: string, result: string): void {
  1886. const now = new Date().toISOString();
  1887. db.prepare(`INSERT OR REPLACE INTO llm_cache (hash, result, created_at) VALUES (?, ?, ?)`).run(cacheKey, result, now);
  1888. if (Math.random() < 0.01) {
  1889. db.exec(`DELETE FROM llm_cache WHERE hash NOT IN (SELECT hash FROM llm_cache ORDER BY created_at DESC LIMIT 1000)`);
  1890. }
  1891. }
  1892. export function clearCache(db: Database): void {
  1893. db.exec(`DELETE FROM llm_cache`);
  1894. }
  1895. // =============================================================================
  1896. // Cleanup and maintenance operations
  1897. // =============================================================================
  1898. /**
  1899. * Delete cached LLM API responses.
  1900. * Returns the number of cached responses deleted.
  1901. */
  1902. export function deleteLLMCache(db: Database): number {
  1903. const result = db.prepare(`DELETE FROM llm_cache`).run();
  1904. return result.changes;
  1905. }
  1906. /**
  1907. * Remove inactive document records (active = 0).
  1908. * Returns the number of inactive documents deleted.
  1909. */
  1910. export function deleteInactiveDocuments(db: Database): number {
  1911. const result = db.prepare(`DELETE FROM documents WHERE active = 0`).run();
  1912. return result.changes;
  1913. }
  1914. /**
  1915. * Remove orphaned content hashes that are not referenced by any active document.
  1916. * Returns the number of orphaned content hashes deleted.
  1917. */
  1918. export function cleanupOrphanedContent(db: Database): number {
  1919. const result = db.prepare(`
  1920. DELETE FROM content
  1921. WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  1922. `).run();
  1923. return result.changes;
  1924. }
  1925. /**
  1926. * Remove orphaned vector embeddings that are not referenced by any active document.
  1927. * Returns the number of orphaned embedding chunks deleted.
  1928. */
  1929. export function cleanupOrphanedVectors(db: Database): number {
  1930. // sqlite-vec may not be loaded (e.g. Bun's bun:sqlite lacks loadExtension).
  1931. // The vectors_vec virtual table can appear in sqlite_master from a prior
  1932. // session, but querying it without the vec0 module loaded will crash (#380).
  1933. if (!isSqliteVecAvailable()) {
  1934. return 0;
  1935. }
  1936. // The schema entry can exist even when sqlite-vec itself is unavailable
  1937. // (for example when reopening a DB without vec0 loaded). In that case,
  1938. // touching the virtual table throws "no such module: vec0" and cleanup
  1939. // should degrade gracefully like the rest of the vector features.
  1940. try {
  1941. db.prepare(`SELECT 1 FROM vectors_vec LIMIT 0`).get();
  1942. } catch {
  1943. return 0;
  1944. }
  1945. // Count orphaned vectors first
  1946. const countResult = db.prepare(`
  1947. SELECT COUNT(*) as c FROM content_vectors cv
  1948. WHERE NOT EXISTS (
  1949. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  1950. )
  1951. `).get() as { c: number };
  1952. if (countResult.c === 0) {
  1953. return 0;
  1954. }
  1955. // Delete from vectors_vec first
  1956. db.exec(`
  1957. DELETE FROM vectors_vec WHERE hash_seq IN (
  1958. SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
  1959. WHERE NOT EXISTS (
  1960. SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
  1961. )
  1962. )
  1963. `);
  1964. // Delete from content_vectors
  1965. db.exec(`
  1966. DELETE FROM content_vectors WHERE hash NOT IN (
  1967. SELECT hash FROM documents WHERE active = 1
  1968. )
  1969. `);
  1970. return countResult.c;
  1971. }
  1972. /**
  1973. * Run VACUUM to reclaim unused space in the database.
  1974. * This operation rebuilds the database file to eliminate fragmentation.
  1975. */
  1976. export function vacuumDatabase(db: Database): void {
  1977. db.exec(`VACUUM`);
  1978. }
  1979. // =============================================================================
  1980. // Document helpers
  1981. // =============================================================================
  1982. export async function hashContent(content: string): Promise<string> {
  1983. const hash = createHash("sha256");
  1984. hash.update(content);
  1985. return hash.digest("hex");
  1986. }
  1987. const titleExtractors: Record<string, (content: string) => string | null> = {
  1988. '.md': (content) => {
  1989. const match = content.match(/^##?\s+(.+)$/m);
  1990. if (match) {
  1991. const title = (match[1] ?? "").trim();
  1992. if (title === "📝 Notes" || title === "Notes") {
  1993. const nextMatch = content.match(/^##\s+(.+)$/m);
  1994. if (nextMatch?.[1]) return nextMatch[1].trim();
  1995. }
  1996. return title;
  1997. }
  1998. return null;
  1999. },
  2000. '.org': (content) => {
  2001. const titleProp = content.match(/^#\+TITLE:\s*(.+)$/im);
  2002. if (titleProp?.[1]) return titleProp[1].trim();
  2003. const heading = content.match(/^\*+\s+(.+)$/m);
  2004. if (heading?.[1]) return heading[1].trim();
  2005. return null;
  2006. },
  2007. };
  2008. export function extractTitle(content: string, filename: string): string {
  2009. const ext = filename.slice(filename.lastIndexOf('.')).toLowerCase();
  2010. const extractor = titleExtractors[ext];
  2011. if (extractor) {
  2012. const title = extractor(content);
  2013. if (title) return title;
  2014. }
  2015. return filename.replace(/\.[^.]+$/, "").split("/").pop() || filename;
  2016. }
  2017. // =============================================================================
  2018. // Document indexing operations
  2019. // =============================================================================
  2020. /**
  2021. * Insert content into the content table (content-addressable storage).
  2022. * Uses INSERT OR IGNORE so duplicate hashes are skipped.
  2023. */
  2024. export function insertContent(db: Database, hash: string, content: string, createdAt: string): void {
  2025. db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
  2026. .run(hash, content, createdAt);
  2027. }
  2028. /**
  2029. * Insert a new document into the documents table.
  2030. */
  2031. export function insertDocument(
  2032. db: Database,
  2033. collectionName: string,
  2034. path: string,
  2035. title: string,
  2036. hash: string,
  2037. createdAt: string,
  2038. modifiedAt: string
  2039. ): void {
  2040. db.prepare(`
  2041. INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active)
  2042. VALUES (?, ?, ?, ?, ?, ?, 1)
  2043. ON CONFLICT(collection, path) DO UPDATE SET
  2044. title = excluded.title,
  2045. hash = excluded.hash,
  2046. modified_at = excluded.modified_at,
  2047. active = 1
  2048. `).run(collectionName, path, title, hash, createdAt, modifiedAt);
  2049. }
  2050. /**
  2051. * Find an active document by collection name and path.
  2052. */
  2053. export function findActiveDocument(
  2054. db: Database,
  2055. collectionName: string,
  2056. path: string
  2057. ): { id: number; hash: string; title: string } | null {
  2058. const row = db.prepare(`
  2059. SELECT id, hash, title FROM documents
  2060. WHERE collection = ? AND path = ? AND active = 1
  2061. `).get(collectionName, path) as { id: number; hash: string; title: string } | undefined;
  2062. return row ?? null;
  2063. }
  2064. /**
  2065. * Update the title and modified_at timestamp for a document.
  2066. */
  2067. export function updateDocumentTitle(
  2068. db: Database,
  2069. documentId: number,
  2070. title: string,
  2071. modifiedAt: string
  2072. ): void {
  2073. db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
  2074. .run(title, modifiedAt, documentId);
  2075. }
  2076. /**
  2077. * Update an existing document's hash, title, and modified_at timestamp.
  2078. * Used when content changes but the file path stays the same.
  2079. */
  2080. export function updateDocument(
  2081. db: Database,
  2082. documentId: number,
  2083. title: string,
  2084. hash: string,
  2085. modifiedAt: string
  2086. ): void {
  2087. db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`)
  2088. .run(title, hash, modifiedAt, documentId);
  2089. }
  2090. /**
  2091. * Deactivate a document (mark as inactive but don't delete).
  2092. */
  2093. export function deactivateDocument(db: Database, collectionName: string, path: string): void {
  2094. db.prepare(`UPDATE documents SET active = 0 WHERE collection = ? AND path = ? AND active = 1`)
  2095. .run(collectionName, path);
  2096. }
  2097. /**
  2098. * Get all active document paths for a collection.
  2099. */
  2100. export function getActiveDocumentPaths(db: Database, collectionName: string): string[] {
  2101. const rows = db.prepare(`
  2102. SELECT path FROM documents WHERE collection = ? AND active = 1
  2103. `).all(collectionName) as { path: string }[];
  2104. return rows.map(r => r.path);
  2105. }
  2106. export { formatQueryForEmbedding, formatDocForEmbedding };
  2107. /**
  2108. * Chunk a document using regex-only break point detection.
  2109. * This is the sync, backward-compatible API used by tests and legacy callers.
  2110. */
  2111. export function chunkDocument(
  2112. content: string,
  2113. maxChars: number = CHUNK_SIZE_CHARS,
  2114. overlapChars: number = CHUNK_OVERLAP_CHARS,
  2115. windowChars: number = CHUNK_WINDOW_CHARS
  2116. ): { text: string; pos: number }[] {
  2117. const breakPoints = scanBreakPoints(content);
  2118. const codeFences = findCodeFences(content);
  2119. return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
  2120. }
  2121. /**
  2122. * Async AST-aware chunking. Detects language from filepath, computes AST
  2123. * break points for supported code files, merges with regex break points,
  2124. * and delegates to the shared chunk algorithm.
  2125. *
  2126. * Strategies:
  2127. * - "regex" (default) — char-based chunking with regex break points only.
  2128. * - "auto" — regex break points merged with AST break points (soft hints).
  2129. * - "function" — one chunk per AST function range (Phase 2); inter-range
  2130. * gaps (imports, top-level code) are char-chunked with AST
  2131. * hints. Falls back to "auto" when zero ranges are detected.
  2132. */
  2133. export async function chunkDocumentAsync(
  2134. content: string,
  2135. maxChars: number = CHUNK_SIZE_CHARS,
  2136. overlapChars: number = CHUNK_OVERLAP_CHARS,
  2137. windowChars: number = CHUNK_WINDOW_CHARS,
  2138. filepath?: string,
  2139. chunkStrategy: ChunkStrategy = "regex",
  2140. ): Promise<{ text: string; pos: number }[]> {
  2141. const regexPoints = scanBreakPoints(content);
  2142. const codeFences = findCodeFences(content);
  2143. // "function" strategy: delegate to the function-level chunker. If no
  2144. // ranges are detected (markdown, unsupported lang, parse failure), fall
  2145. // back to "auto" behavior (AST-break-point-assisted char chunking).
  2146. if (chunkStrategy === "function" && filepath) {
  2147. const { getASTFunctionRanges, getASTBreakPoints } = await import("./ast.js");
  2148. const ranges = await getASTFunctionRanges(content, filepath);
  2149. if (ranges.length > 0) {
  2150. return chunkByFunctionRanges(
  2151. content,
  2152. ranges,
  2153. regexPoints,
  2154. codeFences,
  2155. maxChars,
  2156. overlapChars,
  2157. windowChars,
  2158. );
  2159. }
  2160. // Zero ranges — fall through to auto behavior so break points still help.
  2161. const astPoints = await getASTBreakPoints(content, filepath);
  2162. const merged = astPoints.length > 0 ? mergeBreakPoints(regexPoints, astPoints) : regexPoints;
  2163. return chunkDocumentWithBreakPoints(content, merged, codeFences, maxChars, overlapChars, windowChars);
  2164. }
  2165. let breakPoints = regexPoints;
  2166. if (chunkStrategy === "auto" && filepath) {
  2167. const { getASTBreakPoints } = await import("./ast.js");
  2168. const astPoints = await getASTBreakPoints(content, filepath);
  2169. if (astPoints.length > 0) {
  2170. breakPoints = mergeBreakPoints(regexPoints, astPoints);
  2171. }
  2172. }
  2173. return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
  2174. }
  2175. /**
  2176. * Produce one chunk per AST function range, plus char-chunks for the gaps
  2177. * between ranges (imports, top-level code). Ranges that exceed `maxChars`
  2178. * are further split using the existing char-based algorithm so we never
  2179. * emit a single oversized chunk.
  2180. *
  2181. * Preconditions: `ranges` is non-empty, sorted by `startIndex`, and the
  2182. * ranges are non-overlapping (as produced by `getASTFunctionRanges`).
  2183. */
  2184. function chunkByFunctionRanges(
  2185. content: string,
  2186. ranges: import("./ast.js").FunctionRange[],
  2187. regexPoints: BreakPoint[],
  2188. codeFences: CodeFenceRegion[],
  2189. maxChars: number,
  2190. overlapChars: number,
  2191. windowChars: number,
  2192. ): { text: string; pos: number }[] {
  2193. const out: { text: string; pos: number }[] = [];
  2194. let cursor = 0;
  2195. const emitGap = (start: number, end: number) => {
  2196. if (start >= end) return;
  2197. const gap = content.slice(start, end);
  2198. // Whitespace-only gaps are dropped — they carry no embeddable signal.
  2199. if (!gap.trim()) return;
  2200. if (gap.length <= maxChars) {
  2201. out.push({ text: gap, pos: start });
  2202. return;
  2203. }
  2204. // Reuse char-based algorithm for oversized gaps. Restrict break
  2205. // points and code fences to the gap window and rebase positions so
  2206. // chunkDocumentWithBreakPoints operates on a standalone slice.
  2207. const subPoints = regexPoints
  2208. .filter(p => p.pos >= start && p.pos < end)
  2209. .map(p => ({ ...p, pos: p.pos - start }));
  2210. const subFences = codeFences
  2211. .filter(f => f.end > start && f.start < end)
  2212. .map(f => ({
  2213. start: Math.max(0, f.start - start),
  2214. end: Math.max(0, Math.min(end, f.end) - start),
  2215. }));
  2216. const sub = chunkDocumentWithBreakPoints(gap, subPoints, subFences, maxChars, overlapChars, windowChars);
  2217. for (const c of sub) out.push({ text: c.text, pos: start + c.pos });
  2218. };
  2219. for (const range of ranges) {
  2220. // Emit any leading / inter-range gap (imports, top-level code).
  2221. emitGap(cursor, range.startIndex);
  2222. const body = content.slice(range.startIndex, range.endIndex);
  2223. if (body.length === 0) {
  2224. cursor = range.endIndex;
  2225. continue;
  2226. }
  2227. if (body.length <= maxChars) {
  2228. out.push({ text: body, pos: range.startIndex });
  2229. } else {
  2230. // Oversized function/class — split with char algorithm so we stay
  2231. // under the embed token budget. Break points inside the range are
  2232. // reused to keep splits at syntactically-sensible positions.
  2233. const subPoints = regexPoints
  2234. .filter(p => p.pos >= range.startIndex && p.pos < range.endIndex)
  2235. .map(p => ({ ...p, pos: p.pos - range.startIndex }));
  2236. const subFences = codeFences
  2237. .filter(f => f.end > range.startIndex && f.start < range.endIndex)
  2238. .map(f => ({
  2239. start: Math.max(0, f.start - range.startIndex),
  2240. end: Math.max(0, Math.min(range.endIndex, f.end) - range.startIndex),
  2241. }));
  2242. const sub = chunkDocumentWithBreakPoints(body, subPoints, subFences, maxChars, overlapChars, windowChars);
  2243. for (const c of sub) out.push({ text: c.text, pos: range.startIndex + c.pos });
  2244. }
  2245. cursor = range.endIndex;
  2246. }
  2247. // Trailing gap after the last range.
  2248. emitGap(cursor, content.length);
  2249. // Edge case: content consisted entirely of whitespace-only gaps (zero
  2250. // emitted chunks). Preserve the invariant that non-empty content yields
  2251. // at least one chunk.
  2252. if (out.length === 0 && content.length > 0) {
  2253. return [{ text: content, pos: 0 }];
  2254. }
  2255. return out;
  2256. }
  2257. /**
  2258. * Counts the tokens in `text`. Used by `chunkDocumentByTokens` for the
  2259. * safety re-split that splits chunks exceeding `maxTokens`.
  2260. *
  2261. * When `chunkDocumentByTokens` is called WITHOUT a tokenizer (default),
  2262. * it lazily resolves `getDefaultLlamaCpp()` and uses `llm.tokenize` —
  2263. * accurate but expensive (loads the local GGUF embed model + initialises
  2264. * llama.cpp, ~22s on cold cache).
  2265. *
  2266. * Provider-mode callers (HTTP embed providers like the GPU worker on
  2267. * `models` LXC) MUST pass a JS-only approximator to avoid loading the
  2268. * local model entirely. A char-based estimate like
  2269. * `Math.ceil(text.length / 3)` is a reasonable default — it matches the
  2270. * `avgCharsPerToken=3` heuristic used for the initial char-space chunk
  2271. * step, so the safety re-split stays a near no-op while populating the
  2272. * `tokens` field with a stable estimate.
  2273. */
  2274. export type TokenCounter = (text: string) => number | Promise<number>;
  2275. /**
  2276. * Chunk a document by actual token count using the LLM tokenizer.
  2277. * More accurate than character-based chunking but requires async.
  2278. *
  2279. * When `tokenizer` is supplied, it is used in place of the local
  2280. * `llm.tokenize(...)` call — neither `getDefaultLlamaCpp()` nor
  2281. * `llm.tokenize(...)` is invoked. This lets remote-only deployments
  2282. * (`QMD_EMBED_ENDPOINT=...`) chunk documents without warming up
  2283. * node-llama-cpp (DoD #1 of i-1rqixh6m / i-qkarfffa).
  2284. *
  2285. * When `filepath` and `chunkStrategy` are provided, uses AST-aware break
  2286. * points for supported code files.
  2287. */
  2288. export async function chunkDocumentByTokens(
  2289. content: string,
  2290. maxTokens: number = CHUNK_SIZE_TOKENS,
  2291. overlapTokens: number = CHUNK_OVERLAP_TOKENS,
  2292. windowTokens: number = CHUNK_WINDOW_TOKENS,
  2293. filepath?: string,
  2294. chunkStrategy: ChunkStrategy = "regex",
  2295. signal?: AbortSignal,
  2296. tokenizer?: TokenCounter,
  2297. ): Promise<{ text: string; pos: number; tokens: number }[]> {
  2298. // Resolve token counter lazily so callers that supply `tokenizer` never
  2299. // touch the local LlamaCpp instance — `getDefaultLlamaCpp()` is only
  2300. // invoked from inside the default closure when it is actually called
  2301. // (i.e. when no tokenizer is supplied).
  2302. let llm: ReturnType<typeof getDefaultLlamaCpp> | undefined;
  2303. const countTokens: TokenCounter = tokenizer ?? (async (text: string) => {
  2304. if (!llm) llm = getDefaultLlamaCpp();
  2305. return (await llm.tokenize(text)).length;
  2306. });
  2307. // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
  2308. // If chunks exceed limit, they'll be re-split with actual ratio
  2309. const avgCharsPerToken = 3;
  2310. const maxChars = maxTokens * avgCharsPerToken;
  2311. const overlapChars = overlapTokens * avgCharsPerToken;
  2312. const windowChars = windowTokens * avgCharsPerToken;
  2313. // Chunk in character space with conservative estimate
  2314. // Use AST-aware chunking for the first pass when filepath/strategy provided
  2315. let charChunks = await chunkDocumentAsync(content, maxChars, overlapChars, windowChars, filepath, chunkStrategy);
  2316. // Tokenize and split any chunks that still exceed limit
  2317. const results: { text: string; pos: number; tokens: number }[] = [];
  2318. for (const chunk of charChunks) {
  2319. // Respect abort signal to avoid runaway tokenization
  2320. if (signal?.aborted) break;
  2321. const tokenCount = await countTokens(chunk.text);
  2322. if (tokenCount <= maxTokens) {
  2323. results.push({ text: chunk.text, pos: chunk.pos, tokens: tokenCount });
  2324. } else {
  2325. // Chunk is still too large - split it further
  2326. // Use actual token count to estimate better char limit
  2327. const actualCharsPerToken = chunk.text.length / tokenCount;
  2328. const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
  2329. const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
  2330. for (const subChunk of subChunks) {
  2331. if (signal?.aborted) break;
  2332. const subCount = await countTokens(subChunk.text);
  2333. results.push({
  2334. text: subChunk.text,
  2335. pos: chunk.pos + subChunk.pos,
  2336. tokens: subCount,
  2337. });
  2338. }
  2339. }
  2340. }
  2341. return results;
  2342. }
  2343. // =============================================================================
  2344. // Fuzzy matching
  2345. // =============================================================================
  2346. function levenshtein(a: string, b: string): number {
  2347. const m = a.length, n = b.length;
  2348. if (m === 0) return n;
  2349. if (n === 0) return m;
  2350. const dp: number[][] = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0));
  2351. for (let i = 0; i <= m; i++) dp[i]![0] = i;
  2352. for (let j = 0; j <= n; j++) dp[0]![j] = j;
  2353. for (let i = 1; i <= m; i++) {
  2354. for (let j = 1; j <= n; j++) {
  2355. const cost = a[i - 1] === b[j - 1] ? 0 : 1;
  2356. dp[i]![j] = Math.min(
  2357. dp[i - 1]![j]! + 1,
  2358. dp[i]![j - 1]! + 1,
  2359. dp[i - 1]![j - 1]! + cost
  2360. );
  2361. }
  2362. }
  2363. return dp[m]![n]!;
  2364. }
  2365. /**
  2366. * Normalize a docid input by stripping surrounding quotes and leading #.
  2367. * Handles: "#abc123", 'abc123', "abc123", #abc123, abc123
  2368. * Returns the bare hex string.
  2369. */
  2370. export function normalizeDocid(docid: string): string {
  2371. let normalized = docid.trim();
  2372. // Strip surrounding quotes (single or double)
  2373. if ((normalized.startsWith('"') && normalized.endsWith('"')) ||
  2374. (normalized.startsWith("'") && normalized.endsWith("'"))) {
  2375. normalized = normalized.slice(1, -1);
  2376. }
  2377. // Strip leading # if present
  2378. if (normalized.startsWith('#')) {
  2379. normalized = normalized.slice(1);
  2380. }
  2381. return normalized;
  2382. }
  2383. /**
  2384. * Check if a string looks like a docid reference.
  2385. * Accepts: #abc123, abc123, "#abc123", "abc123", '#abc123', 'abc123'
  2386. * Returns true if the normalized form is a valid hex string of 6+ chars.
  2387. */
  2388. export function isDocid(input: string): boolean {
  2389. const normalized = normalizeDocid(input);
  2390. // Must be at least 6 hex characters
  2391. return normalized.length >= 6 && /^[a-f0-9]+$/i.test(normalized);
  2392. }
  2393. /**
  2394. * Find a document by its short docid (first 6 characters of hash).
  2395. * Returns the document's virtual path if found, null otherwise.
  2396. * If multiple documents match the same short hash (collision), returns the first one.
  2397. *
  2398. * Accepts lenient input: #abc123, abc123, "#abc123", "abc123"
  2399. */
  2400. export function findDocumentByDocid(db: Database, docid: string): { filepath: string; hash: string } | null {
  2401. const shortHash = normalizeDocid(docid);
  2402. if (shortHash.length < 1) return null;
  2403. // Look up documents where hash starts with the short hash
  2404. const doc = db.prepare(`
  2405. SELECT 'qmd://' || d.collection || '/' || d.path as filepath, d.hash
  2406. FROM documents d
  2407. WHERE d.hash LIKE ? AND d.active = 1
  2408. LIMIT 1
  2409. `).get(`${shortHash}%`) as { filepath: string; hash: string } | null;
  2410. return doc;
  2411. }
  2412. export function findSimilarFiles(db: Database, query: string, maxDistance: number = 3, limit: number = 5): string[] {
  2413. const allFiles = db.prepare(`
  2414. SELECT d.path
  2415. FROM documents d
  2416. WHERE d.active = 1
  2417. `).all() as { path: string }[];
  2418. const queryLower = query.toLowerCase();
  2419. const scored = allFiles
  2420. .map(f => ({ path: f.path, dist: levenshtein(f.path.toLowerCase(), queryLower) }))
  2421. .filter(f => f.dist <= maxDistance)
  2422. .sort((a, b) => a.dist - b.dist)
  2423. .slice(0, limit);
  2424. return scored.map(f => f.path);
  2425. }
  2426. export function matchFilesByGlob(db: Database, pattern: string): { filepath: string; displayPath: string; bodyLength: number }[] {
  2427. const allFiles = db.prepare(`
  2428. SELECT
  2429. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  2430. LENGTH(content.doc) as body_length,
  2431. d.path,
  2432. d.collection
  2433. FROM documents d
  2434. JOIN content ON content.hash = d.hash
  2435. WHERE d.active = 1
  2436. `).all() as { virtual_path: string; body_length: number; path: string; collection: string }[];
  2437. const isMatch = picomatch(pattern);
  2438. return allFiles
  2439. .filter(f => isMatch(f.virtual_path) || isMatch(f.path) || isMatch(f.collection + '/' + f.path))
  2440. .map(f => ({
  2441. filepath: f.virtual_path, // Virtual path for precise lookup
  2442. displayPath: f.path, // Relative path for display
  2443. bodyLength: f.body_length
  2444. }));
  2445. }
  2446. // =============================================================================
  2447. // Context
  2448. // =============================================================================
  2449. /**
  2450. * Get context for a file path using hierarchical inheritance.
  2451. * Contexts are collection-scoped and inherit from parent directories.
  2452. * For example, context at "/talks" applies to "/talks/2024/keynote.md".
  2453. *
  2454. * @param db Database instance (unused - kept for compatibility)
  2455. * @param collectionName Collection name
  2456. * @param path Relative path within the collection
  2457. * @returns Context string or null if no context is defined
  2458. */
  2459. export function getContextForPath(db: Database, collectionName: string, path: string): string | null {
  2460. const coll = getStoreCollection(db, collectionName);
  2461. if (!coll) return null;
  2462. // Collect ALL matching contexts (global + all path prefixes)
  2463. const contexts: string[] = [];
  2464. // Add global context if present
  2465. const globalCtx = getStoreGlobalContext(db);
  2466. if (globalCtx) {
  2467. contexts.push(globalCtx);
  2468. }
  2469. // Add all matching path contexts (from most general to most specific)
  2470. if (coll.context) {
  2471. const normalizedPath = path.startsWith("/") ? path : `/${path}`;
  2472. // Collect all matching prefixes
  2473. const matchingContexts: { prefix: string; context: string }[] = [];
  2474. for (const [prefix, context] of Object.entries(coll.context)) {
  2475. const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
  2476. if (normalizedPath.startsWith(normalizedPrefix)) {
  2477. matchingContexts.push({ prefix: normalizedPrefix, context });
  2478. }
  2479. }
  2480. // Sort by prefix length (shortest/most general first)
  2481. matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
  2482. // Add all matching contexts
  2483. for (const match of matchingContexts) {
  2484. contexts.push(match.context);
  2485. }
  2486. }
  2487. // Join all contexts with double newline
  2488. return contexts.length > 0 ? contexts.join('\n\n') : null;
  2489. }
  2490. /**
  2491. * Get context for a file path (virtual or filesystem).
  2492. * Resolves the collection and relative path from the DB store_collections table.
  2493. */
  2494. export function getContextForFile(db: Database, filepath: string): string | null {
  2495. // Handle undefined or null filepath
  2496. if (!filepath) return null;
  2497. // Get all collections from DB
  2498. const collections = getStoreCollections(db);
  2499. // Parse virtual path format: qmd://collection/path
  2500. let collectionName: string | null = null;
  2501. let relativePath: string | null = null;
  2502. const parsedVirtual = filepath.startsWith('qmd://') ? parseVirtualPath(filepath) : null;
  2503. if (parsedVirtual) {
  2504. collectionName = parsedVirtual.collectionName;
  2505. relativePath = parsedVirtual.path;
  2506. } else {
  2507. // Filesystem path: find which collection this absolute path belongs to
  2508. for (const coll of collections) {
  2509. // Skip collections with missing paths
  2510. if (!coll || !coll.path) continue;
  2511. if (filepath.startsWith(coll.path + '/') || filepath === coll.path) {
  2512. collectionName = coll.name;
  2513. // Extract relative path
  2514. relativePath = filepath.startsWith(coll.path + '/')
  2515. ? filepath.slice(coll.path.length + 1)
  2516. : '';
  2517. break;
  2518. }
  2519. }
  2520. if (!collectionName || relativePath === null) return null;
  2521. }
  2522. // Get the collection from DB
  2523. const coll = getStoreCollection(db, collectionName);
  2524. if (!coll) return null;
  2525. // Verify this document exists in the database
  2526. const doc = db.prepare(`
  2527. SELECT d.path
  2528. FROM documents d
  2529. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  2530. LIMIT 1
  2531. `).get(collectionName, relativePath) as { path: string } | null;
  2532. if (!doc) return null;
  2533. // Collect ALL matching contexts (global + all path prefixes)
  2534. const contexts: string[] = [];
  2535. // Add global context if present
  2536. const globalCtx = getStoreGlobalContext(db);
  2537. if (globalCtx) {
  2538. contexts.push(globalCtx);
  2539. }
  2540. // Add all matching path contexts (from most general to most specific)
  2541. if (coll.context) {
  2542. const normalizedPath = relativePath.startsWith("/") ? relativePath : `/${relativePath}`;
  2543. // Collect all matching prefixes
  2544. const matchingContexts: { prefix: string; context: string }[] = [];
  2545. for (const [prefix, context] of Object.entries(coll.context)) {
  2546. const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
  2547. if (normalizedPath.startsWith(normalizedPrefix)) {
  2548. matchingContexts.push({ prefix: normalizedPrefix, context });
  2549. }
  2550. }
  2551. // Sort by prefix length (shortest/most general first)
  2552. matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
  2553. // Add all matching contexts
  2554. for (const match of matchingContexts) {
  2555. contexts.push(match.context);
  2556. }
  2557. }
  2558. // Join all contexts with double newline
  2559. return contexts.length > 0 ? contexts.join('\n\n') : null;
  2560. }
  2561. /**
  2562. * Get collection by name from DB store_collections table.
  2563. */
  2564. export function getCollectionByName(db: Database, name: string): { name: string; pwd: string; glob_pattern: string } | null {
  2565. const collection = getStoreCollection(db, name);
  2566. if (!collection) return null;
  2567. return {
  2568. name: collection.name,
  2569. pwd: collection.path,
  2570. glob_pattern: collection.pattern,
  2571. };
  2572. }
  2573. /**
  2574. * List all collections with document counts from database.
  2575. * Merges store_collections config with database statistics.
  2576. */
  2577. export function listCollections(db: Database): { name: string; pwd: string; glob_pattern: string; doc_count: number; active_count: number; last_modified: string | null; includeByDefault: boolean }[] {
  2578. const collections = getStoreCollections(db);
  2579. // Get document counts from database for each collection
  2580. const result = collections.map(coll => {
  2581. const stats = db.prepare(`
  2582. SELECT
  2583. COUNT(d.id) as doc_count,
  2584. SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count,
  2585. MAX(d.modified_at) as last_modified
  2586. FROM documents d
  2587. WHERE d.collection = ?
  2588. `).get(coll.name) as { doc_count: number; active_count: number; last_modified: string | null } | null;
  2589. return {
  2590. name: coll.name,
  2591. pwd: coll.path,
  2592. glob_pattern: coll.pattern,
  2593. doc_count: stats?.doc_count || 0,
  2594. active_count: stats?.active_count || 0,
  2595. last_modified: stats?.last_modified || null,
  2596. includeByDefault: coll.includeByDefault !== false,
  2597. };
  2598. });
  2599. return result;
  2600. }
  2601. /**
  2602. * Remove a collection and clean up its documents.
  2603. * Uses collections.ts to remove from YAML config and cleans up database.
  2604. */
  2605. export function removeCollection(db: Database, collectionName: string): { deletedDocs: number; cleanedHashes: number } {
  2606. // Delete documents from database
  2607. const docResult = db.prepare(`DELETE FROM documents WHERE collection = ?`).run(collectionName);
  2608. // Clean up orphaned content hashes
  2609. const cleanupResult = db.prepare(`
  2610. DELETE FROM content
  2611. WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
  2612. `).run();
  2613. // Remove from store_collections
  2614. deleteStoreCollection(db, collectionName);
  2615. return {
  2616. deletedDocs: docResult.changes,
  2617. cleanedHashes: cleanupResult.changes
  2618. };
  2619. }
  2620. /**
  2621. * Rename a collection.
  2622. * Updates both YAML config and database documents table.
  2623. */
  2624. export function renameCollection(db: Database, oldName: string, newName: string): void {
  2625. // Update all documents with the new collection name in database
  2626. db.prepare(`UPDATE documents SET collection = ? WHERE collection = ?`)
  2627. .run(newName, oldName);
  2628. // Rename in store_collections
  2629. renameStoreCollection(db, oldName, newName);
  2630. }
  2631. // =============================================================================
  2632. // Context Management Operations
  2633. // =============================================================================
  2634. /**
  2635. * Insert or update a context for a specific collection and path prefix.
  2636. */
  2637. export function insertContext(db: Database, collectionId: number, pathPrefix: string, context: string): void {
  2638. // Get collection name from ID
  2639. const coll = db.prepare(`SELECT name FROM collections WHERE id = ?`).get(collectionId) as { name: string } | null;
  2640. if (!coll) {
  2641. throw new Error(`Collection with id ${collectionId} not found`);
  2642. }
  2643. // Add context to store_collections
  2644. updateStoreContext(db, coll.name, pathPrefix, context);
  2645. }
  2646. /**
  2647. * Delete a context for a specific collection and path prefix.
  2648. * Returns the number of contexts deleted.
  2649. */
  2650. export function deleteContext(db: Database, collectionName: string, pathPrefix: string): number {
  2651. // Remove context from store_collections
  2652. const success = removeStoreContext(db, collectionName, pathPrefix);
  2653. return success ? 1 : 0;
  2654. }
  2655. /**
  2656. * Delete all global contexts (contexts with empty path_prefix).
  2657. * Returns the number of contexts deleted.
  2658. */
  2659. export function deleteGlobalContexts(db: Database): number {
  2660. let deletedCount = 0;
  2661. // Remove global context
  2662. setStoreGlobalContext(db, undefined);
  2663. deletedCount++;
  2664. // Remove root context (empty string) from all collections
  2665. const collections = getStoreCollections(db);
  2666. for (const coll of collections) {
  2667. const success = removeStoreContext(db, coll.name, '');
  2668. if (success) {
  2669. deletedCount++;
  2670. }
  2671. }
  2672. return deletedCount;
  2673. }
  2674. /**
  2675. * List all contexts, grouped by collection.
  2676. * Returns contexts ordered by collection name, then by path prefix length (longest first).
  2677. */
  2678. export function listPathContexts(db: Database): { collection_name: string; path_prefix: string; context: string }[] {
  2679. const allContexts = getStoreContexts(db);
  2680. // Convert to expected format and sort
  2681. return allContexts.map(ctx => ({
  2682. collection_name: ctx.collection,
  2683. path_prefix: ctx.path,
  2684. context: ctx.context,
  2685. })).sort((a, b) => {
  2686. // Sort by collection name first
  2687. if (a.collection_name !== b.collection_name) {
  2688. return a.collection_name.localeCompare(b.collection_name);
  2689. }
  2690. // Then by path prefix length (longest first)
  2691. if (a.path_prefix.length !== b.path_prefix.length) {
  2692. return b.path_prefix.length - a.path_prefix.length;
  2693. }
  2694. // Then alphabetically
  2695. return a.path_prefix.localeCompare(b.path_prefix);
  2696. });
  2697. }
  2698. /**
  2699. * Get all collections (name only - from YAML config).
  2700. */
  2701. export function getAllCollections(db: Database): { name: string }[] {
  2702. const collections = getStoreCollections(db);
  2703. return collections.map(c => ({ name: c.name }));
  2704. }
  2705. /**
  2706. * Check which collections don't have any context defined.
  2707. * Returns collections that have no context entries at all (not even root context).
  2708. */
  2709. export function getCollectionsWithoutContext(db: Database): { name: string; pwd: string; doc_count: number }[] {
  2710. // Get all collections from DB
  2711. const allCollections = getStoreCollections(db);
  2712. // Filter to those without context
  2713. const collectionsWithoutContext: { name: string; pwd: string; doc_count: number }[] = [];
  2714. for (const coll of allCollections) {
  2715. // Check if collection has any context
  2716. if (!coll.context || Object.keys(coll.context).length === 0) {
  2717. // Get doc count from database
  2718. const stats = db.prepare(`
  2719. SELECT COUNT(d.id) as doc_count
  2720. FROM documents d
  2721. WHERE d.collection = ? AND d.active = 1
  2722. `).get(coll.name) as { doc_count: number } | null;
  2723. collectionsWithoutContext.push({
  2724. name: coll.name,
  2725. pwd: coll.path,
  2726. doc_count: stats?.doc_count || 0,
  2727. });
  2728. }
  2729. }
  2730. return collectionsWithoutContext.sort((a, b) => a.name.localeCompare(b.name));
  2731. }
  2732. /**
  2733. * Get top-level directories in a collection that don't have context.
  2734. * Useful for suggesting where context might be needed.
  2735. */
  2736. export function getTopLevelPathsWithoutContext(db: Database, collectionName: string): string[] {
  2737. // Get all paths in the collection from database
  2738. const paths = db.prepare(`
  2739. SELECT DISTINCT path FROM documents
  2740. WHERE collection = ? AND active = 1
  2741. `).all(collectionName) as { path: string }[];
  2742. // Get existing contexts for this collection from DB
  2743. const dbColl = getStoreCollection(db, collectionName);
  2744. if (!dbColl) return [];
  2745. const contextPrefixes = new Set<string>();
  2746. if (dbColl.context) {
  2747. for (const prefix of Object.keys(dbColl.context)) {
  2748. contextPrefixes.add(prefix);
  2749. }
  2750. }
  2751. // Extract top-level directories (first path component)
  2752. const topLevelDirs = new Set<string>();
  2753. for (const { path } of paths) {
  2754. const parts = path.split('/').filter(Boolean);
  2755. if (parts.length > 1) {
  2756. const dir = parts[0];
  2757. if (dir) topLevelDirs.add(dir);
  2758. }
  2759. }
  2760. // Filter out directories that already have context (exact or parent)
  2761. const missing: string[] = [];
  2762. for (const dir of topLevelDirs) {
  2763. let hasContext = false;
  2764. // Check if this dir or any parent has context
  2765. for (const prefix of contextPrefixes) {
  2766. if (prefix === '' || prefix === dir || dir.startsWith(prefix + '/')) {
  2767. hasContext = true;
  2768. break;
  2769. }
  2770. }
  2771. if (!hasContext) {
  2772. missing.push(dir);
  2773. }
  2774. }
  2775. return missing.sort();
  2776. }
  2777. // =============================================================================
  2778. // FTS Search
  2779. // =============================================================================
  2780. export function sanitizeFTS5Term(term: string): string {
  2781. return term.replace(/[^\p{L}\p{N}'_]/gu, '').toLowerCase();
  2782. }
  2783. /**
  2784. * Check if a token is a hyphenated compound word (e.g., multi-agent, DEC-0054, gpt-4).
  2785. * Returns true if the token contains internal hyphens between word/digit characters.
  2786. */
  2787. function isHyphenatedToken(token: string): boolean {
  2788. return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
  2789. }
  2790. /**
  2791. * Sanitize a hyphenated term into an FTS5 phrase by splitting on hyphens
  2792. * and sanitizing each part. Returns the parts joined by spaces for use
  2793. * inside FTS5 quotes: "multi agent" matches "multi-agent" in porter tokenizer.
  2794. */
  2795. function sanitizeHyphenatedTerm(term: string): string {
  2796. return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
  2797. }
  2798. /**
  2799. * Parse lex query syntax into FTS5 query.
  2800. *
  2801. * Supports:
  2802. * - Quoted phrases: "exact phrase" → "exact phrase" (exact match)
  2803. * - Negation: -term or -"phrase" → uses FTS5 NOT operator
  2804. * - Hyphenated tokens: multi-agent, DEC-0054, gpt-4 → treated as phrases
  2805. * - Plain terms: term → "term"* (prefix match)
  2806. *
  2807. * FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2".
  2808. * So `-term` only works when there are also positive terms.
  2809. *
  2810. * Hyphen disambiguation: `-sports` at a word boundary is negation, but `multi-agent`
  2811. * (where `-` is between word characters) is treated as a hyphenated phrase.
  2812. * When a leading `-` is followed by what looks like a hyphenated compound word
  2813. * (e.g., `-multi-agent`), the entire token is treated as a negated phrase.
  2814. *
  2815. * Examples:
  2816. * performance -sports → "performance"* NOT "sports"*
  2817. * "machine learning" → "machine learning"
  2818. * multi-agent memory → "multi agent" AND "memory"*
  2819. * DEC-0054 → "dec 0054"
  2820. * -multi-agent → NOT "multi agent"
  2821. */
  2822. function buildFTS5Query(query: string): string | null {
  2823. const positive: string[] = [];
  2824. const negative: string[] = [];
  2825. let i = 0;
  2826. const s = query.trim();
  2827. while (i < s.length) {
  2828. // Skip whitespace
  2829. while (i < s.length && /\s/.test(s[i]!)) i++;
  2830. if (i >= s.length) break;
  2831. // Check for negation prefix
  2832. const negated = s[i] === '-';
  2833. if (negated) i++;
  2834. // Check for quoted phrase
  2835. if (s[i] === '"') {
  2836. const start = i + 1;
  2837. i++;
  2838. while (i < s.length && s[i] !== '"') i++;
  2839. const phrase = s.slice(start, i).trim();
  2840. i++; // skip closing quote
  2841. if (phrase.length > 0) {
  2842. const sanitized = phrase.split(/\s+/).map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
  2843. if (sanitized) {
  2844. const ftsPhrase = `"${sanitized}"`; // Exact phrase, no prefix match
  2845. if (negated) {
  2846. negative.push(ftsPhrase);
  2847. } else {
  2848. positive.push(ftsPhrase);
  2849. }
  2850. }
  2851. }
  2852. } else {
  2853. // Plain term (until whitespace or quote)
  2854. const start = i;
  2855. while (i < s.length && !/[\s"]/.test(s[i]!)) i++;
  2856. const term = s.slice(start, i);
  2857. // Handle hyphenated tokens: multi-agent, DEC-0054, gpt-4
  2858. // These get split into phrase queries so FTS5 porter tokenizer matches them.
  2859. if (isHyphenatedToken(term)) {
  2860. const sanitized = sanitizeHyphenatedTerm(term);
  2861. if (sanitized) {
  2862. const ftsPhrase = `"${sanitized}"`; // Phrase match (no prefix)
  2863. if (negated) {
  2864. negative.push(ftsPhrase);
  2865. } else {
  2866. positive.push(ftsPhrase);
  2867. }
  2868. }
  2869. } else {
  2870. const sanitized = sanitizeFTS5Term(term);
  2871. if (sanitized) {
  2872. const ftsTerm = `"${sanitized}"*`; // Prefix match
  2873. if (negated) {
  2874. negative.push(ftsTerm);
  2875. } else {
  2876. positive.push(ftsTerm);
  2877. }
  2878. }
  2879. }
  2880. }
  2881. }
  2882. if (positive.length === 0 && negative.length === 0) return null;
  2883. // If only negative terms, we can't search (FTS5 NOT is binary)
  2884. if (positive.length === 0) return null;
  2885. // Join positive terms with AND
  2886. let result = positive.join(' AND ');
  2887. // Add NOT clause for negative terms
  2888. for (const neg of negative) {
  2889. result = `${result} NOT ${neg}`;
  2890. }
  2891. return result;
  2892. }
  2893. /**
  2894. * Validate that a vec/hyde query doesn't use lex-only syntax.
  2895. * Returns error message if invalid, null if valid.
  2896. *
  2897. * Negation is detected ONLY when `-` is preceded by whitespace or sits at
  2898. * the start of the query. Hyphens inside words (e.g. `auto-archived`,
  2899. * `pre-commit`, `multi-session`, `state-of-the-art`) carry no negation
  2900. * semantics in natural English and must pass through unchanged.
  2901. */
  2902. export function validateSemanticQuery(query: string): string | null {
  2903. // `-term` or `-"phrase"` only counts as negation at SOS or after whitespace.
  2904. if (/(?:^|\s)-\w/.test(query) || /(?:^|\s)-"/.test(query)) {
  2905. return 'Negation (-term) is not supported in vec/hyde queries. Use lex for exclusions.';
  2906. }
  2907. return null;
  2908. }
  2909. export function validateLexQuery(query: string): string | null {
  2910. if (/[\r\n]/.test(query)) {
  2911. return 'Lex queries must be a single line. Remove newline characters or split into separate lex: lines.';
  2912. }
  2913. const quoteCount = (query.match(/"/g) ?? []).length;
  2914. if (quoteCount % 2 === 1) {
  2915. return 'Lex query has an unmatched double quote ("). Add the closing quote or remove it.';
  2916. }
  2917. return null;
  2918. }
  2919. export function searchFTS(db: Database, query: string, limit: number = 20, collectionName?: string): SearchResult[] {
  2920. const ftsQuery = buildFTS5Query(query);
  2921. if (!ftsQuery) return [];
  2922. // Use a CTE to force FTS5 to run first, then filter by collection.
  2923. // Without the CTE, SQLite's query planner combines FTS5 MATCH with the
  2924. // collection filter in a single WHERE clause, which can cause it to
  2925. // abandon the FTS5 index and fall back to a full scan — turning an 8ms
  2926. // query into a 17-second query on large collections.
  2927. const params: (string | number)[] = [ftsQuery];
  2928. // When filtering by collection, fetch extra candidates from the FTS index
  2929. // since some will be filtered out. Without a collection filter we can
  2930. // fetch exactly the requested limit.
  2931. const ftsLimit = collectionName ? limit * 10 : limit;
  2932. let sql = `
  2933. WITH fts_matches AS (
  2934. SELECT rowid, bm25(documents_fts, 1.5, 4.0, 1.0) as bm25_score
  2935. FROM documents_fts
  2936. WHERE documents_fts MATCH ?
  2937. ORDER BY bm25_score ASC
  2938. LIMIT ${ftsLimit}
  2939. )
  2940. SELECT
  2941. 'qmd://' || d.collection || '/' || d.path as filepath,
  2942. d.collection || '/' || d.path as display_path,
  2943. d.title,
  2944. content.doc as body,
  2945. d.hash,
  2946. fm.bm25_score
  2947. FROM fts_matches fm
  2948. JOIN documents d ON d.id = fm.rowid
  2949. JOIN content ON content.hash = d.hash
  2950. WHERE d.active = 1
  2951. `;
  2952. if (collectionName) {
  2953. sql += ` AND d.collection = ?`;
  2954. params.push(String(collectionName));
  2955. }
  2956. // bm25 lower is better; sort ascending.
  2957. sql += ` ORDER BY fm.bm25_score ASC LIMIT ?`;
  2958. params.push(limit);
  2959. const rows = db.prepare(sql).all(...params) as { filepath: string; display_path: string; title: string; body: string; hash: string; bm25_score: number }[];
  2960. return rows.map(row => {
  2961. const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
  2962. // Convert bm25 (negative, lower is better) into a stable [0..1) score where higher is better.
  2963. // FTS5 BM25 scores are negative (e.g., -10 is strong, -2 is weak).
  2964. // |x| / (1 + |x|) maps: strong(-10)→0.91, medium(-2)→0.67, weak(-0.5)→0.33, none(0)→0.
  2965. // Monotonic and query-independent — no per-query normalization needed.
  2966. const score = Math.abs(row.bm25_score) / (1 + Math.abs(row.bm25_score));
  2967. return {
  2968. filepath: row.filepath,
  2969. displayPath: row.display_path,
  2970. title: row.title,
  2971. hash: row.hash,
  2972. docid: getDocid(row.hash),
  2973. collectionName,
  2974. modifiedAt: "", // Not available in FTS query
  2975. bodyLength: row.body.length,
  2976. body: row.body,
  2977. context: getContextForFile(db, row.filepath),
  2978. score,
  2979. source: "fts" as const,
  2980. };
  2981. });
  2982. }
  2983. // =============================================================================
  2984. // Vector Search
  2985. // =============================================================================
  2986. export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider): Promise<SearchResult[]> {
  2987. const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  2988. if (!tableExists) return [];
  2989. const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session, undefined, embedProvider);
  2990. if (!embedding) return [];
  2991. // IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
  2992. // hang indefinitely when combined with JOINs in the same query. Do NOT try to
  2993. // "optimize" this by combining into a single query with JOINs - it will break.
  2994. // See: https://github.com/tobi/qmd/pull/23
  2995. // Step 1: Get vector matches from sqlite-vec (no JOINs allowed)
  2996. const vecResults = db.prepare(`
  2997. SELECT hash_seq, distance
  2998. FROM vectors_vec
  2999. WHERE embedding MATCH ? AND k = ?
  3000. `).all(new Float32Array(embedding), limit * 3) as { hash_seq: string; distance: number }[];
  3001. if (vecResults.length === 0) return [];
  3002. // Step 2: Get chunk info and document data
  3003. const hashSeqs = vecResults.map(r => r.hash_seq);
  3004. const distanceMap = new Map(vecResults.map(r => [r.hash_seq, r.distance]));
  3005. // Build query for document lookup
  3006. const placeholders = hashSeqs.map(() => '?').join(',');
  3007. let docSql = `
  3008. SELECT
  3009. cv.hash || '_' || cv.seq as hash_seq,
  3010. cv.hash,
  3011. cv.pos,
  3012. 'qmd://' || d.collection || '/' || d.path as filepath,
  3013. d.collection || '/' || d.path as display_path,
  3014. d.title,
  3015. content.doc as body
  3016. FROM content_vectors cv
  3017. JOIN documents d ON d.hash = cv.hash AND d.active = 1
  3018. JOIN content ON content.hash = d.hash
  3019. WHERE cv.hash || '_' || cv.seq IN (${placeholders})
  3020. `;
  3021. const params: string[] = [...hashSeqs];
  3022. if (collectionName) {
  3023. docSql += ` AND d.collection = ?`;
  3024. params.push(collectionName);
  3025. }
  3026. const docRows = db.prepare(docSql).all(...params) as {
  3027. hash_seq: string; hash: string; pos: number; filepath: string;
  3028. display_path: string; title: string; body: string;
  3029. }[];
  3030. // Combine with distances and dedupe by filepath
  3031. const seen = new Map<string, { row: typeof docRows[0]; bestDist: number }>();
  3032. for (const row of docRows) {
  3033. const distance = distanceMap.get(row.hash_seq) ?? 1;
  3034. const existing = seen.get(row.filepath);
  3035. if (!existing || distance < existing.bestDist) {
  3036. seen.set(row.filepath, { row, bestDist: distance });
  3037. }
  3038. }
  3039. return Array.from(seen.values())
  3040. .sort((a, b) => a.bestDist - b.bestDist)
  3041. .slice(0, limit)
  3042. .map(({ row, bestDist }) => {
  3043. const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
  3044. return {
  3045. filepath: row.filepath,
  3046. displayPath: row.display_path,
  3047. title: row.title,
  3048. hash: row.hash,
  3049. docid: getDocid(row.hash),
  3050. collectionName,
  3051. modifiedAt: "", // Not available in vec query
  3052. bodyLength: row.body.length,
  3053. body: row.body,
  3054. context: getContextForFile(db, row.filepath),
  3055. score: 1 - bestDist, // Cosine similarity = 1 - cosine distance
  3056. source: "vec" as const,
  3057. chunkPos: row.pos,
  3058. };
  3059. });
  3060. }
  3061. // =============================================================================
  3062. // Embeddings
  3063. // =============================================================================
  3064. async function getEmbedding(text: string, model: string, isQuery: boolean, session?: ILLMSession, llmOverride?: LlamaCpp, embedProvider?: EmbeddingProvider): Promise<number[] | null> {
  3065. // When an EmbeddingProvider is supplied, route the encoding through it
  3066. // (HTTP / GPU worker / fallback chain) instead of touching local
  3067. // node-llama-cpp at all. The provider sees the raw text + the desired
  3068. // model id; query-formatting prefixes are still applied via
  3069. // formatQueryForEmbedding so embedding parity with the index is preserved.
  3070. if (embedProvider) {
  3071. const providerModel = embedProvider.getModelId();
  3072. const formattedText = isQuery
  3073. ? formatQueryForEmbedding(text, providerModel)
  3074. : formatDocForEmbedding(text, undefined, providerModel);
  3075. // Only forward an AbortSignal when the provider is local-backed;
  3076. // remote providers manage their own timeouts and an LLM-session signal
  3077. // would abort their HTTP request prematurely (i-08ovbvtb).
  3078. const sig = embedProvider.kind === "local" ? session?.signal : undefined;
  3079. const result = await embedProvider.embed(formattedText, sig ? { model: providerModel, signal: sig } : { model: providerModel });
  3080. return result?.embedding ?? null;
  3081. }
  3082. // Format text using the appropriate prompt template
  3083. const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model);
  3084. const result = session
  3085. ? await session.embed(formattedText, { model, isQuery })
  3086. : await (llmOverride ?? getDefaultLlamaCpp()).embed(formattedText, { model, isQuery });
  3087. return result?.embedding || null;
  3088. }
  3089. /**
  3090. * Get all unique content hashes that need embeddings (from active documents).
  3091. * Returns hash, document body, and a sample path for display purposes.
  3092. */
  3093. export function getHashesForEmbedding(db: Database): { hash: string; body: string; path: string }[] {
  3094. return db.prepare(`
  3095. SELECT d.hash, c.doc as body, MIN(d.path) as path
  3096. FROM documents d
  3097. JOIN content c ON d.hash = c.hash
  3098. LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
  3099. WHERE d.active = 1 AND v.hash IS NULL
  3100. GROUP BY d.hash
  3101. `).all() as { hash: string; body: string; path: string }[];
  3102. }
  3103. /**
  3104. * Clear all embeddings from the database (force re-index).
  3105. * Deletes all rows from content_vectors and drops the vectors_vec table.
  3106. */
  3107. export function clearAllEmbeddings(db: Database): void {
  3108. db.exec(`DELETE FROM content_vectors`);
  3109. db.exec(`DROP TABLE IF EXISTS vectors_vec`);
  3110. }
  3111. /**
  3112. * Get the distinct set of model identifiers present in `content_vectors`.
  3113. *
  3114. * Used by the embedding migration-safety guard: if a configured provider's
  3115. * `getModelId()` does not appear in this list (and the table is non-empty),
  3116. * we refuse to embed and ask the user to run `qmd embed -f` to rebuild.
  3117. *
  3118. * Returns `[]` when the table is empty (fresh DB) — in which case any
  3119. * provider is allowed.
  3120. */
  3121. export function getDistinctEmbeddingModels(db: Database): string[] {
  3122. const rows = db.prepare(
  3123. `SELECT DISTINCT model FROM content_vectors WHERE model IS NOT NULL`,
  3124. ).all() as { model: string }[];
  3125. return rows.map((r) => r.model).filter((m) => typeof m === "string" && m.length > 0);
  3126. }
  3127. /**
  3128. * Insert a single embedding into both content_vectors and vectors_vec tables.
  3129. * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
  3130. *
  3131. * content_vectors is inserted first so that getHashesForEmbedding (which checks
  3132. * only content_vectors) won't re-select the hash on a crash between the two inserts.
  3133. *
  3134. * vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's
  3135. * vec0 virtual tables silently ignore the OR REPLACE conflict clause.
  3136. */
  3137. export function insertEmbedding(
  3138. db: Database,
  3139. hash: string,
  3140. seq: number,
  3141. pos: number,
  3142. embedding: Float32Array,
  3143. model: string,
  3144. embeddedAt: string
  3145. ): void {
  3146. const hashSeq = `${hash}_${seq}`;
  3147. // Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding)
  3148. const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
  3149. insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
  3150. // vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
  3151. const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
  3152. const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
  3153. deleteVecStmt.run(hashSeq);
  3154. insertVecStmt.run(hashSeq, embedding);
  3155. }
  3156. // =============================================================================
  3157. // Query expansion
  3158. // =============================================================================
  3159. export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<ExpandedQuery[]> {
  3160. // Check cache first — stored as JSON preserving types
  3161. const cacheKey = getCacheKey("expandQuery", { query, model, ...(intent && { intent }) });
  3162. const cached = getCachedResult(db, cacheKey);
  3163. if (cached) {
  3164. try {
  3165. const parsed = JSON.parse(cached) as any[];
  3166. // Migrate old cache format: { type, text } → { type, query }
  3167. if (parsed.length > 0 && parsed[0].query) {
  3168. return parsed as ExpandedQuery[];
  3169. } else if (parsed.length > 0 && parsed[0].text) {
  3170. return parsed.map((r: any) => ({ type: r.type, query: r.text }));
  3171. }
  3172. } catch {
  3173. // Old cache format (pre-typed, newline-separated text) — re-expand
  3174. }
  3175. }
  3176. const llm = llmOverride ?? getDefaultLlamaCpp();
  3177. // Note: LlamaCpp uses hardcoded model, model parameter is ignored
  3178. const results = await llm.expandQuery(query, { intent });
  3179. // Map Queryable[] → ExpandedQuery[] (same shape, decoupled from llm.ts internals).
  3180. // Filter out entries that duplicate the original query text.
  3181. const expanded: ExpandedQuery[] = results
  3182. .filter(r => r.text !== query)
  3183. .map(r => ({ type: r.type, query: r.text }));
  3184. if (expanded.length > 0) {
  3185. setCachedResult(db, cacheKey, JSON.stringify(expanded));
  3186. }
  3187. return expanded;
  3188. }
  3189. // =============================================================================
  3190. // Reranking
  3191. // =============================================================================
  3192. export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<{ file: string; score: number }[]> {
  3193. // Prepend intent to rerank query so the reranker scores with domain context
  3194. const rerankQuery = intent ? `${intent}\n\n${query}` : query;
  3195. const cachedResults: Map<string, number> = new Map();
  3196. const uncachedDocsByChunk: Map<string, RerankDocument> = new Map();
  3197. // Check cache for each document
  3198. // Cache key includes chunk text — different queries can select different chunks
  3199. // from the same file, and the reranker score depends on which chunk was sent.
  3200. // File path is excluded from the new cache key because the reranker score
  3201. // depends on the chunk content, not where it came from.
  3202. for (const doc of documents) {
  3203. const cacheKey = getCacheKey("rerank", { query: rerankQuery, model, chunk: doc.text });
  3204. const legacyCacheKey = getCacheKey("rerank", { query, file: doc.file, model, chunk: doc.text });
  3205. const cached = getCachedResult(db, cacheKey) ?? getCachedResult(db, legacyCacheKey);
  3206. if (cached !== null) {
  3207. cachedResults.set(doc.text, parseFloat(cached));
  3208. } else {
  3209. uncachedDocsByChunk.set(doc.text, { file: doc.file, text: doc.text });
  3210. }
  3211. }
  3212. // Rerank uncached documents using LlamaCpp
  3213. if (uncachedDocsByChunk.size > 0) {
  3214. const llm = llmOverride ?? getDefaultLlamaCpp();
  3215. const uncachedDocs = [...uncachedDocsByChunk.values()];
  3216. const rerankResult = await llm.rerank(rerankQuery, uncachedDocs, { model });
  3217. // Cache results by chunk text so identical chunks across files are scored once.
  3218. const textByFile = new Map(uncachedDocs.map(d => [d.file, d.text]));
  3219. for (const result of rerankResult.results) {
  3220. const chunk = textByFile.get(result.file) || "";
  3221. const cacheKey = getCacheKey("rerank", { query: rerankQuery, model, chunk });
  3222. setCachedResult(db, cacheKey, result.score.toString());
  3223. cachedResults.set(chunk, result.score);
  3224. }
  3225. }
  3226. // Return all results sorted by score
  3227. return documents
  3228. .map(doc => ({ file: doc.file, score: cachedResults.get(doc.text) || 0 }))
  3229. .sort((a, b) => b.score - a.score);
  3230. }
  3231. // =============================================================================
  3232. // Reciprocal Rank Fusion
  3233. // =============================================================================
  3234. export function reciprocalRankFusion(
  3235. resultLists: RankedResult[][],
  3236. weights: number[] = [],
  3237. k: number = 60
  3238. ): RankedResult[] {
  3239. const scores = new Map<string, { result: RankedResult; rrfScore: number; topRank: number }>();
  3240. for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
  3241. const list = resultLists[listIdx];
  3242. if (!list) continue;
  3243. const weight = weights[listIdx] ?? 1.0;
  3244. for (let rank = 0; rank < list.length; rank++) {
  3245. const result = list[rank];
  3246. if (!result) continue;
  3247. const rrfContribution = weight / (k + rank + 1);
  3248. const existing = scores.get(result.file);
  3249. if (existing) {
  3250. existing.rrfScore += rrfContribution;
  3251. existing.topRank = Math.min(existing.topRank, rank);
  3252. } else {
  3253. scores.set(result.file, {
  3254. result,
  3255. rrfScore: rrfContribution,
  3256. topRank: rank,
  3257. });
  3258. }
  3259. }
  3260. }
  3261. // Top-rank bonus
  3262. for (const entry of scores.values()) {
  3263. if (entry.topRank === 0) {
  3264. entry.rrfScore += 0.05;
  3265. } else if (entry.topRank <= 2) {
  3266. entry.rrfScore += 0.02;
  3267. }
  3268. }
  3269. return Array.from(scores.values())
  3270. .sort((a, b) => b.rrfScore - a.rrfScore)
  3271. .map(e => ({ ...e.result, score: e.rrfScore }));
  3272. }
  3273. /**
  3274. * Build per-document RRF contribution traces for explain/debug output.
  3275. */
  3276. export function buildRrfTrace(
  3277. resultLists: RankedResult[][],
  3278. weights: number[] = [],
  3279. listMeta: RankedListMeta[] = [],
  3280. k: number = 60
  3281. ): Map<string, RRFScoreTrace> {
  3282. const traces = new Map<string, RRFScoreTrace>();
  3283. for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
  3284. const list = resultLists[listIdx];
  3285. if (!list) continue;
  3286. const weight = weights[listIdx] ?? 1.0;
  3287. const meta = listMeta[listIdx] ?? {
  3288. source: "fts",
  3289. queryType: "original",
  3290. query: "",
  3291. } as const;
  3292. for (let rank0 = 0; rank0 < list.length; rank0++) {
  3293. const result = list[rank0];
  3294. if (!result) continue;
  3295. const rank = rank0 + 1; // 1-indexed rank for explain output
  3296. const contribution = weight / (k + rank);
  3297. const existing = traces.get(result.file);
  3298. const detail: RRFContributionTrace = {
  3299. listIndex: listIdx,
  3300. source: meta.source,
  3301. queryType: meta.queryType,
  3302. query: meta.query,
  3303. rank,
  3304. weight,
  3305. backendScore: result.score,
  3306. rrfContribution: contribution,
  3307. };
  3308. if (existing) {
  3309. existing.baseScore += contribution;
  3310. existing.topRank = Math.min(existing.topRank, rank);
  3311. existing.contributions.push(detail);
  3312. } else {
  3313. traces.set(result.file, {
  3314. contributions: [detail],
  3315. baseScore: contribution,
  3316. topRank: rank,
  3317. topRankBonus: 0,
  3318. totalScore: 0,
  3319. });
  3320. }
  3321. }
  3322. }
  3323. for (const trace of traces.values()) {
  3324. let bonus = 0;
  3325. if (trace.topRank === 1) bonus = 0.05;
  3326. else if (trace.topRank <= 3) bonus = 0.02;
  3327. trace.topRankBonus = bonus;
  3328. trace.totalScore = trace.baseScore + bonus;
  3329. }
  3330. return traces;
  3331. }
  3332. // =============================================================================
  3333. // Document retrieval
  3334. // =============================================================================
  3335. type DbDocRow = {
  3336. virtual_path: string;
  3337. display_path: string;
  3338. title: string;
  3339. hash: string;
  3340. collection: string;
  3341. path: string;
  3342. modified_at: string;
  3343. body_length: number;
  3344. body?: string;
  3345. };
  3346. /**
  3347. * Find a document by filename/path, docid (#hash), or with fuzzy matching.
  3348. * Returns document metadata without body by default.
  3349. *
  3350. * Supports:
  3351. * - Virtual paths: qmd://collection/path/to/file.md
  3352. * - Absolute paths: /path/to/file.md
  3353. * - Relative paths: path/to/file.md
  3354. * - Short docid: #abc123 (first 6 chars of hash)
  3355. */
  3356. export function findDocument(db: Database, filename: string, options: { includeBody?: boolean } = {}): DocumentResult | DocumentNotFound {
  3357. let filepath = filename;
  3358. const colonMatch = filepath.match(/:(\d+)$/);
  3359. if (colonMatch) {
  3360. filepath = filepath.slice(0, -colonMatch[0].length);
  3361. }
  3362. // Check if this is a docid lookup (#abc123, abc123, "#abc123", "abc123", etc.)
  3363. if (isDocid(filepath)) {
  3364. const docidMatch = findDocumentByDocid(db, filepath);
  3365. if (docidMatch) {
  3366. filepath = docidMatch.filepath;
  3367. } else {
  3368. return { error: "not_found", query: filename, similarFiles: [] };
  3369. }
  3370. }
  3371. if (filepath.startsWith('~/')) {
  3372. filepath = homedir() + filepath.slice(1);
  3373. }
  3374. const bodyCol = options.includeBody ? `, content.doc as body` : ``;
  3375. // Build computed columns
  3376. // Note: absoluteFilepath is computed from YAML collections after query
  3377. const selectCols = `
  3378. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  3379. d.collection || '/' || d.path as display_path,
  3380. d.title,
  3381. d.hash,
  3382. d.collection,
  3383. d.modified_at,
  3384. LENGTH(content.doc) as body_length
  3385. ${bodyCol}
  3386. `;
  3387. // Try to match by virtual path first
  3388. let doc = db.prepare(`
  3389. SELECT ${selectCols}
  3390. FROM documents d
  3391. JOIN content ON content.hash = d.hash
  3392. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  3393. `).get(filepath) as DbDocRow | null;
  3394. // Try fuzzy match by virtual path
  3395. if (!doc) {
  3396. doc = db.prepare(`
  3397. SELECT ${selectCols}
  3398. FROM documents d
  3399. JOIN content ON content.hash = d.hash
  3400. WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
  3401. LIMIT 1
  3402. `).get(`%${filepath}`) as DbDocRow | null;
  3403. }
  3404. // Try to match by absolute path (requires looking up collection paths from DB)
  3405. if (!doc && !filepath.startsWith('qmd://')) {
  3406. const collections = getStoreCollections(db);
  3407. for (const coll of collections) {
  3408. let relativePath: string | null = null;
  3409. // If filepath is absolute and starts with collection path, extract relative part
  3410. if (filepath.startsWith(coll.path + '/')) {
  3411. relativePath = filepath.slice(coll.path.length + 1);
  3412. }
  3413. // Otherwise treat filepath as relative to collection
  3414. else if (!filepath.startsWith('/')) {
  3415. relativePath = filepath;
  3416. }
  3417. if (relativePath) {
  3418. doc = db.prepare(`
  3419. SELECT ${selectCols}
  3420. FROM documents d
  3421. JOIN content ON content.hash = d.hash
  3422. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  3423. `).get(coll.name, relativePath) as DbDocRow | null;
  3424. if (doc) break;
  3425. }
  3426. }
  3427. }
  3428. if (!doc) {
  3429. const similar = findSimilarFiles(db, filepath, 5, 5);
  3430. return { error: "not_found", query: filename, similarFiles: similar };
  3431. }
  3432. // Get context using virtual path
  3433. const virtualPath = doc.virtual_path || `qmd://${doc.collection}/${doc.display_path}`;
  3434. const context = getContextForFile(db, virtualPath);
  3435. return {
  3436. filepath: virtualPath,
  3437. displayPath: doc.display_path,
  3438. title: doc.title,
  3439. context,
  3440. hash: doc.hash,
  3441. docid: getDocid(doc.hash),
  3442. collectionName: doc.collection,
  3443. modifiedAt: doc.modified_at,
  3444. bodyLength: doc.body_length,
  3445. ...(options.includeBody && doc.body !== undefined && { body: doc.body }),
  3446. };
  3447. }
  3448. /**
  3449. * Get the body content for a document
  3450. * Optionally slice by line range
  3451. */
  3452. export function getDocumentBody(db: Database, doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number): string | null {
  3453. const filepath = doc.filepath;
  3454. // Try to resolve document by filepath (absolute or virtual)
  3455. let row: { body: string } | null = null;
  3456. // Try virtual path first
  3457. if (filepath.startsWith('qmd://')) {
  3458. row = db.prepare(`
  3459. SELECT content.doc as body
  3460. FROM documents d
  3461. JOIN content ON content.hash = d.hash
  3462. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  3463. `).get(filepath) as { body: string } | null;
  3464. }
  3465. // Try absolute path by looking up in DB store_collections
  3466. if (!row) {
  3467. const collections = getStoreCollections(db);
  3468. for (const coll of collections) {
  3469. if (filepath.startsWith(coll.path + '/')) {
  3470. const relativePath = filepath.slice(coll.path.length + 1);
  3471. row = db.prepare(`
  3472. SELECT content.doc as body
  3473. FROM documents d
  3474. JOIN content ON content.hash = d.hash
  3475. WHERE d.collection = ? AND d.path = ? AND d.active = 1
  3476. `).get(coll.name, relativePath) as { body: string } | null;
  3477. if (row) break;
  3478. }
  3479. }
  3480. }
  3481. if (!row) return null;
  3482. let body = row.body;
  3483. if (fromLine !== undefined || maxLines !== undefined) {
  3484. const lines = body.split('\n');
  3485. const start = (fromLine || 1) - 1;
  3486. const end = maxLines !== undefined ? start + maxLines : lines.length;
  3487. body = lines.slice(start, end).join('\n');
  3488. }
  3489. return body;
  3490. }
  3491. /**
  3492. * Find multiple documents by glob pattern or comma-separated list
  3493. * Returns documents without body by default (use getDocumentBody to load)
  3494. */
  3495. export function findDocuments(
  3496. db: Database,
  3497. pattern: string,
  3498. options: { includeBody?: boolean; maxBytes?: number } = {}
  3499. ): { docs: MultiGetResult[]; errors: string[] } {
  3500. const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?') && !pattern.includes('{');
  3501. const errors: string[] = [];
  3502. const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
  3503. const bodyCol = options.includeBody ? `, content.doc as body` : ``;
  3504. const selectCols = `
  3505. 'qmd://' || d.collection || '/' || d.path as virtual_path,
  3506. d.collection || '/' || d.path as display_path,
  3507. d.title,
  3508. d.hash,
  3509. d.collection,
  3510. d.modified_at,
  3511. LENGTH(content.doc) as body_length
  3512. ${bodyCol}
  3513. `;
  3514. let fileRows: DbDocRow[];
  3515. if (isCommaSeparated) {
  3516. const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
  3517. fileRows = [];
  3518. for (const name of names) {
  3519. let doc = db.prepare(`
  3520. SELECT ${selectCols}
  3521. FROM documents d
  3522. JOIN content ON content.hash = d.hash
  3523. WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
  3524. `).get(name) as DbDocRow | null;
  3525. if (!doc) {
  3526. doc = db.prepare(`
  3527. SELECT ${selectCols}
  3528. FROM documents d
  3529. JOIN content ON content.hash = d.hash
  3530. WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
  3531. LIMIT 1
  3532. `).get(`%${name}`) as DbDocRow | null;
  3533. }
  3534. if (doc) {
  3535. fileRows.push(doc);
  3536. } else {
  3537. const similar = findSimilarFiles(db, name, 5, 3);
  3538. let msg = `File not found: ${name}`;
  3539. if (similar.length > 0) {
  3540. msg += ` (did you mean: ${similar.join(', ')}?)`;
  3541. }
  3542. errors.push(msg);
  3543. }
  3544. }
  3545. } else {
  3546. // Glob pattern match
  3547. const matched = matchFilesByGlob(db, pattern);
  3548. if (matched.length === 0) {
  3549. errors.push(`No files matched pattern: ${pattern}`);
  3550. return { docs: [], errors };
  3551. }
  3552. const virtualPaths = matched.map(m => m.filepath);
  3553. const placeholders = virtualPaths.map(() => '?').join(',');
  3554. fileRows = db.prepare(`
  3555. SELECT ${selectCols}
  3556. FROM documents d
  3557. JOIN content ON content.hash = d.hash
  3558. WHERE 'qmd://' || d.collection || '/' || d.path IN (${placeholders}) AND d.active = 1
  3559. `).all(...virtualPaths) as DbDocRow[];
  3560. }
  3561. const results: MultiGetResult[] = [];
  3562. for (const row of fileRows) {
  3563. // Get context using virtual path
  3564. const virtualPath = row.virtual_path || `qmd://${row.collection}/${row.display_path}`;
  3565. const context = getContextForFile(db, virtualPath);
  3566. if (row.body_length > maxBytes) {
  3567. results.push({
  3568. doc: { filepath: virtualPath, displayPath: row.display_path },
  3569. skipped: true,
  3570. skipReason: `File too large (${Math.round(row.body_length / 1024)}KB > ${Math.round(maxBytes / 1024)}KB)`,
  3571. });
  3572. continue;
  3573. }
  3574. results.push({
  3575. doc: {
  3576. filepath: virtualPath,
  3577. displayPath: row.display_path,
  3578. title: row.title || row.display_path.split('/').pop() || row.display_path,
  3579. context,
  3580. hash: row.hash,
  3581. docid: getDocid(row.hash),
  3582. collectionName: row.collection,
  3583. modifiedAt: row.modified_at,
  3584. bodyLength: row.body_length,
  3585. ...(options.includeBody && row.body !== undefined && { body: row.body }),
  3586. },
  3587. skipped: false,
  3588. });
  3589. }
  3590. return { docs: results, errors };
  3591. }
  3592. // =============================================================================
  3593. // Status
  3594. // =============================================================================
  3595. export function getStatus(db: Database): IndexStatus {
  3596. // DB is source of truth for collections — config provides supplementary metadata
  3597. const dbCollections = db.prepare(`
  3598. SELECT
  3599. collection as name,
  3600. COUNT(*) as active_count,
  3601. MAX(modified_at) as last_doc_update
  3602. FROM documents
  3603. WHERE active = 1
  3604. GROUP BY collection
  3605. `).all() as { name: string; active_count: number; last_doc_update: string | null }[];
  3606. // Build a lookup from store_collections for path/pattern metadata
  3607. const storeCollections = getStoreCollections(db);
  3608. const configLookup = new Map(storeCollections.map(c => [c.name, { path: c.path, pattern: c.pattern }]));
  3609. const collections: CollectionInfo[] = dbCollections.map(row => {
  3610. const config = configLookup.get(row.name);
  3611. return {
  3612. name: row.name,
  3613. path: config?.path ?? null,
  3614. pattern: config?.pattern ?? null,
  3615. documents: row.active_count,
  3616. lastUpdated: row.last_doc_update || new Date().toISOString(),
  3617. };
  3618. });
  3619. // Sort by last update time (most recent first)
  3620. collections.sort((a, b) => {
  3621. if (!a.lastUpdated) return 1;
  3622. if (!b.lastUpdated) return -1;
  3623. return new Date(b.lastUpdated).getTime() - new Date(a.lastUpdated).getTime();
  3624. });
  3625. const totalDocs = (db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get() as { c: number }).c;
  3626. const needsEmbedding = getHashesNeedingEmbedding(db);
  3627. const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
  3628. return {
  3629. totalDocuments: totalDocs,
  3630. needsEmbedding,
  3631. hasVectorIndex: hasVectors,
  3632. collections,
  3633. };
  3634. }
  3635. // =============================================================================
  3636. // Snippet extraction
  3637. // =============================================================================
  3638. export type SnippetResult = {
  3639. line: number; // 1-indexed line number of best match
  3640. snippet: string; // The snippet text with diff-style header
  3641. linesBefore: number; // Lines in document before snippet
  3642. linesAfter: number; // Lines in document after snippet
  3643. snippetLines: number; // Number of lines in snippet
  3644. };
  3645. /** Weight for intent terms relative to query terms (1.0) in snippet scoring */
  3646. export const INTENT_WEIGHT_SNIPPET = 0.3;
  3647. /** Weight for intent terms relative to query terms (1.0) in chunk selection */
  3648. export const INTENT_WEIGHT_CHUNK = 0.5;
  3649. // Common stop words filtered from intent strings before tokenization.
  3650. // Seeded from finetune/reward.py KEY_TERM_STOPWORDS, extended with common
  3651. // 2-3 char function words so the length threshold can drop to >1 and let
  3652. // short domain terms (API, SQL, LLM, CPU, CDN, …) survive.
  3653. const INTENT_STOP_WORDS = new Set([
  3654. // 2-char function words
  3655. "am", "an", "as", "at", "be", "by", "do", "he", "if",
  3656. "in", "is", "it", "me", "my", "no", "of", "on", "or", "so",
  3657. "to", "up", "us", "we",
  3658. // 3-char function words
  3659. "all", "and", "any", "are", "but", "can", "did", "for", "get",
  3660. "has", "her", "him", "his", "how", "its", "let", "may", "not",
  3661. "our", "out", "the", "too", "was", "who", "why", "you",
  3662. // 4+ char common words
  3663. "also", "does", "find", "from", "have", "into", "more", "need",
  3664. "show", "some", "tell", "that", "them", "this", "want", "what",
  3665. "when", "will", "with", "your",
  3666. // Search-context noise
  3667. "about", "looking", "notes", "search", "where", "which",
  3668. ]);
  3669. /**
  3670. * Extract meaningful terms from an intent string, filtering stop words and punctuation.
  3671. * Uses Unicode-aware punctuation stripping so domain terms like "API" survive.
  3672. * Returns lowercase terms suitable for text matching.
  3673. */
  3674. export function extractIntentTerms(intent: string): string[] {
  3675. return intent.toLowerCase().split(/\s+/)
  3676. .map(t => t.replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu, ""))
  3677. .filter(t => t.length > 1 && !INTENT_STOP_WORDS.has(t));
  3678. }
  3679. export function extractSnippet(body: string, query: string, maxLen = 500, chunkPos?: number, chunkLen?: number, intent?: string): SnippetResult {
  3680. const totalLines = body.split('\n').length;
  3681. let searchBody = body;
  3682. let lineOffset = 0;
  3683. if (chunkPos && chunkPos > 0) {
  3684. // Search within the chunk region, with some padding for context
  3685. // Use provided chunkLen or fall back to max chunk size (covers variable-length chunks)
  3686. const searchLen = chunkLen || CHUNK_SIZE_CHARS;
  3687. const contextStart = Math.max(0, chunkPos - 100);
  3688. const contextEnd = Math.min(body.length, chunkPos + searchLen + 100);
  3689. searchBody = body.slice(contextStart, contextEnd);
  3690. if (contextStart > 0) {
  3691. lineOffset = body.slice(0, contextStart).split('\n').length - 1;
  3692. }
  3693. }
  3694. const lines = searchBody.split('\n');
  3695. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
  3696. const intentTerms = intent ? extractIntentTerms(intent) : [];
  3697. let bestLine = 0, bestScore = -1;
  3698. for (let i = 0; i < lines.length; i++) {
  3699. const lineLower = (lines[i] ?? "").toLowerCase();
  3700. let score = 0;
  3701. for (const term of queryTerms) {
  3702. if (lineLower.includes(term)) score += 1.0;
  3703. }
  3704. for (const term of intentTerms) {
  3705. if (lineLower.includes(term)) score += INTENT_WEIGHT_SNIPPET;
  3706. }
  3707. if (score > bestScore) {
  3708. bestScore = score;
  3709. bestLine = i;
  3710. }
  3711. }
  3712. const start = Math.max(0, bestLine - 1);
  3713. const end = Math.min(lines.length, bestLine + 3);
  3714. const snippetLines = lines.slice(start, end);
  3715. let snippetText = snippetLines.join('\n');
  3716. // If we focused on a chunk window and it produced an empty/whitespace-only snippet,
  3717. // fall back to a full-document snippet so we always show something useful.
  3718. if (chunkPos && chunkPos > 0 && snippetText.trim().length === 0) {
  3719. return extractSnippet(body, query, maxLen, undefined, undefined, intent);
  3720. }
  3721. if (snippetText.length > maxLen) snippetText = snippetText.substring(0, maxLen - 3) + "...";
  3722. const absoluteStart = lineOffset + start + 1; // 1-indexed
  3723. const snippetLineCount = snippetLines.length;
  3724. const linesBefore = absoluteStart - 1;
  3725. const linesAfter = totalLines - (absoluteStart + snippetLineCount - 1);
  3726. // Format with diff-style header: @@ -start,count @@ (linesBefore before, linesAfter after)
  3727. const header = `@@ -${absoluteStart},${snippetLineCount} @@ (${linesBefore} before, ${linesAfter} after)`;
  3728. const snippet = `${header}\n${snippetText}`;
  3729. return {
  3730. line: lineOffset + bestLine + 1,
  3731. snippet,
  3732. linesBefore,
  3733. linesAfter,
  3734. snippetLines: snippetLineCount,
  3735. };
  3736. }
  3737. // =============================================================================
  3738. // Shared helpers (used by both CLI and MCP)
  3739. // =============================================================================
  3740. /**
  3741. * Add line numbers to text content.
  3742. * Each line becomes: "{lineNum}: {content}"
  3743. */
  3744. export function addLineNumbers(text: string, startLine: number = 1): string {
  3745. const lines = text.split('\n');
  3746. return lines.map((line, i) => `${startLine + i}: ${line}`).join('\n');
  3747. }
  3748. // =============================================================================
  3749. // Shared search orchestration
  3750. //
  3751. // hybridQuery() and vectorSearchQuery() are standalone functions (not Store
  3752. // methods) because they are orchestration over primitives — same rationale as
  3753. // reciprocalRankFusion(). They take a Store as first argument so both CLI
  3754. // and MCP can share the identical pipeline.
  3755. // =============================================================================
  3756. /**
  3757. * Optional progress hooks for search orchestration.
  3758. * CLI wires these to stderr for user feedback; MCP leaves them unset.
  3759. */
  3760. export interface SearchHooks {
  3761. /** BM25 probe found strong signal — expansion will be skipped */
  3762. onStrongSignal?: (topScore: number) => void;
  3763. /** Query expansion starting */
  3764. onExpandStart?: () => void;
  3765. /** Query expansion complete. Empty array = strong signal skip. elapsedMs = time taken. */
  3766. onExpand?: (original: string, expanded: ExpandedQuery[], elapsedMs: number) => void;
  3767. /** Embedding starting (vec/hyde queries) */
  3768. onEmbedStart?: (count: number) => void;
  3769. /** Embedding complete */
  3770. onEmbedDone?: (elapsedMs: number) => void;
  3771. /** Reranking is about to start */
  3772. onRerankStart?: (chunkCount: number) => void;
  3773. /** Reranking finished */
  3774. onRerankDone?: (elapsedMs: number) => void;
  3775. }
  3776. export interface HybridQueryOptions {
  3777. collection?: string;
  3778. limit?: number; // default 10
  3779. minScore?: number; // default 0
  3780. candidateLimit?: number; // default RERANK_CANDIDATE_LIMIT
  3781. explain?: boolean; // include backend/RRF/rerank score traces
  3782. intent?: string; // domain intent hint for disambiguation
  3783. skipRerank?: boolean; // skip LLM reranking, use only RRF scores
  3784. chunkStrategy?: ChunkStrategy;
  3785. hooks?: SearchHooks;
  3786. /**
  3787. * Optional embedding provider for query-side encoding (i-loazq6ze).
  3788. * When supplied, the original-query vector AND any vec/hyde expansion
  3789. * variants are encoded through this provider (HTTP, GPU worker,
  3790. * AutoFallback chain) instead of `getLlm(store).embedBatch(...)`. Skip
  3791. * to keep pre-patch behavior (uses local LlamaCpp).
  3792. */
  3793. embedProvider?: EmbeddingProvider;
  3794. }
  3795. export interface HybridQueryResult {
  3796. file: string; // internal filepath (qmd://collection/path)
  3797. displayPath: string;
  3798. title: string;
  3799. body: string; // full document body (for snippet extraction)
  3800. bestChunk: string; // best chunk text
  3801. bestChunkPos: number; // char offset of best chunk in body
  3802. score: number; // blended score (full precision)
  3803. context: string | null; // user-set context
  3804. docid: string; // content hash prefix (6 chars)
  3805. explain?: HybridQueryExplain;
  3806. }
  3807. export type RankedListMeta = {
  3808. source: "fts" | "vec";
  3809. queryType: "original" | "lex" | "vec" | "hyde";
  3810. query: string;
  3811. };
  3812. /**
  3813. * Hybrid search: BM25 + vector + query expansion + RRF + chunked reranking.
  3814. *
  3815. * Pipeline:
  3816. * 1. BM25 probe → skip expansion if strong signal
  3817. * 2. expandQuery() → typed query variants (lex/vec/hyde)
  3818. * 3. Type-routed search: original→vector, lex→FTS, vec/hyde→vector
  3819. * 4. RRF fusion → slice to candidateLimit
  3820. * 5. chunkDocument() + keyword-best-chunk selection
  3821. * 6. rerank on chunks (NOT full bodies — O(tokens) trap)
  3822. * 7. Position-aware score blending (RRF rank × reranker score)
  3823. * 8. Dedup by file, filter by minScore, slice to limit
  3824. */
  3825. export async function hybridQuery(
  3826. store: Store,
  3827. query: string,
  3828. options?: HybridQueryOptions
  3829. ): Promise<HybridQueryResult[]> {
  3830. const limit = options?.limit ?? 10;
  3831. const minScore = options?.minScore ?? 0;
  3832. const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
  3833. const collection = options?.collection;
  3834. const explain = options?.explain ?? false;
  3835. const intent = options?.intent;
  3836. const skipRerank = options?.skipRerank ?? false;
  3837. const hooks = options?.hooks;
  3838. const embedProvider = options?.embedProvider;
  3839. const rankedLists: RankedResult[][] = [];
  3840. const rankedListMeta: RankedListMeta[] = [];
  3841. const docidMap = new Map<string, string>(); // filepath -> docid
  3842. const hasVectors = !!store.db.prepare(
  3843. `SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
  3844. ).get();
  3845. // Step 1: BM25 probe — strong signal skips expensive LLM expansion
  3846. // When intent is provided, disable strong-signal bypass — the obvious BM25
  3847. // match may not be what the caller wants (e.g. "performance" with intent
  3848. // "web page load times" should NOT shortcut to a sports-performance doc).
  3849. // Pass collection directly into FTS query (filter at SQL level, not post-hoc)
  3850. const initialFts = store.searchFTS(query, 20, collection);
  3851. const topScore = initialFts[0]?.score ?? 0;
  3852. const secondScore = initialFts[1]?.score ?? 0;
  3853. const hasStrongSignal = !intent && initialFts.length > 0
  3854. && topScore >= STRONG_SIGNAL_MIN_SCORE
  3855. && (topScore - secondScore) >= STRONG_SIGNAL_MIN_GAP;
  3856. if (hasStrongSignal) hooks?.onStrongSignal?.(topScore);
  3857. // Step 2: Expand query (or skip if strong signal)
  3858. hooks?.onExpandStart?.();
  3859. const expandStart = Date.now();
  3860. const expanded = hasStrongSignal
  3861. ? []
  3862. : await store.expandQuery(query, undefined, intent);
  3863. hooks?.onExpand?.(query, expanded, Date.now() - expandStart);
  3864. // Seed with initial FTS results (avoid re-running original query FTS)
  3865. if (initialFts.length > 0) {
  3866. for (const r of initialFts) docidMap.set(r.filepath, r.docid);
  3867. rankedLists.push(initialFts.map(r => ({
  3868. file: r.filepath, displayPath: r.displayPath,
  3869. title: r.title, body: r.body || "", score: r.score,
  3870. })));
  3871. rankedListMeta.push({ source: "fts", queryType: "original", query });
  3872. }
  3873. // Step 3: Route searches by query type
  3874. //
  3875. // Strategy: run all FTS queries immediately (they're sync/instant), then
  3876. // batch-embed all vector queries in one embedBatch() call, then run
  3877. // sqlite-vec lookups with pre-computed embeddings.
  3878. // 3a: Run FTS for all lex expansions right away (no LLM needed)
  3879. for (const q of expanded) {
  3880. if (q.type === 'lex') {
  3881. const ftsResults = store.searchFTS(q.query, 20, collection);
  3882. if (ftsResults.length > 0) {
  3883. for (const r of ftsResults) docidMap.set(r.filepath, r.docid);
  3884. rankedLists.push(ftsResults.map(r => ({
  3885. file: r.filepath, displayPath: r.displayPath,
  3886. title: r.title, body: r.body || "", score: r.score,
  3887. })));
  3888. rankedListMeta.push({ source: "fts", queryType: "lex", query: q.query });
  3889. }
  3890. }
  3891. }
  3892. // 3b: Collect all texts that need vector search (original query + vec/hyde expansions)
  3893. if (hasVectors) {
  3894. const vecQueries: { text: string; queryType: "original" | "vec" | "hyde" }[] = [
  3895. { text: query, queryType: "original" },
  3896. ];
  3897. for (const q of expanded) {
  3898. if (q.type === 'vec' || q.type === 'hyde') {
  3899. vecQueries.push({ text: q.query, queryType: q.type });
  3900. }
  3901. }
  3902. // Batch embed all vector queries in a single call.
  3903. // When `embedProvider` is supplied (i-loazq6ze), route the encode through
  3904. // it (HTTP / GPU worker / AutoFallback chain) instead of warming the
  3905. // local llama-cpp model — this is the whole point of the GPU worker.
  3906. const embedModelName = embedProvider
  3907. ? embedProvider.getModelId()
  3908. : getLlm(store).embedModelName;
  3909. const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, embedModelName));
  3910. hooks?.onEmbedStart?.(textsToEmbed.length);
  3911. const embedStart = Date.now();
  3912. const embeddings = embedProvider
  3913. ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
  3914. : await getLlm(store).embedBatch(textsToEmbed);
  3915. hooks?.onEmbedDone?.(Date.now() - embedStart);
  3916. // Run sqlite-vec lookups with pre-computed embeddings
  3917. for (let i = 0; i < vecQueries.length; i++) {
  3918. const embedding = embeddings[i]?.embedding;
  3919. if (!embedding) continue;
  3920. const vecResults = await store.searchVec(
  3921. vecQueries[i]!.text, DEFAULT_EMBED_MODEL, 20, collection,
  3922. undefined, embedding
  3923. );
  3924. if (vecResults.length > 0) {
  3925. for (const r of vecResults) docidMap.set(r.filepath, r.docid);
  3926. rankedLists.push(vecResults.map(r => ({
  3927. file: r.filepath, displayPath: r.displayPath,
  3928. title: r.title, body: r.body || "", score: r.score,
  3929. })));
  3930. rankedListMeta.push({
  3931. source: "vec",
  3932. queryType: vecQueries[i]!.queryType,
  3933. query: vecQueries[i]!.text,
  3934. });
  3935. }
  3936. }
  3937. }
  3938. // Step 4: RRF fusion — first 2 lists (original FTS + first vec) get 2x weight
  3939. const weights = rankedLists.map((_, i) => i < 2 ? 2.0 : 1.0);
  3940. const fused = reciprocalRankFusion(rankedLists, weights);
  3941. const rrfTraceByFile = explain ? buildRrfTrace(rankedLists, weights, rankedListMeta) : null;
  3942. const candidates = fused.slice(0, candidateLimit);
  3943. if (candidates.length === 0) return [];
  3944. // Step 5: Chunk documents, pick best chunk per doc for reranking.
  3945. // Reranking full bodies is O(tokens) — the critical perf lesson that motivated this refactor.
  3946. const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
  3947. const intentTerms = intent ? extractIntentTerms(intent) : [];
  3948. const docChunkMap = new Map<string, { chunks: { text: string; pos: number }[]; bestIdx: number }>();
  3949. const chunkStrategy = options?.chunkStrategy;
  3950. for (const cand of candidates) {
  3951. const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, chunkStrategy);
  3952. if (chunks.length === 0) continue;
  3953. // Pick chunk with most keyword overlap (fallback: first chunk)
  3954. // Intent terms contribute at INTENT_WEIGHT_CHUNK (0.5) relative to query terms (1.0)
  3955. let bestIdx = 0;
  3956. let bestScore = -1;
  3957. for (let i = 0; i < chunks.length; i++) {
  3958. const chunkLower = chunks[i]!.text.toLowerCase();
  3959. let score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
  3960. for (const term of intentTerms) {
  3961. if (chunkLower.includes(term)) score += INTENT_WEIGHT_CHUNK;
  3962. }
  3963. if (score > bestScore) { bestScore = score; bestIdx = i; }
  3964. }
  3965. docChunkMap.set(cand.file, { chunks, bestIdx });
  3966. }
  3967. if (skipRerank) {
  3968. // Skip LLM reranking — return candidates scored by RRF only
  3969. const seenFiles = new Set<string>();
  3970. return candidates
  3971. .map((cand, i) => {
  3972. const chunkInfo = docChunkMap.get(cand.file);
  3973. const bestIdx = chunkInfo?.bestIdx ?? 0;
  3974. const bestChunk = chunkInfo?.chunks[bestIdx]?.text || cand.body || "";
  3975. const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
  3976. const rrfRank = i + 1;
  3977. const rrfScore = 1 / rrfRank;
  3978. const trace = rrfTraceByFile?.get(cand.file);
  3979. const explainData: HybridQueryExplain | undefined = explain ? {
  3980. ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
  3981. vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
  3982. rrf: {
  3983. rank: rrfRank,
  3984. positionScore: rrfScore,
  3985. weight: 1.0,
  3986. baseScore: trace?.baseScore ?? 0,
  3987. topRankBonus: trace?.topRankBonus ?? 0,
  3988. totalScore: trace?.totalScore ?? 0,
  3989. contributions: trace?.contributions ?? [],
  3990. },
  3991. rerankScore: 0,
  3992. blendedScore: rrfScore,
  3993. } : undefined;
  3994. return {
  3995. file: cand.file,
  3996. displayPath: cand.displayPath,
  3997. title: cand.title,
  3998. body: cand.body,
  3999. bestChunk,
  4000. bestChunkPos,
  4001. score: rrfScore,
  4002. context: store.getContextForFile(cand.file),
  4003. docid: docidMap.get(cand.file) || "",
  4004. ...(explainData ? { explain: explainData } : {}),
  4005. };
  4006. })
  4007. .filter(r => {
  4008. if (seenFiles.has(r.file)) return false;
  4009. seenFiles.add(r.file);
  4010. return true;
  4011. })
  4012. .filter(r => r.score >= minScore)
  4013. .slice(0, limit);
  4014. }
  4015. // Step 6: Rerank chunks (NOT full bodies)
  4016. const chunksToRerank: { file: string; text: string }[] = [];
  4017. for (const cand of candidates) {
  4018. const chunkInfo = docChunkMap.get(cand.file);
  4019. if (chunkInfo) {
  4020. chunksToRerank.push({ file: cand.file, text: chunkInfo.chunks[chunkInfo.bestIdx]!.text });
  4021. }
  4022. }
  4023. hooks?.onRerankStart?.(chunksToRerank.length);
  4024. const rerankStart = Date.now();
  4025. const reranked = await store.rerank(query, chunksToRerank, undefined, intent);
  4026. hooks?.onRerankDone?.(Date.now() - rerankStart);
  4027. // Step 7: Blend RRF position score with reranker score
  4028. // Position-aware weights: top retrieval results get more protection from reranker disagreement
  4029. const candidateMap = new Map(candidates.map(c => [c.file, {
  4030. displayPath: c.displayPath, title: c.title, body: c.body,
  4031. }]));
  4032. const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1]));
  4033. const blended = reranked.map(r => {
  4034. const rrfRank = rrfRankMap.get(r.file) || candidateLimit;
  4035. let rrfWeight: number;
  4036. if (rrfRank <= 3) rrfWeight = 0.75;
  4037. else if (rrfRank <= 10) rrfWeight = 0.60;
  4038. else rrfWeight = 0.40;
  4039. const rrfScore = 1 / rrfRank;
  4040. const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * r.score;
  4041. const candidate = candidateMap.get(r.file);
  4042. const chunkInfo = docChunkMap.get(r.file);
  4043. const bestIdx = chunkInfo?.bestIdx ?? 0;
  4044. const bestChunk = chunkInfo?.chunks[bestIdx]?.text || candidate?.body || "";
  4045. const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
  4046. const trace = rrfTraceByFile?.get(r.file);
  4047. const explainData: HybridQueryExplain | undefined = explain ? {
  4048. ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
  4049. vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
  4050. rrf: {
  4051. rank: rrfRank,
  4052. positionScore: rrfScore,
  4053. weight: rrfWeight,
  4054. baseScore: trace?.baseScore ?? 0,
  4055. topRankBonus: trace?.topRankBonus ?? 0,
  4056. totalScore: trace?.totalScore ?? 0,
  4057. contributions: trace?.contributions ?? [],
  4058. },
  4059. rerankScore: r.score,
  4060. blendedScore,
  4061. } : undefined;
  4062. return {
  4063. file: r.file,
  4064. displayPath: candidate?.displayPath || "",
  4065. title: candidate?.title || "",
  4066. body: candidate?.body || "",
  4067. bestChunk,
  4068. bestChunkPos,
  4069. score: blendedScore,
  4070. context: store.getContextForFile(r.file),
  4071. docid: docidMap.get(r.file) || "",
  4072. ...(explainData ? { explain: explainData } : {}),
  4073. };
  4074. }).sort((a, b) => b.score - a.score);
  4075. // Step 8: Dedup by file (safety net — prevents duplicate output)
  4076. const seenFiles = new Set<string>();
  4077. return blended
  4078. .filter(r => {
  4079. if (seenFiles.has(r.file)) return false;
  4080. seenFiles.add(r.file);
  4081. return true;
  4082. })
  4083. .filter(r => r.score >= minScore)
  4084. .slice(0, limit);
  4085. }
  4086. export interface VectorSearchOptions {
  4087. collection?: string;
  4088. limit?: number; // default 10
  4089. minScore?: number; // default 0.3
  4090. intent?: string; // domain intent hint for disambiguation
  4091. hooks?: Pick<SearchHooks, 'onExpand'>;
  4092. /**
  4093. * Optional embedding provider for query-side encoding (i-loazq6ze).
  4094. * When supplied, query vectors are encoded via the provider (HTTP /
  4095. * GPU worker / fallback chain) instead of the local llama-cpp model.
  4096. */
  4097. embedProvider?: EmbeddingProvider;
  4098. }
  4099. export interface VectorSearchResult {
  4100. file: string;
  4101. displayPath: string;
  4102. title: string;
  4103. body: string;
  4104. score: number;
  4105. context: string | null;
  4106. docid: string;
  4107. }
  4108. /**
  4109. * Vector-only semantic search with query expansion.
  4110. *
  4111. * Pipeline:
  4112. * 1. expandQuery() → typed variants, filter to vec/hyde only (lex irrelevant here)
  4113. * 2. searchVec() for original + vec/hyde variants (sequential — node-llama-cpp embed limitation)
  4114. * 3. Dedup by filepath (keep max score)
  4115. * 4. Sort by score descending, filter by minScore, slice to limit
  4116. */
  4117. export async function vectorSearchQuery(
  4118. store: Store,
  4119. query: string,
  4120. options?: VectorSearchOptions
  4121. ): Promise<VectorSearchResult[]> {
  4122. const limit = options?.limit ?? 10;
  4123. const minScore = options?.minScore ?? 0.3;
  4124. const collection = options?.collection;
  4125. const intent = options?.intent;
  4126. const embedProvider = options?.embedProvider;
  4127. const hasVectors = !!store.db.prepare(
  4128. `SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
  4129. ).get();
  4130. if (!hasVectors) return [];
  4131. // Expand query — filter to vec/hyde only (lex queries target FTS, not vector)
  4132. const expandStart = Date.now();
  4133. const allExpanded = await store.expandQuery(query, undefined, intent);
  4134. const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
  4135. options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
  4136. // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs.
  4137. // When `embedProvider` is supplied (i-loazq6ze), query encoding is routed
  4138. // through it; the per-call signature `searchVec(...)` accepts the provider
  4139. // as the trailing argument so existing tests / callers stay untouched.
  4140. const queryTexts = [query, ...vecExpanded.map(q => q.query)];
  4141. const allResults = new Map<string, VectorSearchResult>();
  4142. for (const q of queryTexts) {
  4143. const vecResults = await store.searchVec(
  4144. q, DEFAULT_EMBED_MODEL, limit, collection,
  4145. undefined, undefined, embedProvider,
  4146. );
  4147. for (const r of vecResults) {
  4148. const existing = allResults.get(r.filepath);
  4149. if (!existing || r.score > existing.score) {
  4150. allResults.set(r.filepath, {
  4151. file: r.filepath,
  4152. displayPath: r.displayPath,
  4153. title: r.title,
  4154. body: r.body || "",
  4155. score: r.score,
  4156. context: store.getContextForFile(r.filepath),
  4157. docid: r.docid,
  4158. });
  4159. }
  4160. }
  4161. }
  4162. return Array.from(allResults.values())
  4163. .sort((a, b) => b.score - a.score)
  4164. .filter(r => r.score >= minScore)
  4165. .slice(0, limit);
  4166. }
  4167. // =============================================================================
  4168. // Structured search — pre-expanded queries from LLM
  4169. // =============================================================================
  4170. /**
  4171. * A single sub-search in a structured search request.
  4172. * Matches the format used in QMD training data.
  4173. */
  4174. export interface StructuredSearchOptions {
  4175. collections?: string[]; // Filter to specific collections (OR match)
  4176. limit?: number; // default 10
  4177. minScore?: number; // default 0
  4178. candidateLimit?: number; // default RERANK_CANDIDATE_LIMIT
  4179. explain?: boolean; // include backend/RRF/rerank score traces
  4180. /** Domain intent hint for disambiguation — steers reranking and chunk selection */
  4181. intent?: string;
  4182. /** Skip LLM reranking, use only RRF scores */
  4183. skipRerank?: boolean;
  4184. chunkStrategy?: ChunkStrategy;
  4185. hooks?: SearchHooks;
  4186. /**
  4187. * Optional embedding provider for query-side encoding (i-loazq6ze).
  4188. * When supplied, vec/hyde sub-queries are batch-encoded via the provider
  4189. * (HTTP / GPU worker / fallback chain) instead of `getLlm(store).embedBatch`.
  4190. */
  4191. embedProvider?: EmbeddingProvider;
  4192. }
  4193. /**
  4194. * Structured search: execute pre-expanded queries without LLM query expansion.
  4195. *
  4196. * Designed for LLM callers (MCP/HTTP) that generate their own query expansions.
  4197. * Skips the internal expandQuery() step — goes directly to:
  4198. *
  4199. * Pipeline:
  4200. * 1. Route searches: lex→FTS, vec/hyde→vector (batch embed)
  4201. * 2. RRF fusion across all result lists
  4202. * 3. Chunk documents + keyword-best-chunk selection
  4203. * 4. Rerank on chunks
  4204. * 5. Position-aware score blending
  4205. * 6. Dedup, filter, slice
  4206. *
  4207. * This is the recommended endpoint for capable LLMs — they can generate
  4208. * better query variations than our small local model, especially for
  4209. * domain-specific or nuanced queries.
  4210. */
  4211. export async function structuredSearch(
  4212. store: Store,
  4213. searches: ExpandedQuery[],
  4214. options?: StructuredSearchOptions
  4215. ): Promise<HybridQueryResult[]> {
  4216. const limit = options?.limit ?? 10;
  4217. const minScore = options?.minScore ?? 0;
  4218. const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
  4219. const explain = options?.explain ?? false;
  4220. const intent = options?.intent;
  4221. const skipRerank = options?.skipRerank ?? false;
  4222. const hooks = options?.hooks;
  4223. const embedProvider = options?.embedProvider;
  4224. const collections = options?.collections;
  4225. if (searches.length === 0) return [];
  4226. // Validate queries before executing
  4227. for (const search of searches) {
  4228. const location = search.line ? `Line ${search.line}` : 'Structured search';
  4229. if (/[\r\n]/.test(search.query)) {
  4230. throw new Error(`${location} (${search.type}): queries must be single-line. Remove newline characters.`);
  4231. }
  4232. if (search.type === 'lex') {
  4233. const error = validateLexQuery(search.query);
  4234. if (error) {
  4235. throw new Error(`${location} (lex): ${error}`);
  4236. }
  4237. } else if (search.type === 'vec' || search.type === 'hyde') {
  4238. const error = validateSemanticQuery(search.query);
  4239. if (error) {
  4240. throw new Error(`${location} (${search.type}): ${error}`);
  4241. }
  4242. }
  4243. }
  4244. const rankedLists: RankedResult[][] = [];
  4245. const rankedListMeta: RankedListMeta[] = [];
  4246. const docidMap = new Map<string, string>(); // filepath -> docid
  4247. const hasVectors = !!store.db.prepare(
  4248. `SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
  4249. ).get();
  4250. // Helper to run search across collections (or all if undefined)
  4251. const collectionList = collections ?? [undefined]; // undefined = all collections
  4252. // Step 1: Run FTS for all lex searches (sync, instant)
  4253. for (const search of searches) {
  4254. if (search.type === 'lex') {
  4255. for (const coll of collectionList) {
  4256. const ftsResults = store.searchFTS(search.query, 20, coll);
  4257. if (ftsResults.length > 0) {
  4258. for (const r of ftsResults) docidMap.set(r.filepath, r.docid);
  4259. rankedLists.push(ftsResults.map(r => ({
  4260. file: r.filepath, displayPath: r.displayPath,
  4261. title: r.title, body: r.body || "", score: r.score,
  4262. })));
  4263. rankedListMeta.push({
  4264. source: "fts",
  4265. queryType: "lex",
  4266. query: search.query,
  4267. });
  4268. }
  4269. }
  4270. }
  4271. }
  4272. // Step 2: Batch embed and run vector searches for vec/hyde
  4273. if (hasVectors) {
  4274. const vecSearches = searches.filter(
  4275. (s): s is ExpandedQuery & { type: 'vec' | 'hyde' } =>
  4276. s.type === 'vec' || s.type === 'hyde'
  4277. );
  4278. if (vecSearches.length > 0) {
  4279. // Route batch encoding through the supplied EmbeddingProvider when
  4280. // present (i-loazq6ze). Otherwise fall back to the local llama-cpp
  4281. // singleton — preserves pre-patch behavior for callers that don't
  4282. // configure a provider.
  4283. const embedModelName = embedProvider
  4284. ? embedProvider.getModelId()
  4285. : getLlm(store).embedModelName;
  4286. const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, embedModelName));
  4287. hooks?.onEmbedStart?.(textsToEmbed.length);
  4288. const embedStart = Date.now();
  4289. const embeddings = embedProvider
  4290. ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
  4291. : await getLlm(store).embedBatch(textsToEmbed);
  4292. hooks?.onEmbedDone?.(Date.now() - embedStart);
  4293. for (let i = 0; i < vecSearches.length; i++) {
  4294. const embedding = embeddings[i]?.embedding;
  4295. if (!embedding) continue;
  4296. for (const coll of collectionList) {
  4297. const vecResults = await store.searchVec(
  4298. vecSearches[i]!.query, DEFAULT_EMBED_MODEL, 20, coll,
  4299. undefined, embedding
  4300. );
  4301. if (vecResults.length > 0) {
  4302. for (const r of vecResults) docidMap.set(r.filepath, r.docid);
  4303. rankedLists.push(vecResults.map(r => ({
  4304. file: r.filepath, displayPath: r.displayPath,
  4305. title: r.title, body: r.body || "", score: r.score,
  4306. })));
  4307. rankedListMeta.push({
  4308. source: "vec",
  4309. queryType: vecSearches[i]!.type,
  4310. query: vecSearches[i]!.query,
  4311. });
  4312. }
  4313. }
  4314. }
  4315. }
  4316. }
  4317. if (rankedLists.length === 0) return [];
  4318. // Step 3: RRF fusion — first list gets 2x weight (assume caller ordered by importance)
  4319. const weights = rankedLists.map((_, i) => i === 0 ? 2.0 : 1.0);
  4320. const fused = reciprocalRankFusion(rankedLists, weights);
  4321. const rrfTraceByFile = explain ? buildRrfTrace(rankedLists, weights, rankedListMeta) : null;
  4322. const candidates = fused.slice(0, candidateLimit);
  4323. if (candidates.length === 0) return [];
  4324. hooks?.onExpand?.("", [], 0); // Signal no expansion (pre-expanded)
  4325. // Step 4: Chunk documents, pick best chunk per doc for reranking
  4326. // Use first lex query as the "query" for keyword matching, or first vec if no lex
  4327. const primaryQuery = searches.find(s => s.type === 'lex')?.query
  4328. || searches.find(s => s.type === 'vec')?.query
  4329. || searches[0]?.query || "";
  4330. const queryTerms = primaryQuery.toLowerCase().split(/\s+/).filter(t => t.length > 2);
  4331. const intentTerms = intent ? extractIntentTerms(intent) : [];
  4332. const docChunkMap = new Map<string, { chunks: { text: string; pos: number }[]; bestIdx: number }>();
  4333. const ssChunkStrategy = options?.chunkStrategy;
  4334. for (const cand of candidates) {
  4335. const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, ssChunkStrategy);
  4336. if (chunks.length === 0) continue;
  4337. // Pick chunk with most keyword overlap
  4338. // Intent terms contribute at INTENT_WEIGHT_CHUNK (0.5) relative to query terms (1.0)
  4339. let bestIdx = 0;
  4340. let bestScore = -1;
  4341. for (let i = 0; i < chunks.length; i++) {
  4342. const chunkLower = chunks[i]!.text.toLowerCase();
  4343. let score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
  4344. for (const term of intentTerms) {
  4345. if (chunkLower.includes(term)) score += INTENT_WEIGHT_CHUNK;
  4346. }
  4347. if (score > bestScore) { bestScore = score; bestIdx = i; }
  4348. }
  4349. docChunkMap.set(cand.file, { chunks, bestIdx });
  4350. }
  4351. if (skipRerank) {
  4352. // Skip LLM reranking — return candidates scored by RRF only
  4353. const seenFiles = new Set<string>();
  4354. return candidates
  4355. .map((cand, i) => {
  4356. const chunkInfo = docChunkMap.get(cand.file);
  4357. const bestIdx = chunkInfo?.bestIdx ?? 0;
  4358. const bestChunk = chunkInfo?.chunks[bestIdx]?.text || cand.body || "";
  4359. const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
  4360. const rrfRank = i + 1;
  4361. const rrfScore = 1 / rrfRank;
  4362. const trace = rrfTraceByFile?.get(cand.file);
  4363. const explainData: HybridQueryExplain | undefined = explain ? {
  4364. ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
  4365. vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
  4366. rrf: {
  4367. rank: rrfRank,
  4368. positionScore: rrfScore,
  4369. weight: 1.0,
  4370. baseScore: trace?.baseScore ?? 0,
  4371. topRankBonus: trace?.topRankBonus ?? 0,
  4372. totalScore: trace?.totalScore ?? 0,
  4373. contributions: trace?.contributions ?? [],
  4374. },
  4375. rerankScore: 0,
  4376. blendedScore: rrfScore,
  4377. } : undefined;
  4378. return {
  4379. file: cand.file,
  4380. displayPath: cand.displayPath,
  4381. title: cand.title,
  4382. body: cand.body,
  4383. bestChunk,
  4384. bestChunkPos,
  4385. score: rrfScore,
  4386. context: store.getContextForFile(cand.file),
  4387. docid: docidMap.get(cand.file) || "",
  4388. ...(explainData ? { explain: explainData } : {}),
  4389. };
  4390. })
  4391. .filter(r => {
  4392. if (seenFiles.has(r.file)) return false;
  4393. seenFiles.add(r.file);
  4394. return true;
  4395. })
  4396. .filter(r => r.score >= minScore)
  4397. .slice(0, limit);
  4398. }
  4399. // Step 5: Rerank chunks
  4400. const chunksToRerank: { file: string; text: string }[] = [];
  4401. for (const cand of candidates) {
  4402. const chunkInfo = docChunkMap.get(cand.file);
  4403. if (chunkInfo) {
  4404. chunksToRerank.push({ file: cand.file, text: chunkInfo.chunks[chunkInfo.bestIdx]!.text });
  4405. }
  4406. }
  4407. hooks?.onRerankStart?.(chunksToRerank.length);
  4408. const rerankStart2 = Date.now();
  4409. const reranked = await store.rerank(primaryQuery, chunksToRerank, undefined, intent);
  4410. hooks?.onRerankDone?.(Date.now() - rerankStart2);
  4411. // Step 6: Blend RRF position score with reranker score
  4412. const candidateMap = new Map(candidates.map(c => [c.file, {
  4413. displayPath: c.displayPath, title: c.title, body: c.body,
  4414. }]));
  4415. const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1]));
  4416. const blended = reranked.map(r => {
  4417. const rrfRank = rrfRankMap.get(r.file) || candidateLimit;
  4418. let rrfWeight: number;
  4419. if (rrfRank <= 3) rrfWeight = 0.75;
  4420. else if (rrfRank <= 10) rrfWeight = 0.60;
  4421. else rrfWeight = 0.40;
  4422. const rrfScore = 1 / rrfRank;
  4423. const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * r.score;
  4424. const candidate = candidateMap.get(r.file);
  4425. const chunkInfo = docChunkMap.get(r.file);
  4426. const bestIdx = chunkInfo?.bestIdx ?? 0;
  4427. const bestChunk = chunkInfo?.chunks[bestIdx]?.text || candidate?.body || "";
  4428. const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
  4429. const trace = rrfTraceByFile?.get(r.file);
  4430. const explainData: HybridQueryExplain | undefined = explain ? {
  4431. ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
  4432. vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
  4433. rrf: {
  4434. rank: rrfRank,
  4435. positionScore: rrfScore,
  4436. weight: rrfWeight,
  4437. baseScore: trace?.baseScore ?? 0,
  4438. topRankBonus: trace?.topRankBonus ?? 0,
  4439. totalScore: trace?.totalScore ?? 0,
  4440. contributions: trace?.contributions ?? [],
  4441. },
  4442. rerankScore: r.score,
  4443. blendedScore,
  4444. } : undefined;
  4445. return {
  4446. file: r.file,
  4447. displayPath: candidate?.displayPath || "",
  4448. title: candidate?.title || "",
  4449. body: candidate?.body || "",
  4450. bestChunk,
  4451. bestChunkPos,
  4452. score: blendedScore,
  4453. context: store.getContextForFile(r.file),
  4454. docid: docidMap.get(r.file) || "",
  4455. ...(explainData ? { explain: explainData } : {}),
  4456. };
  4457. }).sort((a, b) => b.score - a.score);
  4458. // Step 7: Dedup by file
  4459. const seenFiles = new Set<string>();
  4460. return blended
  4461. .filter(r => {
  4462. if (seenFiles.has(r.file)) return false;
  4463. seenFiles.add(r.file);
  4464. return true;
  4465. })
  4466. .filter(r => r.score >= minScore)
  4467. .slice(0, limit);
  4468. }