suby
/
qmd


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
							/**
 * llm.ts - LLM abstraction layer for QMD
 *
 * Provides a clean interface for LLM operations with an Ollama implementation.
 * All raw fetch calls to LLM APIs should go through this module.
 */

// =============================================================================
// Types
// =============================================================================

/**
 * Token with log probability
 */
export type TokenLogProb = {
  token: string;
  logprob: number;
};

/**
 * Embedding result
 */
export type EmbeddingResult = {
  embedding: number[];
  model: string;
};

/**
 * Generation result with optional logprobs
 */
export type GenerateResult = {
  text: string;
  model: string;
  logprobs?: TokenLogProb[];
  done: boolean;
};

/**
 * Rerank result for a single document
 */
export type RerankDocumentResult = {
  file: string;
  relevant: boolean;
  confidence: number;
  score: number;
  rawToken: string;
  logprob: number;
};

/**
 * Batch rerank result
 */
export type RerankResult = {
  results: RerankDocumentResult[];
  model: string;
};

/**
 * Model info
 */
export type ModelInfo = {
  name: string;
  exists: boolean;
  size?: number;
  modifiedAt?: string;
};

/**
 * Options for embedding
 */
export type EmbedOptions = {
  model: string;
  isQuery?: boolean;
  title?: string;
};

/**
 * Options for text generation
 */
export type GenerateOptions = {
  model: string;
  maxTokens?: number;
  temperature?: number;
  logprobs?: boolean;
  raw?: boolean;
  stop?: string[];
};

/**
 * Options for reranking
 */
export type RerankOptions = {
  model: string;
  batchSize?: number;
};

/**
 * Document to rerank
 */
export type RerankDocument = {
  file: string;
  text: string;
  title?: string;
};

// =============================================================================
// LLM Interface
// =============================================================================

/**
 * Abstract LLM interface - implement this for different backends
 */
export interface LLM {
  /**
   * Get embeddings for text
   */
  embed(text: string, options: EmbedOptions): Promise<EmbeddingResult | null>;

  /**
   * Generate text completion
   */
  generate(prompt: string, options: GenerateOptions): Promise<GenerateResult | null>;

  /**
   * Check if a model exists
   */
  modelExists(model: string): Promise<ModelInfo>;

  /**
   * Pull a model (download if not available)
   */
  pullModel(model: string, onProgress?: (progress: number) => void): Promise<boolean>;

  // ==========================================================================
  // High-level abstractions
  // ==========================================================================

  /**
   * Expand a search query into multiple variations
   */
  expandQuery(query: string, model: string, numVariations?: number): Promise<string[]>;

  /**
   * Rerank documents by relevance to a query
   * Returns list of documents with relevance scores and boolean judgments
   */
  rerank(query: string, documents: RerankDocument[], options: RerankOptions): Promise<RerankResult>;

  /**
   * Quick relevance check - returns just boolean judgments with logprobs
   * More efficient than full rerank when you just need yes/no
   */
  rerankerLogprobsCheck(query: string, documents: RerankDocument[], options: RerankOptions): Promise<RerankDocumentResult[]>;
}

// =============================================================================
// Ollama Implementation
// =============================================================================

export type OllamaConfig = {
  baseUrl?: string;
  defaultEmbedModel?: string;
  defaultGenerateModel?: string;
  defaultRerankModel?: string;
};

const DEFAULT_OLLAMA_URL = "http://localhost:11434";
const DEFAULT_EMBED_MODEL = "embeddinggemma";
const DEFAULT_GENERATE_MODEL = "qwen3:0.6b";
const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";

/**
 * Format text for embedding query
 */
export function formatQueryForEmbedding(query: string): string {
  return `task: search result | query: ${query}`;
}

/**
 * Format text for embedding document
 */
export function formatDocForEmbedding(text: string, title?: string): string {
  return `title: ${title || "none"} | text: ${text}`;
}

/**
 * Ollama LLM implementation
 */
export class Ollama implements LLM {
  private baseUrl: string;
  private defaultEmbedModel: string;
  private defaultGenerateModel: string;
  private defaultRerankModel: string;

  constructor(config: OllamaConfig = {}) {
    this.baseUrl = config.baseUrl || process.env.OLLAMA_URL || DEFAULT_OLLAMA_URL;
    this.defaultEmbedModel = config.defaultEmbedModel || DEFAULT_EMBED_MODEL;
    this.defaultGenerateModel = config.defaultGenerateModel || DEFAULT_GENERATE_MODEL;
    this.defaultRerankModel = config.defaultRerankModel || DEFAULT_RERANK_MODEL;
  }

  /**
   * Get the base URL for this Ollama instance
   */
  getBaseUrl(): string {
    return this.baseUrl;
  }

  // ==========================================================================
  // Core API methods
  // ==========================================================================

  async embed(text: string, options: EmbedOptions): Promise<EmbeddingResult | null> {
    const model = options.model || this.defaultEmbedModel;
    const formatted = options.isQuery
      ? formatQueryForEmbedding(text)
      : formatDocForEmbedding(text, options.title);

    try {
      const response = await fetch(`${this.baseUrl}/api/embed`, {
        method: "POST",
        headers: { "Content-Type": "application/json" },
        body: JSON.stringify({ model, input: formatted }),
      });

      if (!response.ok) {
        return null;
      }

      const data = await response.json() as { embeddings?: number[][] };
      if (!data.embeddings?.[0]) {
        return null;
      }

      return {
        embedding: data.embeddings[0],
        model,
      };
    } catch {
      return null;
    }
  }

  async generate(prompt: string, options: GenerateOptions): Promise<GenerateResult | null> {
    const model = options.model || this.defaultGenerateModel;

    const requestBody: Record<string, unknown> = {
      model,
      prompt,
      stream: false,
      options: {
        num_predict: options.maxTokens ?? 150,
        temperature: options.temperature ?? 0,
      },
    };

    if (options.logprobs) {
      requestBody.logprobs = true;
    }

    if (options.raw) {
      requestBody.raw = true;
    }

    if (options.stop) {
      (requestBody.options as Record<string, unknown>).stop = options.stop;
    }

    try {
      const response = await fetch(`${this.baseUrl}/api/generate`, {
        method: "POST",
        headers: { "Content-Type": "application/json" },
        body: JSON.stringify(requestBody),
      });

      if (!response.ok) {
        return null;
      }

      const data = await response.json() as {
        response?: string;
        done?: boolean;
        logprobs?: { tokens?: string[]; token_logprobs?: number[] };
      };

      // Parse logprobs if present
      let logprobs: TokenLogProb[] | undefined;
      if (data.logprobs?.tokens && data.logprobs?.token_logprobs) {
        logprobs = data.logprobs.tokens.map((token, i) => ({
          token,
          logprob: data.logprobs!.token_logprobs![i],
        }));
      }

      return {
        text: data.response || "",
        model,
        logprobs,
        done: data.done ?? true,
      };
    } catch {
      return null;
    }
  }

  async modelExists(model: string): Promise<ModelInfo> {
    try {
      const response = await fetch(`${this.baseUrl}/api/show`, {
        method: "POST",
        headers: { "Content-Type": "application/json" },
        body: JSON.stringify({ name: model }),
      });

      if (!response.ok) {
        return { name: model, exists: false };
      }

      const data = await response.json() as {
        size?: number;
        modified_at?: string;
      };

      return {
        name: model,
        exists: true,
        size: data.size,
        modifiedAt: data.modified_at,
      };
    } catch {
      return { name: model, exists: false };
    }
  }

  async pullModel(model: string, onProgress?: (progress: number) => void): Promise<boolean> {
    try {
      const response = await fetch(`${this.baseUrl}/api/pull`, {
        method: "POST",
        headers: { "Content-Type": "application/json" },
        body: JSON.stringify({ name: model, stream: false }),
      });

      if (!response.ok) {
        return false;
      }

      // For non-streaming, we just wait for completion
      await response.json();
      onProgress?.(100);
      return true;
    } catch {
      return false;
    }
  }

  // ==========================================================================
  // High-level abstractions
  // ==========================================================================

  async expandQuery(query: string, model?: string, numVariations: number = 2): Promise<string[]> {
    const useModel = model || this.defaultGenerateModel;

    const prompt = `You are a search query expander. Given a search query, generate ${numVariations} alternative queries that would help find relevant documents.

Rules:
- Use synonyms and related terminology (e.g., "craft" → "craftsmanship", "quality", "excellence")
- Rephrase to capture different angles (e.g., "engineering culture" → "technical excellence", "developer practices")
- Keep proper nouns and named concepts exactly as written (e.g., "Build a Business", "Stripe", "Shopify")
- Each variation should be 3-8 words, natural search terms
- Do NOT just append words like "search" or "find" or "documents"

Query: "${query}"

Output exactly ${numVariations} variations, one per line, no numbering or bullets:`;

    const result = await this.generate(prompt, {
      model: useModel,
      maxTokens: 150,
      temperature: 0,
    });

    if (!result) {
      return [query];
    }

    // Parse response - filter out thinking tags and clean up
    const cleanText = result.text.replace(/<think>[\s\S]*?<\/think>/g, "").trim();
    const lines = cleanText
      .split("\n")
      .map((l) => l.trim())
      .filter((l) => l.length > 2 && l.length < 100 && !l.startsWith("<"));

    return [query, ...lines.slice(0, numVariations)];
  }

  async rerank(
    query: string,
    documents: RerankDocument[],
    options: RerankOptions
  ): Promise<RerankResult> {
    const results = await this.rerankerLogprobsCheck(query, documents, options);

    return {
      results: results.sort((a, b) => b.score - a.score),
      model: options.model || this.defaultRerankModel,
    };
  }

  async rerankerLogprobsCheck(
    query: string,
    documents: RerankDocument[],
    options: RerankOptions
  ): Promise<RerankDocumentResult[]> {
    const model = options.model || this.defaultRerankModel;
    const batchSize = options.batchSize || 5;

    const results: RerankDocumentResult[] = [];

    // Process in batches
    for (let i = 0; i < documents.length; i += batchSize) {
      const batch = documents.slice(i, i + batchSize);
      const batchResults = await Promise.all(
        batch.map((doc) => this.rerankSingle(query, doc, model))
      );
      results.push(...batchResults);
    }

    return results;
  }

  /**
   * Rerank a single document - internal helper
   */
  private async rerankSingle(
    query: string,
    doc: RerankDocument,
    model: string
  ): Promise<RerankDocumentResult> {
    const systemPrompt = `Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".`;

    const instruct = `Given a search query, determine if the following document is relevant to the query. Consider both direct matches and related concepts.`;

    const docTitle = doc.title || doc.file.split("/").pop()?.replace(/\.md$/, "") || doc.file;
    const docPreview = doc.text.length > 4000 ? doc.text.substring(0, 4000) + "..." : doc.text;

    // Qwen3-reranker prompt format with empty think tags
    const prompt = `<|im_start|>system
${systemPrompt}<|im_end|>
<|im_start|>user
<Instruct>: ${instruct}
<Query>: ${query}
<Document Title>: ${docTitle}
<Document>: ${docPreview}<|im_end|>
<|im_start|>assistant
<think>

</think>

`;

    const result = await this.generate(prompt, {
      model,
      maxTokens: 1,
      temperature: 0,
      logprobs: true,
      raw: true,
    });

    if (!result) {
      return {
        file: doc.file,
        relevant: false,
        confidence: 0,
        score: 0,
        rawToken: "",
        logprob: 0,
      };
    }

    return this.parseRerankResponse(doc.file, result);
  }

  /**
   * Parse rerank response into structured result
   */
  private parseRerankResponse(file: string, result: GenerateResult): RerankDocumentResult {
    const token = result.text.toLowerCase().trim();
    const logprob = result.logprobs?.[0]?.logprob ?? 0;
    const confidence = Math.exp(logprob);

    let relevant: boolean;
    let score: number;

    if (token.startsWith("yes")) {
      relevant = true;
      // Score: 0.5 base + up to 0.5 from confidence
      score = 0.5 + 0.5 * confidence;
    } else if (token.startsWith("no")) {
      relevant = false;
      // Score: up to 0.5 based on uncertainty (1 - confidence)
      score = 0.5 * (1 - confidence);
    } else {
      // Unknown token - neutral score
      relevant = false;
      score = 0.3;
    }

    return {
      file,
      relevant,
      confidence,
      score,
      rawToken: result.logprobs?.[0]?.token ?? token,
      logprob,
    };
  }
}

// =============================================================================
// Singleton for default Ollama instance
// =============================================================================

let defaultOllama: Ollama | null = null;

/**
 * Get the default Ollama instance (creates one if needed)
 */
export function getDefaultOllama(): Ollama {
  if (!defaultOllama) {
    defaultOllama = new Ollama();
  }
  return defaultOllama;
}

/**
 * Set a custom default Ollama instance (useful for testing)
 */
export function setDefaultOllama(ollama: Ollama | null): void {
  defaultOllama = ollama;
}