4 달 전 · 67e2aab18c
--- a/finetune/README.md
+++ b/finetune/README.md
@@ -297,3 +297,42 @@ deterministic, and suitable as an RL signal. See `SCORING.md` for the full rubri
 
				 |-------|--------------|-----------------|
			
 
				 | SFT | 92.0% | 30/30 |
			
 
				 | GRPO | 91.7% | 30/30 |
			
 
				+
			
 
				+## Alternative Base Models
			
 
				+
			
 
				+### LiquidAI LFM2 (Experimental)
			
 
				+
			
 
				+[LFM2](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models) 
			
 
				+is a hybrid architecture from Liquid AI optimized for on-device inference. It uses
			
 
				+a novel combination of convolutions and attention that achieves 2x faster decode
			
 
				+and prefill speed compared to standard transformers.
			
 
				+
			
 
				+**Why LFM2 for query expansion:**
			
 
				+- **Faster inference**: Lower latency for real-time search applications
			
 
				+- **Memory efficient**: Smaller memory footprint than equivalent transformers
			
 
				+- **Edge-optimized**: Can run on mobile devices and embedded systems
			
 
				+- **Good at agentic tasks**: LiquidAI recommends LFM2 for RAG and data extraction
			
 
				+
			
 
				+**Training with LFM2:**
			
 
				+
			
 
				+```bash
			
 
				+# SFT with LFM2-1.2B base model
			
 
				+uv run train.py sft --config configs/sft_lfm2.yaml
			
 
				+
			
 
				+# Evaluate the trained model
			
 
				+uv run eval.py --model outputs/sft-lfm2
			
 
				+
			
 
				+# Convert to GGUF for deployment
			
 
				+uv run convert_gguf.py --base LiquidAI/LFM2-1.2B \
			
 
				+                       --sft outputs/sft-lfm2 \
			
 
				+                       --output tobil/qmd-query-expansion-lfm2-gguf
			
 
				+```
			
 
				+
			
 
				+**Key differences from Qwen3:**
			
 
				+- Different LoRA target modules: `q_proj, k_proj, v_proj, out_proj, in_proj, w1, w2, w3`
			
 
				+- Recommended generation parameters: `temp=0.3, min_p=0.15, repetition_penalty=1.05`
			
 
				+- Requires transformers >= 4.55.0 for architecture support
			
 
				+
			
 
				+**Pre-trained GGUF models:**
			
 
				+- Base: `hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf` (~731 MB)
			
 
				+- Instruct: `hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf` (~731 MB)
			
--- a/finetune/configs/sft_lfm2.yaml
+++ b/finetune/configs/sft_lfm2.yaml
@@ -0,0 +1,60 @@
 
				+# SFT Training Config for QMD Query Expansion with LiquidAI LFM2
			
 
				+# Target: LFM2-1.2B with LoRA (hybrid architecture: convolutions + attention)
			
 
				+#
			
 
				+# LFM2 is optimized for on-device inference with fast decode/prefill.
			
 
				+# Recommended for: agentic tasks, data extraction, RAG, creative writing.
			
 
				+#
			
 
				+# Usage: uv run train.py sft --config configs/sft_lfm2.yaml
			
 
				+#
			
 
				+# Requirements:
			
 
				+#   - transformers >= 4.55.0 (LFM2 architecture support)
			
 
				+#   - May need: pip install -U transformers
			
 
				+
			
 
				+model:
			
 
				+  base: "LiquidAI/LFM2-1.2B"
			
 
				+  output: "outputs/sft-lfm2"  # Local training output (push to HF manually after eval)
			
 
				+
			
 
				+dataset:
			
 
				+  # Local: run `uv run dataset/prepare_data.py` first, then use "data/train/"
			
 
				+  # HuggingFace: use "tobil/qmd-query-expansion-train" (already prepared)
			
 
				+  name: "data/train/"
			
 
				+  text_field: "text"
			
 
				+  split: "train"
			
 
				+  eval_split: 0.1
			
 
				+
			
 
				+training:
			
 
				+  epochs: 5
			
 
				+  batch_size: 4
			
 
				+  gradient_accumulation_steps: 4
			
 
				+  learning_rate: 2e-4
			
 
				+  max_length: 512
			
 
				+  warmup_ratio: 0.03
			
 
				+  lr_scheduler: "cosine"
			
 
				+
			
 
				+lora:
			
 
				+  rank: 16
			
 
				+  alpha: 32
			
 
				+  dropout: 0.0
			
 
				+  # LFM2 uses different architecture than standard transformers:
			
 
				+  # - Attention layers: q_proj, k_proj, v_proj, out_proj
			
 
				+  # - Input projection: in_proj  
			
 
				+  # - FFN/MLP gates: w1, w2, w3 (SwiGLU activation)
			
 
				+  target_modules:
			
 
				+    - "q_proj"
			
 
				+    - "k_proj"
			
 
				+    - "v_proj"
			
 
				+    - "out_proj"
			
 
				+    - "in_proj"
			
 
				+    - "w1"
			
 
				+    - "w2"
			
 
				+    - "w3"
			
 
				+
			
 
				+tracking:
			
 
				+  project: "qmd-query-expansion"
			
 
				+  run_name: "sft-lfm2-1.2B"
			
 
				+
			
 
				+# LFM2-specific generation settings (recommended by LiquidAI)
			
 
				+generation:
			
 
				+  temperature: 0.3
			
 
				+  min_p: 0.15
			
 
				+  repetition_penalty: 1.05
			
--- a/finetune/jobs/sft_lfm2.py
+++ b/finetune/jobs/sft_lfm2.py
@@ -0,0 +1,106 @@
 
				+# /// script
			
 
				+# requires-python = ">=3.10"
			
 
				+# dependencies = [
			
 
				+#     "trl>=0.12.0",
			
 
				+#     "peft>=0.7.0",
			
 
				+#     "transformers>=4.55.0",
			
 
				+#     "accelerate>=0.24.0",
			
 
				+#     "huggingface_hub>=0.20.0",
			
 
				+#     "datasets",
			
 
				+#     "bitsandbytes",
			
 
				+#     "torch",
			
 
				+# ]
			
 
				+# ///
			
 
				+"""
			
 
				+SFT training for QMD query expansion with LiquidAI LFM2-1.2B.
			
 
				+
			
 
				+LFM2 is a hybrid architecture optimized for edge/on-device inference.
			
 
				+Uses different LoRA target modules than standard transformers.
			
 
				+
			
 
				+Self-contained script for HuggingFace Jobs:
			
 
				+    hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/sft_lfm2.py
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+from huggingface_hub import login
			
 
				+
			
 
				+# --- Config (inlined from configs/sft_lfm2.yaml) ---
			
 
				+BASE_MODEL = "LiquidAI/LFM2-1.2B"
			
 
				+OUTPUT_MODEL = "tobil/qmd-query-expansion-lfm2-sft"
			
 
				+DATASET = "tobil/qmd-query-expansion-train"
			
 
				+
			
 
				+hf_token = os.environ.get("HF_TOKEN")
			
 
				+if hf_token:
			
 
				+    login(token=hf_token)
			
 
				+
			
 
				+from datasets import load_dataset
			
 
				+from peft import LoraConfig
			
 
				+from transformers import AutoTokenizer
			
 
				+from trl import SFTTrainer, SFTConfig
			
 
				+
			
 
				+# Load and split dataset
			
 
				+print(f"Loading dataset: {DATASET}...")
			
 
				+dataset = load_dataset(DATASET, split="train")
			
 
				+print(f"Dataset loaded: {len(dataset)} examples")
			
 
				+
			
 
				+split = dataset.train_test_split(test_size=0.1, seed=42)
			
 
				+train_dataset = split["train"]
			
 
				+eval_dataset = split["test"]
			
 
				+print(f"  Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
			
 
				+
			
 
				+# SFT config
			
 
				+config = SFTConfig(
			
 
				+    output_dir="qmd-query-expansion-lfm2-sft",
			
 
				+    push_to_hub=True,
			
 
				+    hub_model_id=OUTPUT_MODEL,
			
 
				+    hub_strategy="every_save",
			
 
				+
			
 
				+    num_train_epochs=5,
			
 
				+    per_device_train_batch_size=4,
			
 
				+    gradient_accumulation_steps=4,
			
 
				+    learning_rate=2e-4,
			
 
				+    max_length=512,
			
 
				+
			
 
				+    logging_steps=10,
			
 
				+    save_strategy="steps",
			
 
				+    save_steps=200,
			
 
				+    save_total_limit=2,
			
 
				+    eval_strategy="steps",
			
 
				+    eval_steps=200,
			
 
				+
			
 
				+    warmup_ratio=0.03,
			
 
				+    lr_scheduler_type="cosine",
			
 
				+    bf16=True,
			
 
				+
			
 
				+    report_to="none",
			
 
				+)
			
 
				+
			
 
				+# LoRA config for LFM2 architecture
			
 
				+# LFM2 uses different layer names than standard transformers:
			
 
				+# - Attention: q_proj, k_proj, v_proj, out_proj
			
 
				+# - Input projection: in_proj
			
 
				+# - FFN/MLP gates (SwiGLU): w1, w2, w3
			
 
				+peft_config = LoraConfig(
			
 
				+    r=16,
			
 
				+    lora_alpha=32,
			
 
				+    lora_dropout=0.0,
			
 
				+    bias="none",
			
 
				+    task_type="CAUSAL_LM",
			
 
				+    target_modules=["q_proj", "k_proj", "v_proj", "out_proj", "in_proj", "w1", "w2", "w3"],
			
 
				+)
			
 
				+
			
 
				+print("Initializing SFT trainer...")
			
 
				+trainer = SFTTrainer(
			
 
				+    model=BASE_MODEL,
			
 
				+    train_dataset=train_dataset,
			
 
				+    eval_dataset=eval_dataset,
			
 
				+    args=config,
			
 
				+    peft_config=peft_config,
			
 
				+)
			
 
				+
			
 
				+print("Starting SFT training (LFM2-1.2B)...")
			
 
				+trainer.train()
			
 
				+
			
 
				+print("Pushing to Hub...")
			
 
				+trainer.push_to_hub()
			
 
				+print(f"Done! Model: https://huggingface.co/{OUTPUT_MODEL}")
			
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -179,6 +179,12 @@ const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-re
 
				 // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
			
 
				 const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
			
 
				 
			
 
				+// Alternative generation models for query expansion:
			
 
				+// LiquidAI LFM2 - hybrid architecture optimized for edge/on-device inference
			
 
				+// Use these as base for fine-tuning with configs/sft_lfm2.yaml
			
 
				+export const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf";
			
 
				+export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
			
 
				+
			
 
				 export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
			
 
				 export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
			
 
				 export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;