4 месяцев назад · 67e2aab18c
--- a/finetune/README.md
+++ b/finetune/README.md
@@ -297,3 +297,42 @@ deterministic, and suitable as an RL signal. See `SCORING.md` for the full rubri
 
															 |-------|--------------|-----------------|
														
 
															 | SFT | 92.0% | 30/30 |
														
 
															 | GRPO | 91.7% | 30/30 |
														
 
															+
														
 
															+## Alternative Base Models
														
 
															+
														
 
															+### LiquidAI LFM2 (Experimental)
														
 
															+
														
 
															+[LFM2](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models) 
														
 
															+is a hybrid architecture from Liquid AI optimized for on-device inference. It uses
														
 
															+a novel combination of convolutions and attention that achieves 2x faster decode
														
 
															+and prefill speed compared to standard transformers.
														
 
															+
														
 
															+**Why LFM2 for query expansion:**
														
 
															+- **Faster inference**: Lower latency for real-time search applications
														
 
															+- **Memory efficient**: Smaller memory footprint than equivalent transformers
														
 
															+- **Edge-optimized**: Can run on mobile devices and embedded systems
														
 
															+- **Good at agentic tasks**: LiquidAI recommends LFM2 for RAG and data extraction
														
 
															+
														
 
															+**Training with LFM2:**
														
 
															+
														
 
															+```bash
														
 
															+# SFT with LFM2-1.2B base model
														
 
															+uv run train.py sft --config configs/sft_lfm2.yaml
														
 
															+
														
 
															+# Evaluate the trained model
														
 
															+uv run eval.py --model outputs/sft-lfm2
														
 
															+
														
 
															+# Convert to GGUF for deployment
														
 
															+uv run convert_gguf.py --base LiquidAI/LFM2-1.2B \
														
 
															+                       --sft outputs/sft-lfm2 \
														
 
															+                       --output tobil/qmd-query-expansion-lfm2-gguf
														
 
															+```
														
 
															+
														
 
															+**Key differences from Qwen3:**
														
 
															+- Different LoRA target modules: `q_proj, k_proj, v_proj, out_proj, in_proj, w1, w2, w3`
														
 
															+- Recommended generation parameters: `temp=0.3, min_p=0.15, repetition_penalty=1.05`
														
 
															+- Requires transformers >= 4.55.0 for architecture support
														
 
															+
														
 
															+**Pre-trained GGUF models:**
														
 
															+- Base: `hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf` (~731 MB)
														
 
															+- Instruct: `hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf` (~731 MB)
														
--- a/finetune/configs/sft_lfm2.yaml
+++ b/finetune/configs/sft_lfm2.yaml
@@ -0,0 +1,60 @@
 
															+# SFT Training Config for QMD Query Expansion with LiquidAI LFM2
														
 
															+# Target: LFM2-1.2B with LoRA (hybrid architecture: convolutions + attention)
														
 
															+#
														
 
															+# LFM2 is optimized for on-device inference with fast decode/prefill.
														
 
															+# Recommended for: agentic tasks, data extraction, RAG, creative writing.
														
 
															+#
														
 
															+# Usage: uv run train.py sft --config configs/sft_lfm2.yaml
														
 
															+#
														
 
															+# Requirements:
														
 
															+#   - transformers >= 4.55.0 (LFM2 architecture support)
														
 
															+#   - May need: pip install -U transformers
														
 
															+
														
 
															+model:
														
 
															+  base: "LiquidAI/LFM2-1.2B"
														
 
															+  output: "outputs/sft-lfm2"  # Local training output (push to HF manually after eval)
														
 
															+
														
 
															+dataset:
														
 
															+  # Local: run `uv run dataset/prepare_data.py` first, then use "data/train/"
														
 
															+  # HuggingFace: use "tobil/qmd-query-expansion-train" (already prepared)
														
 
															+  name: "data/train/"
														
 
															+  text_field: "text"
														
 
															+  split: "train"
														
 
															+  eval_split: 0.1
														
 
															+
														
 
															+training:
														
 
															+  epochs: 5
														
 
															+  batch_size: 4
														
 
															+  gradient_accumulation_steps: 4
														
 
															+  learning_rate: 2e-4
														
 
															+  max_length: 512
														
 
															+  warmup_ratio: 0.03
														
 
															+  lr_scheduler: "cosine"
														
 
															+
														
 
															+lora:
														
 
															+  rank: 16
														
 
															+  alpha: 32
														
 
															+  dropout: 0.0
														
 
															+  # LFM2 uses different architecture than standard transformers:
														
 
															+  # - Attention layers: q_proj, k_proj, v_proj, out_proj
														
 
															+  # - Input projection: in_proj  
														
 
															+  # - FFN/MLP gates: w1, w2, w3 (SwiGLU activation)
														
 
															+  target_modules:
														
 
															+    - "q_proj"
														
 
															+    - "k_proj"
														
 
															+    - "v_proj"
														
 
															+    - "out_proj"
														
 
															+    - "in_proj"
														
 
															+    - "w1"
														
 
															+    - "w2"
														
 
															+    - "w3"
														
 
															+
														
 
															+tracking:
														
 
															+  project: "qmd-query-expansion"
														
 
															+  run_name: "sft-lfm2-1.2B"
														
 
															+
														
 
															+# LFM2-specific generation settings (recommended by LiquidAI)
														
 
															+generation:
														
 
															+  temperature: 0.3
														
 
															+  min_p: 0.15
														
 
															+  repetition_penalty: 1.05
														
--- a/finetune/jobs/sft_lfm2.py
+++ b/finetune/jobs/sft_lfm2.py
@@ -0,0 +1,106 @@
 
															+# /// script
														
 
															+# requires-python = ">=3.10"
														
 
															+# dependencies = [
														
 
															+#     "trl>=0.12.0",
														
 
															+#     "peft>=0.7.0",
														
 
															+#     "transformers>=4.55.0",
														
 
															+#     "accelerate>=0.24.0",
														
 
															+#     "huggingface_hub>=0.20.0",
														
 
															+#     "datasets",
														
 
															+#     "bitsandbytes",
														
 
															+#     "torch",
														
 
															+# ]
														
 
															+# ///
														
 
															+"""
														
 
															+SFT training for QMD query expansion with LiquidAI LFM2-1.2B.
														
 
															+
														
 
															+LFM2 is a hybrid architecture optimized for edge/on-device inference.
														
 
															+Uses different LoRA target modules than standard transformers.
														
 
															+
														
 
															+Self-contained script for HuggingFace Jobs:
														
 
															+    hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/sft_lfm2.py
														
 
															+"""
														
 
															+
														
 
															+import os
														
 
															+from huggingface_hub import login
														
 
															+
														
 
															+# --- Config (inlined from configs/sft_lfm2.yaml) ---
														
 
															+BASE_MODEL = "LiquidAI/LFM2-1.2B"
														
 
															+OUTPUT_MODEL = "tobil/qmd-query-expansion-lfm2-sft"
														
 
															+DATASET = "tobil/qmd-query-expansion-train"
														
 
															+
														
 
															+hf_token = os.environ.get("HF_TOKEN")
														
 
															+if hf_token:
														
 
															+    login(token=hf_token)
														
 
															+
														
 
															+from datasets import load_dataset
														
 
															+from peft import LoraConfig
														
 
															+from transformers import AutoTokenizer
														
 
															+from trl import SFTTrainer, SFTConfig
														
 
															+
														
 
															+# Load and split dataset
														
 
															+print(f"Loading dataset: {DATASET}...")
														
 
															+dataset = load_dataset(DATASET, split="train")
														
 
															+print(f"Dataset loaded: {len(dataset)} examples")
														
 
															+
														
 
															+split = dataset.train_test_split(test_size=0.1, seed=42)
														
 
															+train_dataset = split["train"]
														
 
															+eval_dataset = split["test"]
														
 
															+print(f"  Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
														
 
															+
														
 
															+# SFT config
														
 
															+config = SFTConfig(
														
 
															+    output_dir="qmd-query-expansion-lfm2-sft",
														
 
															+    push_to_hub=True,
														
 
															+    hub_model_id=OUTPUT_MODEL,
														
 
															+    hub_strategy="every_save",
														
 
															+
														
 
															+    num_train_epochs=5,
														
 
															+    per_device_train_batch_size=4,
														
 
															+    gradient_accumulation_steps=4,
														
 
															+    learning_rate=2e-4,
														
 
															+    max_length=512,
														
 
															+
														
 
															+    logging_steps=10,
														
 
															+    save_strategy="steps",
														
 
															+    save_steps=200,
														
 
															+    save_total_limit=2,
														
 
															+    eval_strategy="steps",
														
 
															+    eval_steps=200,
														
 
															+
														
 
															+    warmup_ratio=0.03,
														
 
															+    lr_scheduler_type="cosine",
														
 
															+    bf16=True,
														
 
															+
														
 
															+    report_to="none",
														
 
															+)
														
 
															+
														
 
															+# LoRA config for LFM2 architecture
														
 
															+# LFM2 uses different layer names than standard transformers:
														
 
															+# - Attention: q_proj, k_proj, v_proj, out_proj
														
 
															+# - Input projection: in_proj
														
 
															+# - FFN/MLP gates (SwiGLU): w1, w2, w3
														
 
															+peft_config = LoraConfig(
														
 
															+    r=16,
														
 
															+    lora_alpha=32,
														
 
															+    lora_dropout=0.0,
														
 
															+    bias="none",
														
 
															+    task_type="CAUSAL_LM",
														
 
															+    target_modules=["q_proj", "k_proj", "v_proj", "out_proj", "in_proj", "w1", "w2", "w3"],
														
 
															+)
														
 
															+
														
 
															+print("Initializing SFT trainer...")
														
 
															+trainer = SFTTrainer(
														
 
															+    model=BASE_MODEL,
														
 
															+    train_dataset=train_dataset,
														
 
															+    eval_dataset=eval_dataset,
														
 
															+    args=config,
														
 
															+    peft_config=peft_config,
														
 
															+)
														
 
															+
														
 
															+print("Starting SFT training (LFM2-1.2B)...")
														
 
															+trainer.train()
														
 
															+
														
 
															+print("Pushing to Hub...")
														
 
															+trainer.push_to_hub()
														
 
															+print(f"Done! Model: https://huggingface.co/{OUTPUT_MODEL}")
														
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -179,6 +179,12 @@ const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-re
 
															 // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
														
 
															 const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
														
 
															+// Alternative generation models for query expansion:
														
 
															+// LiquidAI LFM2 - hybrid architecture optimized for edge/on-device inference
														
 
															+// Use these as base for fine-tuning with configs/sft_lfm2.yaml
														
 
															+export const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf";
														
 
															+export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
														
 
															+
														
 
															 export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
														
 
															 export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
														
 
															 export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;