4 сар өмнө · 189916d6fb
--- a/finetune/CLAUDE.md
+++ b/finetune/CLAUDE.md
@@ -38,11 +38,11 @@ The schema is enforced by `dataset/schema.py:TrainingExample` (Pydantic model).
 
				 
			
 
				 | Repository | Purpose |
			
 
				 |------------|---------|
			
 
				-| `tobil/qmd-query-expansion-1.7B` | Final merged model (SFT + GRPO) |
			
 
				+| `tobil/qmd-query-expansion-1.7B` | Final merged model (SFT baseline) |
			
 
				 | `tobil/qmd-query-expansion-1.7B-gguf` | GGUF quantized versions for deployment |
			
 
				 | `tobil/qmd-query-expansion-1.7B-sft` | SFT adapter checkpoint (intermediate) |
			
 
				-| `tobil/qmd-query-expansion-1.7B-grpo` | GRPO adapter checkpoint (intermediate) |
			
 
				 | `tobil/qmd-query-expansion-train` | Prepared training dataset |
			
 
				+| `tobil/qmd-query-expansion-1.7B-grpo` | Experimental GRPO adapter (optional) |
			
 
				 
			
 
				 **Rules:**
			
 
				 - No versioned repos (`-v1`, `-v2`, `-v4`, etc.) - update in place
			
@@ -80,14 +80,14 @@ uv run train.py sft --config configs/sft.yaml
 
				 hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/sft.py
			
 
				 ```
			
 
				 
			
 
				-### Stage 2: GRPO
			
 
				+### Stage 2: (Experimental) GRPO
			
 
				 
			
 
				 ```bash
			
 
				-# Local (requires CUDA)
			
 
				-uv run train.py grpo --config configs/grpo.yaml
			
 
				+# Local (optional; experimental)
			
 
				+uv run train.py grpo --config experiments/grpo/grpo.yaml
			
 
				 
			
 
				-# Cloud (HuggingFace Jobs)
			
 
				-hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 4h jobs/grpo.py
			
 
				+# Experimental script
			
 
				+HF_TOKEN=${HF_TOKEN} uv run experiments/grpo/grpo.py
			
 
				 ```
			
 
				 
			
 
				 ### HuggingFace Jobs
			
@@ -102,9 +102,9 @@ hf jobs cancel <job-id>       # Cancel a job
 
				 ### Evaluation
			
 
				 
			
 
				 ```bash
			
 
				-uv run eval.py --model ./outputs/grpo
			
 
				-uv run eval.py --model tobil/qmd-query-expansion-1.7B
			
 
				-uv run eval.py --model ./outputs/grpo -o eval_results.json
			
 
				+uv run eval.py ./outputs/sft
			
 
				+uv run eval.py tobil/qmd-query-expansion-1.7B
			
 
				+uv run eval.py ./outputs/sft -o eval_results.json
			
 
				 ```
			
 
				 
			
 
				 ## Quality Scoring
			
@@ -126,6 +126,9 @@ experiments/
 
				 ├── lfm2/          # LiquidAI LFM2-1.2B (hybrid architecture, faster inference)
			
 
				 │   ├── sft_lfm2.yaml
			
 
				 │   └── sft_lfm2.py
			
 
				+├── grpo/          # Experimental GRPO recipe and config
			
 
				+│   ├── grpo.py
			
 
				+│   └── grpo.yaml
			
 
				 └── gepa/          # DSPy-based prompt optimization (GEPA)
			
 
				     ├── dspy_gepa.py
			
 
				     └── ...
			
@@ -138,7 +141,7 @@ These are not part of the main training pipeline.
 
				 ```
			
 
				 finetune/
			
 
				 ├── reward.py          # Scoring function (single source of truth)
			
 
				-├── train.py           # Unified SFT + GRPO training
			
 
				+├── train.py           # SFT training entrypoint
			
 
				 ├── eval.py            # Generate and score expansions
			
 
				 ├── convert_gguf.py    # GGUF conversion
			
 
				 ├── SCORING.md         # Detailed scoring rubric
			
@@ -147,8 +150,8 @@ finetune/
 
				 ├── data/              # All training JSONL files (strict schema)
			
 
				 ├── dataset/           # Schema + data tools (Pydantic-based)
			
 
				 ├── jobs/              # Self-contained HuggingFace Jobs scripts
			
 
				-├── configs/           # Training configs (sft.yaml, grpo.yaml)
			
 
				+├── configs/           # Training configs (sft.yaml)
			
 
				 ├── evals/             # Test queries
			
 
				-├── experiments/       # Experimental configs (LFM2, GEPA)
			
 
				+├── experiments/       # Experimental configs (LFM2, GEPA, GRPO)
			
 
				 └── outputs/           # Local training outputs (gitignored)
			
 
				 ```
			
--- a/finetune/Justfile
+++ b/finetune/Justfile
@@ -26,6 +26,9 @@ train-local:
 
				     HF_TOKEN=${HF_TOKEN} uv run torchrun --standalone --nproc_per_node auto \
			
 
				       train.py sft --config configs/sft_local.yaml |& tee /tmp/qmd-sft-train.log
			
 
				 
			
 
				-grpo-local:
			
 
				-    CUDA_VISIBLE_DEVICES=1,2,3 HF_TOKEN=${HF_TOKEN} uv run torchrun --standalone --nproc_per_node 3 \
			
 
				-      train.py grpo --config configs/grpo.yaml |& tee /tmp/qmd-grpo-train.log
			
 
				+# Experimental GRPO training is in finetune/experiments/grpo and not part of
			
 
				+# the default pipeline.
			
 
				+#
			
 
				+# grpo-local:
			
 
				+#   HF_TOKEN=${HF_TOKEN} uv run train.py grpo --config experiments/grpo/grpo.yaml |& tee /tmp/qmd-grpo-train.log
			
 
				+
			
--- a/finetune/README.md
+++ b/finetune/README.md
@@ -40,22 +40,23 @@ These feed into QMD's three search backends:
 
				 # 1. SFT: teach the model the output format (~45 min on A10G, ~$1.50)
			
 
				 hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/sft.py
			
 
				 
			
 
				-# 2. GRPO: RL refinement on top of SFT (~20 min on A10G, ~$0.50)
			
 
				-hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 4h jobs/grpo.py
			
 
				+# 2. Evaluate against test queries (needs local GPU or use eval job)
			
 
				+uv run eval.py tobil/qmd-query-expansion-1.7B
			
 
				 
			
 
				-# 3. Evaluate against test queries (needs local GPU or use eval job)
			
 
				-uv run eval.py --model tobil/qmd-query-expansion-1.7B-grpo \
			
 
				-               --sft-model tobil/qmd-query-expansion-1.7B-sft
			
 
				-
			
 
				-# 4. Convert to GGUF for local deployment (Ollama, llama.cpp)
			
 
				+# 3. Convert to GGUF for local deployment (Ollama, llama.cpp)
			
 
				 uv run convert_gguf.py --size 1.7B
			
 
				+
			
 
				+# NOTE: GRPO is currently experimental and moved to finetune/experiments/grpo
			
 
				+# if you want to run it manually, use uv run python experiments/grpo/grpo.py
			
 
				 ```
			
 
				 
			
 
				 ### Local training (if you have a GPU)
			
 
				 
			
 
				 ```bash
			
 
				 uv run train.py sft  --config configs/sft.yaml
			
 
				-uv run train.py grpo --config configs/grpo.yaml
			
 
				+
			
 
				+# Experimental GRPO
			
 
				+uv run train.py grpo --config experiments/grpo/grpo.yaml
			
 
				 ```
			
 
				 
			
 
				 ### Monitoring HF Jobs
			
@@ -85,19 +86,19 @@ direct `lex:/vec:/hyde:` output without `<think>` blocks.
 
				 ```
			
 
				 finetune/
			
 
				 ├── reward.py          # Scoring/reward function (single source of truth)
			
 
				-├── train.py           # Unified SFT + GRPO training (two subcommands)
			
 
				+├── train.py           # SFT training entrypoint
			
 
				 ├── eval.py            # Generate expansions and score them
			
 
				 ├── convert_gguf.py    # GGUF conversion for Ollama/llama.cpp
			
 
				 ├── jobs/
			
 
				 │   ├── sft.py         # Self-contained SFT for HuggingFace Jobs
			
 
				-│   ├── grpo.py        # Self-contained GRPO for HuggingFace Jobs
			
 
				 │   ├── eval.py        # Self-contained eval for HuggingFace Jobs
			
 
				 │   └── eval_common.py # Shared eval utilities
			
 
				 ├── configs/
			
 
				-│   ├── sft.yaml       # SFT hyperparameters for Qwen3-1.7B
			
 
				-│   └── grpo.yaml      # GRPO hyperparameters for Qwen3-1.7B
			
 
				+│   └── sft.yaml       # SFT hyperparameters for Qwen3-1.7B
			
 
				 ├── evals/
			
 
				 │   └── queries.txt    # 31 test queries across 8 categories
			
 
				+├── experiments/
			
 
				+│   └── grpo/          # Experimental GRPO configuration and script (optional)
			
 
				 ├── data/              # Training JSONL files (all concatenated for training)
			
 
				 ├── dataset/
			
 
				 │   ├── prepare_data.py     # Format for Qwen3 chat template, dedup, split
			
@@ -130,29 +131,14 @@ uv run train.py sft --config configs/sft.yaml
 
				 uv run train.py sft --config configs/sft.yaml --dry-run  # preview config
			
 
				 ```
			
 
				 
			
 
				-### Stage 2: GRPO (Group Relative Policy Optimization)
			
 
				+### Stage 2: (Experimental) GRPO
			
 
				 
			
 
				-Reinforcement learning on top of the merged SFT weights. The model generates
			
 
				-multiple expansions per query, they are scored by the reward function, and the
			
 
				-model is updated to prefer higher-scoring outputs.
			
 
				-
			
 
				-| Parameter | Value |
			
 
				-|-----------|-------|
			
 
				-| Base | Merged SFT checkpoint |
			
 
				-| Method | LoRA (rank 4, alpha 8) — smaller for RL stability |
			
 
				-| Target modules | q_proj, v_proj only |
			
 
				-| Reward | `reward.py` (rule-based, 5 dimensions) |
			
 
				-| KL beta | 0.04 — prevents drift from SFT checkpoint |
			
 
				-| Generations per prompt | 4 |
			
 
				-| Max steps | 200 |
			
 
				-| Learning rate | 5e-7 |
			
 
				-
			
 
				-**Important:** `beta > 0` is critical. With `beta=0` the model experiences
			
 
				-catastrophic drift and scores drop to 0%.
			
 
				+GRPO is currently treated as experimental and kept under `experiments/grpo/`.
			
 
				+It is not part of the default production path for this repository.
			
 
				 
			
 
				 ```bash
			
 
				-uv run train.py grpo --config configs/grpo.yaml
			
 
				-uv run train.py grpo --config configs/grpo.yaml --dry-run  # test reward function
			
 
				+# Optional experimental GRPO run
			
 
				+uv run train.py grpo --config experiments/grpo/grpo.yaml
			
 
				 ```
			
 
				 
			
 
				 ## Evaluation
			
@@ -160,24 +146,26 @@ uv run train.py grpo --config configs/grpo.yaml --dry-run  # test reward functio
 
				 `eval.py` generates expansions from a model and scores them against test queries:
			
 
				 
			
 
				 ```bash
			
 
				-# Evaluate an SFT model
			
 
				+# Evaluate a SFT model
			
 
				 uv run eval.py --model tobil/qmd-query-expansion-1.7B-sft
			
 
				 
			
 
				-# Evaluate a GRPO model (needs SFT adapter merged first)
			
 
				-uv run eval.py --model tobil/qmd-query-expansion-1.7B-grpo \
			
 
				-               --sft-model tobil/qmd-query-expansion-1.7B-sft
			
 
				+# Evaluate an SFT output dir
			
 
				+uv run eval.py outputs/sft
			
 
				 
			
 
				 # Verbose output with deduction details
			
 
				-uv run eval.py --model tobil/qmd-query-expansion-1.7B-sft -v
			
 
				+uv run eval.py tobil/qmd-query-expansion-1.7B -v
			
 
				+
			
 
				+# Optional: evaluate GRPO experimental output (if run)
			
 
				+uv run eval.py outputs/grpo
			
 
				 
			
 
				 # Save detailed scores to JSON
			
 
				-uv run eval.py --model tobil/qmd-query-expansion-1.7B-sft -o scores.json
			
 
				+uv run eval.py tobil/qmd-query-expansion-1.7B -o scores.json
			
 
				 ```
			
 
				 
			
 
				 ## Reward Function
			
 
				 
			
 
				-`reward.py` is the single source of truth for scoring. It is used both as the
			
 
				-GRPO reward signal during training and for evaluation.
			
 
				+`reward.py` is the single source of truth for scoring. It is used for evaluation
			
 
				+and (optionally) as the GRPO reward signal in the experimental path.
			
 
				 
			
 
				 Five scoring dimensions (max 120 without hyde, 140 with):
			
 
				 
			
@@ -201,8 +189,8 @@ uv run reward.py
 
				 
			
 
				 ## GGUF Conversion
			
 
				 
			
 
				-Merges base + SFT + GRPO adapters into a single model and produces
			
 
				-quantized GGUF files for deployment:
			
 
				+Merges base + SFT and (optionally) GRPO adapters into a single model, then
			
 
				+produces quantized GGUF files for deployment:
			
 
				 
			
 
				 ```bash
			
 
				 # Use preset for 1.7B
			
@@ -240,15 +228,14 @@ just validate
 
				 
			
 
				 ## Architecture Notes
			
 
				 
			
 
				-The two-stage training approach (SFT -> GRPO) is standard for structured-output models:
			
 
				+The production training approach is currently **SFT-only**:
			
 
				 
			
 
				 1. **SFT** establishes format compliance and basic query understanding. It uses
			
 
				    a large LoRA (rank 16, all projection layers) because it needs to learn a
			
 
				    new output format from scratch.
			
 
				 
			
 
				-2. **GRPO** refines quality within the learned format. It uses a small LoRA
			
 
				-   (rank 4, q/v only) and KL regularization to make incremental improvements
			
 
				-   without losing what SFT taught.
			
 
				+2. **GRPO** exists as an optional experimental path under `experiments/grpo/`
			
 
				+   and is not in the production training pipeline.
			
 
				 
			
 
				 The reward function is entirely rule-based (no LLM judge) which makes it fast,
			
 
				 deterministic, and suitable as an RL signal. See `SCORING.md` for the full rubric.
			
@@ -266,20 +253,12 @@ deterministic, and suitable as an RL signal. See `SCORING.md` for the full rubri
 
				 | Epochs | 5 |
			
 
				 | Hardware | A10G (24 GB VRAM) |
			
 
				 
			
 
				-### GRPO
			
 
				-
			
 
				-| Metric | Value |
			
 
				-|--------|-------|
			
 
				-| Mean reward | 0.757 |
			
 
				-| Final loss | 0.0005 |
			
 
				-| KL divergence | 0.00048 |
			
 
				-| Mean completion length | ~58 tokens |
			
 
				-| Training time | ~19 min (200 steps) |
			
 
				-| Hardware | A10G (24 GB VRAM) |
			
 
				-
			
 
				 ### Evaluation Scores
			
 
				 
			
 
				 | Model | Average Score | Excellent (30) |
			
 
				 |-------|--------------|-----------------|
			
 
				 | SFT | 92.0% | 30/30 |
			
 
				-| GRPO | 91.7% | 30/30 |
			
 
				+
			
 
				+> GRPO scores are not tracked in this branch; see `experiments/grpo/` for historical
			
 
				+> experimental results.
			
 
				+
			
--- a/finetune/experiments/grpo/README.md
+++ b/finetune/experiments/grpo/README.md
@@ -0,0 +1,26 @@
 
				+# GRPO (Experimental)
			
 
				+
			
 
				+This folder contains the **experimental** GRPO training path for query expansion.
			
 
				+It is not part of the default production pipeline.
			
 
				+
			
 
				+## Files
			
 
				+
			
 
				+- `grpo.yaml` – experimental GRPO hyperparameters
			
 
				+- `grpo.py` – standalone GRPO training script
			
 
				+
			
 
				+## Run
			
 
				+
			
 
				+```bash
			
 
				+# Recommended default: run from repo root
			
 
				+cd /home/tobi/qmd
			
 
				+uv run finetune/experiments/grpo/grpo.py
			
 
				+
			
 
				+# Or use unified entrypoint (deprecated in main pipeline):
			
 
				+uv run train.py grpo --config finetune/experiments/grpo/grpo.yaml
			
 
				+```
			
 
				+
			
 
				+## Notes
			
 
				+
			
 
				+- Current mainline focuses on SFT-only quality and benchmarks.
			
 
				+- Keep this workflow isolated unless you are explicitly experimenting with
			
 
				+  reinforcement-learning refinement.
			
--- a/finetune/experiments/grpo/grpo.py
+++ b/finetune/experiments/grpo/grpo.py
@@ -14,8 +14,10 @@
 
				 """
			
 
				 GRPO training for QMD query expansion (Qwen3-1.7B).
			
 
				 
			
 
				-Runs on top of merged SFT weights. Self-contained for HuggingFace Jobs:
			
 
				-    hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 4h jobs/grpo.py
			
 
				+Experimental recipe run on top of merged SFT weights. Self-contained runner:
			
 
				+    uv run experiments/grpo/grpo.py
			
 
				+
			
 
				+(If using HF Jobs, run this script as the job entrypoint.)
			
 
				 """
			
 
				 
			
 
				 import os
			
@@ -42,7 +44,7 @@ if not os.path.exists(_eval_common_path):
 
				 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
			
 
				 from eval_common import QMDRewardFunction, run_eval
			
 
				 
			
 
				-# --- Config (inlined from configs/grpo.yaml) ---
			
 
				+# --- Config (inlined from experiments/grpo/grpo.yaml) ---
			
 
				 BASE_MODEL = "Qwen/Qwen3-1.7B"
			
 
				 SFT_MODEL = "tobil/qmd-query-expansion-1.7B-sft"
			
 
				 OUTPUT_MODEL = "tobil/qmd-query-expansion-1.7B-grpo"
			
--- a/finetune/experiments/grpo/grpo.yaml
+++ b/finetune/experiments/grpo/grpo.yaml
@@ -1,7 +1,7 @@
 
				 # GRPO Training Config for QMD Query Expansion
			
 
				 # Target: Qwen3-1.7B, trained on top of merged SFT weights
			
 
				 #
			
 
				-# Usage: uv run train.py grpo --config configs/grpo.yaml
			
 
				+# Usage: uv run train.py grpo --config experiments/grpo/grpo.yaml
			
 
				 #
			
 
				 # The reward function (reward.py) scores expansions on format compliance,
			
 
				 # diversity, hyde quality, content quality, and named entity preservation.
			
--- a/finetune/train.py
+++ b/finetune/train.py
@@ -18,14 +18,14 @@
 
				 """
			
 
				 Unified training script for QMD query expansion models.
			
 
				 
			
 
				-Supports two stages:
			
 
				+Primary pipeline is SFT-only:
			
 
				   sft  - Supervised fine-tuning on labeled examples
			
 
				-  grpo - Group Relative Policy Optimization (RL) on top of merged SFT weights
			
 
				+
			
 
				+GRPO was moved to `experiments/grpo/` and is not part of the main training
			
 
				+pipeline by default.
			
 
				 
			
 
				 Usage:
			
 
				     uv run train.py sft  --config configs/sft.yaml
			
 
				-    uv run train.py grpo --config configs/grpo.yaml
			
 
				-    uv run train.py grpo --config configs/grpo.yaml --dry-run
			
 
				 """
			
 
				 
			
 
				 import argparse
			
@@ -412,6 +412,15 @@ def cmd_sft(args):
 
				 
			
 
				 def cmd_grpo(args):
			
 
				     """Run GRPO reinforcement learning on top of merged SFT weights."""
			
 
				+    print(
			
 
				+        "GRPO is not part of the main training pipeline and has been moved to `experiments/grpo/`."
			
 
				+    )
			
 
				+    print("To run experimental GRPO, use:")
			
 
				+    print("  cd finetune && uv run python experiments/grpo/grpo.py")
			
 
				+    print("Or, if you have local config wiring ready:")
			
 
				+    print("  uv run train.py grpo --config experiments/grpo/grpo.yaml")
			
 
				+    return
			
 
				+
			
 
				     import torch
			
 
				     import torch.distributed as dist
			
 
				     import os
			
@@ -645,8 +654,6 @@ def main():
 
				         epilog="""
			
 
				 Examples:
			
 
				   uv run train.py sft  --config configs/sft.yaml
			
 
				-  uv run train.py grpo --config configs/grpo.yaml
			
 
				-  uv run train.py grpo --config configs/grpo.yaml --dry-run
			
 
				         """,
			
 
				     )
			
 
				     sub = parser.add_subparsers(dest="stage", required=True)
			
@@ -657,7 +664,10 @@ Examples:
 
				         "--dry-run", action="store_true", help="Print config and exit"
			
 
				     )
			
 
				 
			
 
				-    grpo_parser = sub.add_parser("grpo", help="GRPO reinforcement learning")
			
 
				+    grpo_parser = sub.add_parser(
			
 
				+        "grpo",
			
 
				+        help="Experimental: GRPO reinforcement learning (moved to experiments/grpo/)",
			
 
				+    )
			
 
				     grpo_parser.add_argument("--config", required=True, help="Path to GRPO config YAML")
			
 
				     grpo_parser.add_argument(
			
 
				         "--dry-run", action="store_true", help="Print config, test reward, and exit"