пре 5 месеци · 38073799c0
--- a/finetune/.gitignore
+++ b/finetune/.gitignore
@@ -1,13 +1,13 @@
 
															-# Model checkpoints (stored on HuggingFace Hub)
														
 
															-qmd-query-expansion-*/
														
 
															+# Training outputs (run eval before pushing to HuggingFace)
														
 
															+outputs/
														
 
															+
														
 
															+# Model checkpoints
														
 
															 *.pt
														
 
															 *.safetensors
														
 
															 # Processed data files (regenerated by prepare_data.py)
														
 
															 data/train/
														
 
															-data/train_v2/train.jsonl
														
 
															-data/train_v2/train_chat.jsonl
														
 
															-data/train_v2/val.jsonl
														
 
															+data/train_v2/
														
 
															 data/qmd_expansion_cleaned.jsonl
														
 
															 data/quality_report.txt
														
--- a/finetune/CLAUDE.md
+++ b/finetune/CLAUDE.md
@@ -70,6 +70,21 @@ Always use **Qwen3-1.7B** as the base model unless explicitly stated otherwise.
 
															 Training can run **locally** (requires CUDA GPU) or via **HuggingFace Jobs** (cloud GPU, no local hardware needed).
														
 
															+### Stage 0: Prepare Data
														
 
															+
														
 
															+Raw data in `data/*.jsonl` must be converted to Qwen3 chat format before training:
														
 
															+
														
 
															+```bash
														
 
															+# Process all JSONL files in data/
														
 
															+uv run dataset/prepare_data.py
														
 
															+# Creates: data/train/train.jsonl, data/train/val.jsonl
														
 
															+
														
 
															+# Or process a specific file
														
 
															+uv run dataset/prepare_data.py --input data/qmd_expansion_v2.jsonl
														
 
															+```
														
 
															+
														
 
															+This applies the Qwen3 chat template, deduplicates, and splits into train/val sets.
														
 
															+
														
 
															 ### Stage 1: SFT
														
 
															 ```bash
														
--- a/finetune/configs/grpo.yaml
+++ b/finetune/configs/grpo.yaml
@@ -13,7 +13,9 @@ model:
 
															   output: "outputs/grpo"  # Local training output (push to HF manually after eval)
														
 
															 dataset:
														
 
															-  name: "tobil/qmd-query-expansion-train-v2"
														
 
															+  # Local: run `uv run dataset/prepare_data.py` first, then use "data/train/"
														
 
															+  # HuggingFace: use "tobil/qmd-query-expansion-train" (already prepared)
														
 
															+  name: "data/train/"
														
 
															   prompt_field: "messages"
														
 
															   max_samples: 1000
														
--- a/finetune/configs/sft.yaml
+++ b/finetune/configs/sft.yaml
@@ -8,7 +8,9 @@ model:
 
															   output: "outputs/sft"  # Local training output (push to HF manually after eval)
														
 
															 dataset:
														
 
															-  name: "tobil/qmd-query-expansion-train-v2"
														
 
															+  # Local: run `uv run dataset/prepare_data.py` first, then use "data/train/"
														
 
															+  # HuggingFace: use "tobil/qmd-query-expansion-train" (already prepared)
														
 
															+  name: "data/train/"
														
 
															   text_field: "text"
														
 
															   split: "train"
														
 
															   eval_split: 0.1
														
--- a/finetune/configs/sft_v4.yaml
+++ b/finetune/configs/sft_v4.yaml
@@ -1,38 +0,0 @@
 
															-# SFT Training Config - v4 with /only: support
														
 
															-# Usage: accelerate launch --config_file configs/accelerate_multi_gpu.yaml train.py sft --config configs/sft_v4.yaml
														
 
															-
														
 
															-model:
														
 
															-  base: "Qwen/Qwen3-1.7B"
														
 
															-  output: "qmd-sft-v4"
														
 
															-
														
 
															-dataset:
														
 
															-  name: "data/train_v4"
														
 
															-  text_field: "text"
														
 
															-  split: "train"
														
 
															-  eval_split: 0.1
														
 
															-
														
 
															-training:
														
 
															-  epochs: 3
														
 
															-  batch_size: 2
														
 
															-  gradient_accumulation_steps: 4
														
 
															-  learning_rate: 0.0002
														
 
															-  max_length: 512
														
 
															-  warmup_ratio: 0.03
														
 
															-  lr_scheduler: "cosine"
														
 
															-
														
 
															-lora:
														
 
															-  rank: 16
														
 
															-  alpha: 32
														
 
															-  dropout: 0.0
														
 
															-  target_modules:
														
 
															-    - "q_proj"
														
 
															-    - "k_proj"
														
 
															-    - "v_proj"
														
 
															-    - "o_proj"
														
 
															-    - "gate_proj"
														
 
															-    - "up_proj"
														
 
															-    - "down_proj"
														
 
															-
														
 
															-tracking:
														
 
															-  project: "qmd-query-expansion"
														
 
															-  run_name: "sft-1.7B-v4-only-modes"
														
--- a/finetune/data/train/dataset_info.json
+++ b/finetune/data/train/dataset_info.json
@@ -1,12 +0,0 @@
 
															-{
														
 
															-  "dataset_name": "qmd-query-expansion",
														
 
															-  "train_samples": 1891,
														
 
															-  "val_samples": 211,
														
 
															-  "short_query_pct": 33.6,
														
 
															-  "columns": [
														
 
															-    "prompt",
														
 
															-    "completion",
														
 
															-    "text",
														
 
															-    "messages"
														
 
															-  ]
														
 
															-}
														
--- a/finetune/data/train_v2/dataset_info.json
+++ b/finetune/data/train_v2/dataset_info.json
@@ -1,12 +0,0 @@
 
															-{
														
 
															-  "dataset_name": "qmd-query-expansion",
														
 
															-  "train_samples": 1145,
														
 
															-  "val_samples": 128,
														
 
															-  "short_query_pct": 29.3,
														
 
															-  "columns": [
														
 
															-    "prompt",
														
 
															-    "completion",
														
 
															-    "text",
														
 
															-    "messages"
														
 
															-  ]
														
 
															-}
														
--- a/finetune/dataset/prepare_data.py
+++ b/finetune/dataset/prepare_data.py
@@ -157,50 +157,65 @@ def format_for_training(input_text: str, output_text: str) -> dict:
 
															 def main():
														
 
															     parser = argparse.ArgumentParser(description="Prepare data for training")
														
 
															-    parser.add_argument("--input", type=str, default="data/qmd_expansion.jsonl", help="Input JSONL file")
														
 
															+    parser.add_argument("--input", type=str, default="data/*.jsonl", help="Input JSONL file(s) - supports glob patterns")
														
 
															     parser.add_argument("--output", type=str, default="data/train", help="Output directory")
														
 
															     parser.add_argument("--split", type=float, default=0.1, help="Validation split ratio")
														
 
															     parser.add_argument("--add-short", type=int, default=3, help="Variations per short query to add")
														
 
															     args = parser.parse_args()
														
 
															-    input_path = Path(args.input)
														
 
															     output_dir = Path(args.output)
														
 
															     output_dir.mkdir(parents=True, exist_ok=True)
														
 
															-    if not input_path.exists():
														
 
															-        print(f"Error: Input file not found: {input_path}")
														
 
															-        exit(1)
														
 
															-
														
 
															-    # Load and clean existing examples
														
 
															+    # Support glob patterns for input
														
 
															+    import glob
														
 
															+    if "*" in args.input:
														
 
															+        input_files = sorted(glob.glob(args.input))
														
 
															+        if not input_files:
														
 
															+            print(f"Error: No files found matching: {args.input}")
														
 
															+            exit(1)
														
 
															+        print(f"Found {len(input_files)} input files: {[Path(f).name for f in input_files]}")
														
 
															+    else:
														
 
															+        input_path = Path(args.input)
														
 
															+        if not input_path.exists():
														
 
															+            print(f"Error: Input file not found: {input_path}")
														
 
															+            exit(1)
														
 
															+        input_files = [str(input_path)]
														
 
															+
														
 
															+    # Load and clean existing examples from all input files
														
 
															     examples = []
														
 
															     seen_queries = set()
														
 
															     long_hyde_count = 0
														
 
															-    with open(input_path) as f:
														
 
															-        for line in f:
														
 
															-            if line.strip():
														
 
															-                ex = json.loads(line)
														
 
															-
														
 
															-                # Clean output (truncate hyde, remove invalid lines)
														
 
															-                original_output = ex["output"]
														
 
															-                ex["output"] = clean_output(ex["output"])
														
 
															-
														
 
															-                # Track hyde truncation
														
 
															-                if "hyde:" in original_output:
														
 
															-                    for orig_line in original_output.split("\n"):
														
 
															-                        if orig_line.strip().startswith("hyde:"):
														
 
															-                            if len(orig_line) > 160:
														
 
															-                                long_hyde_count += 1
														
 
															-
														
 
															-                # Validate cleaned output
														
 
															-                has_lex = "lex:" in ex["output"]
														
 
															-                has_vec = "vec:" in ex["output"]
														
 
															-
														
 
															-                if has_lex and has_vec:
														
 
															-                    examples.append(ex)
														
 
															-                    seen_queries.add(ex["input"].lower())
														
 
															-
														
 
															-    print(f"Loaded and cleaned {len(examples)} examples")
														
 
															+    for input_file in input_files:
														
 
															+        file_count = 0
														
 
															+        with open(input_file) as f:
														
 
															+            for line in f:
														
 
															+                if line.strip():
														
 
															+                    ex = json.loads(line)
														
 
															+
														
 
															+                    # Clean output (truncate hyde, remove invalid lines)
														
 
															+                    original_output = ex["output"]
														
 
															+                    ex["output"] = clean_output(ex["output"])
														
 
															+
														
 
															+                    # Track hyde truncation
														
 
															+                    if "hyde:" in original_output:
														
 
															+                        for orig_line in original_output.split("\n"):
														
 
															+                            if orig_line.strip().startswith("hyde:"):
														
 
															+                                if len(orig_line) > 160:
														
 
															+                                    long_hyde_count += 1
														
 
															+
														
 
															+                    # Validate cleaned output
														
 
															+                    has_lex = "lex:" in ex["output"]
														
 
															+                    has_vec = "vec:" in ex["output"]
														
 
															+
														
 
															+                    # Skip duplicates
														
 
															+                    if has_lex and has_vec and ex["input"].lower() not in seen_queries:
														
 
															+                        examples.append(ex)
														
 
															+                        seen_queries.add(ex["input"].lower())
														
 
															+                        file_count += 1
														
 
															+        print(f"  {Path(input_file).name}: {file_count} examples")
														
 
															+
														
 
															+    print(f"Loaded and cleaned {len(examples)} examples total")
														
 
															     print(f"Truncated {long_hyde_count} long hyde sections")
														
 
															     # Count existing short queries
														
--- a/finetune/prepare_v4_dataset.py
+++ b/finetune/prepare_v4_dataset.py
@@ -1,107 +0,0 @@
 
															-# /// script
														
 
															-# requires-python = ">=3.10"
														
 
															-# dependencies = []
														
 
															-# ///
														
 
															-"""Prepare v4 dataset: high-quality expansions + /only: variants."""
														
 
															-
														
 
															-import json
														
 
															-import random
														
 
															-from pathlib import Path
														
 
															-
														
 
															-def to_chat_format(query: str, output: str) -> dict:
														
 
															-    """Convert input/output to chat format with /no_think."""
														
 
															-    # For /only: queries, keep the suffix in the prompt
														
 
															-    prompt = f"/no_think Expand this search query: {query}"
														
 
															-    
														
 
															-    text = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n{output}<|im_end|>\n"
														
 
															-    
														
 
															-    messages = [
														
 
															-        {"role": "user", "content": prompt},
														
 
															-        {"role": "assistant", "content": output}
														
 
															-    ]
														
 
															-    
														
 
															-    return {"text": text, "messages": messages}
														
 
															-
														
 
															-
														
 
															-def load_jsonl(path: Path) -> list[dict]:
														
 
															-    """Load JSONL file."""
														
 
															-    data = []
														
 
															-    with open(path) as f:
														
 
															-        for line in f:
														
 
															-            line = line.strip()
														
 
															-            if line:
														
 
															-                data.append(json.loads(line))
														
 
															-    return data
														
 
															-
														
 
															-
														
 
															-def main():
														
 
															-    data_dir = Path("data")
														
 
															-    
														
 
															-    # High-quality sources
														
 
															-    sources = [
														
 
															-        ("qmd_expansion_v2.jsonl", "v2"),
														
 
															-        ("qmd_expansion_handcrafted.jsonl", "handcrafted"),
														
 
															-        ("qmd_only_variants.jsonl", "only"),
														
 
															-    ]
														
 
															-    
														
 
															-    all_examples = []
														
 
															-    stats = {}
														
 
															-    
														
 
															-    for filename, label in sources:
														
 
															-        path = data_dir / filename
														
 
															-        if not path.exists():
														
 
															-            print(f"  Skipping {filename} (not found)")
														
 
															-            continue
														
 
															-        
														
 
															-        raw = load_jsonl(path)
														
 
															-        converted = []
														
 
															-        
														
 
															-        for item in raw:
														
 
															-            query = item.get("input", "")
														
 
															-            output = item.get("output", "")
														
 
															-            if query and output:
														
 
															-                converted.append(to_chat_format(query, output))
														
 
															-        
														
 
															-        all_examples.extend(converted)
														
 
															-        stats[label] = len(converted)
														
 
															-        print(f"  {label}: {len(converted)} examples")
														
 
															-    
														
 
															-    # Shuffle
														
 
															-    random.seed(42)
														
 
															-    random.shuffle(all_examples)
														
 
															-    
														
 
															-    # Split 90/10
														
 
															-    split_idx = int(len(all_examples) * 0.9)
														
 
															-    train = all_examples[:split_idx]
														
 
															-    val = all_examples[split_idx:]
														
 
															-    
														
 
															-    # Write output
														
 
															-    out_dir = data_dir / "train_v4"
														
 
															-    out_dir.mkdir(exist_ok=True)
														
 
															-    
														
 
															-    with open(out_dir / "train.jsonl", "w") as f:
														
 
															-        for ex in train:
														
 
															-            f.write(json.dumps(ex) + "\n")
														
 
															-    
														
 
															-    with open(out_dir / "val.jsonl", "w") as f:
														
 
															-        for ex in val:
														
 
															-            f.write(json.dumps(ex) + "\n")
														
 
															-    
														
 
															-    # Dataset info
														
 
															-    info = {
														
 
															-        "dataset_name": "qmd-query-expansion-v4",
														
 
															-        "train_samples": len(train),
														
 
															-        "val_samples": len(val),
														
 
															-        "sources": stats,
														
 
															-    }
														
 
															-    with open(out_dir / "dataset_info.json", "w") as f:
														
 
															-        json.dump(info, f, indent=2)
														
 
															-    
														
 
															-    print(f"\n✓ Dataset prepared in {out_dir}/")
														
 
															-    print(f"  Train: {len(train)}")
														
 
															-    print(f"  Val: {len(val)}")
														
 
															-    print(f"  Total: {len(all_examples)}")
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    main()
														
--- a/finetune/train.py
+++ b/finetune/train.py
@@ -50,15 +50,25 @@ def cmd_sft(args):
 
															     dataset_name = cfg["dataset"]["name"]
														
 
															     print(f"Loading dataset: {dataset_name}...")
														
 
															-    # Support local JSONL files
														
 
															+    # Support local JSONL files and glob patterns
														
 
															     if dataset_name.startswith("data/") or dataset_name.endswith(".jsonl"):
														
 
															         from pathlib import Path
														
 
															-        data_path = Path(dataset_name)
														
 
															-        if data_path.is_dir():
														
 
															-            train_file = data_path / "train.jsonl"
														
 
															-            dataset = load_dataset("json", data_files=str(train_file), split="train")
														
 
															+        import glob
														
 
															+
														
 
															+        # Handle glob patterns like "data/*.jsonl"
														
 
															+        if "*" in dataset_name:
														
 
															+            jsonl_files = sorted(glob.glob(dataset_name))
														
 
															+            if not jsonl_files:
														
 
															+                raise ValueError(f"No files found matching: {dataset_name}")
														
 
															+            print(f"  Found {len(jsonl_files)} JSONL files: {[Path(f).name for f in jsonl_files]}")
														
 
															+            dataset = load_dataset("json", data_files=jsonl_files, split="train")
														
 
															         else:
														
 
															-            dataset = load_dataset("json", data_files=dataset_name, split="train")
														
 
															+            data_path = Path(dataset_name)
														
 
															+            if data_path.is_dir():
														
 
															+                train_file = data_path / "train.jsonl"
														
 
															+                dataset = load_dataset("json", data_files=str(train_file), split="train")
														
 
															+            else:
														
 
															+                dataset = load_dataset("json", data_files=dataset_name, split="train")
														
 
															     else:
														
 
															         dataset = load_dataset(dataset_name, split=cfg["dataset"]["split"])
														
 
															     print(f"Dataset loaded: {len(dataset)} examples")