suby
/
qmd


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
							# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "transformers>=4.45.0",
#     "peft>=0.7.0",
#     "torch",
#     "huggingface_hub>=0.20.0",
#     "accelerate",
#     "sentencepiece>=0.1.99",
#     "protobuf>=3.20.0",
#     "numpy",
#     "gguf",
# ]
# ///
"""
Merge SFT + GRPO adapters and convert to GGUF with multiple quantizations.

Uploads each quantization to HuggingFace Hub as it's produced, so partial
results are available even if the job times out.

    hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/quantize.py
    hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/quantize.py -- --size 4B
"""

import argparse
import os
import subprocess
import sys

import torch
from huggingface_hub import HfApi, login
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

PRESETS = {
    "1.7B": {
        "base": "Qwen/Qwen3-1.7B",
        "sft": "tobil/qmd-query-expansion-1.7B-sft",
        "grpo": "tobil/qmd-query-expansion-1.7B-grpo",
        "output": "tobil/qmd-query-expansion-1.7B-gguf",
    },
    "4B": {
        "base": "Qwen/Qwen3-4B",
        "sft": "tobil/qmd-query-expansion-4B-sft",
        "grpo": "tobil/qmd-query-expansion-4B-grpo",
        "output": "tobil/qmd-query-expansion-4B-gguf",
    },
}

QUANT_TYPES = [
    ("Q4_K_M", "4-bit (recommended for most use)"),
    ("Q5_K_M", "5-bit (balanced quality/size)"),
    ("Q8_0", "8-bit (highest quality)"),
]


def run_cmd(cmd, description):
    print(f"  {description}...")
    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"  FAILED: {' '.join(cmd)}")
        if e.stderr:
            print(f"  {e.stderr[:500]}")
        return False
    except FileNotFoundError:
        print(f"  Command not found: {cmd[0]}")
        return False


def main():
    parser = argparse.ArgumentParser(description="Convert QMD model to GGUF")
    parser.add_argument("--size", default="1.7B", choices=PRESETS.keys(), help="Model size preset")
    args = parser.parse_args()

    preset = PRESETS[args.size]
    base_model = preset["base"]
    sft_model = preset["sft"]
    grpo_model = preset["grpo"]
    output_repo = preset["output"]
    model_name = output_repo.split("/")[-1].replace("-gguf", "")

    print(f"QMD GGUF Conversion: {model_name}")
    print("=" * 60)

    hf_token = os.environ.get("HF_TOKEN")
    if hf_token:
        login(token=hf_token)

    api = HfApi()
    api.create_repo(repo_id=output_repo, repo_type="model", exist_ok=True)

    # Step 1: Install build tools
    print("\nStep 1: Installing build dependencies...")
    subprocess.run(["apt-get", "update", "-qq"], capture_output=True)
    subprocess.run(["apt-get", "install", "-y", "-qq", "build-essential", "cmake", "git"], capture_output=True)

    # Step 2: Load and merge
    print(f"\nStep 2: Loading base model {base_model}...")
    model = AutoModelForCausalLM.from_pretrained(
        base_model, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True,
    )

    print(f"Step 3: Merging SFT adapter {sft_model}...")
    model = PeftModel.from_pretrained(model, sft_model)
    model = model.merge_and_unload()

    print(f"Step 4: Merging GRPO adapter {grpo_model}...")
    model = PeftModel.from_pretrained(model, grpo_model)
    model = model.merge_and_unload()

    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

    # Step 3: Save merged model
    merged_dir = "/tmp/merged_model"
    print(f"\nStep 5: Saving merged model to {merged_dir}...")
    model.save_pretrained(merged_dir, safe_serialization=True)
    tokenizer.save_pretrained(merged_dir)
    del model
    torch.cuda.empty_cache()

    # Step 4: Setup llama.cpp
    print("\nStep 6: Setting up llama.cpp...")
    if not os.path.exists("/tmp/llama.cpp"):
        run_cmd(["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"],
                "Cloning llama.cpp")
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-r", "/tmp/llama.cpp/requirements.txt"],
                   capture_output=True)

    # Step 5: Convert to FP16 GGUF
    gguf_dir = "/tmp/gguf_output"
    os.makedirs(gguf_dir, exist_ok=True)
    fp16_file = f"{gguf_dir}/{model_name}-f16.gguf"

    print(f"\nStep 7: Converting to FP16 GGUF...")
    if not run_cmd([sys.executable, "/tmp/llama.cpp/convert_hf_to_gguf.py",
                    merged_dir, "--outfile", fp16_file, "--outtype", "f16"],
                   "Converting to FP16"):
        sys.exit(1)

    size_mb = os.path.getsize(fp16_file) / (1024 * 1024)
    print(f"  FP16: {size_mb:.1f} MB")

    # Upload FP16 immediately
    print(f"  Uploading FP16 to {output_repo}...")
    api.upload_file(path_or_fileobj=fp16_file,
                    path_in_repo=f"{model_name}-f16.gguf", repo_id=output_repo)
    print(f"  Uploaded: {model_name}-f16.gguf")

    # Step 6: Build quantize tool
    print("\nStep 8: Building quantize tool...")
    os.makedirs("/tmp/llama.cpp/build", exist_ok=True)
    run_cmd(["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"],
            "CMake configure")
    run_cmd(["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"],
            "Building llama-quantize")
    quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize"

    # Step 7: Quantize and upload each one immediately
    print("\nStep 9: Quantizing and uploading...")
    for quant_type, desc in QUANT_TYPES:
        qfile = f"{gguf_dir}/{model_name}-{quant_type.lower()}.gguf"
        if run_cmd([quantize_bin, fp16_file, qfile, quant_type], f"{quant_type} ({desc})"):
            qsize = os.path.getsize(qfile) / (1024 * 1024)
            print(f"  {quant_type}: {qsize:.1f} MB")

            print(f"  Uploading {quant_type} to {output_repo}...")
            api.upload_file(path_or_fileobj=qfile,
                            path_in_repo=f"{model_name}-{quant_type.lower()}.gguf", repo_id=output_repo)
            print(f"  Uploaded: {model_name}-{quant_type.lower()}.gguf")

            # Remove to save disk
            os.remove(qfile)

    # Step 8: Upload README
    ollama_name = "qmd-expand" if args.size == "1.7B" else f"qmd-expand-{args.size.lower()}"
    readme = f"""---
base_model: {base_model}
tags: [gguf, llama.cpp, quantized, query-expansion, qmd]
---
# {model_name} (GGUF)

GGUF quantizations of the QMD Query Expansion model for use with
[Ollama](https://ollama.com), [llama.cpp](https://github.com/ggerganov/llama.cpp),
or [LM Studio](https://lmstudio.ai).

## Available Quantizations

| File | Quant | Description |
|------|-------|-------------|
| `{model_name}-q4_k_m.gguf` | Q4_K_M | 4-bit — smallest, recommended for most use |
| `{model_name}-q5_k_m.gguf` | Q5_K_M | 5-bit — balanced quality/size |
| `{model_name}-q8_0.gguf` | Q8_0 | 8-bit — highest quality |
| `{model_name}-f16.gguf` | FP16 | Full precision (large) |

## Details

- **Base:** {base_model}
- **SFT:** {sft_model}
- **GRPO:** {grpo_model}
- **Task:** Query expansion for hybrid search (lex/vec/hyde format)
- **Eval score:** 90.7% average (29/30 Excellent)

## Quick Start with Ollama

```bash
huggingface-cli download {output_repo} \\
    {model_name}-q4_k_m.gguf --local-dir .

echo 'FROM ./{model_name}-q4_k_m.gguf' > Modelfile
ollama create {ollama_name} -f Modelfile
ollama run {ollama_name}
```

## Prompt Format

```
<|im_start|>user
/no_think Expand this search query: your query here<|im_end|>
<|im_start|>assistant
```

The model produces structured output:
```
lex: keyword expansion for BM25 search
lex: another keyword variant
vec: natural language expansion for vector search
vec: another semantic expansion
hyde: A hypothetical document passage that might match this query.
```
"""
    api.upload_file(path_or_fileobj=readme.encode(),
                    path_in_repo="README.md", repo_id=output_repo)

    print(f"\nDone! Repository: https://huggingface.co/{output_repo}")
    print(f"\nTo use with Ollama:")
    print(f"  huggingface-cli download {output_repo} {model_name}-q4_k_m.gguf --local-dir .")
    print(f"  echo 'FROM ./{model_name}-q4_k_m.gguf' > Modelfile")
    print(f"  ollama create {ollama_name} -f Modelfile")


if __name__ == "__main__":
    main()