#!/usr/bin/env python3 # /// script # requires-python = ">=3.10" # dependencies = [ # "transformers>=4.36.0", # "peft>=0.7.0", # "torch>=2.0.0", # "accelerate>=0.24.0", # "huggingface_hub>=0.20.0", # "sentencepiece>=0.1.99", # "protobuf>=3.20.0", # "numpy", # "gguf", # ] # /// """ Convert QMD query expansion model to GGUF format. Loads the base model, merges SFT and GRPO adapters, then converts to GGUF with multiple quantizations for use with Ollama/llama.cpp/LM Studio. Usage: uv run convert_gguf.py --size 1.7B uv run convert_gguf.py --size 4B --skip-quantize uv run convert_gguf.py --base Qwen/Qwen3-1.7B \ --sft tobil/qmd-query-expansion-1.7B-sft \ --grpo tobil/qmd-query-expansion-1.7B-grpo \ --output tobil/qmd-query-expansion-1.7B-gguf """ import argparse import os import subprocess import sys import torch from huggingface_hub import HfApi, login from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer # Preset configurations for each model size PRESETS = { "1.7B": { "base": "Qwen/Qwen3-1.7B", "sft": "tobil/qmd-query-expansion-1.7B-sft", "grpo": "tobil/qmd-query-expansion-1.7B-grpo", "output": "tobil/qmd-query-expansion-1.7B-gguf", "ollama_name": "qmd-expand", }, "4B": { "base": "Qwen/Qwen3-4B", "sft": "tobil/qmd-query-expansion-4B-sft", "grpo": "tobil/qmd-query-expansion-4B-grpo", "output": "tobil/qmd-query-expansion-4B-gguf", "ollama_name": "qmd-expand-4b", }, } def run_cmd(cmd, description): """Run a shell command with error handling.""" print(f" {description}...") try: subprocess.run(cmd, check=True, capture_output=True, text=True) return True except subprocess.CalledProcessError as e: print(f" FAILED: {' '.join(cmd)}") if e.stderr: print(f" {e.stderr[:500]}") return False except FileNotFoundError: print(f" Command not found: {cmd[0]}") return False def main(): parser = argparse.ArgumentParser(description="Convert QMD model to GGUF") parser.add_argument("--size", choices=PRESETS.keys(), help="Use preset config for model size") parser.add_argument("--base", help="Base model (overrides preset)") parser.add_argument("--sft", help="SFT adapter (overrides preset)") parser.add_argument("--grpo", help="GRPO adapter (overrides preset)") parser.add_argument("--output", help="Output HF repo (overrides preset)") parser.add_argument("--skip-quantize", action="store_true", help="Only produce FP16 GGUF") parser.add_argument("--no-upload", action="store_true", help="Don't upload to HF Hub") args = parser.parse_args() # Resolve config if args.size: preset = PRESETS[args.size] base_model = args.base or preset["base"] sft_model = args.sft or preset["sft"] grpo_model = args.grpo or preset["grpo"] output_repo = args.output or preset["output"] elif args.base and args.sft and args.grpo and args.output: base_model = args.base sft_model = args.sft grpo_model = args.grpo output_repo = args.output else: parser.error("Either --size or all of --base/--sft/--grpo/--output are required") model_name = output_repo.split("/")[-1].replace("-gguf", "") print(f"QMD GGUF Conversion: {model_name}") print("=" * 60) # Install build tools (for Colab/cloud environments) print("\nInstalling build dependencies...") subprocess.run(["apt-get", "update", "-qq"], capture_output=True) subprocess.run(["apt-get", "install", "-y", "-qq", "build-essential", "cmake", "git"], capture_output=True) # Login hf_token = os.environ.get("HF_TOKEN") if hf_token: print("Logging in to HuggingFace...") login(token=hf_token) # Step 1: Load and merge print(f"\nStep 1: Loading base model {base_model}...") model = AutoModelForCausalLM.from_pretrained( base_model, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, ) print(f"Step 2: Merging SFT adapter {sft_model}...") model = PeftModel.from_pretrained(model, sft_model) model = model.merge_and_unload() print(f"Step 3: Merging GRPO adapter {grpo_model}...") model = PeftModel.from_pretrained(model, grpo_model) model = model.merge_and_unload() tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) # Step 2: Save merged model merged_dir = "/tmp/merged_model" print(f"\nStep 4: Saving merged model to {merged_dir}...") model.save_pretrained(merged_dir, safe_serialization=True) tokenizer.save_pretrained(merged_dir) # Step 3: Setup llama.cpp print("\nStep 5: Setting up llama.cpp...") if not os.path.exists("/tmp/llama.cpp"): run_cmd(["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"], "Cloning llama.cpp") subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-r", "/tmp/llama.cpp/requirements.txt"], capture_output=True) # Step 4: Convert to FP16 GGUF gguf_dir = "/tmp/gguf_output" os.makedirs(gguf_dir, exist_ok=True) gguf_file = f"{gguf_dir}/{model_name}-f16.gguf" print(f"\nStep 6: Converting to FP16 GGUF...") if not run_cmd([sys.executable, "/tmp/llama.cpp/convert_hf_to_gguf.py", merged_dir, "--outfile", gguf_file, "--outtype", "f16"], "Converting"): sys.exit(1) size_mb = os.path.getsize(gguf_file) / (1024 * 1024) print(f" FP16: {size_mb:.1f} MB") # Step 5: Quantize quantized_files = [] if not args.skip_quantize: print("\nStep 7: Building quantize tool...") os.makedirs("/tmp/llama.cpp/build", exist_ok=True) run_cmd(["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"], "CMake configure") run_cmd(["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"], "Building llama-quantize") quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize" print("\nStep 8: Quantizing...") for quant_type, desc in [("Q4_K_M", "4-bit"), ("Q5_K_M", "5-bit"), ("Q8_0", "8-bit")]: qfile = f"{gguf_dir}/{model_name}-{quant_type.lower()}.gguf" if run_cmd([quantize_bin, gguf_file, qfile, quant_type], f"{quant_type} ({desc})"): qsize = os.path.getsize(qfile) / (1024 * 1024) print(f" {quant_type}: {qsize:.1f} MB") quantized_files.append((qfile, quant_type)) # Step 6: Upload if not args.no_upload: print(f"\nStep 9: Uploading to {output_repo}...") api = HfApi() api.create_repo(repo_id=output_repo, repo_type="model", exist_ok=True) api.upload_file(path_or_fileobj=gguf_file, path_in_repo=f"{model_name}-f16.gguf", repo_id=output_repo) for qfile, qtype in quantized_files: api.upload_file(path_or_fileobj=qfile, path_in_repo=f"{model_name}-{qtype.lower()}.gguf", repo_id=output_repo) # Upload README readme = f"""--- base_model: {base_model} tags: [gguf, llama.cpp, quantized, query-expansion, qmd] --- # {model_name} (GGUF) GGUF conversion of the QMD Query Expansion model. ## Details - **Base:** {base_model} - **SFT:** {sft_model} - **GRPO:** {grpo_model} - **Task:** Query expansion (lex/vec/hyde format) ## Prompt Format ``` <|im_start|>user /no_think Expand this search query: your query here<|im_end|> <|im_start|>assistant ``` """ api.upload_file(path_or_fileobj=readme.encode(), path_in_repo="README.md", repo_id=output_repo) print(f"\nDone! Repository: https://huggingface.co/{output_repo}") if __name__ == "__main__": main()