quantize.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. # /// script
  2. # requires-python = ">=3.10"
  3. # dependencies = [
  4. # "transformers>=4.45.0",
  5. # "peft>=0.7.0",
  6. # "torch",
  7. # "huggingface_hub>=0.20.0",
  8. # "accelerate",
  9. # "sentencepiece>=0.1.99",
  10. # "protobuf>=3.20.0",
  11. # "numpy",
  12. # "gguf",
  13. # ]
  14. # ///
  15. """
  16. Merge SFT + GRPO adapters and convert to GGUF with multiple quantizations.
  17. Uploads each quantization to HuggingFace Hub as it's produced, so partial
  18. results are available even if the job times out.
  19. hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/quantize.py
  20. hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/quantize.py -- --size 4B
  21. """
  22. import argparse
  23. import os
  24. import subprocess
  25. import sys
  26. import torch
  27. from huggingface_hub import HfApi, login
  28. from peft import PeftModel
  29. from transformers import AutoModelForCausalLM, AutoTokenizer
  30. PRESETS = {
  31. "1.7B": {
  32. "base": "Qwen/Qwen3-1.7B",
  33. "sft": "tobil/qmd-query-expansion-1.7B-sft",
  34. "grpo": "tobil/qmd-query-expansion-1.7B-grpo",
  35. "output": "tobil/qmd-query-expansion-1.7B-gguf",
  36. },
  37. "4B": {
  38. "base": "Qwen/Qwen3-4B",
  39. "sft": "tobil/qmd-query-expansion-4B-sft",
  40. "grpo": "tobil/qmd-query-expansion-4B-grpo",
  41. "output": "tobil/qmd-query-expansion-4B-gguf",
  42. },
  43. }
  44. QUANT_TYPES = [
  45. ("Q4_K_M", "4-bit (recommended for most use)"),
  46. ("Q5_K_M", "5-bit (balanced quality/size)"),
  47. ("Q8_0", "8-bit (highest quality)"),
  48. ]
  49. def run_cmd(cmd, description):
  50. print(f" {description}...")
  51. try:
  52. result = subprocess.run(cmd, check=True, capture_output=True, text=True)
  53. return True
  54. except subprocess.CalledProcessError as e:
  55. print(f" FAILED: {' '.join(cmd)}")
  56. if e.stderr:
  57. print(f" {e.stderr[:500]}")
  58. return False
  59. except FileNotFoundError:
  60. print(f" Command not found: {cmd[0]}")
  61. return False
  62. def main():
  63. parser = argparse.ArgumentParser(description="Convert QMD model to GGUF")
  64. parser.add_argument("--size", default="1.7B", choices=PRESETS.keys(), help="Model size preset")
  65. args = parser.parse_args()
  66. preset = PRESETS[args.size]
  67. base_model = preset["base"]
  68. sft_model = preset["sft"]
  69. grpo_model = preset["grpo"]
  70. output_repo = preset["output"]
  71. model_name = output_repo.split("/")[-1].replace("-gguf", "")
  72. print(f"QMD GGUF Conversion: {model_name}")
  73. print("=" * 60)
  74. hf_token = os.environ.get("HF_TOKEN")
  75. if hf_token:
  76. login(token=hf_token)
  77. api = HfApi()
  78. api.create_repo(repo_id=output_repo, repo_type="model", exist_ok=True)
  79. # Step 1: Install build tools
  80. print("\nStep 1: Installing build dependencies...")
  81. subprocess.run(["apt-get", "update", "-qq"], capture_output=True)
  82. subprocess.run(["apt-get", "install", "-y", "-qq", "build-essential", "cmake", "git"], capture_output=True)
  83. # Step 2: Load and merge
  84. print(f"\nStep 2: Loading base model {base_model}...")
  85. model = AutoModelForCausalLM.from_pretrained(
  86. base_model, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True,
  87. )
  88. print(f"Step 3: Merging SFT adapter {sft_model}...")
  89. model = PeftModel.from_pretrained(model, sft_model)
  90. model = model.merge_and_unload()
  91. print(f"Step 4: Merging GRPO adapter {grpo_model}...")
  92. model = PeftModel.from_pretrained(model, grpo_model)
  93. model = model.merge_and_unload()
  94. tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
  95. # Step 3: Save merged model
  96. merged_dir = "/tmp/merged_model"
  97. print(f"\nStep 5: Saving merged model to {merged_dir}...")
  98. model.save_pretrained(merged_dir, safe_serialization=True)
  99. tokenizer.save_pretrained(merged_dir)
  100. del model
  101. torch.cuda.empty_cache()
  102. # Step 4: Setup llama.cpp
  103. print("\nStep 6: Setting up llama.cpp...")
  104. if not os.path.exists("/tmp/llama.cpp"):
  105. run_cmd(["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"],
  106. "Cloning llama.cpp")
  107. subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-r", "/tmp/llama.cpp/requirements.txt"],
  108. capture_output=True)
  109. # Step 5: Convert to FP16 GGUF
  110. gguf_dir = "/tmp/gguf_output"
  111. os.makedirs(gguf_dir, exist_ok=True)
  112. fp16_file = f"{gguf_dir}/{model_name}-f16.gguf"
  113. print(f"\nStep 7: Converting to FP16 GGUF...")
  114. if not run_cmd([sys.executable, "/tmp/llama.cpp/convert_hf_to_gguf.py",
  115. merged_dir, "--outfile", fp16_file, "--outtype", "f16"],
  116. "Converting to FP16"):
  117. sys.exit(1)
  118. size_mb = os.path.getsize(fp16_file) / (1024 * 1024)
  119. print(f" FP16: {size_mb:.1f} MB")
  120. # Upload FP16 immediately
  121. print(f" Uploading FP16 to {output_repo}...")
  122. api.upload_file(path_or_fileobj=fp16_file,
  123. path_in_repo=f"{model_name}-f16.gguf", repo_id=output_repo)
  124. print(f" Uploaded: {model_name}-f16.gguf")
  125. # Step 6: Build quantize tool
  126. print("\nStep 8: Building quantize tool...")
  127. os.makedirs("/tmp/llama.cpp/build", exist_ok=True)
  128. run_cmd(["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"],
  129. "CMake configure")
  130. run_cmd(["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"],
  131. "Building llama-quantize")
  132. quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize"
  133. # Step 7: Quantize and upload each one immediately
  134. print("\nStep 9: Quantizing and uploading...")
  135. for quant_type, desc in QUANT_TYPES:
  136. qfile = f"{gguf_dir}/{model_name}-{quant_type.lower()}.gguf"
  137. if run_cmd([quantize_bin, fp16_file, qfile, quant_type], f"{quant_type} ({desc})"):
  138. qsize = os.path.getsize(qfile) / (1024 * 1024)
  139. print(f" {quant_type}: {qsize:.1f} MB")
  140. print(f" Uploading {quant_type} to {output_repo}...")
  141. api.upload_file(path_or_fileobj=qfile,
  142. path_in_repo=f"{model_name}-{quant_type.lower()}.gguf", repo_id=output_repo)
  143. print(f" Uploaded: {model_name}-{quant_type.lower()}.gguf")
  144. # Remove to save disk
  145. os.remove(qfile)
  146. # Step 8: Upload README
  147. ollama_name = "qmd-expand" if args.size == "1.7B" else f"qmd-expand-{args.size.lower()}"
  148. readme = f"""---
  149. base_model: {base_model}
  150. tags: [gguf, llama.cpp, quantized, query-expansion, qmd]
  151. ---
  152. # {model_name} (GGUF)
  153. GGUF quantizations of the QMD Query Expansion model for use with
  154. [Ollama](https://ollama.com), [llama.cpp](https://github.com/ggerganov/llama.cpp),
  155. or [LM Studio](https://lmstudio.ai).
  156. ## Available Quantizations
  157. | File | Quant | Description |
  158. |------|-------|-------------|
  159. | `{model_name}-q4_k_m.gguf` | Q4_K_M | 4-bit — smallest, recommended for most use |
  160. | `{model_name}-q5_k_m.gguf` | Q5_K_M | 5-bit — balanced quality/size |
  161. | `{model_name}-q8_0.gguf` | Q8_0 | 8-bit — highest quality |
  162. | `{model_name}-f16.gguf` | FP16 | Full precision (large) |
  163. ## Details
  164. - **Base:** {base_model}
  165. - **SFT:** {sft_model}
  166. - **GRPO:** {grpo_model}
  167. - **Task:** Query expansion for hybrid search (lex/vec/hyde format)
  168. - **Eval score:** 90.7% average (29/30 Excellent)
  169. ## Quick Start with Ollama
  170. ```bash
  171. huggingface-cli download {output_repo} \\
  172. {model_name}-q4_k_m.gguf --local-dir .
  173. echo 'FROM ./{model_name}-q4_k_m.gguf' > Modelfile
  174. ollama create {ollama_name} -f Modelfile
  175. ollama run {ollama_name}
  176. ```
  177. ## Prompt Format
  178. ```
  179. <|im_start|>user
  180. /no_think Expand this search query: your query here<|im_end|>
  181. <|im_start|>assistant
  182. ```
  183. The model produces structured output:
  184. ```
  185. lex: keyword expansion for BM25 search
  186. lex: another keyword variant
  187. vec: natural language expansion for vector search
  188. vec: another semantic expansion
  189. hyde: A hypothetical document passage that might match this query.
  190. ```
  191. """
  192. api.upload_file(path_or_fileobj=readme.encode(),
  193. path_in_repo="README.md", repo_id=output_repo)
  194. print(f"\nDone! Repository: https://huggingface.co/{output_repo}")
  195. print(f"\nTo use with Ollama:")
  196. print(f" huggingface-cli download {output_repo} {model_name}-q4_k_m.gguf --local-dir .")
  197. print(f" echo 'FROM ./{model_name}-q4_k_m.gguf' > Modelfile")
  198. print(f" ollama create {ollama_name} -f Modelfile")
  199. if __name__ == "__main__":
  200. main()