convert_1.7B_gguf.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. #!/usr/bin/env python3
  2. # /// script
  3. # requires-python = ">=3.10"
  4. # dependencies = [
  5. # "transformers>=4.36.0",
  6. # "peft>=0.7.0",
  7. # "torch>=2.0.0",
  8. # "accelerate>=0.24.0",
  9. # "huggingface_hub>=0.20.0",
  10. # "sentencepiece>=0.1.99",
  11. # "protobuf>=3.20.0",
  12. # "numpy",
  13. # "gguf",
  14. # ]
  15. # ///
  16. """
  17. GGUF Conversion for QMD Query Expansion 1.7B Model
  18. Loads base model, applies SFT adapter, then GRPO adapter, merges all,
  19. and converts to GGUF format for use with Ollama/llama.cpp/LM Studio.
  20. """
  21. import os
  22. import sys
  23. import subprocess
  24. import torch
  25. from transformers import AutoModelForCausalLM, AutoTokenizer
  26. from peft import PeftModel
  27. from huggingface_hub import HfApi, login
  28. # Configuration
  29. BASE_MODEL = "Qwen/Qwen3-1.7B"
  30. SFT_MODEL = "tobil/qmd-query-expansion-1.7B-sft"
  31. GRPO_MODEL = "tobil/qmd-query-expansion-1.7B-grpo"
  32. OUTPUT_REPO = "tobil/qmd-query-expansion-1.7B-gguf"
  33. def run_command(cmd, description):
  34. """Run a command with error handling."""
  35. print(f" {description}...")
  36. try:
  37. result = subprocess.run(cmd, check=True, capture_output=True, text=True)
  38. return True
  39. except subprocess.CalledProcessError as e:
  40. print(f" ❌ Command failed: {' '.join(cmd)}")
  41. if e.stderr:
  42. print(f" STDERR: {e.stderr[:500]}")
  43. return False
  44. except FileNotFoundError:
  45. print(f" ❌ Command not found: {cmd[0]}")
  46. return False
  47. print("🔄 QMD Query Expansion 1.7B GGUF Conversion")
  48. print("=" * 60)
  49. # Install build tools
  50. print("\n📦 Installing build dependencies...")
  51. subprocess.run(["apt-get", "update", "-qq"], capture_output=True)
  52. subprocess.run(["apt-get", "install", "-y", "-qq", "build-essential", "cmake", "git"], capture_output=True)
  53. print(" ✅ Build tools ready")
  54. # Login to HuggingFace
  55. hf_token = os.environ.get("HF_TOKEN")
  56. if hf_token:
  57. print("\n🔐 Logging in to HuggingFace...")
  58. login(token=hf_token)
  59. print(" ✅ Logged in")
  60. # Step 1: Load base model
  61. print(f"\n🔧 Step 1: Loading base model {BASE_MODEL}...")
  62. base_model = AutoModelForCausalLM.from_pretrained(
  63. BASE_MODEL,
  64. torch_dtype=torch.bfloat16,
  65. device_map="auto",
  66. trust_remote_code=True,
  67. )
  68. print(" ✅ Base model loaded")
  69. # Step 2: Load and merge SFT adapter
  70. print(f"\n🔧 Step 2: Loading SFT adapter {SFT_MODEL}...")
  71. model = PeftModel.from_pretrained(base_model, SFT_MODEL)
  72. print(" Merging SFT adapter...")
  73. model = model.merge_and_unload()
  74. print(" ✅ SFT merged")
  75. # Step 3: Load and merge GRPO adapter
  76. print(f"\n🔧 Step 3: Loading GRPO adapter {GRPO_MODEL}...")
  77. model = PeftModel.from_pretrained(model, GRPO_MODEL)
  78. print(" Merging GRPO adapter...")
  79. merged_model = model.merge_and_unload()
  80. print(" ✅ GRPO merged - final model ready")
  81. # Load tokenizer
  82. print("\n📝 Loading tokenizer...")
  83. tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
  84. print(" ✅ Tokenizer loaded")
  85. # Step 4: Save merged model
  86. print("\n💾 Step 4: Saving merged model to disk...")
  87. merged_dir = "/tmp/merged_model"
  88. merged_model.save_pretrained(merged_dir, safe_serialization=True)
  89. tokenizer.save_pretrained(merged_dir)
  90. print(f" ✅ Saved to {merged_dir}")
  91. # Step 5: Setup llama.cpp
  92. print("\n📥 Step 5: Setting up llama.cpp...")
  93. if not os.path.exists("/tmp/llama.cpp"):
  94. run_command(
  95. ["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"],
  96. "Cloning llama.cpp"
  97. )
  98. # Install Python deps
  99. subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-r", "/tmp/llama.cpp/requirements.txt"], capture_output=True)
  100. subprocess.run([sys.executable, "-m", "pip", "install", "-q", "sentencepiece", "protobuf"], capture_output=True)
  101. print(" ✅ llama.cpp ready")
  102. # Step 6: Convert to GGUF (FP16)
  103. print("\n🔄 Step 6: Converting to GGUF format (FP16)...")
  104. gguf_output_dir = "/tmp/gguf_output"
  105. os.makedirs(gguf_output_dir, exist_ok=True)
  106. model_name = "qmd-query-expansion-1.7B"
  107. gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf"
  108. convert_script = "/tmp/llama.cpp/convert_hf_to_gguf.py"
  109. if not run_command(
  110. [sys.executable, convert_script, merged_dir, "--outfile", gguf_file, "--outtype", "f16"],
  111. "Converting to FP16 GGUF"
  112. ):
  113. print(" ❌ Conversion failed!")
  114. sys.exit(1)
  115. size_mb = os.path.getsize(gguf_file) / (1024 * 1024)
  116. print(f" ✅ FP16 GGUF created: {size_mb:.1f} MB")
  117. # Step 7: Build quantize tool
  118. print("\n⚙️ Step 7: Building quantize tool...")
  119. os.makedirs("/tmp/llama.cpp/build", exist_ok=True)
  120. run_command(
  121. ["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"],
  122. "Configuring with CMake"
  123. )
  124. run_command(
  125. ["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"],
  126. "Building llama-quantize"
  127. )
  128. quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize"
  129. print(" ✅ Quantize tool built")
  130. # Step 8: Create quantized versions
  131. print("\n⚙️ Step 8: Creating quantized versions...")
  132. quant_formats = [
  133. ("Q4_K_M", "4-bit medium (recommended)"),
  134. ("Q5_K_M", "5-bit medium"),
  135. ("Q8_0", "8-bit"),
  136. ]
  137. quantized_files = []
  138. for quant_type, description in quant_formats:
  139. print(f" Creating {quant_type} ({description})...")
  140. quant_file = f"{gguf_output_dir}/{model_name}-{quant_type.lower()}.gguf"
  141. if run_command([quantize_bin, gguf_file, quant_file, quant_type], f"Quantizing to {quant_type}"):
  142. size_mb = os.path.getsize(quant_file) / (1024 * 1024)
  143. print(f" ✅ {quant_type}: {size_mb:.1f} MB")
  144. quantized_files.append((quant_file, quant_type))
  145. else:
  146. print(f" ⚠️ Skipping {quant_type}")
  147. # Step 9: Upload to Hub
  148. print("\n☁️ Step 9: Uploading to Hugging Face Hub...")
  149. api = HfApi()
  150. print(f" Creating repository: {OUTPUT_REPO}")
  151. api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True)
  152. # Upload F16
  153. print(" Uploading FP16...")
  154. api.upload_file(
  155. path_or_fileobj=gguf_file,
  156. path_in_repo=f"{model_name}-f16.gguf",
  157. repo_id=OUTPUT_REPO,
  158. )
  159. print(" ✅ FP16 uploaded")
  160. # Upload quantized versions
  161. for quant_file, quant_type in quantized_files:
  162. print(f" Uploading {quant_type}...")
  163. api.upload_file(
  164. path_or_fileobj=quant_file,
  165. path_in_repo=f"{model_name}-{quant_type.lower()}.gguf",
  166. repo_id=OUTPUT_REPO,
  167. )
  168. print(f" ✅ {quant_type} uploaded")
  169. # Create README
  170. print("\n📝 Creating README...")
  171. readme_content = f"""---
  172. base_model: {BASE_MODEL}
  173. tags:
  174. - gguf
  175. - llama.cpp
  176. - quantized
  177. - query-expansion
  178. - qmd
  179. ---
  180. # QMD Query Expansion 1.7B (GGUF)
  181. GGUF conversion of the QMD Query Expansion model for use with Ollama, llama.cpp, and LM Studio.
  182. ## Model Details
  183. - **Base Model:** {BASE_MODEL}
  184. - **SFT Adapter:** {SFT_MODEL}
  185. - **GRPO Adapter:** {GRPO_MODEL}
  186. - **Task:** Query expansion for hybrid search (lex/vec/hyde format)
  187. ## Available Quantizations
  188. | File | Quant | Description |
  189. |------|-------|-------------|
  190. | {model_name}-f16.gguf | F16 | Full precision |
  191. | {model_name}-q8_0.gguf | Q8_0 | 8-bit |
  192. | {model_name}-q5_k_m.gguf | Q5_K_M | 5-bit medium |
  193. | {model_name}-q4_k_m.gguf | Q4_K_M | 4-bit medium (recommended) |
  194. ## Usage
  195. ### With Ollama
  196. ```bash
  197. # Download
  198. huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf --local-dir .
  199. # Create Modelfile
  200. echo 'FROM ./{model_name}-q4_k_m.gguf' > Modelfile
  201. # Create and run
  202. ollama create qmd-expand -f Modelfile
  203. ollama run qmd-expand
  204. ```
  205. ### Prompt Format
  206. Use Qwen3 chat format with `/no_think`:
  207. ```
  208. <|im_start|>user
  209. /no_think Expand this search query: your query here<|im_end|>
  210. <|im_start|>assistant
  211. ```
  212. ### Expected Output
  213. ```
  214. lex: keyword variation 1
  215. lex: keyword variation 2
  216. vec: natural language reformulation
  217. hyde: Hypothetical document passage answering the query.
  218. ```
  219. ## License
  220. Apache 2.0 (inherited from Qwen3)
  221. """
  222. api.upload_file(
  223. path_or_fileobj=readme_content.encode(),
  224. path_in_repo="README.md",
  225. repo_id=OUTPUT_REPO,
  226. )
  227. print(" ✅ README uploaded")
  228. print("\n" + "=" * 60)
  229. print("✅ GGUF Conversion Complete!")
  230. print(f"📦 Repository: https://huggingface.co/{OUTPUT_REPO}")
  231. print("=" * 60)