export_gguf.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. #!/usr/bin/env python3
  2. # /// script
  3. # requires-python = ">=3.10"
  4. # dependencies = [
  5. # "unsloth",
  6. # "transformers>=4.45.0",
  7. # "torch",
  8. # ]
  9. # ///
  10. """
  11. Export finetuned model to GGUF format for use with node-llama-cpp.
  12. Usage:
  13. python export_gguf.py --model models/qmd-expansion --quantization Q8_0
  14. python export_gguf.py --model models/qmd-expansion --quantization Q4_K_M
  15. """
  16. import argparse
  17. from pathlib import Path
  18. def main():
  19. parser = argparse.ArgumentParser(description="Export model to GGUF")
  20. parser.add_argument("--model", type=str, required=True, help="Path to finetuned model")
  21. parser.add_argument("--output", type=str, help="Output GGUF file path")
  22. parser.add_argument("--quantization", type=str, default="Q8_0",
  23. choices=["Q4_K_M", "Q5_K_M", "Q6_K", "Q8_0", "F16"],
  24. help="Quantization method")
  25. parser.add_argument("--push-to-hub", type=str, help="Push GGUF to HuggingFace Hub repo")
  26. args = parser.parse_args()
  27. from unsloth import FastLanguageModel
  28. model_path = Path(args.model)
  29. if not model_path.exists():
  30. print(f"Error: Model not found at {model_path}")
  31. exit(1)
  32. # Default output path
  33. if args.output:
  34. output_path = args.output
  35. else:
  36. output_path = str(model_path / f"qmd-expansion-{args.quantization}.gguf")
  37. print(f"Loading model from {model_path}")
  38. # Load the finetuned model
  39. model, tokenizer = FastLanguageModel.from_pretrained(
  40. model_name=str(model_path),
  41. max_seq_length=512,
  42. dtype=None,
  43. load_in_4bit=True,
  44. )
  45. print(f"Exporting to GGUF with {args.quantization} quantization...")
  46. # Export to GGUF
  47. model.save_pretrained_gguf(
  48. output_path.replace(".gguf", ""), # Unsloth adds .gguf
  49. tokenizer,
  50. quantization_method=args.quantization.lower().replace("_", "-"),
  51. )
  52. print(f"Exported to {output_path}")
  53. # Push to hub if requested
  54. if args.push_to_hub:
  55. print(f"Pushing GGUF to HuggingFace Hub: {args.push_to_hub}")
  56. model.push_to_hub_gguf(
  57. args.push_to_hub,
  58. tokenizer,
  59. quantization_method=args.quantization.lower().replace("_", "-"),
  60. )
  61. print("Export complete!")
  62. print(f"\nTo use in QMD, update src/llm.ts:")
  63. print(f' const DEFAULT_GENERATE_MODEL = "{output_path}";')
  64. if __name__ == "__main__":
  65. main()