suby
/
qmd


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
							#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "unsloth",
#     "transformers>=4.45.0",
#     "torch",
# ]
# ///
"""
Export finetuned model to GGUF format for use with node-llama-cpp.

Usage:
    python export_gguf.py --model models/qmd-expansion --quantization Q8_0
    python export_gguf.py --model models/qmd-expansion --quantization Q4_K_M
"""

import argparse
from pathlib import Path


def main():
    parser = argparse.ArgumentParser(description="Export model to GGUF")
    parser.add_argument("--model", type=str, required=True, help="Path to finetuned model")
    parser.add_argument("--output", type=str, help="Output GGUF file path")
    parser.add_argument("--quantization", type=str, default="Q8_0",
                        choices=["Q4_K_M", "Q5_K_M", "Q6_K", "Q8_0", "F16"],
                        help="Quantization method")
    parser.add_argument("--push-to-hub", type=str, help="Push GGUF to HuggingFace Hub repo")
    args = parser.parse_args()

    from unsloth import FastLanguageModel

    model_path = Path(args.model)
    if not model_path.exists():
        print(f"Error: Model not found at {model_path}")
        exit(1)

    # Default output path
    if args.output:
        output_path = args.output
    else:
        output_path = str(model_path / f"qmd-expansion-{args.quantization}.gguf")

    print(f"Loading model from {model_path}")

    # Load the finetuned model
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=str(model_path),
        max_seq_length=512,
        dtype=None,
        load_in_4bit=True,
    )

    print(f"Exporting to GGUF with {args.quantization} quantization...")

    # Export to GGUF
    model.save_pretrained_gguf(
        output_path.replace(".gguf", ""),  # Unsloth adds .gguf
        tokenizer,
        quantization_method=args.quantization.lower().replace("_", "-"),
    )

    print(f"Exported to {output_path}")

    # Push to hub if requested
    if args.push_to_hub:
        print(f"Pushing GGUF to HuggingFace Hub: {args.push_to_hub}")
        model.push_to_hub_gguf(
            args.push_to_hub,
            tokenizer,
            quantization_method=args.quantization.lower().replace("_", "-"),
        )

    print("Export complete!")
    print(f"\nTo use in QMD, update src/llm.ts:")
    print(f'  const DEFAULT_GENERATE_MODEL = "{output_path}";')


if __name__ == "__main__":
    main()