#!/usr/bin/env python3 # /// script # requires-python = ">=3.10" # dependencies = ["pydantic>=2.0"] # /// """Validate JSONL files against the strict QMD training schema.""" from __future__ import annotations import argparse import json import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from dataset.schema import TrainingExample def validate_file(path: Path) -> tuple[int, int]: """Return (total_lines, error_count).""" total = 0 errors = 0 with path.open("r", encoding="utf-8") as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line: continue total += 1 try: obj = json.loads(line) except json.JSONDecodeError as e: print(f"{path}:{line_num}: invalid JSON ({e})") errors += 1 continue try: TrainingExample.model_validate(obj) except Exception as e: print(f"{path}:{line_num}: {e}") errors += 1 return total, errors def main() -> int: parser = argparse.ArgumentParser(description="Validate QMD JSONL schema") parser.add_argument( "paths", nargs="*", default=["finetune/data/*.jsonl"], help="JSONL files or glob patterns (default: finetune/data/*.jsonl)", ) args = parser.parse_args() repo_root = Path(__file__).parent.parent.parent files: list[Path] = [] for pattern in args.paths: if "*" in pattern: files.extend(repo_root.glob(pattern)) else: files.append(repo_root / pattern) files = [p for p in files if p.exists()] if not files: print("No files found to validate.") return 1 total_lines = 0 total_errors = 0 for path in sorted(files): lines, errors = validate_file(path) total_lines += lines total_errors += errors status = "OK" if errors == 0 else f"{errors} error(s)" print(f"{path}: {lines} lines, {status}") if total_errors: print( f"\nValidation failed: {total_errors} error(s) across {total_lines} lines" ) return 1 print(f"\nValidation passed: {total_lines} lines checked") return 0 if __name__ == "__main__": raise SystemExit(main())