validate_schema.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. #!/usr/bin/env python3
  2. """Validate JSONL files against the QMD training schema."""
  3. from __future__ import annotations
  4. import argparse
  5. import json
  6. import sys
  7. from pathlib import Path
  8. sys.path.insert(0, str(Path(__file__).parent.parent))
  9. from dataset.schema import VALID_OUTPUT_TYPES
  10. def validate_file(path: Path) -> tuple[int, int]:
  11. """Return (total_lines, error_count)."""
  12. total = 0
  13. errors = 0
  14. with path.open("r", encoding="utf-8") as f:
  15. for line_num, line in enumerate(f, 1):
  16. line = line.strip()
  17. if not line:
  18. continue
  19. total += 1
  20. try:
  21. obj = json.loads(line)
  22. except json.JSONDecodeError as e:
  23. print(f"{path}:{line_num}: invalid JSON ({e})")
  24. errors += 1
  25. continue
  26. query = obj.get("query")
  27. output = obj.get("output")
  28. if not isinstance(query, str) or not query.strip():
  29. print(f"{path}:{line_num}: missing/invalid query")
  30. errors += 1
  31. continue
  32. if not isinstance(output, list):
  33. print(f"{path}:{line_num}: output must be a list")
  34. errors += 1
  35. continue
  36. for idx, item in enumerate(output):
  37. if not isinstance(item, list) or len(item) != 2:
  38. print(f"{path}:{line_num}: output[{idx}] must be [type, text]")
  39. errors += 1
  40. continue
  41. kind, text = item
  42. if kind not in VALID_OUTPUT_TYPES:
  43. print(f"{path}:{line_num}: invalid output type '{kind}'")
  44. errors += 1
  45. if not isinstance(text, str) or not text.strip():
  46. print(f"{path}:{line_num}: empty output text")
  47. errors += 1
  48. return total, errors
  49. def main() -> int:
  50. parser = argparse.ArgumentParser(description="Validate QMD JSONL schema")
  51. parser.add_argument(
  52. "paths",
  53. nargs="*",
  54. default=["finetune/data/*.jsonl"],
  55. help="JSONL files or glob patterns (default: finetune/data/*.jsonl)",
  56. )
  57. args = parser.parse_args()
  58. repo_root = Path(__file__).parent.parent.parent
  59. files: list[Path] = []
  60. for pattern in args.paths:
  61. if "*" in pattern:
  62. files.extend(repo_root.glob(pattern))
  63. else:
  64. files.append(repo_root / pattern)
  65. files = [p for p in files if p.exists()]
  66. if not files:
  67. print("No files found to validate.")
  68. return 1
  69. total_lines = 0
  70. total_errors = 0
  71. for path in sorted(files):
  72. lines, errors = validate_file(path)
  73. total_lines += lines
  74. total_errors += errors
  75. status = "OK" if errors == 0 else f"{errors} error(s)"
  76. print(f"{path}: {lines} lines, {status}")
  77. if total_errors:
  78. print(
  79. f"\nValidation failed: {total_errors} error(s) across {total_lines} lines"
  80. )
  81. return 1
  82. print(f"\nValidation passed: {total_lines} lines checked")
  83. return 0
  84. if __name__ == "__main__":
  85. raise SystemExit(main())