schema.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. #!/usr/bin/env python3
  2. """Schema helpers for QMD training JSONL data."""
  3. from __future__ import annotations
  4. from typing import Iterable
  5. VALID_OUTPUT_TYPES = {"hyde", "lex", "vec"}
  6. def parse_output_text(text: str) -> list[list[str]]:
  7. """Parse prefixed output text into list pairs.
  8. Returns: [["hyde", "..."], ["lex", "..."], ...]
  9. """
  10. items: list[list[str]] = []
  11. for raw_line in text.strip().split("\n"):
  12. line = raw_line.strip()
  13. if not line:
  14. continue
  15. if line.startswith("lex:"):
  16. items.append(["lex", line[4:].strip()])
  17. elif line.startswith("vec:"):
  18. items.append(["vec", line[4:].strip()])
  19. elif line.startswith("hyde:"):
  20. items.append(["hyde", line[5:].strip()])
  21. return items
  22. def output_items_to_text(items: Iterable[Iterable[str]]) -> str:
  23. """Render output list pairs to prefixed text lines."""
  24. lines = []
  25. for item in items:
  26. if not item:
  27. continue
  28. try:
  29. kind, text = item[0], item[1]
  30. except Exception:
  31. continue
  32. if kind not in VALID_OUTPUT_TYPES:
  33. continue
  34. if text is None:
  35. continue
  36. text = str(text).strip()
  37. if not text:
  38. continue
  39. lines.append(f"{kind}: {text}")
  40. return "\n".join(lines)
  41. def normalize_output_items(items: Iterable[Iterable[str]]) -> list[list[str]]:
  42. """Normalize output list pairs (filter invalid, trim whitespace)."""
  43. normalized: list[list[str]] = []
  44. for item in items:
  45. if not item:
  46. continue
  47. try:
  48. kind, text = item[0], item[1]
  49. except Exception:
  50. continue
  51. if kind not in VALID_OUTPUT_TYPES:
  52. continue
  53. if text is None:
  54. continue
  55. text = str(text).strip()
  56. if not text:
  57. continue
  58. normalized.append([kind, text])
  59. return normalized
  60. def has_type(items: Iterable[Iterable[str]], kind: str) -> bool:
  61. return any(item and item[0] == kind for item in items)