schema.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. #!/usr/bin/env python3
  2. """Schema helpers for QMD training JSONL data."""
  3. from __future__ import annotations
  4. from typing import Iterable
  5. VALID_OUTPUT_TYPES = {"hyde", "lex", "vec"}
  6. def parse_output_text(text: str) -> list[list[str]]:
  7. """Parse prefixed output text into list pairs.
  8. Returns: [["hyde", "..."], ["lex", "..."], ...]
  9. """
  10. items: list[list[str]] = []
  11. for raw_line in text.strip().split("\n"):
  12. line = raw_line.strip()
  13. if not line:
  14. continue
  15. if line.startswith("lex:"):
  16. items.append(["lex", line[4:].strip()])
  17. elif line.startswith("vec:"):
  18. items.append(["vec", line[4:].strip()])
  19. elif line.startswith("hyde:"):
  20. items.append(["hyde", line[5:].strip()])
  21. return items
  22. def reorder_hyde_first(items: list[list[str]]) -> list[list[str]]:
  23. """Reorder items to put hyde first, then lex, then vec."""
  24. hyde_items = [item for item in items if item and item[0] == "hyde"]
  25. lex_items = [item for item in items if item and item[0] == "lex"]
  26. vec_items = [item for item in items if item and item[0] == "vec"]
  27. return hyde_items + lex_items + vec_items
  28. def output_items_to_text(items: Iterable[Iterable[str]], hyde_first: bool = True) -> str:
  29. """Render output list pairs to prefixed text lines.
  30. Args:
  31. items: Iterable of [type, text] pairs
  32. hyde_first: If True, reorder to put hyde first (default)
  33. """
  34. # First normalize to list
  35. normalized = []
  36. for item in items:
  37. if not item:
  38. continue
  39. try:
  40. kind, text = item[0], item[1]
  41. except Exception:
  42. continue
  43. if kind not in VALID_OUTPUT_TYPES:
  44. continue
  45. if text is None:
  46. continue
  47. text = str(text).strip()
  48. if not text:
  49. continue
  50. normalized.append([kind, text])
  51. # Apply hyde-first ordering if requested
  52. if hyde_first:
  53. normalized = reorder_hyde_first(normalized)
  54. lines = [f"{kind}: {text}" for kind, text in normalized]
  55. return "\n".join(lines)
  56. def normalize_output_items(items: Iterable[Iterable[str]], hyde_first: bool = True) -> list[list[str]]:
  57. """Normalize output list pairs (filter invalid, trim whitespace, reorder).
  58. Args:
  59. items: Iterable of [type, text] pairs
  60. hyde_first: If True, reorder to put hyde first (default)
  61. """
  62. normalized: list[list[str]] = []
  63. for item in items:
  64. if not item:
  65. continue
  66. try:
  67. kind, text = item[0], item[1]
  68. except Exception:
  69. continue
  70. if kind not in VALID_OUTPUT_TYPES:
  71. continue
  72. if text is None:
  73. continue
  74. text = str(text).strip()
  75. if not text:
  76. continue
  77. normalized.append([kind, text])
  78. # Apply hyde-first ordering if requested
  79. if hyde_first:
  80. normalized = reorder_hyde_first(normalized)
  81. return normalized
  82. def has_type(items: Iterable[Iterable[str]], kind: str) -> bool:
  83. return any(item and item[0] == kind for item in items)