| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273 |
- #!/usr/bin/env python3
- """Schema helpers for QMD training JSONL data."""
- from __future__ import annotations
- from typing import Iterable
- VALID_OUTPUT_TYPES = {"hyde", "lex", "vec"}
- def parse_output_text(text: str) -> list[list[str]]:
- """Parse prefixed output text into list pairs.
- Returns: [["hyde", "..."], ["lex", "..."], ...]
- """
- items: list[list[str]] = []
- for raw_line in text.strip().split("\n"):
- line = raw_line.strip()
- if not line:
- continue
- if line.startswith("lex:"):
- items.append(["lex", line[4:].strip()])
- elif line.startswith("vec:"):
- items.append(["vec", line[4:].strip()])
- elif line.startswith("hyde:"):
- items.append(["hyde", line[5:].strip()])
- return items
- def output_items_to_text(items: Iterable[Iterable[str]]) -> str:
- """Render output list pairs to prefixed text lines."""
- lines = []
- for item in items:
- if not item:
- continue
- try:
- kind, text = item[0], item[1]
- except Exception:
- continue
- if kind not in VALID_OUTPUT_TYPES:
- continue
- if text is None:
- continue
- text = str(text).strip()
- if not text:
- continue
- lines.append(f"{kind}: {text}")
- return "\n".join(lines)
- def normalize_output_items(items: Iterable[Iterable[str]]) -> list[list[str]]:
- """Normalize output list pairs (filter invalid, trim whitespace)."""
- normalized: list[list[str]] = []
- for item in items:
- if not item:
- continue
- try:
- kind, text = item[0], item[1]
- except Exception:
- continue
- if kind not in VALID_OUTPUT_TYPES:
- continue
- if text is None:
- continue
- text = str(text).strip()
- if not text:
- continue
- normalized.append([kind, text])
- return normalized
- def has_type(items: Iterable[Iterable[str]], kind: str) -> bool:
- return any(item and item[0] == kind for item in items)
|