suby
/
qmd


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
							#!/usr/bin/env python3
"""Schema helpers for QMD training JSONL data."""

from __future__ import annotations

from typing import Iterable

VALID_OUTPUT_TYPES = {"hyde", "lex", "vec"}


def parse_output_text(text: str) -> list[list[str]]:
    """Parse prefixed output text into list pairs.

    Returns: [["hyde", "..."], ["lex", "..."], ...]
    """
    items: list[list[str]] = []
    for raw_line in text.strip().split("\n"):
        line = raw_line.strip()
        if not line:
            continue
        if line.startswith("lex:"):
            items.append(["lex", line[4:].strip()])
        elif line.startswith("vec:"):
            items.append(["vec", line[4:].strip()])
        elif line.startswith("hyde:"):
            items.append(["hyde", line[5:].strip()])
    return items


def output_items_to_text(items: Iterable[Iterable[str]]) -> str:
    """Render output list pairs to prefixed text lines."""
    lines = []
    for item in items:
        if not item:
            continue
        try:
            kind, text = item[0], item[1]
        except Exception:
            continue
        if kind not in VALID_OUTPUT_TYPES:
            continue
        if text is None:
            continue
        text = str(text).strip()
        if not text:
            continue
        lines.append(f"{kind}: {text}")
    return "\n".join(lines)


def normalize_output_items(items: Iterable[Iterable[str]]) -> list[list[str]]:
    """Normalize output list pairs (filter invalid, trim whitespace)."""
    normalized: list[list[str]] = []
    for item in items:
        if not item:
            continue
        try:
            kind, text = item[0], item[1]
        except Exception:
            continue
        if kind not in VALID_OUTPUT_TYPES:
            continue
        if text is None:
            continue
        text = str(text).strip()
        if not text:
            continue
        normalized.append([kind, text])
    return normalized


def has_type(items: Iterable[Iterable[str]], kind: str) -> bool:
    return any(item and item[0] == kind for item in items)