| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 |
- #!/usr/bin/env python3
- """
- Convert QMD expansion JSONL to structured format with type/query objects.
- Also applies hyde-first ordering.
- """
- import json
- from pathlib import Path
- def reorder_hyde_first(output_items):
- """Reorder output items to put hyde first, then lex, then vec."""
- hyde_items = [item for item in output_items if item[0] == "hyde"]
- lex_items = [item for item in output_items if item[0] == "lex"]
- vec_items = [item for item in output_items if item[0] == "vec"]
- return hyde_items + lex_items + vec_items
- def convert_to_structured(entry):
- """Convert flat output format to structured searches array."""
- query = entry["query"]
- output_items = entry.get("output", [])
-
- # Apply hyde-first ordering
- output_items = reorder_hyde_first(output_items)
-
- # Convert to structured format
- searches = []
- for item_type, content in output_items:
- searches.append({
- "type": item_type,
- "query": content
- })
-
- return {
- "query": query,
- "searches": searches
- }
- def main():
- script_dir = Path(__file__).parent
- input_file = script_dir / "qmd_expansion_v3.jsonl"
- output_file = script_dir / "qmd_expansion_v3_structured.jsonl"
-
- print(f"Converting {input_file} to structured format...")
-
- count = 0
- with open(input_file, 'r', encoding='utf-8') as f_in, \
- open(output_file, 'w', encoding='utf-8') as f_out:
- for line in f_in:
- if line.strip():
- entry = json.loads(line)
- structured = convert_to_structured(entry)
- f_out.write(json.dumps(structured, ensure_ascii=False) + '\n')
- count += 1
-
- print(f"Converted {count} entries to {output_file}")
-
- # Show sample
- print("\nSample entry:")
- with open(output_file, 'r') as f:
- sample = json.loads(f.readline())
- print(json.dumps(sample, indent=2))
- if __name__ == "__main__":
- main()
|