convert_to_structured.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. #!/usr/bin/env python3
  2. """
  3. Convert QMD expansion JSONL to structured format with type/query objects.
  4. Also applies hyde-first ordering.
  5. """
  6. import json
  7. from pathlib import Path
  8. def reorder_hyde_first(output_items):
  9. """Reorder output items to put hyde first, then lex, then vec."""
  10. hyde_items = [item for item in output_items if item[0] == "hyde"]
  11. lex_items = [item for item in output_items if item[0] == "lex"]
  12. vec_items = [item for item in output_items if item[0] == "vec"]
  13. return hyde_items + lex_items + vec_items
  14. def convert_to_structured(entry):
  15. """Convert flat output format to structured searches array."""
  16. query = entry["query"]
  17. output_items = entry.get("output", [])
  18. # Apply hyde-first ordering
  19. output_items = reorder_hyde_first(output_items)
  20. # Convert to structured format
  21. searches = []
  22. for item_type, content in output_items:
  23. searches.append({
  24. "type": item_type,
  25. "query": content
  26. })
  27. return {
  28. "query": query,
  29. "searches": searches
  30. }
  31. def main():
  32. script_dir = Path(__file__).parent
  33. input_file = script_dir / "qmd_expansion_v3.jsonl"
  34. output_file = script_dir / "qmd_expansion_v3_structured.jsonl"
  35. print(f"Converting {input_file} to structured format...")
  36. count = 0
  37. with open(input_file, 'r', encoding='utf-8') as f_in, \
  38. open(output_file, 'w', encoding='utf-8') as f_out:
  39. for line in f_in:
  40. if line.strip():
  41. entry = json.loads(line)
  42. structured = convert_to_structured(entry)
  43. f_out.write(json.dumps(structured, ensure_ascii=False) + '\n')
  44. count += 1
  45. print(f"Converted {count} entries to {output_file}")
  46. # Show sample
  47. print("\nSample entry:")
  48. with open(output_file, 'r') as f:
  49. sample = json.loads(f.readline())
  50. print(json.dumps(sample, indent=2))
  51. if __name__ == "__main__":
  52. main()