verify_data.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. #!/usr/bin/env python3
  2. """
  3. Verify the quality and correctness of the converted ChatML data.
  4. """
  5. import json
  6. import re
  7. from pathlib import Path
  8. def verify_chatml_format(text):
  9. """Verify that the text follows proper ChatML format."""
  10. issues = []
  11. # Check start token
  12. if not text.startswith("<|startoftext|>"):
  13. issues.append("Missing <|startoftext|> at beginning")
  14. # Check user section
  15. user_pattern = r"<\|im_start\|>user\n.*?<\|im_end\|>"
  16. if not re.search(user_pattern, text, re.DOTALL):
  17. issues.append("Missing or malformed user section")
  18. # Check assistant section
  19. assistant_pattern = r"<\|im_start\|>assistant\n.*?<\|im_end\|>"
  20. if not re.search(assistant_pattern, text, re.DOTALL):
  21. issues.append("Missing or malformed assistant section")
  22. # Check for proper query format
  23. if "Expand this search query:" not in text:
  24. issues.append("Missing 'Expand this search query:' prompt")
  25. # Check for required output types
  26. assistant_content = re.search(r"<\|im_start\|>assistant\n(.*?)<\|im_end\|>", text, re.DOTALL)
  27. if assistant_content:
  28. content = assistant_content.group(1)
  29. has_lex = "lex:" in content
  30. has_vec = "vec:" in content
  31. has_hyde = "hyde:" in content
  32. if not has_lex:
  33. issues.append("Missing lex: entries")
  34. if not has_vec:
  35. issues.append("Missing vec: entries")
  36. if not has_hyde:
  37. issues.append("Missing hyde: entries")
  38. return issues
  39. def analyze_file(filepath):
  40. """Analyze a JSONL file for quality and issues."""
  41. print(f"\nAnalyzing {filepath}...")
  42. total_entries = 0
  43. total_issues = 0
  44. issue_counts = {}
  45. query_lengths = []
  46. assistant_lengths = []
  47. with open(filepath, 'r', encoding='utf-8') as f:
  48. for line_num, line in enumerate(f, 1):
  49. try:
  50. entry = json.loads(line.strip())
  51. total_entries += 1
  52. text = entry["text"]
  53. issues = verify_chatml_format(text)
  54. if issues:
  55. total_issues += 1
  56. for issue in issues:
  57. issue_counts[issue] = issue_counts.get(issue, 0) + 1
  58. # Extract query and assistant response for length analysis
  59. user_match = re.search(r"Expand this search query: (.*?)<\|im_end\|>", text, re.DOTALL)
  60. assistant_match = re.search(r"<\|im_start\|>assistant\n(.*?)<\|im_end\|>", text, re.DOTALL)
  61. if user_match:
  62. query_lengths.append(len(user_match.group(1).strip()))
  63. if assistant_match:
  64. assistant_lengths.append(len(assistant_match.group(1)))
  65. except json.JSONDecodeError as e:
  66. print(f"JSON decode error on line {line_num}: {e}")
  67. except Exception as e:
  68. print(f"Error processing line {line_num}: {e}")
  69. print(f"Total entries: {total_entries}")
  70. print(f"Entries with issues: {total_issues}")
  71. print(f"Success rate: {((total_entries - total_issues) / total_entries * 100):.1f}%")
  72. if issue_counts:
  73. print("\nIssue breakdown:")
  74. for issue, count in sorted(issue_counts.items()):
  75. print(f" {issue}: {count}")
  76. if query_lengths:
  77. print(f"\nQuery length stats:")
  78. print(f" Min: {min(query_lengths)} chars")
  79. print(f" Max: {max(query_lengths)} chars")
  80. print(f" Avg: {sum(query_lengths) / len(query_lengths):.1f} chars")
  81. if assistant_lengths:
  82. print(f"\nAssistant response length stats:")
  83. print(f" Min: {min(assistant_lengths)} chars")
  84. print(f" Max: {max(assistant_lengths)} chars")
  85. print(f" Avg: {sum(assistant_lengths) / len(assistant_lengths):.1f} chars")
  86. def main():
  87. data_dir = Path("~/src/github.com/tobi/qmd/finetune/data/train-lfm2").expanduser()
  88. # Analyze both train and validation sets
  89. analyze_file(data_dir / "train.jsonl")
  90. analyze_file(data_dir / "val.jsonl")
  91. print("\n" + "="*50)
  92. print("DATA PREPARATION VERIFICATION COMPLETE")
  93. print("="*50)
  94. if __name__ == "__main__":
  95. main()