| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 |
- #!/usr/bin/env python3
- """
- Verify the quality and correctness of the converted ChatML data.
- """
- import json
- import re
- from pathlib import Path
- def verify_chatml_format(text):
- """Verify that the text follows proper ChatML format."""
- issues = []
-
- # Check start token
- if not text.startswith("<|startoftext|>"):
- issues.append("Missing <|startoftext|> at beginning")
-
- # Check user section
- user_pattern = r"<\|im_start\|>user\n.*?<\|im_end\|>"
- if not re.search(user_pattern, text, re.DOTALL):
- issues.append("Missing or malformed user section")
-
- # Check assistant section
- assistant_pattern = r"<\|im_start\|>assistant\n.*?<\|im_end\|>"
- if not re.search(assistant_pattern, text, re.DOTALL):
- issues.append("Missing or malformed assistant section")
-
- # Check for proper query format
- if "Expand this search query:" not in text:
- issues.append("Missing 'Expand this search query:' prompt")
-
- # Check for required output types
- assistant_content = re.search(r"<\|im_start\|>assistant\n(.*?)<\|im_end\|>", text, re.DOTALL)
- if assistant_content:
- content = assistant_content.group(1)
- has_lex = "lex:" in content
- has_vec = "vec:" in content
- has_hyde = "hyde:" in content
-
- if not has_lex:
- issues.append("Missing lex: entries")
- if not has_vec:
- issues.append("Missing vec: entries")
- if not has_hyde:
- issues.append("Missing hyde: entries")
-
- return issues
- def analyze_file(filepath):
- """Analyze a JSONL file for quality and issues."""
- print(f"\nAnalyzing {filepath}...")
-
- total_entries = 0
- total_issues = 0
- issue_counts = {}
- query_lengths = []
- assistant_lengths = []
-
- with open(filepath, 'r', encoding='utf-8') as f:
- for line_num, line in enumerate(f, 1):
- try:
- entry = json.loads(line.strip())
- total_entries += 1
-
- text = entry["text"]
- issues = verify_chatml_format(text)
-
- if issues:
- total_issues += 1
- for issue in issues:
- issue_counts[issue] = issue_counts.get(issue, 0) + 1
-
- # Extract query and assistant response for length analysis
- user_match = re.search(r"Expand this search query: (.*?)<\|im_end\|>", text, re.DOTALL)
- assistant_match = re.search(r"<\|im_start\|>assistant\n(.*?)<\|im_end\|>", text, re.DOTALL)
-
- if user_match:
- query_lengths.append(len(user_match.group(1).strip()))
- if assistant_match:
- assistant_lengths.append(len(assistant_match.group(1)))
-
- except json.JSONDecodeError as e:
- print(f"JSON decode error on line {line_num}: {e}")
- except Exception as e:
- print(f"Error processing line {line_num}: {e}")
-
- print(f"Total entries: {total_entries}")
- print(f"Entries with issues: {total_issues}")
- print(f"Success rate: {((total_entries - total_issues) / total_entries * 100):.1f}%")
-
- if issue_counts:
- print("\nIssue breakdown:")
- for issue, count in sorted(issue_counts.items()):
- print(f" {issue}: {count}")
-
- if query_lengths:
- print(f"\nQuery length stats:")
- print(f" Min: {min(query_lengths)} chars")
- print(f" Max: {max(query_lengths)} chars")
- print(f" Avg: {sum(query_lengths) / len(query_lengths):.1f} chars")
-
- if assistant_lengths:
- print(f"\nAssistant response length stats:")
- print(f" Min: {min(assistant_lengths)} chars")
- print(f" Max: {max(assistant_lengths)} chars")
- print(f" Avg: {sum(assistant_lengths) / len(assistant_lengths):.1f} chars")
- def main():
- data_dir = Path("~/src/github.com/tobi/qmd/finetune/data/train-lfm2").expanduser()
-
- # Analyze both train and validation sets
- analyze_file(data_dir / "train.jsonl")
- analyze_file(data_dir / "val.jsonl")
-
- print("\n" + "="*50)
- print("DATA PREPARATION VERIFICATION COMPLETE")
- print("="*50)
- if __name__ == "__main__":
- main()
|