prepare_data_lfm2.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. #!/usr/bin/env python3
  2. """Prepare QMD query expansion data for LFM2.5-1.2B-Instruct training.
  3. LFM2.5 uses ChatML format:
  4. <|startoftext|><|im_start|>user
  5. Expand this search query: {query}<|im_end|>
  6. <|im_start|>assistant
  7. {output}<|im_end|>
  8. No /no_think needed (that's Qwen3-specific).
  9. """
  10. import json
  11. import os
  12. import random
  13. import sys
  14. from pathlib import Path
  15. sys.path.insert(0, str(Path(__file__).parent.parent))
  16. from dataset.schema import normalize_output_items, output_items_to_text
  17. from transformers import AutoTokenizer
  18. def format_for_training(query_text: str, output_items: list[list[str]], tokenizer) -> dict:
  19. """Format a single example for SFT training using LFM2.5 chat format."""
  20. output_text = output_items_to_text(output_items)
  21. messages = [
  22. {"role": "user", "content": f"Expand this search query: {query_text}"},
  23. {"role": "assistant", "content": output_text},
  24. ]
  25. text = tokenizer.apply_chat_template(
  26. messages, tokenize=False, add_generation_prompt=False
  27. )
  28. return {"text": text}
  29. def main():
  30. input_path = Path("data/qmd_expansion_v2.jsonl")
  31. output_dir = Path("data/train-lfm2")
  32. output_dir.mkdir(parents=True, exist_ok=True)
  33. print("Loading LFM2.5 tokenizer...")
  34. tokenizer = AutoTokenizer.from_pretrained(
  35. "LiquidAI/LFM2.5-1.2B-Instruct", trust_remote_code=True
  36. )
  37. examples = []
  38. with open(input_path) as f:
  39. for line in f:
  40. row = json.loads(line)
  41. items = normalize_output_items(row["output"])
  42. example = format_for_training(row["query"], items, tokenizer)
  43. examples.append(example)
  44. # Shuffle and split
  45. random.seed(42)
  46. random.shuffle(examples)
  47. split_idx = int(len(examples) * 0.9)
  48. train = examples[:split_idx]
  49. val = examples[split_idx:]
  50. # Write as JSONL
  51. train_path = output_dir / "train.jsonl"
  52. val_path = output_dir / "val.jsonl"
  53. with open(train_path, "w") as f:
  54. for ex in train:
  55. f.write(json.dumps(ex) + "\n")
  56. with open(val_path, "w") as f:
  57. for ex in val:
  58. f.write(json.dumps(ex) + "\n")
  59. print(f"Written {len(train)} train, {len(val)} val examples to {output_dir}")
  60. print(f"\nSample formatted text:")
  61. print(train[0]["text"][:500])
  62. if __name__ == "__main__":
  63. main()