| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107 |
- # /// script
- # requires-python = ">=3.10"
- # dependencies = []
- # ///
- """Prepare v4 dataset: high-quality expansions + /only: variants."""
- import json
- import random
- from pathlib import Path
- def to_chat_format(query: str, output: str) -> dict:
- """Convert input/output to chat format with /no_think."""
- # For /only: queries, keep the suffix in the prompt
- prompt = f"/no_think Expand this search query: {query}"
-
- text = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n{output}<|im_end|>\n"
-
- messages = [
- {"role": "user", "content": prompt},
- {"role": "assistant", "content": output}
- ]
-
- return {"text": text, "messages": messages}
- def load_jsonl(path: Path) -> list[dict]:
- """Load JSONL file."""
- data = []
- with open(path) as f:
- for line in f:
- line = line.strip()
- if line:
- data.append(json.loads(line))
- return data
- def main():
- data_dir = Path("data")
-
- # High-quality sources
- sources = [
- ("qmd_expansion_v2.jsonl", "v2"),
- ("qmd_expansion_handcrafted.jsonl", "handcrafted"),
- ("qmd_only_variants.jsonl", "only"),
- ]
-
- all_examples = []
- stats = {}
-
- for filename, label in sources:
- path = data_dir / filename
- if not path.exists():
- print(f" Skipping {filename} (not found)")
- continue
-
- raw = load_jsonl(path)
- converted = []
-
- for item in raw:
- query = item.get("input", "")
- output = item.get("output", "")
- if query and output:
- converted.append(to_chat_format(query, output))
-
- all_examples.extend(converted)
- stats[label] = len(converted)
- print(f" {label}: {len(converted)} examples")
-
- # Shuffle
- random.seed(42)
- random.shuffle(all_examples)
-
- # Split 90/10
- split_idx = int(len(all_examples) * 0.9)
- train = all_examples[:split_idx]
- val = all_examples[split_idx:]
-
- # Write output
- out_dir = data_dir / "train_v4"
- out_dir.mkdir(exist_ok=True)
-
- with open(out_dir / "train.jsonl", "w") as f:
- for ex in train:
- f.write(json.dumps(ex) + "\n")
-
- with open(out_dir / "val.jsonl", "w") as f:
- for ex in val:
- f.write(json.dumps(ex) + "\n")
-
- # Dataset info
- info = {
- "dataset_name": "qmd-query-expansion-v4",
- "train_samples": len(train),
- "val_samples": len(val),
- "sources": stats,
- }
- with open(out_dir / "dataset_info.json", "w") as f:
- json.dump(info, f, indent=2)
-
- print(f"\n✓ Dataset prepared in {out_dir}/")
- print(f" Train: {len(train)}")
- print(f" Val: {len(val)}")
- print(f" Total: {len(all_examples)}")
- if __name__ == "__main__":
- main()
|