generate_balanced.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823
  1. #!/usr/bin/env python3
  2. """
  3. Generate BALANCED QMD training examples - reduced tech focus, more life diversity.
  4. Categories (non-tech heavy):
  5. - Health & Wellness (15%)
  6. - Personal Finance & Business (15%)
  7. - Home, Garden & DIY (10%)
  8. - Food & Cooking (10%)
  9. - Travel & Geography (10%)
  10. - Hobbies & Crafts (10%)
  11. - Education & Learning (10%)
  12. - Arts & Culture (10%)
  13. - Lifestyle & Relationships (5%)
  14. - Technology (5% - minimal)
  15. """
  16. import json
  17. import random
  18. from pathlib import Path
  19. from datetime import datetime
  20. from dataset.schema import normalize_output_items, parse_output_text
  21. # Category weights - balanced with reasonable tech representation
  22. CATEGORY_WEIGHTS = {
  23. "health_wellness": 0.12,
  24. "finance_business": 0.12,
  25. "home_garden": 0.10,
  26. "food_cooking": 0.10,
  27. "travel_geography": 0.10,
  28. "hobbies_crafts": 0.10,
  29. "education_learning": 0.08,
  30. "arts_culture": 0.08,
  31. "lifestyle_relationships": 0.05,
  32. "technology": 0.15, # 15% - reasonable for QMD use case
  33. }
  34. # === HEALTH & WELLNESS ===
  35. HEALTH_TOPICS = [
  36. "meditation techniques",
  37. "sleep improvement",
  38. "stress management",
  39. "anxiety relief",
  40. "healthy eating",
  41. "meal planning",
  42. "weight loss",
  43. "muscle building",
  44. "yoga poses",
  45. "home workout",
  46. "running form",
  47. "swimming technique",
  48. "vitamin supplements",
  49. "hydration tips",
  50. "posture correction",
  51. "stretching routine",
  52. "mental health",
  53. "therapy types",
  54. "mindfulness practice",
  55. "breathing exercises",
  56. "first aid basics",
  57. "CPR technique",
  58. "common cold remedies",
  59. "allergy management",
  60. "chronic pain",
  61. "physical therapy",
  62. "massage techniques",
  63. "acupuncture",
  64. "eye health",
  65. "dental care",
  66. "skin care routine",
  67. "hair health",
  68. ]
  69. HEALTH_ACTIVITIES = [
  70. "improve sleep",
  71. "reduce stress",
  72. "manage anxiety",
  73. "build muscle",
  74. "lose weight",
  75. "eat healthier",
  76. "start meditating",
  77. "practice yoga",
  78. "run faster",
  79. "swim better",
  80. "lift weights",
  81. "stretch properly",
  82. "boost immunity",
  83. "increase energy",
  84. "reduce inflammation",
  85. "detox body",
  86. ]
  87. # === FINANCE & BUSINESS ===
  88. FINANCE_TOPICS = [
  89. "budget planning",
  90. "emergency fund",
  91. "debt payoff",
  92. "credit score",
  93. "investment basics",
  94. "stock market",
  95. "retirement planning",
  96. "401k",
  97. "IRA",
  98. "tax deductions",
  99. "filing taxes",
  100. "side hustle",
  101. "freelance income",
  102. "mortgage rates",
  103. "home buying",
  104. "rent vs buy",
  105. "real estate investing",
  106. "credit cards",
  107. "rewards programs",
  108. "travel hacking",
  109. "points maximization",
  110. "small business",
  111. "LLC setup",
  112. "business plan",
  113. "marketing strategy",
  114. "negotiation skills",
  115. "salary raise",
  116. "job interview",
  117. "career change",
  118. "passive income",
  119. "dividend stocks",
  120. "index funds",
  121. "ETF investing",
  122. "financial independence",
  123. "FIRE movement",
  124. "frugal living",
  125. "minimalism",
  126. ]
  127. BUSINESS_ACTIVITIES = [
  128. "create budget",
  129. "save money",
  130. "pay off debt",
  131. "improve credit",
  132. "start investing",
  133. "buy stocks",
  134. "plan retirement",
  135. "reduce taxes",
  136. "start business",
  137. "write business plan",
  138. "market product",
  139. "hire employees",
  140. "negotiate salary",
  141. "prepare interview",
  142. "switch careers",
  143. "build network",
  144. "negotiate price",
  145. "close deal",
  146. "manage team",
  147. "increase sales",
  148. ]
  149. # === HOME & GARDEN ===
  150. HOME_TOPICS = [
  151. "organize closet",
  152. "declutter home",
  153. "minimalist living",
  154. "storage solutions",
  155. "clean efficiently",
  156. "deep cleaning",
  157. "laundry tips",
  158. "stain removal",
  159. "home repair",
  160. "fix leaky faucet",
  161. "unclog drain",
  162. "patch drywall",
  163. "paint walls",
  164. "choose paint color",
  165. "interior design",
  166. "furniture arrangement",
  167. "small space",
  168. "apartment living",
  169. "studio setup",
  170. "home office",
  171. "garden planning",
  172. "vegetable garden",
  173. "herb growing",
  174. "indoor plants",
  175. "composting",
  176. "raised beds",
  177. "pest control",
  178. "organic gardening",
  179. "flower arranging",
  180. "landscaping",
  181. "lawn care",
  182. "pruning trees",
  183. "tool maintenance",
  184. "basic carpentry",
  185. "electrical basics",
  186. "plumbing 101",
  187. ]
  188. # === FOOD & COOKING ===
  189. FOOD_TOPICS = [
  190. "meal prep",
  191. "batch cooking",
  192. "knife skills",
  193. "cooking techniques",
  194. "baking bread",
  195. "sourdough starter",
  196. "cake decorating",
  197. "pastry making",
  198. "meal planning",
  199. "grocery shopping",
  200. "food storage",
  201. "meal ideas",
  202. "healthy recipes",
  203. "quick dinners",
  204. "breakfast ideas",
  205. "lunch prep",
  206. "dietary restrictions",
  207. "gluten free",
  208. "vegan cooking",
  209. "keto meals",
  210. "international cuisine",
  211. "Italian recipes",
  212. "Asian cooking",
  213. "Mexican food",
  214. "spice combinations",
  215. "flavor pairing",
  216. "wine pairing",
  217. "coffee brewing",
  218. "fermentation",
  219. "pickling vegetables",
  220. "making cheese",
  221. "home brewing",
  222. "grilling",
  223. "BBQ techniques",
  224. "smoking meat",
  225. "outdoor cooking",
  226. ]
  227. # === TRAVEL & GEOGRAPHY ===
  228. TRAVEL_TOPICS = [
  229. "budget travel",
  230. "solo travel",
  231. "backpacking",
  232. "road trip planning",
  233. "travel insurance",
  234. "passport renewal",
  235. "visa requirements",
  236. "travel documents",
  237. "packing light",
  238. "carry on essentials",
  239. "travel gear",
  240. "luggage selection",
  241. "travel photography",
  242. "scenic routes",
  243. "hidden gems",
  244. "local experiences",
  245. "cultural etiquette",
  246. "language basics",
  247. "travel phrases",
  248. "translation apps",
  249. "countries visited",
  250. "bucket list destinations",
  251. "seven wonders",
  252. "UNESCO sites",
  253. "capital cities",
  254. "world geography",
  255. "climate zones",
  256. "time zones",
  257. "currency exchange",
  258. "travel banking",
  259. "avoid fees",
  260. "travel rewards",
  261. "jet lag",
  262. "adjust timezone",
  263. "stay healthy",
  264. "travel safety",
  265. ]
  266. # === HOBBIES & CRAFTS ===
  267. HOBBY_TOPICS = [
  268. "photography basics",
  269. "camera settings",
  270. "photo editing",
  271. "composition rules",
  272. "drawing techniques",
  273. "sketching",
  274. "watercolor",
  275. "acrylic painting",
  276. "knitting",
  277. "crochet",
  278. "sewing",
  279. "embroidery",
  280. "cross stitch",
  281. "woodworking",
  282. "wood carving",
  283. "furniture making",
  284. "wood finishing",
  285. "pottery",
  286. "ceramics",
  287. "clay sculpting",
  288. "wheel throwing",
  289. "jewelry making",
  290. "beading",
  291. "wire wrapping",
  292. "metal smithing",
  293. "playing guitar",
  294. "piano lessons",
  295. "music theory",
  296. "songwriting",
  297. "bird watching",
  298. "stargazing",
  299. "astronomy",
  300. "telescope selection",
  301. "board games",
  302. "chess strategy",
  303. "puzzle solving",
  304. "escape rooms",
  305. "fishing",
  306. "fly tying",
  307. "bait selection",
  308. "fishing spots",
  309. "hiking",
  310. "trail finding",
  311. "orienteering",
  312. "survival skills",
  313. "camping",
  314. "tent setup",
  315. "campfire cooking",
  316. "outdoor gear",
  317. "calligraphy",
  318. "hand lettering",
  319. "font design",
  320. "typography",
  321. "DIY projects",
  322. "upcycling",
  323. "furniture restoration",
  324. "home decor",
  325. ]
  326. # === EDUCATION & LEARNING ===
  327. EDUCATION_TOPICS = [
  328. "study techniques",
  329. "memory techniques",
  330. "speed reading",
  331. "note taking",
  332. "learning languages",
  333. "Spanish basics",
  334. "French phrases",
  335. "Mandarin tones",
  336. "online courses",
  337. "MOOC platforms",
  338. "certification prep",
  339. "exam strategy",
  340. "math fundamentals",
  341. "calculus basics",
  342. "statistics",
  343. "probability",
  344. "physics concepts",
  345. "chemistry basics",
  346. "biology",
  347. "earth science",
  348. "history timeline",
  349. "ancient civilizations",
  350. "world wars",
  351. "modern history",
  352. "literature classics",
  353. "book recommendations",
  354. "reading list",
  355. "literary analysis",
  356. "public speaking",
  357. "presentation skills",
  358. "storytelling",
  359. "communication",
  360. "critical thinking",
  361. "logical reasoning",
  362. "problem solving",
  363. "decision making",
  364. "time management",
  365. "productivity systems",
  366. "focus techniques",
  367. "deep work",
  368. "learning styles",
  369. "visual learner",
  370. "auditory learner",
  371. "kinesthetic",
  372. ]
  373. # === ARTS & CULTURE ===
  374. ARTS_TOPICS = [
  375. "art history",
  376. "renaissance art",
  377. "impressionism",
  378. "modern art",
  379. "museum visits",
  380. "gallery etiquette",
  381. "art appreciation",
  382. "criticism",
  383. "classical music",
  384. "orchestra instruments",
  385. "opera",
  386. "ballet",
  387. "jazz history",
  388. "blues music",
  389. "rock music",
  390. "classical composers",
  391. "film analysis",
  392. "cinema history",
  393. "directors",
  394. "film genres",
  395. "theater",
  396. "plays",
  397. "Shakespeare",
  398. "acting techniques",
  399. "dance styles",
  400. "ballroom dancing",
  401. "salsa",
  402. "contemporary dance",
  403. "architecture styles",
  404. "Gothic",
  405. "Baroque",
  406. "Modern",
  407. "Art Deco",
  408. "fashion history",
  409. "style guide",
  410. "color theory",
  411. "design principles",
  412. "poetry forms",
  413. "haiku",
  414. "sonnet",
  415. "free verse",
  416. "spoken word",
  417. "cultural festivals",
  418. "holiday traditions",
  419. "world celebrations",
  420. "customs",
  421. ]
  422. # === LIFESTYLE & RELATIONSHIPS ===
  423. LIFESTYLE_TOPICS = [
  424. "work life balance",
  425. "morning routine",
  426. "evening routine",
  427. "habit formation",
  428. "digital detox",
  429. "screen time",
  430. "social media",
  431. "information diet",
  432. "minimalism",
  433. "simple living",
  434. "intentional living",
  435. "slow living",
  436. "dating advice",
  437. "relationship communication",
  438. "conflict resolution",
  439. "trust building",
  440. "parenting tips",
  441. "child development",
  442. "teenager advice",
  443. "empty nest",
  444. "friendship maintenance",
  445. "social skills",
  446. "networking",
  447. "introvert tips",
  448. "family dynamics",
  449. "sibling relationships",
  450. "in-laws",
  451. "family gatherings",
  452. "pet care",
  453. "dog training",
  454. "cat behavior",
  455. "pet health",
  456. "event planning",
  457. "dinner party",
  458. "birthday celebration",
  459. "wedding planning",
  460. "etiquette rules",
  461. "dining manners",
  462. "thank you notes",
  463. "gift giving",
  464. "personal style",
  465. "wardrobe basics",
  466. "capsule wardrobe",
  467. "seasonal fashion",
  468. ]
  469. # === TECHNOLOGY (minimal 5%) ===
  470. TECH_TOPICS = [
  471. "email etiquette",
  472. "video calls",
  473. "password manager",
  474. "backup data",
  475. "smartphone tips",
  476. "app recommendations",
  477. "digital organization",
  478. "file management",
  479. "basic troubleshooting",
  480. "wifi setup",
  481. "printer issues",
  482. "software updates",
  483. "online privacy",
  484. "social media privacy",
  485. "two factor auth",
  486. "secure browsing",
  487. ]
  488. # === SHORT KEYWORD QUERIES (for all categories) ===
  489. SHORT_KEYWORDS = [
  490. # Health
  491. "meditate",
  492. "hydrate",
  493. "stretch",
  494. "exercise",
  495. "sleep",
  496. "vitamins",
  497. "protein",
  498. # Finance
  499. "budget",
  500. "save",
  501. "invest",
  502. "taxes",
  503. "debt",
  504. "credit",
  505. "retirement",
  506. # Home
  507. "clean",
  508. "organize",
  509. "repair",
  510. "paint",
  511. "garden",
  512. "compost",
  513. "prune",
  514. # Food
  515. "cook",
  516. "bake",
  517. "recipe",
  518. "meal",
  519. "diet",
  520. "nutrition",
  521. "spices",
  522. # Travel
  523. "travel",
  524. "pack",
  525. "passport",
  526. "visa",
  527. "hotel",
  528. "flight",
  529. "itinerary",
  530. # Hobbies
  531. "photo",
  532. "draw",
  533. "paint",
  534. "knit",
  535. "woodwork",
  536. "guitar",
  537. "hike",
  538. "camp",
  539. # Education
  540. "study",
  541. "learn",
  542. "read",
  543. "course",
  544. "exam",
  545. "degree",
  546. "certificate",
  547. # Arts
  548. "art",
  549. "music",
  550. "film",
  551. "dance",
  552. "theater",
  553. "museum",
  554. "gallery",
  555. # Life
  556. "habit",
  557. "routine",
  558. "minimal",
  559. "organize",
  560. "relationship",
  561. "dating",
  562. "parent",
  563. # Tech (minimal)
  564. "email",
  565. "wifi",
  566. "backup",
  567. "password",
  568. "update",
  569. ]
  570. def generate_query(category: str) -> str:
  571. """Generate a query for a specific category."""
  572. templates = {
  573. "health_wellness": [
  574. "how to {activity}",
  575. "best {topic}",
  576. "{topic} guide",
  577. "{topic} for beginners",
  578. "improve {topic}",
  579. ],
  580. "finance_business": [
  581. "how to {activity}",
  582. "{topic} basics",
  583. "{topic} strategy",
  584. "start {topic}",
  585. "{topic} tips",
  586. ],
  587. "home_garden": [
  588. "how to {topic}",
  589. "DIY {topic}",
  590. "{topic} ideas",
  591. "best {topic}",
  592. "{topic} tutorial",
  593. ],
  594. "food_cooking": [
  595. "how to {topic}",
  596. "{topic} recipe",
  597. "best {topic}",
  598. "{topic} techniques",
  599. "learn {topic}",
  600. ],
  601. "travel_geography": [
  602. "{topic} guide",
  603. "how to {topic}",
  604. "best {topic}",
  605. "{topic} tips",
  606. "plan {topic}",
  607. ],
  608. "hobbies_crafts": [
  609. "learn {topic}",
  610. "{topic} basics",
  611. "how to {topic}",
  612. "{topic} for beginners",
  613. "best {topic}",
  614. ],
  615. "education_learning": [
  616. "learn {topic}",
  617. "{topic} guide",
  618. "improve {topic}",
  619. "{topic} techniques",
  620. "study {topic}",
  621. ],
  622. "arts_culture": [
  623. "understand {topic}",
  624. "{topic} guide",
  625. "appreciate {topic}",
  626. "learn {topic}",
  627. "{topic} history",
  628. ],
  629. "lifestyle_relationships": [
  630. "how to {topic}",
  631. "improve {topic}",
  632. "{topic} advice",
  633. "{topic} tips",
  634. "best {topic}",
  635. ],
  636. "technology": [
  637. "how to {topic}",
  638. "{topic} setup",
  639. "fix {topic}",
  640. "{topic} tips",
  641. ],
  642. }
  643. word_lists = {
  644. "health_wellness": (HEALTH_ACTIVITIES, HEALTH_TOPICS),
  645. "finance_business": (BUSINESS_ACTIVITIES, FINANCE_TOPICS),
  646. "home_garden": (HOME_TOPICS, HOME_TOPICS),
  647. "food_cooking": (FOOD_TOPICS, FOOD_TOPICS),
  648. "travel_geography": (TRAVEL_TOPICS, TRAVEL_TOPICS),
  649. "hobbies_crafts": (HOBBY_TOPICS, HOBBY_TOPICS),
  650. "education_learning": (EDUCATION_TOPICS, EDUCATION_TOPICS),
  651. "arts_culture": (ARTS_TOPICS, ARTS_TOPICS),
  652. "lifestyle_relationships": (LIFESTYLE_TOPICS, LIFESTYLE_TOPICS),
  653. "technology": (TECH_TOPICS, TECH_TOPICS),
  654. }
  655. template = random.choice(templates[category])
  656. activities, topics = word_lists[category]
  657. if "{activity}" in template:
  658. return template.format(activity=random.choice(activities))
  659. else:
  660. return template.format(topic=random.choice(topics))
  661. def generate_short_query() -> str:
  662. """Generate a short 1-2 word query."""
  663. return random.choice(SHORT_KEYWORDS)
  664. def generate_expansion(query: str, category: str) -> str:
  665. """Generate a realistic expansion for a query."""
  666. # Generate contextually appropriate lex/vec/hyde based on category
  667. domain_hints = {
  668. "health_wellness": "health medical wellness fitness nutrition exercise",
  669. "finance_business": "finance money investing business career salary budget",
  670. "home_garden": "home repair DIY garden organization cleaning maintenance",
  671. "food_cooking": "food cooking recipe culinary kitchen meal nutrition diet",
  672. "travel_geography": "travel trip vacation destination geography tourism explore",
  673. "hobbies_crafts": "hobby craft creative DIY leisure recreation skill learn",
  674. "education_learning": "education learn study course academic knowledge skill",
  675. "arts_culture": "art culture creative music film theater literature history",
  676. "lifestyle_relationships": "life lifestyle relationship social personal development habits",
  677. "technology": "tech digital computer software internet online tool app",
  678. }
  679. domain = domain_hints.get(category, "general")
  680. lex_variations = [
  681. f"{query} guide",
  682. f"{query} tips",
  683. f"{query} how to",
  684. f"{query} tutorial",
  685. f"{query} advice",
  686. ]
  687. vec_variations = [
  688. f"how to {query} effectively",
  689. f"best way to {query}",
  690. f"complete guide to {query}",
  691. f"learn {query} step by step",
  692. f"tips for {query} success",
  693. ]
  694. # Select variations
  695. selected_lex = random.sample(lex_variations, min(3, len(lex_variations)))
  696. selected_vec = random.sample(vec_variations, min(2, len(vec_variations)))
  697. # Generate hyde passage
  698. hyde_templates = [
  699. f"This comprehensive guide to {query} covers all the essential information you need to get started. Follow the steps carefully for best results.",
  700. f"Learning {query} requires practice and patience. This resource provides detailed instructions, examples, and tips to help you master the basics quickly.",
  701. f"Whether you're a beginner or looking to improve, this guide to {query} offers practical advice, common pitfalls to avoid, and proven strategies for success.",
  702. ]
  703. hyde = random.choice(hyde_templates)
  704. output_lines = []
  705. for lex in selected_lex:
  706. output_lines.append(f"lex: {lex}")
  707. for vec in selected_vec:
  708. output_lines.append(f"vec: {vec}")
  709. output_lines.append(f"hyde: {hyde}")
  710. return "\n".join(output_lines)
  711. def main():
  712. """Generate balanced diverse training examples."""
  713. output_file = Path("data/qmd_expansion_balanced.jsonl")
  714. # Generate 500 examples with balanced distribution
  715. target_count = 500
  716. print(f"Generating {target_count} balanced training examples...")
  717. print(f"Tech focus reduced to {CATEGORY_WEIGHTS['technology']:.0%}")
  718. print()
  719. # Show distribution
  720. for cat, weight in CATEGORY_WEIGHTS.items():
  721. count = int(target_count * weight)
  722. bar = "█" * int(weight * 40)
  723. print(f" {cat:25} {count:3} ({weight:4.0%}) {bar}")
  724. print()
  725. examples = []
  726. category_counts = {cat: 0 for cat in CATEGORY_WEIGHTS.keys()}
  727. for i in range(target_count):
  728. # Select category based on weights
  729. categories = list(CATEGORY_WEIGHTS.keys())
  730. weights = list(CATEGORY_WEIGHTS.values())
  731. category = random.choices(categories, weights=weights, k=1)[0]
  732. # 20% of queries should be short (1-2 words)
  733. if random.random() < 0.20:
  734. query = generate_short_query()
  735. short_flag = True
  736. else:
  737. query = generate_query(category)
  738. short_flag = False
  739. expansion = generate_expansion(query, category)
  740. output_items = normalize_output_items(parse_output_text(expansion))
  741. examples.append(
  742. {
  743. "query": query,
  744. "output": output_items,
  745. "category": category,
  746. "is_short": short_flag,
  747. }
  748. )
  749. category_counts[category] += 1
  750. if (i + 1) % 100 == 0:
  751. print(f" Generated {i + 1}/{target_count} examples...")
  752. # Write to file
  753. output_file.parent.mkdir(parents=True, exist_ok=True)
  754. with open(output_file, "w") as f:
  755. for ex in examples:
  756. f.write(json.dumps(ex) + "\n")
  757. print(f"\n✅ Saved {len(examples)} balanced examples to {output_file}")
  758. print("\nActual distribution:")
  759. for cat, count in sorted(category_counts.items(), key=lambda x: -x[1]):
  760. pct = count / len(examples)
  761. bar = "█" * int(pct * 40)
  762. print(f" {cat:25} {count:3} ({pct:4.1%}) {bar}")
  763. short_count = sum(1 for ex in examples if ex["is_short"])
  764. print(
  765. f"\n {'Short queries (1-2 words)':25} {short_count:3} ({short_count / len(examples):4.1%})"
  766. )
  767. print("\n📋 Usage:")
  768. print(f" cat {output_file} >> data/qmd_expansion_v2.jsonl")
  769. print(" uv run dataset/prepare_data.py")
  770. if __name__ == "__main__":
  771. main()