generate_diverse.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441
  1. #!/usr/bin/env python3
  2. """
  3. Generate diverse QMD training examples for underrepresented categories.
  4. This script creates additional training examples focused on:
  5. - Trivia, Geography, Philosophy, History (as requested)
  6. - Temporal/Recency queries (important for evals)
  7. - Named entity queries (critical for entity preservation scoring)
  8. """
  9. import json
  10. import random
  11. from pathlib import Path
  12. from datetime import datetime, timedelta
  13. from dataset.schema import normalize_output_items, parse_output_text
  14. # Additional diverse query categories
  15. TRIVIA_QUERIES = [
  16. "world capitals quiz",
  17. "trivia facts about space",
  18. "did you know history",
  19. "random science facts",
  20. "famous inventions timeline",
  21. "world records list",
  22. "fun geography facts",
  23. "historical trivia questions",
  24. "animal trivia facts",
  25. "sports trivia records",
  26. ]
  27. GEOGRAPHY_QUERIES = [
  28. "largest countries by area",
  29. "rivers that cross multiple countries",
  30. "highest mountain peaks",
  31. "desert climate zones",
  32. "island nations list",
  33. "capital cities europe",
  34. "population by continent",
  35. "time zones map",
  36. "latitude longitude coordinates",
  37. "borders between countries",
  38. "ocean currents patterns",
  39. "tectonic plate boundaries",
  40. "climate zones earth",
  41. ]
  42. PHILOSOPHY_QUERIES = [
  43. "stoicism daily practice",
  44. "existentialism meaning life",
  45. "utilitarianism ethics explained",
  46. "kant categorical imperative",
  47. "free will determinism debate",
  48. "nietzsche will to power",
  49. "socrates method questioning",
  50. "plato theory forms",
  51. "aristotle virtue ethics",
  52. "descartes cogito ergo sum",
  53. "logic propositional calculus",
  54. "epistemology knowledge theory",
  55. "metaphysics existence reality",
  56. ]
  57. HISTORY_QUERIES = [
  58. "ancient civilizations timeline",
  59. "roman empire fall reasons",
  60. "medieval period events",
  61. "renaissance art movement",
  62. "industrial revolution inventions",
  63. "world war i causes",
  64. "cold war key events",
  65. "french revolution timeline",
  66. "american civil war battles",
  67. "egyptian pharaohs dynasty",
  68. "bronze age collapse",
  69. "byzantine empire history",
  70. "vietnam war timeline",
  71. ]
  72. SCIENCE_QUERIES = [
  73. "quantum mechanics basics",
  74. "theory of relativity explained",
  75. "dna structure discovery",
  76. "photosynthesis process steps",
  77. "black holes physics",
  78. "plate tectonics theory",
  79. "evolution natural selection",
  80. "periodic table elements",
  81. "cell biology fundamentals",
  82. "climate change evidence",
  83. ]
  84. ARTS_CULTURE_QUERIES = [
  85. "impressionist painters list",
  86. "shakespeare plays summary",
  87. "classical music composers",
  88. "modern art movements",
  89. "film noir characteristics",
  90. "jazz history origins",
  91. "renaissance sculpture techniques",
  92. "photography composition rules",
  93. "poetry forms haiku",
  94. "baroque art characteristics",
  95. "street art graffiti history",
  96. ]
  97. HEALTH_MEDICINE_QUERIES = [
  98. "symptoms of vitamin deficiency",
  99. "how vaccines work immune system",
  100. "blood pressure normal range",
  101. "sleep hygiene tips",
  102. "intermittent fasting benefits",
  103. "anxiety coping strategies",
  104. "stretching exercises back pain",
  105. "heart disease prevention",
  106. "diabetes type 2 management",
  107. "meditation mental health",
  108. "nutrition macros explained",
  109. "first aid basics",
  110. ]
  111. BUSINESS_FINANCE_QUERIES = [
  112. "compound interest calculator",
  113. "stock market basics beginners",
  114. "startup funding stages",
  115. "tax deductions small business",
  116. "budgeting methods 50 30 20",
  117. "cryptocurrency explained simply",
  118. "inflation effects on savings",
  119. "retirement planning strategies",
  120. "passive income ideas",
  121. "venture capital vs angel investors",
  122. "balance sheet basics",
  123. "supply chain management",
  124. ]
  125. SPORTS_QUERIES = [
  126. "marathon training schedule",
  127. "weightlifting proper form",
  128. "swimming stroke techniques",
  129. "tennis serve mechanics",
  130. "basketball dribbling drills",
  131. "soccer formations tactics",
  132. "golf swing fundamentals",
  133. "yoga poses beginners",
  134. "running injury prevention",
  135. "cycling gear ratios",
  136. "rock climbing grades",
  137. "surfing wave types",
  138. ]
  139. TRAVEL_QUERIES = [
  140. "best time visit japan",
  141. "travel packing checklist",
  142. "budget backpacking europe",
  143. "visa requirements usa",
  144. "jet lag remedies",
  145. "road trip planning tips",
  146. "solo travel safety",
  147. "airport security rules",
  148. "travel insurance coverage",
  149. "language apps learning",
  150. "hostel vs hotel comparison",
  151. "travel photography tips",
  152. ]
  153. FOOD_COOKING_QUERIES = [
  154. "bread baking techniques",
  155. "knife skills basics",
  156. "fermentation at home",
  157. "meal prep weekly",
  158. "spice combinations guide",
  159. "pasta making fresh",
  160. "coffee brewing methods",
  161. "wine pairing basics",
  162. "vegetarian protein sources",
  163. "food storage guidelines",
  164. "sourdough starter maintenance",
  165. "grilling temperature chart",
  166. ]
  167. PSYCHOLOGY_QUERIES = [
  168. "cognitive biases list",
  169. "attachment theory styles",
  170. "maslow hierarchy needs",
  171. "growth mindset vs fixed",
  172. "emotional intelligence components",
  173. "memory techniques mnemonics",
  174. "habit formation science",
  175. "stress response fight flight",
  176. "personality types myers briggs",
  177. "motivation intrinsic extrinsic",
  178. "decision making psychology",
  179. "procrastination causes solutions",
  180. ]
  181. ENVIRONMENT_NATURE_QUERIES = [
  182. "renewable energy types",
  183. "carbon footprint reduction",
  184. "composting basics home",
  185. "endangered species list",
  186. "recycling symbols meaning",
  187. "ocean plastic pollution",
  188. "deforestation effects",
  189. "sustainable living tips",
  190. "wildlife conservation efforts",
  191. "solar panel installation",
  192. "water conservation methods",
  193. "biodiversity importance",
  194. ]
  195. MATH_QUERIES = [
  196. "calculus derivatives explained",
  197. "probability basics statistics",
  198. "linear algebra matrices",
  199. "geometry proofs theorems",
  200. "logarithms rules properties",
  201. "trigonometry identities",
  202. "set theory basics",
  203. "prime numbers properties",
  204. "fractions decimals conversion",
  205. "algebra equations solving",
  206. "graph theory fundamentals",
  207. "combinatorics permutations",
  208. ]
  209. LANGUAGE_QUERIES = [
  210. "spanish verb conjugation",
  211. "japanese hiragana katakana",
  212. "french pronunciation rules",
  213. "german cases grammar",
  214. "mandarin tones guide",
  215. "latin phrases common",
  216. "arabic alphabet basics",
  217. "english idioms meanings",
  218. "sign language basics",
  219. "etymology word origins",
  220. "grammar punctuation rules",
  221. "writing style guides",
  222. ]
  223. DIY_CRAFTS_QUERIES = [
  224. "woodworking joints types",
  225. "knitting patterns beginners",
  226. "home repair basics",
  227. "sewing machine threading",
  228. "painting techniques acrylic",
  229. "pottery wheel basics",
  230. "electronics soldering guide",
  231. "gardening soil preparation",
  232. "candle making supplies",
  233. "leather crafting tools",
  234. "origami folding instructions",
  235. "furniture restoration tips",
  236. ]
  237. # Temporal/Recency queries (matches evals/queries.txt requirements)
  238. TEMPORAL_TEMPLATES = [
  239. "latest {topic} updates",
  240. "recent {topic} changes {year}",
  241. "what changed in {topic} {year}",
  242. "{topic} changelog {year}",
  243. "{topic} new features {year}",
  244. "{topic} latest version release",
  245. "{topic} recent news {month}",
  246. ]
  247. TEMPORAL_TOPICS = [
  248. "Shopify",
  249. "React",
  250. "Kubernetes",
  251. "Docker",
  252. "TypeScript",
  253. "Python",
  254. "AWS",
  255. "GitHub",
  256. "Next.js",
  257. "Vue",
  258. "AI",
  259. "machine learning",
  260. "climate tech",
  261. "space exploration",
  262. ]
  263. # Named entity queries (critical for entity preservation testing)
  264. NAMED_ENTITY_QUERIES = [
  265. "who is TDS motorsports",
  266. "React hooks tutorial",
  267. "Docker container networking",
  268. "Kubernetes pod deployment",
  269. "AWS Lambda functions setup",
  270. "Stripe payment integration",
  271. "GitHub Actions workflow",
  272. "Vercel deployment guide",
  273. "Supabase auth configuration",
  274. "Twilio SMS API",
  275. "Datadog monitoring setup",
  276. "Sentry error tracking",
  277. "Terraform AWS provider",
  278. "Ansible playbook examples",
  279. ]
  280. # Generate temporal queries with recent dates
  281. def generate_temporal_queries():
  282. queries = []
  283. current_year = datetime.now().year
  284. months = [
  285. "January",
  286. "February",
  287. "March",
  288. "April",
  289. "May",
  290. "June",
  291. "July",
  292. "August",
  293. "September",
  294. "October",
  295. "November",
  296. "December",
  297. ]
  298. for template in TEMPORAL_TEMPLATES:
  299. for topic in TEMPORAL_TOPICS:
  300. if "{year}" in template:
  301. # Use current year and previous year
  302. for year in [current_year, current_year - 1]:
  303. queries.append(template.format(topic=topic, year=year))
  304. elif "{month}" in template:
  305. # Use recent months
  306. for month in months[-3:]: # Last 3 months
  307. queries.append(template.format(topic=topic, month=month))
  308. else:
  309. queries.append(template.format(topic=topic))
  310. return list(set(queries)) # Remove duplicates
  311. def generate_expansion(query: str) -> str:
  312. """Generate a realistic expansion for a query."""
  313. # This is a template-based generator - in production, use Claude API
  314. lex_variations = [
  315. f"{query} guide",
  316. f"{query} documentation",
  317. f"{query} tutorial",
  318. f"{query} examples",
  319. f"{query} best practices",
  320. ]
  321. vec_variations = [
  322. f"how to {query}",
  323. f"guide for {query}",
  324. f"learn about {query}",
  325. f"understanding {query}",
  326. f"complete {query} reference",
  327. ]
  328. # Select 2-3 lex and 2 vec variations
  329. selected_lex = random.sample(lex_variations, min(3, len(lex_variations)))
  330. selected_vec = random.sample(vec_variations, min(2, len(vec_variations)))
  331. # Generate hyde passage
  332. hyde = f"This comprehensive guide covers everything you need to know about {query}. It includes practical examples, best practices, and troubleshooting tips for beginners and advanced users alike."
  333. output_lines = []
  334. for lex in selected_lex:
  335. output_lines.append(f"lex: {lex}")
  336. for vec in selected_vec:
  337. output_lines.append(f"vec: {vec}")
  338. output_lines.append(f"hyde: {hyde}")
  339. return "\n".join(output_lines)
  340. def main():
  341. """Generate diverse examples and append to training data."""
  342. output_file = Path("data/qmd_expansion_diverse_addon.jsonl")
  343. all_queries = (
  344. TRIVIA_QUERIES
  345. + GEOGRAPHY_QUERIES
  346. + PHILOSOPHY_QUERIES
  347. + HISTORY_QUERIES
  348. + SCIENCE_QUERIES
  349. + ARTS_CULTURE_QUERIES
  350. + HEALTH_MEDICINE_QUERIES
  351. + BUSINESS_FINANCE_QUERIES
  352. + SPORTS_QUERIES
  353. + TRAVEL_QUERIES
  354. + FOOD_COOKING_QUERIES
  355. + PSYCHOLOGY_QUERIES
  356. + ENVIRONMENT_NATURE_QUERIES
  357. + MATH_QUERIES
  358. + LANGUAGE_QUERIES
  359. + DIY_CRAFTS_QUERIES
  360. + generate_temporal_queries()
  361. + NAMED_ENTITY_QUERIES
  362. )
  363. print(f"Generating {len(all_queries)} diverse training examples...")
  364. print(f" - Trivia: {len(TRIVIA_QUERIES)}")
  365. print(f" - Geography: {len(GEOGRAPHY_QUERIES)}")
  366. print(f" - Philosophy: {len(PHILOSOPHY_QUERIES)}")
  367. print(f" - History: {len(HISTORY_QUERIES)}")
  368. print(f" - Science: {len(SCIENCE_QUERIES)}")
  369. print(f" - Arts/Culture: {len(ARTS_CULTURE_QUERIES)}")
  370. print(f" - Health/Medicine: {len(HEALTH_MEDICINE_QUERIES)}")
  371. print(f" - Business/Finance: {len(BUSINESS_FINANCE_QUERIES)}")
  372. print(f" - Sports: {len(SPORTS_QUERIES)}")
  373. print(f" - Travel: {len(TRAVEL_QUERIES)}")
  374. print(f" - Food/Cooking: {len(FOOD_COOKING_QUERIES)}")
  375. print(f" - Psychology: {len(PSYCHOLOGY_QUERIES)}")
  376. print(f" - Environment: {len(ENVIRONMENT_NATURE_QUERIES)}")
  377. print(f" - Math: {len(MATH_QUERIES)}")
  378. print(f" - Language: {len(LANGUAGE_QUERIES)}")
  379. print(f" - DIY/Crafts: {len(DIY_CRAFTS_QUERIES)}")
  380. print(f" - Temporal: {len(generate_temporal_queries())}")
  381. print(f" - Named Entities: {len(NAMED_ENTITY_QUERIES)}")
  382. examples = []
  383. for query in all_queries:
  384. expansion = generate_expansion(query)
  385. output_items = normalize_output_items(parse_output_text(expansion))
  386. examples.append(
  387. {"query": query, "output": output_items, "category": "diverse_addon"}
  388. )
  389. # Write to file
  390. output_file.parent.mkdir(parents=True, exist_ok=True)
  391. with open(output_file, "w") as f:
  392. for ex in examples:
  393. f.write(json.dumps(ex) + "\n")
  394. print(f"\nSaved {len(examples)} diverse examples to {output_file}")
  395. print("\nTo use these examples:")
  396. print(f" cat {output_file} >> data/qmd_expansion_v2.jsonl")
  397. print(" uv run dataset/prepare_data.py --add-short 2")
  398. if __name__ == "__main__":
  399. main()