| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441 |
- #!/usr/bin/env python3
- """
- Generate diverse QMD training examples for underrepresented categories.
- This script creates additional training examples focused on:
- - Trivia, Geography, Philosophy, History (as requested)
- - Temporal/Recency queries (important for evals)
- - Named entity queries (critical for entity preservation scoring)
- """
- import json
- import random
- from pathlib import Path
- from datetime import datetime, timedelta
- from dataset.schema import normalize_output_items, parse_output_text
- # Additional diverse query categories
- TRIVIA_QUERIES = [
- "world capitals quiz",
- "trivia facts about space",
- "did you know history",
- "random science facts",
- "famous inventions timeline",
- "world records list",
- "fun geography facts",
- "historical trivia questions",
- "animal trivia facts",
- "sports trivia records",
- ]
- GEOGRAPHY_QUERIES = [
- "largest countries by area",
- "rivers that cross multiple countries",
- "highest mountain peaks",
- "desert climate zones",
- "island nations list",
- "capital cities europe",
- "population by continent",
- "time zones map",
- "latitude longitude coordinates",
- "borders between countries",
- "ocean currents patterns",
- "tectonic plate boundaries",
- "climate zones earth",
- ]
- PHILOSOPHY_QUERIES = [
- "stoicism daily practice",
- "existentialism meaning life",
- "utilitarianism ethics explained",
- "kant categorical imperative",
- "free will determinism debate",
- "nietzsche will to power",
- "socrates method questioning",
- "plato theory forms",
- "aristotle virtue ethics",
- "descartes cogito ergo sum",
- "logic propositional calculus",
- "epistemology knowledge theory",
- "metaphysics existence reality",
- ]
- HISTORY_QUERIES = [
- "ancient civilizations timeline",
- "roman empire fall reasons",
- "medieval period events",
- "renaissance art movement",
- "industrial revolution inventions",
- "world war i causes",
- "cold war key events",
- "french revolution timeline",
- "american civil war battles",
- "egyptian pharaohs dynasty",
- "bronze age collapse",
- "byzantine empire history",
- "vietnam war timeline",
- ]
- SCIENCE_QUERIES = [
- "quantum mechanics basics",
- "theory of relativity explained",
- "dna structure discovery",
- "photosynthesis process steps",
- "black holes physics",
- "plate tectonics theory",
- "evolution natural selection",
- "periodic table elements",
- "cell biology fundamentals",
- "climate change evidence",
- ]
- ARTS_CULTURE_QUERIES = [
- "impressionist painters list",
- "shakespeare plays summary",
- "classical music composers",
- "modern art movements",
- "film noir characteristics",
- "jazz history origins",
- "renaissance sculpture techniques",
- "photography composition rules",
- "poetry forms haiku",
- "baroque art characteristics",
- "street art graffiti history",
- ]
- HEALTH_MEDICINE_QUERIES = [
- "symptoms of vitamin deficiency",
- "how vaccines work immune system",
- "blood pressure normal range",
- "sleep hygiene tips",
- "intermittent fasting benefits",
- "anxiety coping strategies",
- "stretching exercises back pain",
- "heart disease prevention",
- "diabetes type 2 management",
- "meditation mental health",
- "nutrition macros explained",
- "first aid basics",
- ]
- BUSINESS_FINANCE_QUERIES = [
- "compound interest calculator",
- "stock market basics beginners",
- "startup funding stages",
- "tax deductions small business",
- "budgeting methods 50 30 20",
- "cryptocurrency explained simply",
- "inflation effects on savings",
- "retirement planning strategies",
- "passive income ideas",
- "venture capital vs angel investors",
- "balance sheet basics",
- "supply chain management",
- ]
- SPORTS_QUERIES = [
- "marathon training schedule",
- "weightlifting proper form",
- "swimming stroke techniques",
- "tennis serve mechanics",
- "basketball dribbling drills",
- "soccer formations tactics",
- "golf swing fundamentals",
- "yoga poses beginners",
- "running injury prevention",
- "cycling gear ratios",
- "rock climbing grades",
- "surfing wave types",
- ]
- TRAVEL_QUERIES = [
- "best time visit japan",
- "travel packing checklist",
- "budget backpacking europe",
- "visa requirements usa",
- "jet lag remedies",
- "road trip planning tips",
- "solo travel safety",
- "airport security rules",
- "travel insurance coverage",
- "language apps learning",
- "hostel vs hotel comparison",
- "travel photography tips",
- ]
- FOOD_COOKING_QUERIES = [
- "bread baking techniques",
- "knife skills basics",
- "fermentation at home",
- "meal prep weekly",
- "spice combinations guide",
- "pasta making fresh",
- "coffee brewing methods",
- "wine pairing basics",
- "vegetarian protein sources",
- "food storage guidelines",
- "sourdough starter maintenance",
- "grilling temperature chart",
- ]
- PSYCHOLOGY_QUERIES = [
- "cognitive biases list",
- "attachment theory styles",
- "maslow hierarchy needs",
- "growth mindset vs fixed",
- "emotional intelligence components",
- "memory techniques mnemonics",
- "habit formation science",
- "stress response fight flight",
- "personality types myers briggs",
- "motivation intrinsic extrinsic",
- "decision making psychology",
- "procrastination causes solutions",
- ]
- ENVIRONMENT_NATURE_QUERIES = [
- "renewable energy types",
- "carbon footprint reduction",
- "composting basics home",
- "endangered species list",
- "recycling symbols meaning",
- "ocean plastic pollution",
- "deforestation effects",
- "sustainable living tips",
- "wildlife conservation efforts",
- "solar panel installation",
- "water conservation methods",
- "biodiversity importance",
- ]
- MATH_QUERIES = [
- "calculus derivatives explained",
- "probability basics statistics",
- "linear algebra matrices",
- "geometry proofs theorems",
- "logarithms rules properties",
- "trigonometry identities",
- "set theory basics",
- "prime numbers properties",
- "fractions decimals conversion",
- "algebra equations solving",
- "graph theory fundamentals",
- "combinatorics permutations",
- ]
- LANGUAGE_QUERIES = [
- "spanish verb conjugation",
- "japanese hiragana katakana",
- "french pronunciation rules",
- "german cases grammar",
- "mandarin tones guide",
- "latin phrases common",
- "arabic alphabet basics",
- "english idioms meanings",
- "sign language basics",
- "etymology word origins",
- "grammar punctuation rules",
- "writing style guides",
- ]
- DIY_CRAFTS_QUERIES = [
- "woodworking joints types",
- "knitting patterns beginners",
- "home repair basics",
- "sewing machine threading",
- "painting techniques acrylic",
- "pottery wheel basics",
- "electronics soldering guide",
- "gardening soil preparation",
- "candle making supplies",
- "leather crafting tools",
- "origami folding instructions",
- "furniture restoration tips",
- ]
- # Temporal/Recency queries (matches evals/queries.txt requirements)
- TEMPORAL_TEMPLATES = [
- "latest {topic} updates",
- "recent {topic} changes {year}",
- "what changed in {topic} {year}",
- "{topic} changelog {year}",
- "{topic} new features {year}",
- "{topic} latest version release",
- "{topic} recent news {month}",
- ]
- TEMPORAL_TOPICS = [
- "Shopify",
- "React",
- "Kubernetes",
- "Docker",
- "TypeScript",
- "Python",
- "AWS",
- "GitHub",
- "Next.js",
- "Vue",
- "AI",
- "machine learning",
- "climate tech",
- "space exploration",
- ]
- # Named entity queries (critical for entity preservation testing)
- NAMED_ENTITY_QUERIES = [
- "who is TDS motorsports",
- "React hooks tutorial",
- "Docker container networking",
- "Kubernetes pod deployment",
- "AWS Lambda functions setup",
- "Stripe payment integration",
- "GitHub Actions workflow",
- "Vercel deployment guide",
- "Supabase auth configuration",
- "Twilio SMS API",
- "Datadog monitoring setup",
- "Sentry error tracking",
- "Terraform AWS provider",
- "Ansible playbook examples",
- ]
- # Generate temporal queries with recent dates
- def generate_temporal_queries():
- queries = []
- current_year = datetime.now().year
- months = [
- "January",
- "February",
- "March",
- "April",
- "May",
- "June",
- "July",
- "August",
- "September",
- "October",
- "November",
- "December",
- ]
- for template in TEMPORAL_TEMPLATES:
- for topic in TEMPORAL_TOPICS:
- if "{year}" in template:
- # Use current year and previous year
- for year in [current_year, current_year - 1]:
- queries.append(template.format(topic=topic, year=year))
- elif "{month}" in template:
- # Use recent months
- for month in months[-3:]: # Last 3 months
- queries.append(template.format(topic=topic, month=month))
- else:
- queries.append(template.format(topic=topic))
- return list(set(queries)) # Remove duplicates
- def generate_expansion(query: str) -> str:
- """Generate a realistic expansion for a query."""
- # This is a template-based generator - in production, use Claude API
- lex_variations = [
- f"{query} guide",
- f"{query} documentation",
- f"{query} tutorial",
- f"{query} examples",
- f"{query} best practices",
- ]
- vec_variations = [
- f"how to {query}",
- f"guide for {query}",
- f"learn about {query}",
- f"understanding {query}",
- f"complete {query} reference",
- ]
- # Select 2-3 lex and 2 vec variations
- selected_lex = random.sample(lex_variations, min(3, len(lex_variations)))
- selected_vec = random.sample(vec_variations, min(2, len(vec_variations)))
- # Generate hyde passage
- hyde = f"This comprehensive guide covers everything you need to know about {query}. It includes practical examples, best practices, and troubleshooting tips for beginners and advanced users alike."
- output_lines = []
- for lex in selected_lex:
- output_lines.append(f"lex: {lex}")
- for vec in selected_vec:
- output_lines.append(f"vec: {vec}")
- output_lines.append(f"hyde: {hyde}")
- return "\n".join(output_lines)
- def main():
- """Generate diverse examples and append to training data."""
- output_file = Path("data/qmd_expansion_diverse_addon.jsonl")
- all_queries = (
- TRIVIA_QUERIES
- + GEOGRAPHY_QUERIES
- + PHILOSOPHY_QUERIES
- + HISTORY_QUERIES
- + SCIENCE_QUERIES
- + ARTS_CULTURE_QUERIES
- + HEALTH_MEDICINE_QUERIES
- + BUSINESS_FINANCE_QUERIES
- + SPORTS_QUERIES
- + TRAVEL_QUERIES
- + FOOD_COOKING_QUERIES
- + PSYCHOLOGY_QUERIES
- + ENVIRONMENT_NATURE_QUERIES
- + MATH_QUERIES
- + LANGUAGE_QUERIES
- + DIY_CRAFTS_QUERIES
- + generate_temporal_queries()
- + NAMED_ENTITY_QUERIES
- )
- print(f"Generating {len(all_queries)} diverse training examples...")
- print(f" - Trivia: {len(TRIVIA_QUERIES)}")
- print(f" - Geography: {len(GEOGRAPHY_QUERIES)}")
- print(f" - Philosophy: {len(PHILOSOPHY_QUERIES)}")
- print(f" - History: {len(HISTORY_QUERIES)}")
- print(f" - Science: {len(SCIENCE_QUERIES)}")
- print(f" - Arts/Culture: {len(ARTS_CULTURE_QUERIES)}")
- print(f" - Health/Medicine: {len(HEALTH_MEDICINE_QUERIES)}")
- print(f" - Business/Finance: {len(BUSINESS_FINANCE_QUERIES)}")
- print(f" - Sports: {len(SPORTS_QUERIES)}")
- print(f" - Travel: {len(TRAVEL_QUERIES)}")
- print(f" - Food/Cooking: {len(FOOD_COOKING_QUERIES)}")
- print(f" - Psychology: {len(PSYCHOLOGY_QUERIES)}")
- print(f" - Environment: {len(ENVIRONMENT_NATURE_QUERIES)}")
- print(f" - Math: {len(MATH_QUERIES)}")
- print(f" - Language: {len(LANGUAGE_QUERIES)}")
- print(f" - DIY/Crafts: {len(DIY_CRAFTS_QUERIES)}")
- print(f" - Temporal: {len(generate_temporal_queries())}")
- print(f" - Named Entities: {len(NAMED_ENTITY_QUERIES)}")
- examples = []
- for query in all_queries:
- expansion = generate_expansion(query)
- output_items = normalize_output_items(parse_output_text(expansion))
- examples.append(
- {"query": query, "output": output_items, "category": "diverse_addon"}
- )
- # Write to file
- output_file.parent.mkdir(parents=True, exist_ok=True)
- with open(output_file, "w") as f:
- for ex in examples:
- f.write(json.dumps(ex) + "\n")
- print(f"\nSaved {len(examples)} diverse examples to {output_file}")
- print("\nTo use these examples:")
- print(f" cat {output_file} >> data/qmd_expansion_v2.jsonl")
- print(" uv run dataset/prepare_data.py --add-short 2")
- if __name__ == "__main__":
- main()
|