#!/usr/bin/env python3 """Generate synthetic training data for QMD query expansion using Claude API.""" import argparse import json import os import random from pathlib import Path try: import anthropic except ImportError: print("Install anthropic: pip install anthropic") exit(1) # Sample query templates for diverse training data - organized by category QUERY_TEMPLATES = [ # === Technical documentation (35% of queries) === "how to {action} {technology}", "{technology} {concept} example", "configure {technology} for {use_case}", "{error_type} error in {technology}", "best practices for {concept}", "{technology} vs {technology2}", "{action} {technology} {use_case}", "setup {technology} {use_case}", "{technology} tutorial for beginners", "{technology} documentation", "{technology} {error_type} troubleshooting", "{concept} in {technology}", "migrate from {technology} to {technology2}", "{action} {concept} {technology}", # === Personal notes / journals (15% of queries) === "meeting notes {topic}", "ideas for {project}", "{date} journal entry", "thoughts on {topic}", "{project} {topic} notes", "{topic} meeting {date}", "reflect on {topic}", "brainstorm {project}", # === Research / learning (20% of queries) === "what is {concept}", "difference between {thing1} and {thing2}", "{topic} tutorial", "learn {skill}", "understand {concept}", "explain {concept}", "{topic} fundamentals", "intro to {skill}", "{thing1} or {thing2}", "when to use {concept}", # === Short / keyword queries (15% of queries) === "{keyword}", "{keyword} {modifier}", "{keyword} {action}", "{keyword} {use_case}", "{technology} {keyword}", "{concept} {keyword}", # === Temporal / recency queries (10% of queries) === "latest {topic}", "recent {concept} changes", "new {technology} features", "{topic} update {date}", "what changed in {technology}", "{technology} changelog {date}", "{topic} news {date}", # === Named entities / specific topics (5% of queries) === "{named_entity} {topic}", "{person} {concept}", "{organization} {use_case}", "{product} {action}", ] # Category weights for balanced sampling TEMPLATE_CATEGORIES = { "technical": list(range(0, 14)), # 0-13 "personal": list(range(14, 22)), # 14-21 "research": list(range(22, 31)), # 22-30 "short": list(range(31, 36)), # 31-35 "temporal": list(range(36, 42)), # 36-41 "entities": list(range(42, 46)), # 42-45 } ACTIONS = [ "install", "configure", "setup", "debug", "deploy", "test", "optimize", "migrate", "build", "run", "lint", "format", "backup", "restore", "update", "rollback", "monitor", "scale", "secure", "integrate", "automate", "refactor", "initialize", ] TECHNOLOGIES = [ # Languages "python", "typescript", "javascript", "rust", "golang", "java", "kotlin", "swift", "ruby", "php", "cpp", "c", "elixir", "scala", "clojure", "dart", # Frameworks/Frontend "react", "vue", "angular", "svelte", "solid", "htmx", "alpine", "nextjs", "nuxt", # Backend "django", "flask", "fastapi", "express", "rails", "spring", "laravel", # Infrastructure "docker", "kubernetes", "terraform", "ansible", "jenkins", "github-actions", # Databases "postgres", "mysql", "mongodb", "redis", "elasticsearch", "sqlite", "dynamodb", "cassandra", "cockroachdb", "supabase", "firebase", # Tools "git", "nginx", "apache", "linux", "aws", "gcp", "azure", "vercel", "netlify", # Data/ML "pandas", "numpy", "tensorflow", "pytorch", "scikit-learn", "jupyter", "spark", "kafka", "airflow", "dbt", ] TECHNOLOGIES_2 = [ "docker", "kubernetes", "postgres", "mysql", "redis", "mongodb", "aws", "gcp", "react", "vue", "angular", "python", "javascript", "typescript", "github-actions", "gitlab-ci", "jenkins", "terraform", "ansible", ] CONCEPTS = [ "authentication", "caching", "logging", "testing", "deployment", "API", "database", "security", "monitoring", "performance", "scalability", "reliability", "observability", "microservices", "serverless", "virtualization", "containerization", "orchestration", "CI/CD", "version control", "dependency injection", "event sourcing", "CQRS", "load balancing", "rate limiting", "circuit breaker", "retry logic", "idempotency", ] USE_CASES = [ "production", "development", "CI/CD", "local", "cloud", "staging", "testing", "microservices", "serverless", "hybrid", "multi-tenant", "high-availability", "real-time", "batch processing", "stream processing", "data pipeline", ] ERROR_TYPES = [ "connection", "timeout", "permission", "memory", "syntax", "runtime", "configuration", "dependency", "network", "authentication", "authorization", "validation", "concurrency", "deadlock", "resource", "quota", ] TOPICS = [ "productivity", "workflow", "architecture", "design", "performance", "security", "scalability", "reliability", "observability", "maintainability", "testing", "documentation", "refactoring", "debugging", "optimization", "best practices", "patterns", "anti-patterns", "trade-offs", "decision making", ] KEYWORDS = [ "auth", "config", "setup", "api", "cache", "log", "test", "debug", "env", "vars", "secrets", "tokens", "headers", "params", "query", "body", "route", "middleware", "handler", "controller", "model", "view", "template", "migration", "seed", "fixture", "mock", "stub", "spy", "fake", "build", "bundle", "compile", "transpile", "minify", "optimize", "deploy", "release", "rollback", "promote", "freeze", "thaw", "pull", "push", "commit", "merge", "rebase", "cherry-pick", "stash", "up", "down", "scale", "restart", "reload", "refresh", "flush", "cron", "queue", "job", "worker", "scheduler", "trigger", "webhook", "alert", "metric", "trace", "span", "event", "incident", "oncall", ] MODIFIERS = [ "best", "fast", "simple", "advanced", "secure", "quick", "easy", "proper", "correct", "safe", "efficient", "reliable", "robust", "latest", "recent", "new", "old", "legacy", "modern", "local", "remote", "global", "shared", "private", "public", ] NAMED_ENTITIES = [ "React", "Vue", "Angular", "Docker", "Kubernetes", "AWS", "GCP", "GitHub", "GitLab", "Vercel", "Netlify", "Supabase", "Firebase", "Stripe", "Twilio", "SendGrid", "Datadog", "PagerDuty", "Sentry", "Terraform", "Ansible", "Jenkins", "CircleCI", "TravisCI", ] PERSONS = [ "Kent Beck", "Martin Fowler", "Robert Martin", "Dave Thomas", "Guido van Rossum", "Brendan Eich", "Ryan Dahl", "Anders Hejlsberg", "Linus Torvalds", "DHH", "Yukihiro Matsumoto", "Rich Hickey", ] ORGANIZATIONS = [ "Google", "Microsoft", "Amazon", "Meta", "Apple", "Netflix", "Spotify", "Stripe", "Shopify", "Airbnb", "Uber", "Lyft", "Slack", "Discord", ] PRODUCTS = [ "VS Code", "IntelliJ", "PyCharm", "WebStorm", "DataGrip", "Postman", "Insomnia", "TablePlus", "Docker Desktop", "Lens", "Figma", "Sketch", "Notion", "Linear", "Jira", "Trello", ] SYSTEM_PROMPT = """You are a search query optimization expert for a markdown document search system called QMD. Your task is to transform user queries into retrieval-optimized outputs with THREE distinct types: 1. **lex** lines: Keyword variations optimized for BM25 full-text search - Short, keyword-focused - Good for exact term matching - 1-3 lines 2. **vec** lines: Semantic reformulations for vector/embedding search - Complete phrases or questions - Capture semantic meaning - 1-3 lines 3. **hyde** line: A hypothetical document passage (HyDE technique) - A realistic passage that would answer the query - Contains domain-specific terminology - Written as if it's FROM a document, not ABOUT the query - MAX 1 line Output format (STRICT - follow exactly): ``` hyde: A passage that would appear in a document answering this query. lex: keyword1 lex: keyword2 vec: semantic query reformulation ``` Rules: - Each line must start with "lex:", "vec:", or "hyde:" - No blank lines - No repetition between lines - hyde should be a realistic document excerpt, not a question - Stay focused on the original query intent""" USER_PROMPT_TEMPLATE = """Generate query expansion outputs for this search query: Query: {query} Respond with ONLY the lex/vec/hyde lines, nothing else.""" # Category weights - BALANCED approach # Tech at 15% (reasonable for QMD's technical document use case) CATEGORY_WEIGHTS = { "technical": 0.15, # 15% - Technical documentation "personal": 0.10, # 10% - Personal notes, journals "research": 0.10, # 10% - Research and learning "short": 0.15, # 15% - Short keyword queries "temporal": 0.10, # 10% - Temporal/recency queries (2025/2026) "entities": 0.05, # 5% - Named entity queries "health": 0.10, # 10% - Health & wellness "finance": 0.10, # 10% - Finance & business "lifestyle": 0.10, # 10% - Home, food, hobbies, travel "education": 0.05, # 5% - Education & arts } def generate_random_query() -> str: """Generate a random query from templates with category-weighted sampling.""" # Select category based on weights categories = list(CATEGORY_WEIGHTS.keys()) weights = list(CATEGORY_WEIGHTS.values()) selected_category = random.choices(categories, weights=weights, k=1)[0] # Select template from that category template_idx = random.choice(TEMPLATE_CATEGORIES[selected_category]) template = QUERY_TEMPLATES[template_idx] # Build replacements based on template type replacements = { "{action}": random.choice(ACTIONS), "{technology}": random.choice(TECHNOLOGIES), "{technology2}": random.choice(TECHNOLOGIES_2), "{concept}": random.choice(CONCEPTS), "{use_case}": random.choice(USE_CASES), "{error_type}": random.choice(ERROR_TYPES), "{topic}": random.choice(TOPICS), "{project}": random.choice( ["website", "app", "CLI tool", "API", "library", "service", "platform"] ), "{date}": random.choice( # Emphasize 2025/2026 for recency queries (current era) [ "2026", "2026", "2025", "2025", "January 2026", "February 2026", "March 2026", "last month", "this week", "yesterday", "today", "recently", "latest", ] ), "{thing1}": random.choice(CONCEPTS[:10]), "{thing2}": random.choice(CONCEPTS[10:] if len(CONCEPTS) > 10 else CONCEPTS), "{skill}": random.choice(TECHNOLOGIES), "{keyword}": random.choice(KEYWORDS), "{modifier}": random.choice(MODIFIERS), "{named_entity}": random.choice(NAMED_ENTITIES), "{person}": random.choice(PERSONS), "{organization}": random.choice(ORGANIZATIONS), "{product}": random.choice(PRODUCTS), } query = template for key, value in replacements.items(): query = query.replace(key, value) return query def generate_expansion(client: anthropic.Anthropic, query: str) -> str | None: """Generate expansion using Claude API.""" try: response = client.messages.create( model="claude-sonnet-4-20250514", max_tokens=300, system=SYSTEM_PROMPT, messages=[ {"role": "user", "content": USER_PROMPT_TEMPLATE.format(query=query)} ], ) return response.content[0].text.strip() except Exception as e: print(f"Error generating expansion for '{query}': {e}") return None def validate_output(output: str) -> bool: """Validate that output follows the expected format.""" lines = output.strip().split("\n") if not lines: return False has_lex = False has_vec = False for line in lines: line = line.strip() if not line: continue if line.startswith("lex:"): has_lex = True elif line.startswith("vec:"): has_vec = True elif line.startswith("hyde:"): pass else: return False # Invalid line type return has_lex and has_vec def main(): parser = argparse.ArgumentParser( description="Generate QMD query expansion training data" ) parser.add_argument( "--count", type=int, default=100, help="Number of examples to generate" ) parser.add_argument( "--output", type=str, default="data/qmd_expansion.jsonl", help="Output file path", ) parser.add_argument( "--queries", type=str, help="Optional file with custom queries (one per line)" ) args = parser.parse_args() api_key = os.environ.get("ANTHROPIC_API_KEY") if not api_key: print("Error: ANTHROPIC_API_KEY environment variable not set") exit(1) client = anthropic.Anthropic(api_key=api_key) output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) # Load custom queries if provided custom_queries = [] if args.queries and Path(args.queries).exists(): custom_queries = Path(args.queries).read_text().strip().split("\n") print(f"Loaded {len(custom_queries)} custom queries") examples = [] seen_queries = set() print(f"Generating {args.count} examples...") i = 0 while len(examples) < args.count: # Use custom query or generate random one if custom_queries and i < len(custom_queries): query = custom_queries[i].strip() else: query = generate_random_query() i += 1 # Skip duplicates if query in seen_queries: continue seen_queries.add(query) # Generate expansion output = generate_expansion(client, query) if output and validate_output(output): examples.append({"input": query, "output": output}) print(f"[{len(examples)}/{args.count}] {query[:50]}...") else: print(f" Skipped invalid output for: {query[:50]}...") # Write output with open(output_path, "w") as f: for example in examples: f.write(json.dumps(example) + "\n") print(f"\nGenerated {len(examples)} examples to {output_path}") if __name__ == "__main__": main()