| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711 |
- #!/usr/bin/env python3
- """Generate synthetic training data for QMD query expansion using Claude API."""
- import argparse
- import json
- import os
- import random
- from pathlib import Path
- try:
- import anthropic
- except ImportError:
- print("Install anthropic: pip install anthropic")
- exit(1)
- # Sample query templates for diverse training data - organized by category
- QUERY_TEMPLATES = [
- # === Technical documentation (35% of queries) ===
- "how to {action} {technology}",
- "{technology} {concept} example",
- "configure {technology} for {use_case}",
- "{error_type} error in {technology}",
- "best practices for {concept}",
- "{technology} vs {technology2}",
- "{action} {technology} {use_case}",
- "setup {technology} {use_case}",
- "{technology} tutorial for beginners",
- "{technology} documentation",
- "{technology} {error_type} troubleshooting",
- "{concept} in {technology}",
- "migrate from {technology} to {technology2}",
- "{action} {concept} {technology}",
- # === Personal notes / journals (15% of queries) ===
- "meeting notes {topic}",
- "ideas for {project}",
- "{date} journal entry",
- "thoughts on {topic}",
- "{project} {topic} notes",
- "{topic} meeting {date}",
- "reflect on {topic}",
- "brainstorm {project}",
- # === Research / learning (20% of queries) ===
- "what is {concept}",
- "difference between {thing1} and {thing2}",
- "{topic} tutorial",
- "learn {skill}",
- "understand {concept}",
- "explain {concept}",
- "{topic} fundamentals",
- "intro to {skill}",
- "{thing1} or {thing2}",
- "when to use {concept}",
- # === Short / keyword queries (15% of queries) ===
- "{keyword}",
- "{keyword} {modifier}",
- "{keyword} {action}",
- "{keyword} {use_case}",
- "{technology} {keyword}",
- "{concept} {keyword}",
- # === Temporal / recency queries (10% of queries) ===
- "latest {topic}",
- "recent {concept} changes",
- "new {technology} features",
- "{topic} update {date}",
- "what changed in {technology}",
- "{technology} changelog {date}",
- "{topic} news {date}",
- # === Named entities / specific topics (5% of queries) ===
- "{named_entity} {topic}",
- "{person} {concept}",
- "{organization} {use_case}",
- "{product} {action}",
- ]
- # Category weights for balanced sampling
- TEMPLATE_CATEGORIES = {
- "technical": list(range(0, 14)), # 0-13
- "personal": list(range(14, 22)), # 14-21
- "research": list(range(22, 31)), # 22-30
- "short": list(range(31, 36)), # 31-35
- "temporal": list(range(36, 42)), # 36-41
- "entities": list(range(42, 46)), # 42-45
- }
- ACTIONS = [
- "install",
- "configure",
- "setup",
- "debug",
- "deploy",
- "test",
- "optimize",
- "migrate",
- "build",
- "run",
- "lint",
- "format",
- "backup",
- "restore",
- "update",
- "rollback",
- "monitor",
- "scale",
- "secure",
- "integrate",
- "automate",
- "refactor",
- "initialize",
- ]
- TECHNOLOGIES = [
- # Languages
- "python",
- "typescript",
- "javascript",
- "rust",
- "golang",
- "java",
- "kotlin",
- "swift",
- "ruby",
- "php",
- "cpp",
- "c",
- "elixir",
- "scala",
- "clojure",
- "dart",
- # Frameworks/Frontend
- "react",
- "vue",
- "angular",
- "svelte",
- "solid",
- "htmx",
- "alpine",
- "nextjs",
- "nuxt",
- # Backend
- "django",
- "flask",
- "fastapi",
- "express",
- "rails",
- "spring",
- "laravel",
- # Infrastructure
- "docker",
- "kubernetes",
- "terraform",
- "ansible",
- "jenkins",
- "github-actions",
- # Databases
- "postgres",
- "mysql",
- "mongodb",
- "redis",
- "elasticsearch",
- "sqlite",
- "dynamodb",
- "cassandra",
- "cockroachdb",
- "supabase",
- "firebase",
- # Tools
- "git",
- "nginx",
- "apache",
- "linux",
- "aws",
- "gcp",
- "azure",
- "vercel",
- "netlify",
- # Data/ML
- "pandas",
- "numpy",
- "tensorflow",
- "pytorch",
- "scikit-learn",
- "jupyter",
- "spark",
- "kafka",
- "airflow",
- "dbt",
- ]
- TECHNOLOGIES_2 = [
- "docker",
- "kubernetes",
- "postgres",
- "mysql",
- "redis",
- "mongodb",
- "aws",
- "gcp",
- "react",
- "vue",
- "angular",
- "python",
- "javascript",
- "typescript",
- "github-actions",
- "gitlab-ci",
- "jenkins",
- "terraform",
- "ansible",
- ]
- CONCEPTS = [
- "authentication",
- "caching",
- "logging",
- "testing",
- "deployment",
- "API",
- "database",
- "security",
- "monitoring",
- "performance",
- "scalability",
- "reliability",
- "observability",
- "microservices",
- "serverless",
- "virtualization",
- "containerization",
- "orchestration",
- "CI/CD",
- "version control",
- "dependency injection",
- "event sourcing",
- "CQRS",
- "load balancing",
- "rate limiting",
- "circuit breaker",
- "retry logic",
- "idempotency",
- ]
- USE_CASES = [
- "production",
- "development",
- "CI/CD",
- "local",
- "cloud",
- "staging",
- "testing",
- "microservices",
- "serverless",
- "hybrid",
- "multi-tenant",
- "high-availability",
- "real-time",
- "batch processing",
- "stream processing",
- "data pipeline",
- ]
- ERROR_TYPES = [
- "connection",
- "timeout",
- "permission",
- "memory",
- "syntax",
- "runtime",
- "configuration",
- "dependency",
- "network",
- "authentication",
- "authorization",
- "validation",
- "concurrency",
- "deadlock",
- "resource",
- "quota",
- ]
- TOPICS = [
- "productivity",
- "workflow",
- "architecture",
- "design",
- "performance",
- "security",
- "scalability",
- "reliability",
- "observability",
- "maintainability",
- "testing",
- "documentation",
- "refactoring",
- "debugging",
- "optimization",
- "best practices",
- "patterns",
- "anti-patterns",
- "trade-offs",
- "decision making",
- ]
- KEYWORDS = [
- "auth",
- "config",
- "setup",
- "api",
- "cache",
- "log",
- "test",
- "debug",
- "env",
- "vars",
- "secrets",
- "tokens",
- "headers",
- "params",
- "query",
- "body",
- "route",
- "middleware",
- "handler",
- "controller",
- "model",
- "view",
- "template",
- "migration",
- "seed",
- "fixture",
- "mock",
- "stub",
- "spy",
- "fake",
- "build",
- "bundle",
- "compile",
- "transpile",
- "minify",
- "optimize",
- "deploy",
- "release",
- "rollback",
- "promote",
- "freeze",
- "thaw",
- "pull",
- "push",
- "commit",
- "merge",
- "rebase",
- "cherry-pick",
- "stash",
- "up",
- "down",
- "scale",
- "restart",
- "reload",
- "refresh",
- "flush",
- "cron",
- "queue",
- "job",
- "worker",
- "scheduler",
- "trigger",
- "webhook",
- "alert",
- "metric",
- "trace",
- "span",
- "event",
- "incident",
- "oncall",
- ]
- MODIFIERS = [
- "best",
- "fast",
- "simple",
- "advanced",
- "secure",
- "quick",
- "easy",
- "proper",
- "correct",
- "safe",
- "efficient",
- "reliable",
- "robust",
- "latest",
- "recent",
- "new",
- "old",
- "legacy",
- "modern",
- "local",
- "remote",
- "global",
- "shared",
- "private",
- "public",
- ]
- NAMED_ENTITIES = [
- "React",
- "Vue",
- "Angular",
- "Docker",
- "Kubernetes",
- "AWS",
- "GCP",
- "GitHub",
- "GitLab",
- "Vercel",
- "Netlify",
- "Supabase",
- "Firebase",
- "Stripe",
- "Twilio",
- "SendGrid",
- "Datadog",
- "PagerDuty",
- "Sentry",
- "Terraform",
- "Ansible",
- "Jenkins",
- "CircleCI",
- "TravisCI",
- ]
- PERSONS = [
- "Kent Beck",
- "Martin Fowler",
- "Robert Martin",
- "Dave Thomas",
- "Guido van Rossum",
- "Brendan Eich",
- "Ryan Dahl",
- "Anders Hejlsberg",
- "Linus Torvalds",
- "DHH",
- "Yukihiro Matsumoto",
- "Rich Hickey",
- ]
- ORGANIZATIONS = [
- "Google",
- "Microsoft",
- "Amazon",
- "Meta",
- "Apple",
- "Netflix",
- "Spotify",
- "Stripe",
- "Shopify",
- "Airbnb",
- "Uber",
- "Lyft",
- "Slack",
- "Discord",
- ]
- PRODUCTS = [
- "VS Code",
- "IntelliJ",
- "PyCharm",
- "WebStorm",
- "DataGrip",
- "Postman",
- "Insomnia",
- "TablePlus",
- "Docker Desktop",
- "Lens",
- "Figma",
- "Sketch",
- "Notion",
- "Linear",
- "Jira",
- "Trello",
- ]
- SYSTEM_PROMPT = """You are a search query optimization expert for a markdown document search system called QMD.
- Your task is to transform user queries into retrieval-optimized outputs with THREE distinct types:
- 1. **lex** lines: Keyword variations optimized for BM25 full-text search
- - Short, keyword-focused
- - Good for exact term matching
- - 1-3 lines
- 2. **vec** lines: Semantic reformulations for vector/embedding search
- - Complete phrases or questions
- - Capture semantic meaning
- - 1-3 lines
- 3. **hyde** line: A hypothetical document passage (HyDE technique)
- - A realistic passage that would answer the query
- - Contains domain-specific terminology
- - Written as if it's FROM a document, not ABOUT the query
- - MAX 1 line
- Output format (STRICT - follow exactly):
- ```
- lex: keyword1
- lex: keyword2
- vec: semantic query reformulation
- hyde: A passage that would appear in a document answering this query.
- ```
- Rules:
- - Each line must start with "lex:", "vec:", or "hyde:"
- - No blank lines
- - No repetition between lines
- - hyde should be a realistic document excerpt, not a question
- - Stay focused on the original query intent"""
- USER_PROMPT_TEMPLATE = """Generate query expansion outputs for this search query:
- Query: {query}
- Respond with ONLY the lex/vec/hyde lines, nothing else."""
- # Category weights - BALANCED approach
- # Tech at 15% (reasonable for QMD's technical document use case)
- CATEGORY_WEIGHTS = {
- "technical": 0.15, # 15% - Technical documentation
- "personal": 0.10, # 10% - Personal notes, journals
- "research": 0.10, # 10% - Research and learning
- "short": 0.15, # 15% - Short keyword queries
- "temporal": 0.10, # 10% - Temporal/recency queries (2025/2026)
- "entities": 0.05, # 5% - Named entity queries
- "health": 0.10, # 10% - Health & wellness
- "finance": 0.10, # 10% - Finance & business
- "lifestyle": 0.10, # 10% - Home, food, hobbies, travel
- "education": 0.05, # 5% - Education & arts
- }
- def generate_random_query() -> str:
- """Generate a random query from templates with category-weighted sampling."""
- # Select category based on weights
- categories = list(CATEGORY_WEIGHTS.keys())
- weights = list(CATEGORY_WEIGHTS.values())
- selected_category = random.choices(categories, weights=weights, k=1)[0]
- # Select template from that category
- template_idx = random.choice(TEMPLATE_CATEGORIES[selected_category])
- template = QUERY_TEMPLATES[template_idx]
- # Build replacements based on template type
- replacements = {
- "{action}": random.choice(ACTIONS),
- "{technology}": random.choice(TECHNOLOGIES),
- "{technology2}": random.choice(TECHNOLOGIES_2),
- "{concept}": random.choice(CONCEPTS),
- "{use_case}": random.choice(USE_CASES),
- "{error_type}": random.choice(ERROR_TYPES),
- "{topic}": random.choice(TOPICS),
- "{project}": random.choice(
- ["website", "app", "CLI tool", "API", "library", "service", "platform"]
- ),
- "{date}": random.choice(
- # Emphasize 2025/2026 for recency queries (current era)
- [
- "2026",
- "2026",
- "2025",
- "2025",
- "January 2026",
- "February 2026",
- "March 2026",
- "last month",
- "this week",
- "yesterday",
- "today",
- "recently",
- "latest",
- ]
- ),
- "{thing1}": random.choice(CONCEPTS[:10]),
- "{thing2}": random.choice(CONCEPTS[10:] if len(CONCEPTS) > 10 else CONCEPTS),
- "{skill}": random.choice(TECHNOLOGIES),
- "{keyword}": random.choice(KEYWORDS),
- "{modifier}": random.choice(MODIFIERS),
- "{named_entity}": random.choice(NAMED_ENTITIES),
- "{person}": random.choice(PERSONS),
- "{organization}": random.choice(ORGANIZATIONS),
- "{product}": random.choice(PRODUCTS),
- }
- query = template
- for key, value in replacements.items():
- query = query.replace(key, value)
- return query
- def generate_expansion(client: anthropic.Anthropic, query: str) -> str | None:
- """Generate expansion using Claude API."""
- try:
- response = client.messages.create(
- model="claude-sonnet-4-20250514",
- max_tokens=300,
- system=SYSTEM_PROMPT,
- messages=[
- {"role": "user", "content": USER_PROMPT_TEMPLATE.format(query=query)}
- ],
- )
- return response.content[0].text.strip()
- except Exception as e:
- print(f"Error generating expansion for '{query}': {e}")
- return None
- def validate_output(output: str) -> bool:
- """Validate that output follows the expected format."""
- lines = output.strip().split("\n")
- if not lines:
- return False
- has_lex = False
- has_vec = False
- for line in lines:
- line = line.strip()
- if not line:
- continue
- if line.startswith("lex:"):
- has_lex = True
- elif line.startswith("vec:"):
- has_vec = True
- elif line.startswith("hyde:"):
- pass
- else:
- return False # Invalid line type
- return has_lex and has_vec
- def main():
- parser = argparse.ArgumentParser(
- description="Generate QMD query expansion training data"
- )
- parser.add_argument(
- "--count", type=int, default=100, help="Number of examples to generate"
- )
- parser.add_argument(
- "--output",
- type=str,
- default="data/qmd_expansion.jsonl",
- help="Output file path",
- )
- parser.add_argument(
- "--queries", type=str, help="Optional file with custom queries (one per line)"
- )
- args = parser.parse_args()
- api_key = os.environ.get("ANTHROPIC_API_KEY")
- if not api_key:
- print("Error: ANTHROPIC_API_KEY environment variable not set")
- exit(1)
- client = anthropic.Anthropic(api_key=api_key)
- output_path = Path(args.output)
- output_path.parent.mkdir(parents=True, exist_ok=True)
- # Load custom queries if provided
- custom_queries = []
- if args.queries and Path(args.queries).exists():
- custom_queries = Path(args.queries).read_text().strip().split("\n")
- print(f"Loaded {len(custom_queries)} custom queries")
- examples = []
- seen_queries = set()
- print(f"Generating {args.count} examples...")
- i = 0
- while len(examples) < args.count:
- # Use custom query or generate random one
- if custom_queries and i < len(custom_queries):
- query = custom_queries[i].strip()
- else:
- query = generate_random_query()
- i += 1
- # Skip duplicates
- if query in seen_queries:
- continue
- seen_queries.add(query)
- # Generate expansion
- output = generate_expansion(client, query)
- if output and validate_output(output):
- examples.append({"input": query, "output": output})
- print(f"[{len(examples)}/{args.count}] {query[:50]}...")
- else:
- print(f" Skipped invalid output for: {query[:50]}...")
- # Write output
- with open(output_path, "w") as f:
- for example in examples:
- f.write(json.dumps(example) + "\n")
- print(f"\nGenerated {len(examples)} examples to {output_path}")
- if __name__ == "__main__":
- main()
|