generate_data.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711
  1. #!/usr/bin/env python3
  2. """Generate synthetic training data for QMD query expansion using Claude API."""
  3. import argparse
  4. import json
  5. import os
  6. import random
  7. from pathlib import Path
  8. try:
  9. import anthropic
  10. except ImportError:
  11. print("Install anthropic: pip install anthropic")
  12. exit(1)
  13. # Sample query templates for diverse training data - organized by category
  14. QUERY_TEMPLATES = [
  15. # === Technical documentation (35% of queries) ===
  16. "how to {action} {technology}",
  17. "{technology} {concept} example",
  18. "configure {technology} for {use_case}",
  19. "{error_type} error in {technology}",
  20. "best practices for {concept}",
  21. "{technology} vs {technology2}",
  22. "{action} {technology} {use_case}",
  23. "setup {technology} {use_case}",
  24. "{technology} tutorial for beginners",
  25. "{technology} documentation",
  26. "{technology} {error_type} troubleshooting",
  27. "{concept} in {technology}",
  28. "migrate from {technology} to {technology2}",
  29. "{action} {concept} {technology}",
  30. # === Personal notes / journals (15% of queries) ===
  31. "meeting notes {topic}",
  32. "ideas for {project}",
  33. "{date} journal entry",
  34. "thoughts on {topic}",
  35. "{project} {topic} notes",
  36. "{topic} meeting {date}",
  37. "reflect on {topic}",
  38. "brainstorm {project}",
  39. # === Research / learning (20% of queries) ===
  40. "what is {concept}",
  41. "difference between {thing1} and {thing2}",
  42. "{topic} tutorial",
  43. "learn {skill}",
  44. "understand {concept}",
  45. "explain {concept}",
  46. "{topic} fundamentals",
  47. "intro to {skill}",
  48. "{thing1} or {thing2}",
  49. "when to use {concept}",
  50. # === Short / keyword queries (15% of queries) ===
  51. "{keyword}",
  52. "{keyword} {modifier}",
  53. "{keyword} {action}",
  54. "{keyword} {use_case}",
  55. "{technology} {keyword}",
  56. "{concept} {keyword}",
  57. # === Temporal / recency queries (10% of queries) ===
  58. "latest {topic}",
  59. "recent {concept} changes",
  60. "new {technology} features",
  61. "{topic} update {date}",
  62. "what changed in {technology}",
  63. "{technology} changelog {date}",
  64. "{topic} news {date}",
  65. # === Named entities / specific topics (5% of queries) ===
  66. "{named_entity} {topic}",
  67. "{person} {concept}",
  68. "{organization} {use_case}",
  69. "{product} {action}",
  70. ]
  71. # Category weights for balanced sampling
  72. TEMPLATE_CATEGORIES = {
  73. "technical": list(range(0, 14)), # 0-13
  74. "personal": list(range(14, 22)), # 14-21
  75. "research": list(range(22, 31)), # 22-30
  76. "short": list(range(31, 36)), # 31-35
  77. "temporal": list(range(36, 42)), # 36-41
  78. "entities": list(range(42, 46)), # 42-45
  79. }
  80. ACTIONS = [
  81. "install",
  82. "configure",
  83. "setup",
  84. "debug",
  85. "deploy",
  86. "test",
  87. "optimize",
  88. "migrate",
  89. "build",
  90. "run",
  91. "lint",
  92. "format",
  93. "backup",
  94. "restore",
  95. "update",
  96. "rollback",
  97. "monitor",
  98. "scale",
  99. "secure",
  100. "integrate",
  101. "automate",
  102. "refactor",
  103. "initialize",
  104. ]
  105. TECHNOLOGIES = [
  106. # Languages
  107. "python",
  108. "typescript",
  109. "javascript",
  110. "rust",
  111. "golang",
  112. "java",
  113. "kotlin",
  114. "swift",
  115. "ruby",
  116. "php",
  117. "cpp",
  118. "c",
  119. "elixir",
  120. "scala",
  121. "clojure",
  122. "dart",
  123. # Frameworks/Frontend
  124. "react",
  125. "vue",
  126. "angular",
  127. "svelte",
  128. "solid",
  129. "htmx",
  130. "alpine",
  131. "nextjs",
  132. "nuxt",
  133. # Backend
  134. "django",
  135. "flask",
  136. "fastapi",
  137. "express",
  138. "rails",
  139. "spring",
  140. "laravel",
  141. # Infrastructure
  142. "docker",
  143. "kubernetes",
  144. "terraform",
  145. "ansible",
  146. "jenkins",
  147. "github-actions",
  148. # Databases
  149. "postgres",
  150. "mysql",
  151. "mongodb",
  152. "redis",
  153. "elasticsearch",
  154. "sqlite",
  155. "dynamodb",
  156. "cassandra",
  157. "cockroachdb",
  158. "supabase",
  159. "firebase",
  160. # Tools
  161. "git",
  162. "nginx",
  163. "apache",
  164. "linux",
  165. "aws",
  166. "gcp",
  167. "azure",
  168. "vercel",
  169. "netlify",
  170. # Data/ML
  171. "pandas",
  172. "numpy",
  173. "tensorflow",
  174. "pytorch",
  175. "scikit-learn",
  176. "jupyter",
  177. "spark",
  178. "kafka",
  179. "airflow",
  180. "dbt",
  181. ]
  182. TECHNOLOGIES_2 = [
  183. "docker",
  184. "kubernetes",
  185. "postgres",
  186. "mysql",
  187. "redis",
  188. "mongodb",
  189. "aws",
  190. "gcp",
  191. "react",
  192. "vue",
  193. "angular",
  194. "python",
  195. "javascript",
  196. "typescript",
  197. "github-actions",
  198. "gitlab-ci",
  199. "jenkins",
  200. "terraform",
  201. "ansible",
  202. ]
  203. CONCEPTS = [
  204. "authentication",
  205. "caching",
  206. "logging",
  207. "testing",
  208. "deployment",
  209. "API",
  210. "database",
  211. "security",
  212. "monitoring",
  213. "performance",
  214. "scalability",
  215. "reliability",
  216. "observability",
  217. "microservices",
  218. "serverless",
  219. "virtualization",
  220. "containerization",
  221. "orchestration",
  222. "CI/CD",
  223. "version control",
  224. "dependency injection",
  225. "event sourcing",
  226. "CQRS",
  227. "load balancing",
  228. "rate limiting",
  229. "circuit breaker",
  230. "retry logic",
  231. "idempotency",
  232. ]
  233. USE_CASES = [
  234. "production",
  235. "development",
  236. "CI/CD",
  237. "local",
  238. "cloud",
  239. "staging",
  240. "testing",
  241. "microservices",
  242. "serverless",
  243. "hybrid",
  244. "multi-tenant",
  245. "high-availability",
  246. "real-time",
  247. "batch processing",
  248. "stream processing",
  249. "data pipeline",
  250. ]
  251. ERROR_TYPES = [
  252. "connection",
  253. "timeout",
  254. "permission",
  255. "memory",
  256. "syntax",
  257. "runtime",
  258. "configuration",
  259. "dependency",
  260. "network",
  261. "authentication",
  262. "authorization",
  263. "validation",
  264. "concurrency",
  265. "deadlock",
  266. "resource",
  267. "quota",
  268. ]
  269. TOPICS = [
  270. "productivity",
  271. "workflow",
  272. "architecture",
  273. "design",
  274. "performance",
  275. "security",
  276. "scalability",
  277. "reliability",
  278. "observability",
  279. "maintainability",
  280. "testing",
  281. "documentation",
  282. "refactoring",
  283. "debugging",
  284. "optimization",
  285. "best practices",
  286. "patterns",
  287. "anti-patterns",
  288. "trade-offs",
  289. "decision making",
  290. ]
  291. KEYWORDS = [
  292. "auth",
  293. "config",
  294. "setup",
  295. "api",
  296. "cache",
  297. "log",
  298. "test",
  299. "debug",
  300. "env",
  301. "vars",
  302. "secrets",
  303. "tokens",
  304. "headers",
  305. "params",
  306. "query",
  307. "body",
  308. "route",
  309. "middleware",
  310. "handler",
  311. "controller",
  312. "model",
  313. "view",
  314. "template",
  315. "migration",
  316. "seed",
  317. "fixture",
  318. "mock",
  319. "stub",
  320. "spy",
  321. "fake",
  322. "build",
  323. "bundle",
  324. "compile",
  325. "transpile",
  326. "minify",
  327. "optimize",
  328. "deploy",
  329. "release",
  330. "rollback",
  331. "promote",
  332. "freeze",
  333. "thaw",
  334. "pull",
  335. "push",
  336. "commit",
  337. "merge",
  338. "rebase",
  339. "cherry-pick",
  340. "stash",
  341. "up",
  342. "down",
  343. "scale",
  344. "restart",
  345. "reload",
  346. "refresh",
  347. "flush",
  348. "cron",
  349. "queue",
  350. "job",
  351. "worker",
  352. "scheduler",
  353. "trigger",
  354. "webhook",
  355. "alert",
  356. "metric",
  357. "trace",
  358. "span",
  359. "event",
  360. "incident",
  361. "oncall",
  362. ]
  363. MODIFIERS = [
  364. "best",
  365. "fast",
  366. "simple",
  367. "advanced",
  368. "secure",
  369. "quick",
  370. "easy",
  371. "proper",
  372. "correct",
  373. "safe",
  374. "efficient",
  375. "reliable",
  376. "robust",
  377. "latest",
  378. "recent",
  379. "new",
  380. "old",
  381. "legacy",
  382. "modern",
  383. "local",
  384. "remote",
  385. "global",
  386. "shared",
  387. "private",
  388. "public",
  389. ]
  390. NAMED_ENTITIES = [
  391. "React",
  392. "Vue",
  393. "Angular",
  394. "Docker",
  395. "Kubernetes",
  396. "AWS",
  397. "GCP",
  398. "GitHub",
  399. "GitLab",
  400. "Vercel",
  401. "Netlify",
  402. "Supabase",
  403. "Firebase",
  404. "Stripe",
  405. "Twilio",
  406. "SendGrid",
  407. "Datadog",
  408. "PagerDuty",
  409. "Sentry",
  410. "Terraform",
  411. "Ansible",
  412. "Jenkins",
  413. "CircleCI",
  414. "TravisCI",
  415. ]
  416. PERSONS = [
  417. "Kent Beck",
  418. "Martin Fowler",
  419. "Robert Martin",
  420. "Dave Thomas",
  421. "Guido van Rossum",
  422. "Brendan Eich",
  423. "Ryan Dahl",
  424. "Anders Hejlsberg",
  425. "Linus Torvalds",
  426. "DHH",
  427. "Yukihiro Matsumoto",
  428. "Rich Hickey",
  429. ]
  430. ORGANIZATIONS = [
  431. "Google",
  432. "Microsoft",
  433. "Amazon",
  434. "Meta",
  435. "Apple",
  436. "Netflix",
  437. "Spotify",
  438. "Stripe",
  439. "Shopify",
  440. "Airbnb",
  441. "Uber",
  442. "Lyft",
  443. "Slack",
  444. "Discord",
  445. ]
  446. PRODUCTS = [
  447. "VS Code",
  448. "IntelliJ",
  449. "PyCharm",
  450. "WebStorm",
  451. "DataGrip",
  452. "Postman",
  453. "Insomnia",
  454. "TablePlus",
  455. "Docker Desktop",
  456. "Lens",
  457. "Figma",
  458. "Sketch",
  459. "Notion",
  460. "Linear",
  461. "Jira",
  462. "Trello",
  463. ]
  464. SYSTEM_PROMPT = """You are a search query optimization expert for a markdown document search system called QMD.
  465. Your task is to transform user queries into retrieval-optimized outputs with THREE distinct types:
  466. 1. **lex** lines: Keyword variations optimized for BM25 full-text search
  467. - Short, keyword-focused
  468. - Good for exact term matching
  469. - 1-3 lines
  470. 2. **vec** lines: Semantic reformulations for vector/embedding search
  471. - Complete phrases or questions
  472. - Capture semantic meaning
  473. - 1-3 lines
  474. 3. **hyde** line: A hypothetical document passage (HyDE technique)
  475. - A realistic passage that would answer the query
  476. - Contains domain-specific terminology
  477. - Written as if it's FROM a document, not ABOUT the query
  478. - MAX 1 line
  479. Output format (STRICT - follow exactly):
  480. ```
  481. lex: keyword1
  482. lex: keyword2
  483. vec: semantic query reformulation
  484. hyde: A passage that would appear in a document answering this query.
  485. ```
  486. Rules:
  487. - Each line must start with "lex:", "vec:", or "hyde:"
  488. - No blank lines
  489. - No repetition between lines
  490. - hyde should be a realistic document excerpt, not a question
  491. - Stay focused on the original query intent"""
  492. USER_PROMPT_TEMPLATE = """Generate query expansion outputs for this search query:
  493. Query: {query}
  494. Respond with ONLY the lex/vec/hyde lines, nothing else."""
  495. # Category weights - BALANCED approach
  496. # Tech at 15% (reasonable for QMD's technical document use case)
  497. CATEGORY_WEIGHTS = {
  498. "technical": 0.15, # 15% - Technical documentation
  499. "personal": 0.10, # 10% - Personal notes, journals
  500. "research": 0.10, # 10% - Research and learning
  501. "short": 0.15, # 15% - Short keyword queries
  502. "temporal": 0.10, # 10% - Temporal/recency queries (2025/2026)
  503. "entities": 0.05, # 5% - Named entity queries
  504. "health": 0.10, # 10% - Health & wellness
  505. "finance": 0.10, # 10% - Finance & business
  506. "lifestyle": 0.10, # 10% - Home, food, hobbies, travel
  507. "education": 0.05, # 5% - Education & arts
  508. }
  509. def generate_random_query() -> str:
  510. """Generate a random query from templates with category-weighted sampling."""
  511. # Select category based on weights
  512. categories = list(CATEGORY_WEIGHTS.keys())
  513. weights = list(CATEGORY_WEIGHTS.values())
  514. selected_category = random.choices(categories, weights=weights, k=1)[0]
  515. # Select template from that category
  516. template_idx = random.choice(TEMPLATE_CATEGORIES[selected_category])
  517. template = QUERY_TEMPLATES[template_idx]
  518. # Build replacements based on template type
  519. replacements = {
  520. "{action}": random.choice(ACTIONS),
  521. "{technology}": random.choice(TECHNOLOGIES),
  522. "{technology2}": random.choice(TECHNOLOGIES_2),
  523. "{concept}": random.choice(CONCEPTS),
  524. "{use_case}": random.choice(USE_CASES),
  525. "{error_type}": random.choice(ERROR_TYPES),
  526. "{topic}": random.choice(TOPICS),
  527. "{project}": random.choice(
  528. ["website", "app", "CLI tool", "API", "library", "service", "platform"]
  529. ),
  530. "{date}": random.choice(
  531. # Emphasize 2025/2026 for recency queries (current era)
  532. [
  533. "2026",
  534. "2026",
  535. "2025",
  536. "2025",
  537. "January 2026",
  538. "February 2026",
  539. "March 2026",
  540. "last month",
  541. "this week",
  542. "yesterday",
  543. "today",
  544. "recently",
  545. "latest",
  546. ]
  547. ),
  548. "{thing1}": random.choice(CONCEPTS[:10]),
  549. "{thing2}": random.choice(CONCEPTS[10:] if len(CONCEPTS) > 10 else CONCEPTS),
  550. "{skill}": random.choice(TECHNOLOGIES),
  551. "{keyword}": random.choice(KEYWORDS),
  552. "{modifier}": random.choice(MODIFIERS),
  553. "{named_entity}": random.choice(NAMED_ENTITIES),
  554. "{person}": random.choice(PERSONS),
  555. "{organization}": random.choice(ORGANIZATIONS),
  556. "{product}": random.choice(PRODUCTS),
  557. }
  558. query = template
  559. for key, value in replacements.items():
  560. query = query.replace(key, value)
  561. return query
  562. def generate_expansion(client: anthropic.Anthropic, query: str) -> str | None:
  563. """Generate expansion using Claude API."""
  564. try:
  565. response = client.messages.create(
  566. model="claude-sonnet-4-20250514",
  567. max_tokens=300,
  568. system=SYSTEM_PROMPT,
  569. messages=[
  570. {"role": "user", "content": USER_PROMPT_TEMPLATE.format(query=query)}
  571. ],
  572. )
  573. return response.content[0].text.strip()
  574. except Exception as e:
  575. print(f"Error generating expansion for '{query}': {e}")
  576. return None
  577. def validate_output(output: str) -> bool:
  578. """Validate that output follows the expected format."""
  579. lines = output.strip().split("\n")
  580. if not lines:
  581. return False
  582. has_lex = False
  583. has_vec = False
  584. for line in lines:
  585. line = line.strip()
  586. if not line:
  587. continue
  588. if line.startswith("lex:"):
  589. has_lex = True
  590. elif line.startswith("vec:"):
  591. has_vec = True
  592. elif line.startswith("hyde:"):
  593. pass
  594. else:
  595. return False # Invalid line type
  596. return has_lex and has_vec
  597. def main():
  598. parser = argparse.ArgumentParser(
  599. description="Generate QMD query expansion training data"
  600. )
  601. parser.add_argument(
  602. "--count", type=int, default=100, help="Number of examples to generate"
  603. )
  604. parser.add_argument(
  605. "--output",
  606. type=str,
  607. default="data/qmd_expansion.jsonl",
  608. help="Output file path",
  609. )
  610. parser.add_argument(
  611. "--queries", type=str, help="Optional file with custom queries (one per line)"
  612. )
  613. args = parser.parse_args()
  614. api_key = os.environ.get("ANTHROPIC_API_KEY")
  615. if not api_key:
  616. print("Error: ANTHROPIC_API_KEY environment variable not set")
  617. exit(1)
  618. client = anthropic.Anthropic(api_key=api_key)
  619. output_path = Path(args.output)
  620. output_path.parent.mkdir(parents=True, exist_ok=True)
  621. # Load custom queries if provided
  622. custom_queries = []
  623. if args.queries and Path(args.queries).exists():
  624. custom_queries = Path(args.queries).read_text().strip().split("\n")
  625. print(f"Loaded {len(custom_queries)} custom queries")
  626. examples = []
  627. seen_queries = set()
  628. print(f"Generating {args.count} examples...")
  629. i = 0
  630. while len(examples) < args.count:
  631. # Use custom query or generate random one
  632. if custom_queries and i < len(custom_queries):
  633. query = custom_queries[i].strip()
  634. else:
  635. query = generate_random_query()
  636. i += 1
  637. # Skip duplicates
  638. if query in seen_queries:
  639. continue
  640. seen_queries.add(query)
  641. # Generate expansion
  642. output = generate_expansion(client, query)
  643. if output and validate_output(output):
  644. examples.append({"input": query, "output": output})
  645. print(f"[{len(examples)}/{args.count}] {query[:50]}...")
  646. else:
  647. print(f" Skipped invalid output for: {query[:50]}...")
  648. # Write output
  649. with open(output_path, "w") as f:
  650. for example in examples:
  651. f.write(json.dumps(example) + "\n")
  652. print(f"\nGenerated {len(examples)} examples to {output_path}")
  653. if __name__ == "__main__":
  654. main()