suby
/
qmd


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
							#!/usr/bin/env python3
"""Generate synthetic training data for QMD query expansion using Claude API."""

import argparse
import json
import os
import random
from pathlib import Path

try:
    import anthropic
except ImportError:
    print("Install anthropic: pip install anthropic")
    exit(1)

# Sample query templates for diverse training data
QUERY_TEMPLATES = [
    # Technical documentation
    "how to {action} {technology}",
    "{technology} {concept} example",
    "configure {technology} for {use_case}",
    "{error_type} error in {technology}",
    "best practices for {concept}",

    # Personal notes / journals
    "meeting notes {topic}",
    "ideas for {project}",
    "{date} journal entry",
    "thoughts on {topic}",

    # Research / learning
    "what is {concept}",
    "difference between {thing1} and {thing2}",
    "{topic} tutorial",
    "learn {skill}",

    # Short queries
    "{keyword}",
    "{keyword} {modifier}",
]

ACTIONS = ["install", "configure", "setup", "debug", "deploy", "test", "optimize", "migrate"]
TECHNOLOGIES = ["python", "typescript", "react", "docker", "kubernetes", "postgres", "redis", "nginx", "git", "linux"]
CONCEPTS = ["authentication", "caching", "logging", "testing", "deployment", "API", "database", "security"]
USE_CASES = ["production", "development", "CI/CD", "local", "cloud"]
ERROR_TYPES = ["connection", "timeout", "permission", "memory", "syntax"]
TOPICS = ["productivity", "workflow", "architecture", "design", "performance"]
KEYWORDS = ["auth", "config", "setup", "api", "data", "cache", "log", "test"]
MODIFIERS = ["best", "fast", "simple", "advanced", "secure"]

SYSTEM_PROMPT = """You are a search query optimization expert for a markdown document search system called QMD.

Your task is to transform user queries into retrieval-optimized outputs with THREE distinct types:

1. **lex** lines: Keyword variations optimized for BM25 full-text search
   - Short, keyword-focused
   - Good for exact term matching
   - 1-3 lines

2. **vec** lines: Semantic reformulations for vector/embedding search
   - Complete phrases or questions
   - Capture semantic meaning
   - 1-3 lines

3. **hyde** line: A hypothetical document passage (HyDE technique)
   - A realistic passage that would answer the query
   - Contains domain-specific terminology
   - Written as if it's FROM a document, not ABOUT the query
   - MAX 1 line

Output format (STRICT - follow exactly):
```
hyde: A passage that would appear in a document answering this query.
lex: keyword1
lex: keyword2
vec: semantic query reformulation
```

Rules:
- Each line must start with "lex:", "vec:", or "hyde:"
- No blank lines
- No repetition between lines
- hyde should be a realistic document excerpt, not a question
- Stay focused on the original query intent"""

USER_PROMPT_TEMPLATE = """Generate query expansion outputs for this search query:

Query: {query}

Respond with ONLY the lex/vec/hyde lines, nothing else."""


def generate_random_query() -> str:
    """Generate a random query from templates."""
    template = random.choice(QUERY_TEMPLATES)

    replacements = {
        "{action}": random.choice(ACTIONS),
        "{technology}": random.choice(TECHNOLOGIES),
        "{concept}": random.choice(CONCEPTS),
        "{use_case}": random.choice(USE_CASES),
        "{error_type}": random.choice(ERROR_TYPES),
        "{topic}": random.choice(TOPICS),
        "{project}": random.choice(["website", "app", "CLI tool", "API", "library"]),
        "{date}": random.choice(["2024-01", "2024-06", "yesterday", "today"]),
        "{thing1}": random.choice(CONCEPTS[:4]),
        "{thing2}": random.choice(CONCEPTS[4:]),
        "{skill}": random.choice(TECHNOLOGIES),
        "{keyword}": random.choice(KEYWORDS),
        "{modifier}": random.choice(MODIFIERS),
    }

    query = template
    for key, value in replacements.items():
        query = query.replace(key, value)

    return query


def generate_expansion(client: anthropic.Anthropic, query: str) -> str | None:
    """Generate expansion using Claude API."""
    try:
        response = client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=300,
            system=SYSTEM_PROMPT,
            messages=[
                {"role": "user", "content": USER_PROMPT_TEMPLATE.format(query=query)}
            ]
        )
        return response.content[0].text.strip()
    except Exception as e:
        print(f"Error generating expansion for '{query}': {e}")
        return None


def validate_output(output: str) -> bool:
    """Validate that output follows the expected format."""
    lines = output.strip().split("\n")
    if not lines:
        return False

    has_lex = False
    has_vec = False

    for line in lines:
        line = line.strip()
        if not line:
            continue
        if line.startswith("lex:"):
            has_lex = True
        elif line.startswith("vec:"):
            has_vec = True
        elif line.startswith("hyde:"):
            pass
        else:
            return False  # Invalid line type

    return has_lex and has_vec


def main():
    parser = argparse.ArgumentParser(description="Generate QMD query expansion training data")
    parser.add_argument("--count", type=int, default=100, help="Number of examples to generate")
    parser.add_argument("--output", type=str, default="data/qmd_expansion.jsonl", help="Output file path")
    parser.add_argument("--queries", type=str, help="Optional file with custom queries (one per line)")
    args = parser.parse_args()

    api_key = os.environ.get("ANTHROPIC_API_KEY")
    if not api_key:
        print("Error: ANTHROPIC_API_KEY environment variable not set")
        exit(1)

    client = anthropic.Anthropic(api_key=api_key)
    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Load custom queries if provided
    custom_queries = []
    if args.queries and Path(args.queries).exists():
        custom_queries = Path(args.queries).read_text().strip().split("\n")
        print(f"Loaded {len(custom_queries)} custom queries")

    examples = []
    seen_queries = set()

    print(f"Generating {args.count} examples...")

    i = 0
    while len(examples) < args.count:
        # Use custom query or generate random one
        if custom_queries and i < len(custom_queries):
            query = custom_queries[i].strip()
        else:
            query = generate_random_query()

        i += 1

        # Skip duplicates
        if query in seen_queries:
            continue
        seen_queries.add(query)

        # Generate expansion
        output = generate_expansion(client, query)
        if output and validate_output(output):
            examples.append({"input": query, "output": output})
            print(f"[{len(examples)}/{args.count}] {query[:50]}...")
        else:
            print(f"  Skipped invalid output for: {query[:50]}...")

    # Write output
    with open(output_path, "w") as f:
        for example in examples:
            f.write(json.dumps(example) + "\n")

    print(f"\nGenerated {len(examples)} examples to {output_path}")


if __name__ == "__main__":
    main()