| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906 |
- #!/usr/bin/env python3
- """
- Data Quality Reviewer for Query Expansion Training Dataset
- This script identifies and flags/fixes semantic errors where technical terms
- are misunderstood. For example:
- - "gem find" expanded as "mineral hunt" instead of "ruby gem search"
- - "yarn spin" expanded as "wool twist" instead of "yarn package manager"
- The script uses contextual analysis to detect when technical terms
- are likely being used in a programming context vs. their everyday meaning.
- """
- import json
- import re
- from pathlib import Path
- from dataclasses import dataclass, field
- from typing import Optional
- from collections import defaultdict
- from dataset.schema import (
- normalize_output_items,
- output_items_to_text,
- parse_output_text,
- )
- @dataclass
- class TechnicalTerm:
- """Definition of a technical term that might be misunderstood."""
- term: str # The ambiguous term (e.g., "liquid", "gem", "yarn")
- context_indicators: list[str] # Words that suggest tech context
- wrong_expansions: list[str] # Patterns that indicate wrong interpretation
- correct_domain: str # What domain this belongs to when technical
- correct_lex: list[str] # Correct lex expansions
- correct_vec: list[str] # Correct vec expansions
- # Known technical terms that are commonly misunderstood
- KNOWN_TECHNICAL_TERMS = [
- TechnicalTerm(
- term="liquid",
- context_indicators=["shopify", "template", "filter", "tag", "theme", "jekyll"],
- wrong_expansions=["fluid", "water", "pour", "drink", "beverage", "h2o", "wet"],
- correct_domain="Shopify/Jekyll templating language",
- correct_lex=["shopify template syntax", "liquid template filter"],
- correct_vec=[
- "shopify liquid templating language",
- "liquid template engine filters",
- ],
- ),
- TechnicalTerm(
- term="gem",
- context_indicators=[
- "ruby",
- "bundler",
- "install",
- "gemfile",
- "rails",
- "require",
- ],
- wrong_expansions=[
- "mineral",
- "crystal",
- "jewel",
- "stone",
- "diamond",
- "jewelry",
- "precious",
- ],
- correct_domain="Ruby package manager",
- correct_lex=["ruby gem package", "gem install command"],
- correct_vec=["ruby gem package manager", "rubygems library installation"],
- ),
- TechnicalTerm(
- term="yarn",
- context_indicators=[
- "npm",
- "package",
- "install",
- "node",
- "javascript",
- "react",
- "webpack",
- ],
- wrong_expansions=[
- "thread",
- "wool",
- "knit",
- "spin",
- "textile",
- "fabric",
- "sew",
- "twist",
- ],
- correct_domain="JavaScript package manager",
- correct_lex=["yarn package manager", "yarn install dependencies"],
- correct_vec=["yarn javascript package manager", "yarn npm alternative"],
- ),
- TechnicalTerm(
- term="hook",
- context_indicators=[
- "react",
- "use",
- "state",
- "effect",
- "component",
- "callback",
- "git",
- ],
- wrong_expansions=["fish", "fishing", "bait", "catch", "hang", "pirate"],
- correct_domain="React hooks or Git hooks",
- correct_lex=["react hooks api", "usestate useeffect"],
- correct_vec=[
- "react hooks state management",
- "react functional component hooks",
- ],
- ),
- TechnicalTerm(
- term="container",
- context_indicators=[
- "docker",
- "kubernetes",
- "k8s",
- "image",
- "orchestration",
- "pod",
- ],
- wrong_expansions=[
- "box",
- "storage",
- "shipping",
- "cargo",
- "tupperware",
- "jar",
- "vessel",
- ],
- correct_domain="Docker/Kubernetes containers",
- correct_lex=["docker container", "container image"],
- correct_vec=[
- "docker container virtualization",
- "container orchestration platform",
- ],
- ),
- TechnicalTerm(
- term="branch",
- context_indicators=[
- "git",
- "merge",
- "checkout",
- "commit",
- "main",
- "master",
- "repo",
- ],
- wrong_expansions=["tree", "limb", "wood", "leaf", "twig", "forest"],
- correct_domain="Git version control",
- correct_lex=["git branch", "git checkout branch"],
- correct_vec=["git branch version control", "git branching workflow"],
- ),
- TechnicalTerm(
- term="decorator",
- context_indicators=["python", "@", "function", "wrapper", "class", "def"],
- wrong_expansions=[
- "interior",
- "design",
- "paint",
- "furniture",
- "decor",
- "ornament",
- ],
- correct_domain="Python decorators",
- correct_lex=["python decorator function", "@decorator syntax"],
- correct_vec=["python function decorators", "python decorator pattern"],
- ),
- TechnicalTerm(
- term="bean",
- context_indicators=[
- "java",
- "spring",
- "injection",
- "dependency",
- "servlet",
- "ejb",
- ],
- wrong_expansions=["coffee", "food", "vegetable", "legume", "plant", "soy"],
- correct_domain="Java Beans / Spring Beans",
- correct_lex=["java bean class", "spring bean injection"],
- correct_vec=["java enterprise beans", "spring dependency injection beans"],
- ),
- TechnicalTerm(
- term="shell",
- context_indicators=[
- "bash",
- "script",
- "terminal",
- "command",
- "linux",
- "unix",
- "zsh",
- ],
- wrong_expansions=["seashell", "ocean", "beach", "clam", "oyster", "egg"],
- correct_domain="Unix/Linux shell scripting",
- correct_lex=["bash shell script", "shell command"],
- correct_vec=["unix shell scripting", "bash command line shell"],
- ),
- TechnicalTerm(
- term="rust",
- context_indicators=[
- "cargo",
- "crate",
- "ownership",
- "borrow",
- "lifetime",
- "unsafe",
- ],
- wrong_expansions=["oxidation", "metal", "corrosion", "decay", "iron", "orange"],
- correct_domain="Rust programming language",
- correct_lex=["rust programming language", "rust cargo package"],
- correct_vec=["rust systems programming", "rust memory safety"],
- ),
- TechnicalTerm(
- term="go",
- context_indicators=[
- "golang",
- "goroutine",
- "channel",
- "defer",
- "gofmt",
- "module",
- ],
- wrong_expansions=[
- "travel",
- "move",
- "walk",
- "game",
- "board game",
- "leave",
- "depart",
- ],
- correct_domain="Go programming language",
- correct_lex=["golang programming", "go language syntax"],
- correct_vec=["go programming language", "golang concurrent programming"],
- ),
- TechnicalTerm(
- term="swift",
- context_indicators=["ios", "xcode", "apple", "uikit", "swiftui", "cocoa"],
- wrong_expansions=["fast", "quick", "bird", "speed", "rapid", "taylor"],
- correct_domain="Swift programming language",
- correct_lex=["swift ios development", "swift programming language"],
- correct_vec=["swift apple programming language", "swift ios app development"],
- ),
- TechnicalTerm(
- term="pod",
- context_indicators=[
- "kubernetes",
- "k8s",
- "deployment",
- "service",
- "cluster",
- "node",
- ],
- wrong_expansions=["pea", "seed", "plant", "vegetable", "legume", "whale"],
- correct_domain="Kubernetes pods",
- correct_lex=["kubernetes pod", "k8s pod deployment"],
- correct_vec=["kubernetes pod container group", "k8s pod orchestration"],
- ),
- TechnicalTerm(
- term="redis",
- context_indicators=[
- "cache",
- "database",
- "key-value",
- "memory",
- "pub/sub",
- "queue",
- ],
- wrong_expansions=[], # "redis" doesn't have common wrong meanings
- correct_domain="Redis in-memory database",
- correct_lex=["redis cache", "redis database"],
- correct_vec=["redis in-memory data store", "redis caching solution"],
- ),
- TechnicalTerm(
- term="kafka",
- context_indicators=[
- "message",
- "stream",
- "queue",
- "broker",
- "topic",
- "producer",
- "consumer",
- ],
- wrong_expansions=[
- "franz",
- "author",
- "writer",
- "novel",
- "metamorphosis",
- "literature",
- ],
- correct_domain="Apache Kafka message queue",
- correct_lex=["apache kafka", "kafka message broker"],
- correct_vec=["apache kafka streaming platform", "kafka message queue"],
- ),
- TechnicalTerm(
- term="elastic",
- context_indicators=[
- "elasticsearch",
- "search",
- "index",
- "kibana",
- "logstash",
- "query",
- ],
- wrong_expansions=["stretch", "rubber", "flexible", "band", "bouncy"],
- correct_domain="Elasticsearch",
- correct_lex=["elasticsearch", "elastic search index"],
- correct_vec=["elasticsearch full-text search", "elastic stack"],
- ),
- TechnicalTerm(
- term="spark",
- context_indicators=["apache", "hadoop", "data", "rdd", "dataframe", "pyspark"],
- wrong_expansions=["fire", "ignite", "flame", "plug", "electricity"],
- correct_domain="Apache Spark",
- correct_lex=["apache spark", "spark data processing"],
- correct_vec=["apache spark big data processing", "spark cluster computing"],
- ),
- TechnicalTerm(
- term="flask",
- context_indicators=["python", "web", "route", "api", "jinja", "werkzeug"],
- wrong_expansions=[
- "bottle",
- "container",
- "lab",
- "chemistry",
- "drink",
- "thermos",
- ],
- correct_domain="Flask web framework",
- correct_lex=["flask python web framework", "flask api"],
- correct_vec=["flask python web development", "flask microframework"],
- ),
- TechnicalTerm(
- term="django",
- context_indicators=["python", "web", "orm", "model", "view", "template"],
- wrong_expansions=["jazz", "music", "reinhardt", "guitar", "movie", "western"],
- correct_domain="Django web framework",
- correct_lex=["django python framework", "django web development"],
- correct_vec=["django python web framework", "django orm models"],
- ),
- TechnicalTerm(
- term="rails",
- context_indicators=[
- "ruby",
- "gem",
- "activerecord",
- "model",
- "controller",
- "migration",
- ],
- wrong_expansions=["train", "track", "railroad", "railway", "metal"],
- correct_domain="Ruby on Rails",
- correct_lex=["ruby on rails", "rails web framework"],
- correct_vec=["ruby on rails framework", "rails mvc architecture"],
- ),
- TechnicalTerm(
- term="node",
- context_indicators=[
- "javascript",
- "npm",
- "express",
- "async",
- "require",
- "module",
- ],
- wrong_expansions=["lump", "knot", "bump", "growth", "junction"],
- correct_domain="Node.js",
- correct_lex=["node.js javascript", "nodejs runtime"],
- correct_vec=["node.js javascript runtime", "nodejs server-side javascript"],
- ),
- TechnicalTerm(
- term="maven",
- context_indicators=[
- "java",
- "pom",
- "dependency",
- "build",
- "artifact",
- "repository",
- ],
- wrong_expansions=["expert", "specialist", "connoisseur"],
- correct_domain="Apache Maven",
- correct_lex=["apache maven", "maven build tool"],
- correct_vec=["apache maven java build", "maven dependency management"],
- ),
- TechnicalTerm(
- term="gradle",
- context_indicators=["java", "kotlin", "android", "build", "groovy", "task"],
- wrong_expansions=["grade", "slope", "hill", "incline"],
- correct_domain="Gradle build tool",
- correct_lex=["gradle build tool", "gradle android"],
- correct_vec=["gradle java build automation", "gradle kotlin dsl"],
- ),
- TechnicalTerm(
- term="ant",
- context_indicators=["java", "build", "xml", "target", "task"],
- wrong_expansions=["insect", "bug", "colony", "hill", "picnic"],
- correct_domain="Apache Ant build tool",
- correct_lex=["apache ant", "ant build xml"],
- correct_vec=["apache ant java build", "ant build automation"],
- ),
- ]
- @dataclass
- class Issue:
- """Represents an issue found in a dataset example."""
- line_number: int
- input_text: str
- output_text: str
- issue_type: str
- technical_term: str
- wrong_expansion_found: str
- suggested_fix: Optional[str] = None
- @dataclass
- class AnalysisResult:
- """Results of analyzing the dataset."""
- total_examples: int = 0
- issues_found: list[Issue] = field(default_factory=list)
- examples_with_correct_tech_terms: list[tuple[int, str]] = field(
- default_factory=list
- )
- term_statistics: dict = field(default_factory=lambda: defaultdict(int))
- def check_for_wrong_expansion(output_text: str, term: TechnicalTerm) -> Optional[str]:
- """Check if the output contains wrong expansions for a technical term."""
- output_lower = output_text.lower()
- for wrong in term.wrong_expansions:
- if wrong.lower() in output_lower:
- return wrong
- return None
- def has_tech_context(input_text: str, term: TechnicalTerm) -> bool:
- """Check if the input has indicators of a technical context."""
- input_lower = input_text.lower()
- for indicator in term.context_indicators:
- if indicator.lower() in input_lower:
- return True
- return False
- def is_likely_tech_query(input_text: str) -> bool:
- """
- Heuristic to determine if a short query is likely tech-related.
- Short queries like "gem find" or "yarn spin" are ambiguous.
- """
- tech_patterns = [
- r"\b(install|config|setup|build|run|debug|test|deploy|compile)\b",
- r"\b(api|cli|sdk|lib|pkg|npm|pip|cargo)\b",
- r"\b(func|class|method|var|const|let|def)\b",
- r"\b(http|https|url|port|host|server|client)\b",
- r"\b(json|xml|yaml|csv|sql|html|css|js)\b",
- ]
- input_lower = input_text.lower()
- for pattern in tech_patterns:
- if re.search(pattern, input_lower):
- return True
- return False
- def has_non_tech_context(input_text: str, term: TechnicalTerm) -> bool:
- """
- Check if the input clearly indicates a non-technical context.
- This helps avoid false positives for words like "car rust", "yarn spin", etc.
- """
- input_lower = input_text.lower()
- term_lower = term.term.lower()
- # Define non-tech context indicators for each ambiguous term
- non_tech_contexts = {
- "rust": [
- "car",
- "metal",
- "iron",
- "steel",
- "corrosion",
- "prevention",
- "remove",
- "body",
- ],
- "gem": [
- "gemstone",
- "jewelry",
- "jewel",
- "diamond",
- "precious",
- "stone",
- "cut",
- "shop",
- "buy",
- "wear",
- ],
- "yarn": [
- "knit",
- "crochet",
- "spin",
- "wool",
- "thread",
- "textile",
- "fabric",
- "sew",
- "weave",
- ],
- "hook": ["fishing", "crochet", "hang", "coat", "wall", "ceiling"],
- "container": [
- "storage",
- "plastic",
- "food",
- "shipping",
- "cargo",
- "kitchen",
- "box",
- ],
- "branch": ["tree", "bank", "library", "store", "office", "organization"],
- "decorator": [
- "interior",
- "home",
- "room",
- "house",
- "design",
- "party",
- "cake",
- "wedding",
- ],
- "bean": [
- "coffee",
- "soy",
- "kidney",
- "black",
- "green",
- "garden",
- "cooking",
- "food",
- "plant",
- "grow",
- ],
- "shell": [
- "sea",
- "beach",
- "egg",
- "nut",
- "turtle",
- "snail",
- "crab",
- "clam",
- "oyster",
- ],
- "spark": ["plug", "fire", "ignite", "car", "engine", "electric", "romance"],
- "go": ["travel", "vacation", "trip", "walk", "run", "leave", "visit", "tour"],
- "swift": ["taylor", "concert", "music", "singer", "speed", "fast", "bird"],
- "pod": ["pea", "whale", "orca", "dolphin", "vegetable", "seed", "plant"],
- "ant": ["insect", "colony", "fire", "carpenter", "pest", "bug", "picnic"],
- "node": ["lymph", "medical", "body", "tree", "network point"],
- "rails": ["train", "railroad", "railway", "track", "transit", "fence"],
- "flask": ["lab", "chemistry", "drink", "hip", "thermos", "bottle", "water"],
- "django": [
- "jazz",
- "music",
- "reinhardt",
- "guitar",
- "movie",
- "western",
- "unchained",
- ],
- "maven": ["expert", "connoisseur", "specialist", "guru"],
- "gradle": ["grade", "school", "slope"],
- "kafka": [
- "franz",
- "author",
- "novel",
- "metamorphosis",
- "literature",
- "writer",
- "book",
- ],
- "elastic": ["band", "rubber", "stretch", "flexible", "waist", "fabric"],
- }
- if term_lower in non_tech_contexts:
- for context_word in non_tech_contexts[term_lower]:
- if context_word.lower() in input_lower:
- return True
- return False
- def analyze_example(line_num: int, input_text: str, output_text: str) -> list[Issue]:
- """Analyze a single example for potential issues."""
- issues = []
- input_lower = input_text.lower()
- for term in KNOWN_TECHNICAL_TERMS:
- term_lower = term.term.lower()
- # Check if the input contains this technical term
- if term_lower not in input_lower:
- continue
- # Check if output has wrong expansion
- wrong_expansion = check_for_wrong_expansion(output_text, term)
- if wrong_expansion is None:
- continue
- # Skip if the context clearly indicates non-technical usage
- if has_non_tech_context(input_text, term):
- continue
- # Determine if this is likely a technical context
- is_tech = has_tech_context(input_text, term) or is_likely_tech_query(input_text)
- # For very short inputs that contain ONLY the tech term (like "gem find"),
- # these are ambiguous and could be tech-related
- word_count = len(input_text.split())
- words = [w.lower() for w in input_text.split()]
- # Only flag if it's clearly a tech context OR a very short query
- # where the term appears prominently (e.g., "gem find", "yarn add")
- if is_tech:
- # Create suggested fix for definite tech issues
- suggested_output = f"lex: {term.correct_lex[0]}\nlex: {term.correct_lex[1] if len(term.correct_lex) > 1 else term.correct_lex[0]}\nvec: {term.correct_vec[0]}\nvec: {term.correct_vec[1] if len(term.correct_vec) > 1 else term.correct_vec[0]}\nhyde: {term.correct_domain} is a concept that provides functionality for software development."
- issue = Issue(
- line_number=line_num,
- input_text=input_text,
- output_text=output_text[:200] + "..."
- if len(output_text) > 200
- else output_text,
- issue_type="wrong_tech_expansion",
- technical_term=term.term,
- wrong_expansion_found=wrong_expansion,
- suggested_fix=suggested_output,
- )
- issues.append(issue)
- elif word_count <= 2 and term_lower in words:
- # Very short query with the term as a primary word - truly ambiguous
- issue = Issue(
- line_number=line_num,
- input_text=input_text,
- output_text=output_text[:200] + "..."
- if len(output_text) > 200
- else output_text,
- issue_type="ambiguous_term",
- technical_term=term.term,
- wrong_expansion_found=wrong_expansion,
- suggested_fix=None,
- )
- issues.append(issue)
- return issues
- def analyze_dataset(filepath: Path) -> AnalysisResult:
- """Analyze the entire dataset for issues."""
- result = AnalysisResult()
- with open(filepath, "r", encoding="utf-8") as f:
- for line_num, line in enumerate(f, 1):
- line = line.strip()
- if not line:
- continue
- try:
- example = json.loads(line)
- input_text = example.get("query", "") or example.get("input", "")
- output_raw = example.get("output", [])
- if isinstance(output_raw, str):
- output_items = normalize_output_items(parse_output_text(output_raw))
- else:
- output_items = normalize_output_items(output_raw)
- output_text = output_items_to_text(output_items)
- result.total_examples += 1
- # Analyze for issues
- issues = analyze_example(line_num, input_text, output_text)
- result.issues_found.extend(issues)
- # Track term statistics
- for term in KNOWN_TECHNICAL_TERMS:
- if term.term.lower() in input_text.lower():
- result.term_statistics[term.term] += 1
- except json.JSONDecodeError as e:
- print(f"Warning: Could not parse line {line_num}: {e}")
- return result
- def fix_example(example: dict, issues: list[Issue]) -> Optional[dict]:
- """
- Attempt to fix an example based on identified issues.
- Returns None if no fix is needed or possible.
- """
- # Only fix examples with definite tech context issues
- tech_issues = [
- i for i in issues if i.issue_type == "wrong_tech_expansion" and i.suggested_fix
- ]
- if not tech_issues:
- return None
- # Use the first tech issue's fix (they should be similar)
- issue = tech_issues[0]
- if not issue.suggested_fix:
- return None
- fixed = example.copy()
- fixed_output_items = normalize_output_items(parse_output_text(issue.suggested_fix))
- fixed["output"] = fixed_output_items
- fixed["_fixed"] = True
- original_items = example.get("output", [])
- if isinstance(original_items, str):
- original_items = normalize_output_items(parse_output_text(original_items))
- fixed["_original_output"] = output_items_to_text(original_items)
- fixed["_fix_reason"] = (
- f"Technical term '{issue.technical_term}' was incorrectly expanded as '{issue.wrong_expansion_found}'"
- )
- return fixed
- def generate_report(result: AnalysisResult) -> str:
- """Generate a human-readable report of the analysis."""
- lines = []
- lines.append("=" * 70)
- lines.append("QUERY EXPANSION DATASET QUALITY REPORT")
- lines.append("=" * 70)
- lines.append("")
- lines.append(f"Total examples analyzed: {result.total_examples}")
- lines.append(f"Issues found: {len(result.issues_found)}")
- lines.append("")
- # Group issues by type
- by_type = defaultdict(list)
- for issue in result.issues_found:
- by_type[issue.issue_type].append(issue)
- lines.append("-" * 70)
- lines.append("ISSUES BY TYPE:")
- lines.append("-" * 70)
- for issue_type, issues in by_type.items():
- lines.append(f"\n{issue_type.upper()}: {len(issues)} issues")
- lines.append("-" * 40)
- # Show up to 10 examples per type
- for issue in issues[:10]:
- lines.append(f"\n Line {issue.line_number}:")
- lines.append(f" Input: {issue.input_text}")
- lines.append(f" Technical term: '{issue.technical_term}'")
- lines.append(f" Wrong expansion found: '{issue.wrong_expansion_found}'")
- if issue.suggested_fix:
- lines.append(f" Suggested fix available: Yes")
- if len(issues) > 10:
- lines.append(f"\n ... and {len(issues) - 10} more")
- # Term statistics
- lines.append("\n" + "-" * 70)
- lines.append("TECHNICAL TERM OCCURRENCES IN DATASET:")
- lines.append("-" * 70)
- for term, count in sorted(result.term_statistics.items(), key=lambda x: -x[1]):
- if count > 0:
- lines.append(f" {term}: {count} occurrences")
- lines.append("\n" + "=" * 70)
- return "\n".join(lines)
- def save_cleaned_dataset(filepath: Path, output_path: Path, result: AnalysisResult):
- """Save a cleaned version of the dataset."""
- issues_by_line = defaultdict(list)
- for issue in result.issues_found:
- issues_by_line[issue.line_number].append(issue)
- fixed_count = 0
- flagged_count = 0
- with (
- open(filepath, "r", encoding="utf-8") as f_in,
- open(output_path, "w", encoding="utf-8") as f_out,
- ):
- for line_num, line in enumerate(f_in, 1):
- line = line.strip()
- if not line:
- continue
- try:
- example = json.loads(line)
- if "query" not in example and "input" in example:
- example["query"] = example.pop("input")
- output_raw = example.get("output", [])
- if isinstance(output_raw, str):
- example["output"] = normalize_output_items(
- parse_output_text(output_raw)
- )
- else:
- example["output"] = normalize_output_items(output_raw)
- if line_num in issues_by_line:
- issues = issues_by_line[line_num]
- fixed = fix_example(example, issues)
- if fixed:
- f_out.write(json.dumps(fixed) + "\n")
- fixed_count += 1
- else:
- # Flag but don't fix ambiguous cases
- example["_flagged"] = True
- example["_flag_reason"] = (
- f"Ambiguous term '{issues[0].technical_term}' may need review"
- )
- f_out.write(json.dumps(example) + "\n")
- flagged_count += 1
- else:
- f_out.write(json.dumps(example) + "\n")
- except json.JSONDecodeError:
- # Keep problematic lines as-is
- f_out.write(line + "\n")
- return fixed_count, flagged_count
- def main():
- """Main entry point."""
- # Paths
- script_dir = Path(__file__).parent
- input_path = script_dir / "data" / "qmd_expansion.jsonl"
- output_path = script_dir / "data" / "qmd_expansion_cleaned.jsonl"
- report_path = script_dir / "data" / "quality_report.txt"
- print(f"Analyzing dataset: {input_path}")
- print("-" * 50)
- if not input_path.exists():
- print(f"Error: Input file not found: {input_path}")
- return 1
- # Analyze the dataset
- result = analyze_dataset(input_path)
- # Generate and print report
- report = generate_report(result)
- print(report)
- # Save report to file
- with open(report_path, "w", encoding="utf-8") as f:
- f.write(report)
- print(f"\nReport saved to: {report_path}")
- # Save cleaned dataset
- fixed_count, flagged_count = save_cleaned_dataset(input_path, output_path, result)
- print(f"\nCleaned dataset saved to: {output_path}")
- print(f" - Examples fixed: {fixed_count}")
- print(f" - Examples flagged for review: {flagged_count}")
- print(
- f" - Examples unchanged: {result.total_examples - fixed_count - flagged_count}"
- )
- # Summary statistics
- print("\n" + "=" * 50)
- print("SUMMARY")
- print("=" * 50)
- print(f"Total examples: {result.total_examples}")
- print(f"Total issues found: {len(result.issues_found)}")
- tech_issues = [
- i for i in result.issues_found if i.issue_type == "wrong_tech_expansion"
- ]
- ambig_issues = [i for i in result.issues_found if i.issue_type == "ambiguous_term"]
- print(f" - Definite tech term errors: {len(tech_issues)}")
- print(f" - Ambiguous terms needing review: {len(ambig_issues)}")
- if len(result.issues_found) > 0:
- error_rate = len(result.issues_found) / result.total_examples * 100
- print(f"\nError rate: {error_rate:.2f}%")
- return 0
- if __name__ == "__main__":
- exit(main())
|