/** * AST-aware chunking support via web-tree-sitter. * * Provides language detection, AST break point extraction for supported * code file types, and a stub for future symbol extraction. * * All functions degrade gracefully: parse failures or unsupported languages * return empty arrays, falling back to regex-only chunking. * * ## Dependency Note * * Grammar packages (tree-sitter-typescript, etc.) are listed as * optionalDependencies with pinned versions. They ship native prebuilds * and source files (~72 MB total) but QMD only uses the .wasm files * (~5 MB). If install size becomes a concern, the .wasm files can be * bundled directly in the repo (e.g. assets/grammars/) and resolved * via import.meta.url instead of require.resolve(), eliminating the * grammar packages entirely. */ import { createRequire } from "node:module"; import { extname } from "node:path"; const EXTENSION_MAP = { ".ts": "typescript", ".tsx": "tsx", ".js": "javascript", ".jsx": "tsx", ".mts": "typescript", ".cts": "typescript", ".mjs": "javascript", ".cjs": "javascript", ".py": "python", ".go": "go", ".rs": "rust", ".java": "java", ".kt": "kotlin", ".kts": "kotlin", }; /** * Detect language from file path extension. * Returns null for unsupported or unknown extensions (including .md). */ export function detectLanguage(filepath) { const ext = extname(filepath).toLowerCase(); return EXTENSION_MAP[ext] ?? null; } // ============================================================================= // Grammar Resolution // ============================================================================= /** * Maps language to the npm package and wasm filename for the grammar. */ const GRAMMAR_MAP = { typescript: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-typescript.wasm" }, tsx: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-tsx.wasm" }, javascript: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-typescript.wasm" }, python: { pkg: "tree-sitter-python", wasm: "tree-sitter-python.wasm" }, go: { pkg: "tree-sitter-go", wasm: "tree-sitter-go.wasm" }, rust: { pkg: "tree-sitter-rust", wasm: "tree-sitter-rust.wasm" }, java: { pkg: "tree-sitter-java", wasm: "tree-sitter-java.wasm" }, kotlin: { pkg: "@tree-sitter-grammars/tree-sitter-kotlin", wasm: "tree-sitter-kotlin.wasm" }, }; // ============================================================================= // Per-Language Query Definitions // ============================================================================= /** * Tree-sitter S-expression queries for each language. * Each capture name maps to a break point score via SCORE_MAP. * * For TypeScript/JavaScript, we match export_statement wrappers to get the * correct start position (before `export`), plus bare declarations for * non-exported code. */ const LANGUAGE_QUERIES = { typescript: ` (export_statement) @export (class_declaration) @class (function_declaration) @func (method_definition) @method (interface_declaration) @iface (type_alias_declaration) @type (enum_declaration) @enum (import_statement) @import (lexical_declaration (variable_declarator value: (arrow_function))) @func (lexical_declaration (variable_declarator value: (function_expression))) @func `, tsx: ` (export_statement) @export (class_declaration) @class (function_declaration) @func (method_definition) @method (interface_declaration) @iface (type_alias_declaration) @type (enum_declaration) @enum (import_statement) @import (lexical_declaration (variable_declarator value: (arrow_function))) @func (lexical_declaration (variable_declarator value: (function_expression))) @func `, javascript: ` (export_statement) @export (class_declaration) @class (function_declaration) @func (method_definition) @method (import_statement) @import (lexical_declaration (variable_declarator value: (arrow_function))) @func (lexical_declaration (variable_declarator value: (function_expression))) @func `, python: ` (class_definition) @class (function_definition) @func (decorated_definition) @decorated (import_statement) @import (import_from_statement) @import `, go: ` (type_declaration) @type (function_declaration) @func (method_declaration) @method (import_declaration) @import `, rust: ` (struct_item) @struct (impl_item) @impl (function_item) @func (trait_item) @trait (enum_item) @enum (use_declaration) @import (type_item) @type (mod_item) @mod `, java: ` (class_declaration) @class (interface_declaration) @iface (enum_declaration) @enum (record_declaration) @class (annotation_type_declaration) @iface (method_declaration) @method (constructor_declaration) @method (import_declaration) @import `, kotlin: ` (class_declaration) @class (object_declaration) @class (function_declaration) @func (type_alias) @type (import) @import `, }; /** * Score mapping from capture names to break point scores. * Aligned with the markdown BREAK_PATTERNS scale (h1=100, h2=90, etc.) * so findBestCutoff() decay works unchanged. */ const SCORE_MAP = { class: 100, iface: 100, struct: 100, trait: 100, impl: 100, mod: 100, export: 90, func: 90, method: 90, decorated: 90, type: 80, enum: 80, import: 60, }; // ============================================================================= // Parser Caching & Initialization // ============================================================================= let ParserClass = null; let LanguageClass = null; let QueryClass = null; let initPromise = null; /** Languages that have already failed to load — warn only once per process. */ const failedLanguages = new Set(); /** Cached grammar load promises. */ const grammarCache = new Map(); /** Cached compiled queries per language. */ const queryCache = new Map(); /** * Initialize web-tree-sitter. Called once and cached. */ async function ensureInit() { if (!initPromise) { initPromise = (async () => { const mod = await import("web-tree-sitter"); ParserClass = mod.Parser; LanguageClass = mod.Language; QueryClass = mod.Query; await ParserClass.init(); })(); } return initPromise; } /** * Resolve the filesystem path to a grammar .wasm file. * Uses createRequire to resolve from installed dependency packages. */ function resolveGrammarPath(language) { const { pkg, wasm } = GRAMMAR_MAP[language]; const require = createRequire(import.meta.url); return require.resolve(`${pkg}/${wasm}`); } /** * Load and cache a grammar for the given language. * Returns null on failure (logs once per language). */ async function loadGrammar(language) { if (failedLanguages.has(language)) return null; const wasmKey = GRAMMAR_MAP[language].wasm; if (!grammarCache.has(wasmKey)) { grammarCache.set(wasmKey, (async () => { const path = resolveGrammarPath(language); return LanguageClass.load(path); })()); } try { return await grammarCache.get(wasmKey); } catch (err) { failedLanguages.add(language); grammarCache.delete(wasmKey); console.warn(`[qmd] Failed to load tree-sitter grammar for ${language}: ${err}`); return null; } } /** * Get or create a compiled query for the given language. */ function getQuery(language, grammar) { if (!queryCache.has(language)) { const source = LANGUAGE_QUERIES[language]; const query = new QueryClass(grammar, source); queryCache.set(language, query); } return queryCache.get(language); } // ============================================================================= // AST Break Point Extraction // ============================================================================= /** * Parse a source file and return break points at AST node boundaries. * * Returns an empty array for unsupported languages, parse failures, * or grammar loading failures. Never throws. * * @param content - The file content to parse. * @param filepath - The file path (used for language detection). * @returns Array of BreakPoint objects suitable for merging with regex break points. */ export async function getASTBreakPoints(content, filepath) { const language = detectLanguage(filepath); if (!language) return []; try { await ensureInit(); const grammar = await loadGrammar(language); if (!grammar) return []; const parser = new ParserClass(); parser.setLanguage(grammar); const tree = parser.parse(content); if (!tree) { parser.delete(); return []; } const query = getQuery(language, grammar); const captures = query.captures(tree.rootNode); // Deduplicate: at each byte position, keep the highest-scoring capture. // This handles cases like export_statement wrapping a class_declaration // at different offsets — we want the outermost (earliest) position. const seen = new Map(); for (const cap of captures) { const pos = cap.node.startIndex; const score = SCORE_MAP[cap.name] ?? 20; const type = `ast:${cap.name}`; const existing = seen.get(pos); if (!existing || score > existing.score) { seen.set(pos, { pos, score, type }); } } tree.delete(); parser.delete(); return Array.from(seen.values()).sort((a, b) => a.pos - b.pos); } catch (err) { console.warn(`[qmd] AST parse failed for ${filepath}, falling back to regex: ${err instanceof Error ? err.message : err}`); return []; } } // ============================================================================= // Health / Status // ============================================================================= /** * Check which tree-sitter grammars are available. * Returns a status object for each supported language. */ export async function getASTStatus() { const languages = []; try { await ensureInit(); } catch (err) { return { available: false, languages: Object.keys(GRAMMAR_MAP).map(lang => ({ language: lang, available: false, error: `web-tree-sitter init failed: ${err instanceof Error ? err.message : err}`, })), }; } for (const lang of Object.keys(GRAMMAR_MAP)) { try { const grammar = await loadGrammar(lang); if (grammar) { // Also verify the query compiles getQuery(lang, grammar); languages.push({ language: lang, available: true }); } else { languages.push({ language: lang, available: false, error: "grammar failed to load" }); } } catch (err) { languages.push({ language: lang, available: false, error: err instanceof Error ? err.message : String(err), }); } } return { available: languages.some(l => l.available), languages, }; } /** * Capture names that denote a "function-like unit" — a chunk worth of * code that should stay together. Shared across all languages because * capture-name semantics (@class, @func, @method, @iface, etc.) are * normalized in `LANGUAGE_QUERIES`. Captures with names not in this * set (e.g. `import`) are ignored — they belong to the inter-range * gaps (char-chunked) instead. * * Language-agnostic by design so that `getASTFunctionRanges` works for * any current-or-future `SupportedLanguage` without requiring a * per-language table edit. */ const FUNCTION_CAPTURE_NAMES = new Set([ "export", // TS/JS: export_statement wrapping a decl — preserves outer start "class", // TS/JS/Py/Java/Kotlin/etc. "iface", // TS: interface_declaration; Java: interface / annotation_type_declaration "func", // function_declaration + arrow/function-expression lexical_declaration "method", // method_definition / method_declaration / constructor_declaration "type", // TS: type_alias_declaration; Go: type_declaration; Rust: type_item; Kotlin: type_alias "enum", // TS/Rust/Java: enum declarations "decorated", // Python: decorated_definition — preserves decorators "struct", // Rust "impl", // Rust "trait", // Rust "mod", // Rust ]); /** * Try to pull a human-readable name out of an AST node. Best-effort — * returns `undefined` when the node shape doesn't expose a simple name * child. Used for debugging / display and not for correctness. */ function extractNodeName(node) { // Common shape: `(function_declaration name: (identifier))` etc. const nameChild = node.childForFieldName?.("name"); if (nameChild && nameChild.text) return nameChild.text; // TS lexical_declaration: `const foo = () => ...` — first declarator's identifier. const declarator = node.namedChildren?.find(c => c?.type === "variable_declarator"); if (declarator) { const id = declarator.childForFieldName?.("name"); if (id && id.text) return id.text; } // export_statement / decorated_definition — recurse into the wrapped decl. const inner = node.namedChildren?.find(c => c != null && (c.type === "class_declaration" || c.type === "function_declaration" || c.type === "interface_declaration" || c.type === "type_alias_declaration" || c.type === "enum_declaration" || c.type === "lexical_declaration" || c.type === "function_definition" || c.type === "class_definition")); if (inner) return extractNodeName(inner); return undefined; } /** * Deduplicate overlapping ranges produced by the same AST pass. * * Tree-sitter emits multiple captures for the same region — e.g. an * `export class Foo {}` matches both `export` and `class`. We want ONE * range per region, preferring the outermost (earliest startIndex, largest * endIndex). When two captures start at the same position we keep the * one with the larger end (typically the wrapper — export/decorated). * * After this pass no two ranges overlap (strictly: for any a, b either * a.endIndex <= b.startIndex or b.endIndex <= a.startIndex). */ function dedupeFunctionRanges(ranges) { if (ranges.length === 0) return ranges; const sorted = [...ranges].sort((a, b) => { if (a.startIndex !== b.startIndex) return a.startIndex - b.startIndex; return b.endIndex - a.endIndex; // larger wrapper wins at same start }); const result = []; for (const r of sorted) { const last = result[result.length - 1]; if (last && r.startIndex < last.endIndex) { // r is contained in or overlaps last — last is the outer/earlier range; drop r. continue; } result.push(r); } return result; } /** * Parse a source file and return byte-offset ranges for every top-level * code unit that should be its own chunk under the `"function"` chunk * strategy. * * Returns an empty array for unsupported languages, parse failures, or * grammar loading failures. Never throws. Reuses the parser/grammar/ * query caches already populated by `getASTBreakPoints`. * * @param content - The file content to parse. * @param filepath - The file path (used for language detection). * @returns Array of non-overlapping FunctionRange objects, sorted by startIndex. */ export async function getASTFunctionRanges(content, filepath) { const language = detectLanguage(filepath); if (!language) return []; try { await ensureInit(); const grammar = await loadGrammar(language); if (!grammar) return []; const parser = new ParserClass(); parser.setLanguage(grammar); const tree = parser.parse(content); if (!tree) { parser.delete(); return []; } const query = getQuery(language, grammar); const captures = query.captures(tree.rootNode); const ranges = []; for (const cap of captures) { if (!FUNCTION_CAPTURE_NAMES.has(cap.name)) continue; ranges.push({ startIndex: cap.node.startIndex, endIndex: cap.node.endIndex, type: `ast:${cap.name}`, name: extractNodeName(cap.node), }); } tree.delete(); parser.delete(); return dedupeFunctionRanges(ranges); } catch (err) { console.warn(`[qmd] AST function-range extraction failed for ${filepath}, returning empty: ${err instanceof Error ? err.message : err}`); return []; } } /** * Extract symbol metadata for code within a byte range. * Stubbed for Phase 2 — returns empty array. */ export function extractSymbols(_content, _language, _startPos, _endPos) { return []; }