|
|
@@ -31,7 +31,15 @@ type QueryType = import("web-tree-sitter").Query;
|
|
|
// Language Detection
|
|
|
// =============================================================================
|
|
|
|
|
|
-export type SupportedLanguage = "typescript" | "tsx" | "javascript" | "python" | "go" | "rust";
|
|
|
+export type SupportedLanguage =
|
|
|
+ | "typescript"
|
|
|
+ | "tsx"
|
|
|
+ | "javascript"
|
|
|
+ | "python"
|
|
|
+ | "go"
|
|
|
+ | "rust"
|
|
|
+ | "java"
|
|
|
+ | "kotlin";
|
|
|
|
|
|
const EXTENSION_MAP: Record<string, SupportedLanguage> = {
|
|
|
".ts": "typescript",
|
|
|
@@ -45,6 +53,9 @@ const EXTENSION_MAP: Record<string, SupportedLanguage> = {
|
|
|
".py": "python",
|
|
|
".go": "go",
|
|
|
".rs": "rust",
|
|
|
+ ".java": "java",
|
|
|
+ ".kt": "kotlin",
|
|
|
+ ".kts": "kotlin",
|
|
|
};
|
|
|
|
|
|
/**
|
|
|
@@ -70,6 +81,8 @@ const GRAMMAR_MAP: Record<SupportedLanguage, { pkg: string; wasm: string }> = {
|
|
|
python: { pkg: "tree-sitter-python", wasm: "tree-sitter-python.wasm" },
|
|
|
go: { pkg: "tree-sitter-go", wasm: "tree-sitter-go.wasm" },
|
|
|
rust: { pkg: "tree-sitter-rust", wasm: "tree-sitter-rust.wasm" },
|
|
|
+ java: { pkg: "tree-sitter-java", wasm: "tree-sitter-java.wasm" },
|
|
|
+ kotlin: { pkg: "@tree-sitter-grammars/tree-sitter-kotlin", wasm: "tree-sitter-kotlin.wasm" },
|
|
|
};
|
|
|
|
|
|
// =============================================================================
|
|
|
@@ -141,6 +154,23 @@ const LANGUAGE_QUERIES: Record<SupportedLanguage, string> = {
|
|
|
(type_item) @type
|
|
|
(mod_item) @mod
|
|
|
`,
|
|
|
+ java: `
|
|
|
+ (class_declaration) @class
|
|
|
+ (interface_declaration) @iface
|
|
|
+ (enum_declaration) @enum
|
|
|
+ (record_declaration) @class
|
|
|
+ (annotation_type_declaration) @iface
|
|
|
+ (method_declaration) @method
|
|
|
+ (constructor_declaration) @method
|
|
|
+ (import_declaration) @import
|
|
|
+ `,
|
|
|
+ kotlin: `
|
|
|
+ (class_declaration) @class
|
|
|
+ (object_declaration) @class
|
|
|
+ (function_declaration) @func
|
|
|
+ (type_alias) @type
|
|
|
+ (import) @import
|
|
|
+ `,
|
|
|
};
|
|
|
|
|
|
/**
|
|
|
@@ -362,6 +392,176 @@ export async function getASTStatus(): Promise<{
|
|
|
};
|
|
|
}
|
|
|
|
|
|
+// =============================================================================
|
|
|
+// Function-Level Range Extraction (Phase 2)
|
|
|
+// =============================================================================
|
|
|
+
|
|
|
+/**
|
|
|
+ * A byte-offset range covering a single top-level code unit
|
|
|
+ * (function, method, class, interface, struct, impl, trait, type...).
|
|
|
+ *
|
|
|
+ * Used by the `"function"` chunk strategy in store.ts to produce
|
|
|
+ * one chunk per range instead of character-window chunks.
|
|
|
+ */
|
|
|
+export interface FunctionRange {
|
|
|
+ startIndex: number; // byte offset (inclusive)
|
|
|
+ endIndex: number; // byte offset (exclusive)
|
|
|
+ type: string; // capture name from the tree-sitter query (e.g. "ast:class")
|
|
|
+ name?: string; // symbol name when extractable (best-effort)
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * Capture names that denote a "function-like unit" — a chunk worth of
|
|
|
+ * code that should stay together. Shared across all languages because
|
|
|
+ * capture-name semantics (@class, @func, @method, @iface, etc.) are
|
|
|
+ * normalized in `LANGUAGE_QUERIES`. Captures with names not in this
|
|
|
+ * set (e.g. `import`) are ignored — they belong to the inter-range
|
|
|
+ * gaps (char-chunked) instead.
|
|
|
+ *
|
|
|
+ * Language-agnostic by design so that `getASTFunctionRanges` works for
|
|
|
+ * any current-or-future `SupportedLanguage` without requiring a
|
|
|
+ * per-language table edit.
|
|
|
+ */
|
|
|
+const FUNCTION_CAPTURE_NAMES: ReadonlySet<string> = new Set([
|
|
|
+ "export", // TS/JS: export_statement wrapping a decl — preserves outer start
|
|
|
+ "class", // TS/JS/Py/Java/Kotlin/etc.
|
|
|
+ "iface", // TS: interface_declaration; Java: interface / annotation_type_declaration
|
|
|
+ "func", // function_declaration + arrow/function-expression lexical_declaration
|
|
|
+ "method", // method_definition / method_declaration / constructor_declaration
|
|
|
+ "type", // TS: type_alias_declaration; Go: type_declaration; Rust: type_item; Kotlin: type_alias
|
|
|
+ "enum", // TS/Rust/Java: enum declarations
|
|
|
+ "decorated", // Python: decorated_definition — preserves decorators
|
|
|
+ "struct", // Rust
|
|
|
+ "impl", // Rust
|
|
|
+ "trait", // Rust
|
|
|
+ "mod", // Rust
|
|
|
+]);
|
|
|
+
|
|
|
+/**
|
|
|
+ * Try to pull a human-readable name out of an AST node. Best-effort —
|
|
|
+ * returns `undefined` when the node shape doesn't expose a simple name
|
|
|
+ * child. Used for debugging / display and not for correctness.
|
|
|
+ */
|
|
|
+function extractNodeName(node: import("web-tree-sitter").Node): string | undefined {
|
|
|
+ // Common shape: `(function_declaration name: (identifier))` etc.
|
|
|
+ const nameChild = node.childForFieldName?.("name");
|
|
|
+ if (nameChild && nameChild.text) return nameChild.text;
|
|
|
+
|
|
|
+ // TS lexical_declaration: `const foo = () => ...` — first declarator's identifier.
|
|
|
+ const declarator = node.namedChildren?.find(c => c?.type === "variable_declarator");
|
|
|
+ if (declarator) {
|
|
|
+ const id = declarator.childForFieldName?.("name");
|
|
|
+ if (id && id.text) return id.text;
|
|
|
+ }
|
|
|
+
|
|
|
+ // export_statement / decorated_definition — recurse into the wrapped decl.
|
|
|
+ const inner = node.namedChildren?.find(
|
|
|
+ c => c != null && (
|
|
|
+ c.type === "class_declaration" ||
|
|
|
+ c.type === "function_declaration" ||
|
|
|
+ c.type === "interface_declaration" ||
|
|
|
+ c.type === "type_alias_declaration" ||
|
|
|
+ c.type === "enum_declaration" ||
|
|
|
+ c.type === "lexical_declaration" ||
|
|
|
+ c.type === "function_definition" ||
|
|
|
+ c.type === "class_definition"
|
|
|
+ )
|
|
|
+ );
|
|
|
+ if (inner) return extractNodeName(inner);
|
|
|
+
|
|
|
+ return undefined;
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * Deduplicate overlapping ranges produced by the same AST pass.
|
|
|
+ *
|
|
|
+ * Tree-sitter emits multiple captures for the same region — e.g. an
|
|
|
+ * `export class Foo {}` matches both `export` and `class`. We want ONE
|
|
|
+ * range per region, preferring the outermost (earliest startIndex, largest
|
|
|
+ * endIndex). When two captures start at the same position we keep the
|
|
|
+ * one with the larger end (typically the wrapper — export/decorated).
|
|
|
+ *
|
|
|
+ * After this pass no two ranges overlap (strictly: for any a, b either
|
|
|
+ * a.endIndex <= b.startIndex or b.endIndex <= a.startIndex).
|
|
|
+ */
|
|
|
+function dedupeFunctionRanges(ranges: FunctionRange[]): FunctionRange[] {
|
|
|
+ if (ranges.length === 0) return ranges;
|
|
|
+ const sorted = [...ranges].sort((a, b) => {
|
|
|
+ if (a.startIndex !== b.startIndex) return a.startIndex - b.startIndex;
|
|
|
+ return b.endIndex - a.endIndex; // larger wrapper wins at same start
|
|
|
+ });
|
|
|
+
|
|
|
+ const result: FunctionRange[] = [];
|
|
|
+ for (const r of sorted) {
|
|
|
+ const last = result[result.length - 1];
|
|
|
+ if (last && r.startIndex < last.endIndex) {
|
|
|
+ // r is contained in or overlaps last — last is the outer/earlier range; drop r.
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ result.push(r);
|
|
|
+ }
|
|
|
+ return result;
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * Parse a source file and return byte-offset ranges for every top-level
|
|
|
+ * code unit that should be its own chunk under the `"function"` chunk
|
|
|
+ * strategy.
|
|
|
+ *
|
|
|
+ * Returns an empty array for unsupported languages, parse failures, or
|
|
|
+ * grammar loading failures. Never throws. Reuses the parser/grammar/
|
|
|
+ * query caches already populated by `getASTBreakPoints`.
|
|
|
+ *
|
|
|
+ * @param content - The file content to parse.
|
|
|
+ * @param filepath - The file path (used for language detection).
|
|
|
+ * @returns Array of non-overlapping FunctionRange objects, sorted by startIndex.
|
|
|
+ */
|
|
|
+export async function getASTFunctionRanges(
|
|
|
+ content: string,
|
|
|
+ filepath: string,
|
|
|
+): Promise<FunctionRange[]> {
|
|
|
+ const language = detectLanguage(filepath);
|
|
|
+ if (!language) return [];
|
|
|
+
|
|
|
+ try {
|
|
|
+ await ensureInit();
|
|
|
+
|
|
|
+ const grammar = await loadGrammar(language);
|
|
|
+ if (!grammar) return [];
|
|
|
+
|
|
|
+ const parser = new ParserClass!();
|
|
|
+ parser.setLanguage(grammar);
|
|
|
+
|
|
|
+ const tree = parser.parse(content);
|
|
|
+ if (!tree) {
|
|
|
+ parser.delete();
|
|
|
+ return [];
|
|
|
+ }
|
|
|
+
|
|
|
+ const query = getQuery(language, grammar);
|
|
|
+ const captures = query.captures(tree.rootNode);
|
|
|
+
|
|
|
+ const ranges: FunctionRange[] = [];
|
|
|
+ for (const cap of captures) {
|
|
|
+ if (!FUNCTION_CAPTURE_NAMES.has(cap.name)) continue;
|
|
|
+ ranges.push({
|
|
|
+ startIndex: cap.node.startIndex,
|
|
|
+ endIndex: cap.node.endIndex,
|
|
|
+ type: `ast:${cap.name}`,
|
|
|
+ name: extractNodeName(cap.node),
|
|
|
+ });
|
|
|
+ }
|
|
|
+
|
|
|
+ tree.delete();
|
|
|
+ parser.delete();
|
|
|
+
|
|
|
+ return dedupeFunctionRanges(ranges);
|
|
|
+ } catch (err) {
|
|
|
+ console.warn(`[qmd] AST function-range extraction failed for ${filepath}, returning empty: ${err instanceof Error ? err.message : err}`);
|
|
|
+ return [];
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
// =============================================================================
|
|
|
// Symbol Extraction (Phase 2 Stub)
|
|
|
// =============================================================================
|