ast.js 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. /**
  2. * AST-aware chunking support via web-tree-sitter.
  3. *
  4. * Provides language detection, AST break point extraction for supported
  5. * code file types, and a stub for future symbol extraction.
  6. *
  7. * All functions degrade gracefully: parse failures or unsupported languages
  8. * return empty arrays, falling back to regex-only chunking.
  9. *
  10. * ## Dependency Note
  11. *
  12. * Grammar packages (tree-sitter-typescript, etc.) are listed as
  13. * optionalDependencies with pinned versions. They ship native prebuilds
  14. * and source files (~72 MB total) but QMD only uses the .wasm files
  15. * (~5 MB). If install size becomes a concern, the .wasm files can be
  16. * bundled directly in the repo (e.g. assets/grammars/) and resolved
  17. * via import.meta.url instead of require.resolve(), eliminating the
  18. * grammar packages entirely.
  19. */
  20. import { createRequire } from "node:module";
  21. import { extname } from "node:path";
  22. const EXTENSION_MAP = {
  23. ".ts": "typescript",
  24. ".tsx": "tsx",
  25. ".js": "javascript",
  26. ".jsx": "tsx",
  27. ".mts": "typescript",
  28. ".cts": "typescript",
  29. ".mjs": "javascript",
  30. ".cjs": "javascript",
  31. ".py": "python",
  32. ".go": "go",
  33. ".rs": "rust",
  34. ".java": "java",
  35. ".kt": "kotlin",
  36. ".kts": "kotlin",
  37. };
  38. /**
  39. * Detect language from file path extension.
  40. * Returns null for unsupported or unknown extensions (including .md).
  41. */
  42. export function detectLanguage(filepath) {
  43. const ext = extname(filepath).toLowerCase();
  44. return EXTENSION_MAP[ext] ?? null;
  45. }
  46. // =============================================================================
  47. // Grammar Resolution
  48. // =============================================================================
  49. /**
  50. * Maps language to the npm package and wasm filename for the grammar.
  51. */
  52. const GRAMMAR_MAP = {
  53. typescript: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-typescript.wasm" },
  54. tsx: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-tsx.wasm" },
  55. javascript: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-typescript.wasm" },
  56. python: { pkg: "tree-sitter-python", wasm: "tree-sitter-python.wasm" },
  57. go: { pkg: "tree-sitter-go", wasm: "tree-sitter-go.wasm" },
  58. rust: { pkg: "tree-sitter-rust", wasm: "tree-sitter-rust.wasm" },
  59. java: { pkg: "tree-sitter-java", wasm: "tree-sitter-java.wasm" },
  60. kotlin: { pkg: "@tree-sitter-grammars/tree-sitter-kotlin", wasm: "tree-sitter-kotlin.wasm" },
  61. };
  62. // =============================================================================
  63. // Per-Language Query Definitions
  64. // =============================================================================
  65. /**
  66. * Tree-sitter S-expression queries for each language.
  67. * Each capture name maps to a break point score via SCORE_MAP.
  68. *
  69. * For TypeScript/JavaScript, we match export_statement wrappers to get the
  70. * correct start position (before `export`), plus bare declarations for
  71. * non-exported code.
  72. */
  73. const LANGUAGE_QUERIES = {
  74. typescript: `
  75. (export_statement) @export
  76. (class_declaration) @class
  77. (function_declaration) @func
  78. (method_definition) @method
  79. (interface_declaration) @iface
  80. (type_alias_declaration) @type
  81. (enum_declaration) @enum
  82. (import_statement) @import
  83. (lexical_declaration (variable_declarator value: (arrow_function))) @func
  84. (lexical_declaration (variable_declarator value: (function_expression))) @func
  85. `,
  86. tsx: `
  87. (export_statement) @export
  88. (class_declaration) @class
  89. (function_declaration) @func
  90. (method_definition) @method
  91. (interface_declaration) @iface
  92. (type_alias_declaration) @type
  93. (enum_declaration) @enum
  94. (import_statement) @import
  95. (lexical_declaration (variable_declarator value: (arrow_function))) @func
  96. (lexical_declaration (variable_declarator value: (function_expression))) @func
  97. `,
  98. javascript: `
  99. (export_statement) @export
  100. (class_declaration) @class
  101. (function_declaration) @func
  102. (method_definition) @method
  103. (import_statement) @import
  104. (lexical_declaration (variable_declarator value: (arrow_function))) @func
  105. (lexical_declaration (variable_declarator value: (function_expression))) @func
  106. `,
  107. python: `
  108. (class_definition) @class
  109. (function_definition) @func
  110. (decorated_definition) @decorated
  111. (import_statement) @import
  112. (import_from_statement) @import
  113. `,
  114. go: `
  115. (type_declaration) @type
  116. (function_declaration) @func
  117. (method_declaration) @method
  118. (import_declaration) @import
  119. `,
  120. rust: `
  121. (struct_item) @struct
  122. (impl_item) @impl
  123. (function_item) @func
  124. (trait_item) @trait
  125. (enum_item) @enum
  126. (use_declaration) @import
  127. (type_item) @type
  128. (mod_item) @mod
  129. `,
  130. java: `
  131. (class_declaration) @class
  132. (interface_declaration) @iface
  133. (enum_declaration) @enum
  134. (record_declaration) @class
  135. (annotation_type_declaration) @iface
  136. (method_declaration) @method
  137. (constructor_declaration) @method
  138. (import_declaration) @import
  139. `,
  140. kotlin: `
  141. (class_declaration) @class
  142. (object_declaration) @class
  143. (function_declaration) @func
  144. (type_alias) @type
  145. (import) @import
  146. `,
  147. };
  148. /**
  149. * Score mapping from capture names to break point scores.
  150. * Aligned with the markdown BREAK_PATTERNS scale (h1=100, h2=90, etc.)
  151. * so findBestCutoff() decay works unchanged.
  152. */
  153. const SCORE_MAP = {
  154. class: 100,
  155. iface: 100,
  156. struct: 100,
  157. trait: 100,
  158. impl: 100,
  159. mod: 100,
  160. export: 90,
  161. func: 90,
  162. method: 90,
  163. decorated: 90,
  164. type: 80,
  165. enum: 80,
  166. import: 60,
  167. };
  168. // =============================================================================
  169. // Parser Caching & Initialization
  170. // =============================================================================
  171. let ParserClass = null;
  172. let LanguageClass = null;
  173. let QueryClass = null;
  174. let initPromise = null;
  175. /** Languages that have already failed to load — warn only once per process. */
  176. const failedLanguages = new Set();
  177. /** Cached grammar load promises. */
  178. const grammarCache = new Map();
  179. /** Cached compiled queries per language. */
  180. const queryCache = new Map();
  181. /**
  182. * Initialize web-tree-sitter. Called once and cached.
  183. */
  184. async function ensureInit() {
  185. if (!initPromise) {
  186. initPromise = (async () => {
  187. const mod = await import("web-tree-sitter");
  188. ParserClass = mod.Parser;
  189. LanguageClass = mod.Language;
  190. QueryClass = mod.Query;
  191. await ParserClass.init();
  192. })();
  193. }
  194. return initPromise;
  195. }
  196. /**
  197. * Resolve the filesystem path to a grammar .wasm file.
  198. * Uses createRequire to resolve from installed dependency packages.
  199. */
  200. function resolveGrammarPath(language) {
  201. const { pkg, wasm } = GRAMMAR_MAP[language];
  202. const require = createRequire(import.meta.url);
  203. return require.resolve(`${pkg}/${wasm}`);
  204. }
  205. /**
  206. * Load and cache a grammar for the given language.
  207. * Returns null on failure (logs once per language).
  208. */
  209. async function loadGrammar(language) {
  210. if (failedLanguages.has(language))
  211. return null;
  212. const wasmKey = GRAMMAR_MAP[language].wasm;
  213. if (!grammarCache.has(wasmKey)) {
  214. grammarCache.set(wasmKey, (async () => {
  215. const path = resolveGrammarPath(language);
  216. return LanguageClass.load(path);
  217. })());
  218. }
  219. try {
  220. return await grammarCache.get(wasmKey);
  221. }
  222. catch (err) {
  223. failedLanguages.add(language);
  224. grammarCache.delete(wasmKey);
  225. console.warn(`[qmd] Failed to load tree-sitter grammar for ${language}: ${err}`);
  226. return null;
  227. }
  228. }
  229. /**
  230. * Get or create a compiled query for the given language.
  231. */
  232. function getQuery(language, grammar) {
  233. if (!queryCache.has(language)) {
  234. const source = LANGUAGE_QUERIES[language];
  235. const query = new QueryClass(grammar, source);
  236. queryCache.set(language, query);
  237. }
  238. return queryCache.get(language);
  239. }
  240. // =============================================================================
  241. // AST Break Point Extraction
  242. // =============================================================================
  243. /**
  244. * Parse a source file and return break points at AST node boundaries.
  245. *
  246. * Returns an empty array for unsupported languages, parse failures,
  247. * or grammar loading failures. Never throws.
  248. *
  249. * @param content - The file content to parse.
  250. * @param filepath - The file path (used for language detection).
  251. * @returns Array of BreakPoint objects suitable for merging with regex break points.
  252. */
  253. export async function getASTBreakPoints(content, filepath) {
  254. const language = detectLanguage(filepath);
  255. if (!language)
  256. return [];
  257. try {
  258. await ensureInit();
  259. const grammar = await loadGrammar(language);
  260. if (!grammar)
  261. return [];
  262. const parser = new ParserClass();
  263. parser.setLanguage(grammar);
  264. const tree = parser.parse(content);
  265. if (!tree) {
  266. parser.delete();
  267. return [];
  268. }
  269. const query = getQuery(language, grammar);
  270. const captures = query.captures(tree.rootNode);
  271. // Deduplicate: at each byte position, keep the highest-scoring capture.
  272. // This handles cases like export_statement wrapping a class_declaration
  273. // at different offsets — we want the outermost (earliest) position.
  274. const seen = new Map();
  275. for (const cap of captures) {
  276. const pos = cap.node.startIndex;
  277. const score = SCORE_MAP[cap.name] ?? 20;
  278. const type = `ast:${cap.name}`;
  279. const existing = seen.get(pos);
  280. if (!existing || score > existing.score) {
  281. seen.set(pos, { pos, score, type });
  282. }
  283. }
  284. tree.delete();
  285. parser.delete();
  286. return Array.from(seen.values()).sort((a, b) => a.pos - b.pos);
  287. }
  288. catch (err) {
  289. console.warn(`[qmd] AST parse failed for ${filepath}, falling back to regex: ${err instanceof Error ? err.message : err}`);
  290. return [];
  291. }
  292. }
  293. // =============================================================================
  294. // Health / Status
  295. // =============================================================================
  296. /**
  297. * Check which tree-sitter grammars are available.
  298. * Returns a status object for each supported language.
  299. */
  300. export async function getASTStatus() {
  301. const languages = [];
  302. try {
  303. await ensureInit();
  304. }
  305. catch (err) {
  306. return {
  307. available: false,
  308. languages: Object.keys(GRAMMAR_MAP).map(lang => ({
  309. language: lang,
  310. available: false,
  311. error: `web-tree-sitter init failed: ${err instanceof Error ? err.message : err}`,
  312. })),
  313. };
  314. }
  315. for (const lang of Object.keys(GRAMMAR_MAP)) {
  316. try {
  317. const grammar = await loadGrammar(lang);
  318. if (grammar) {
  319. // Also verify the query compiles
  320. getQuery(lang, grammar);
  321. languages.push({ language: lang, available: true });
  322. }
  323. else {
  324. languages.push({ language: lang, available: false, error: "grammar failed to load" });
  325. }
  326. }
  327. catch (err) {
  328. languages.push({
  329. language: lang,
  330. available: false,
  331. error: err instanceof Error ? err.message : String(err),
  332. });
  333. }
  334. }
  335. return {
  336. available: languages.some(l => l.available),
  337. languages,
  338. };
  339. }
  340. /**
  341. * Extract symbol metadata for code within a byte range.
  342. * Stubbed for Phase 2 — returns empty array.
  343. */
  344. export function extractSymbols(_content, _language, _startPos, _endPos) {
  345. return [];
  346. }