stt.js 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. // ── STT Integration ──
  2. const sttState = {
  3. enabled: false,
  4. language: "pl",
  5. timestamps: true,
  6. diarize: true,
  7. itn: true,
  8. detect_emotion: false,
  9. server_vad: false,
  10. vad_threshold: 0.3,
  11. vad_pad_ms: 400,
  12. vad_min_ms: 100,
  13. connected: false,
  14. segments: [], // finalized segments
  15. partialText: "", // current partial
  16. speakerMap: {}, // SPEAKER_00 -> {name, score}
  17. };
  18. const sttEls = {
  19. enabled: document.getElementById("sttEnabled"),
  20. status: document.getElementById("sttStatus"),
  21. controls: document.getElementById("sttControls"),
  22. language: document.getElementById("sttLanguage"),
  23. timestamps: document.getElementById("sttTimestamps"),
  24. diarize: document.getElementById("sttDiarize"),
  25. itn: document.getElementById("sttItn"),
  26. emotion: document.getElementById("sttEmotion"),
  27. vad: document.getElementById("sttVad"),
  28. vadControls: document.getElementById("sttVadControls"),
  29. vadThreshold: document.getElementById("sttVadThreshold"),
  30. vadThresholdValue: document.getElementById("sttVadThresholdValue"),
  31. vadPadMs: document.getElementById("sttVadPadMs"),
  32. vadPadMsValue: document.getElementById("sttVadPadMsValue"),
  33. output: document.getElementById("sttOutput"),
  34. };
  35. function sttSendSettings() {
  36. socket.emit("client_message", {
  37. type: "stt_settings",
  38. stt_enabled: sttState.enabled,
  39. stt_language: sttState.language,
  40. stt_timestamps: sttState.timestamps,
  41. stt_diarize: sttState.diarize,
  42. stt_itn: sttState.itn,
  43. stt_detect_emotion: sttState.detect_emotion,
  44. stt_server_vad: sttState.server_vad,
  45. stt_vad_threshold: sttState.vad_threshold,
  46. stt_vad_pad_ms: sttState.vad_pad_ms,
  47. stt_vad_min_ms: sttState.vad_min_ms,
  48. });
  49. }
  50. function sttFormatTime(seconds) {
  51. if (!seconds && seconds !== 0) return "";
  52. var m = Math.floor(seconds / 60);
  53. var s = Math.floor(seconds % 60);
  54. return String(m).padStart(2, "0") + ":" + String(s).padStart(2, "0");
  55. }
  56. function sttSpeakerColor(speakerId) {
  57. if (!speakerId) return "";
  58. var match = speakerId.match(/(\d+)/);
  59. var idx = match ? parseInt(match[1], 10) % 8 : 0;
  60. return "stt-speaker-" + idx;
  61. }
  62. function sttSpeakerName(speakerId) {
  63. if (!speakerId) return "";
  64. var mapped = sttState.speakerMap[speakerId];
  65. if (mapped && mapped.name && mapped.name !== "Unknown") {
  66. return mapped.name;
  67. }
  68. return speakerId.replace("SPEAKER_", "Mowca ");
  69. }
  70. function sttConfidenceClass(conf) {
  71. if (conf >= 0.85) return "conf-high";
  72. if (conf >= 0.6) return "conf-mid";
  73. return "conf-low";
  74. }
  75. function sttRenderSegment(seg, isPartial) {
  76. var div = document.createElement("div");
  77. div.className = "stt-segment" + (isPartial ? " partial" : "");
  78. var html = "";
  79. // Timestamp
  80. if (seg.duration && sttState.timestamps) {
  81. html += '<span class="stt-timestamp">[' + sttFormatTime(seg.duration) + ']</span>';
  82. }
  83. // Speaker segments with diarization
  84. if (seg.speakers && seg.speakers.length > 0 && sttState.diarize) {
  85. seg.speakers.forEach(function(sp) {
  86. var spName = sttSpeakerName(sp.speaker);
  87. var colorClass = sttSpeakerColor(sp.speaker);
  88. html += '<div style="margin: 0.2rem 0;">';
  89. html += '<span class="stt-speaker ' + colorClass + '">' + spName + '</span>';
  90. if (sp.start !== undefined && sttState.timestamps) {
  91. html += '<span class="stt-timestamp">' + sttFormatTime(sp.start) + '-' + sttFormatTime(sp.end) + '</span>';
  92. }
  93. html += '<span>' + (sp.text || "") + '</span>';
  94. html += '</div>';
  95. });
  96. } else {
  97. // Words with confidence
  98. if (seg.words && seg.words.length > 0) {
  99. seg.words.forEach(function(w) {
  100. var cls = sttConfidenceClass(w.confidence || 1.0);
  101. html += '<span class="stt-word ' + cls + '">' + w.word + '</span> ';
  102. });
  103. } else {
  104. html += '<span>' + (seg.text || "") + '</span>';
  105. }
  106. }
  107. // Confidence badge
  108. if (seg.confidence && !isPartial) {
  109. html += '<span class="stt-confidence">' + Math.round(seg.confidence * 100) + '%</span>';
  110. }
  111. // Emotion
  112. if (seg.emotion && sttState.detect_emotion) {
  113. html += '<span class="stt-emotion">' + seg.emotion + '</span>';
  114. }
  115. div.innerHTML = html;
  116. return div;
  117. }
  118. function sttRenderAll() {
  119. var output = sttEls.output;
  120. output.innerHTML = "";
  121. sttState.segments.forEach(function(seg) {
  122. output.appendChild(sttRenderSegment(seg, false));
  123. });
  124. if (sttState.partialText) {
  125. output.appendChild(sttRenderSegment({ text: sttState.partialText }, true));
  126. }
  127. // Auto-scroll to bottom
  128. output.scrollTop = output.scrollHeight;
  129. }
  130. function sttHandleMessage(msg) {
  131. if (!msg || !msg.type) return;
  132. if (msg.type === "stt_status") {
  133. sttState.connected = Boolean(msg.connected);
  134. sttUpdateStatus();
  135. return;
  136. }
  137. if (msg.type === "partial") {
  138. sttState.partialText = msg.text || "";
  139. sttRenderAll();
  140. return;
  141. }
  142. if (msg.type === "final") {
  143. sttState.partialText = "";
  144. if (msg.text) {
  145. sttState.segments.push(msg);
  146. }
  147. if (msg.speaker_map) {
  148. Object.assign(sttState.speakerMap, msg.speaker_map);
  149. }
  150. sttRenderAll();
  151. return;
  152. }
  153. if (msg.type === "speaker_update" && msg.speaker_map) {
  154. Object.assign(sttState.speakerMap, msg.speaker_map);
  155. sttRenderAll();
  156. return;
  157. }
  158. }
  159. function sttUpdateStatus() {
  160. if (!sttState.enabled) {
  161. sttEls.status.textContent = "STT: wylaczone";
  162. sttEls.status.style.background = "";
  163. } else if (sttState.connected) {
  164. sttEls.status.textContent = "STT: polaczone";
  165. sttEls.status.style.background = "rgba(81, 207, 102, 0.25)";
  166. } else {
  167. sttEls.status.textContent = "STT: laczenie...";
  168. sttEls.status.style.background = "rgba(255, 210, 0, 0.2)";
  169. }
  170. }
  171. function sttRefreshControls() {
  172. sttEls.controls.classList.toggle("hidden", !sttState.enabled);
  173. sttEls.vadControls.classList.toggle("hidden", !sttState.server_vad);
  174. }
  175. function sttBindControls() {
  176. sttEls.enabled.addEventListener("change", function() {
  177. sttState.enabled = sttEls.enabled.checked;
  178. if (!sttState.enabled) {
  179. sttState.connected = false;
  180. sttState.segments = [];
  181. sttState.partialText = "";
  182. sttState.speakerMap = {};
  183. sttRenderAll();
  184. }
  185. sttRefreshControls();
  186. sttUpdateStatus();
  187. sttSendSettings();
  188. });
  189. sttEls.language.addEventListener("change", function() {
  190. sttState.language = sttEls.language.value;
  191. // Clear transcript on language change
  192. sttState.segments = [];
  193. sttState.partialText = "";
  194. sttRenderAll();
  195. sttSendSettings();
  196. });
  197. sttEls.timestamps.addEventListener("change", function() {
  198. sttState.timestamps = sttEls.timestamps.checked;
  199. sttSendSettings();
  200. });
  201. sttEls.diarize.addEventListener("change", function() {
  202. sttState.diarize = sttEls.diarize.checked;
  203. sttSendSettings();
  204. });
  205. sttEls.itn.addEventListener("change", function() {
  206. sttState.itn = sttEls.itn.checked;
  207. sttSendSettings();
  208. });
  209. sttEls.emotion.addEventListener("change", function() {
  210. sttState.detect_emotion = sttEls.emotion.checked;
  211. sttSendSettings();
  212. });
  213. sttEls.vad.addEventListener("change", function() {
  214. sttState.server_vad = sttEls.vad.checked;
  215. sttRefreshControls();
  216. sttSendSettings();
  217. });
  218. sttEls.vadThreshold.addEventListener("input", function() {
  219. sttState.vad_threshold = parseFloat(sttEls.vadThreshold.value);
  220. sttEls.vadThresholdValue.textContent = sttState.vad_threshold.toFixed(2);
  221. });
  222. sttEls.vadThreshold.addEventListener("change", sttSendSettings);
  223. sttEls.vadPadMs.addEventListener("input", function() {
  224. sttState.vad_pad_ms = parseInt(sttEls.vadPadMs.value, 10);
  225. sttEls.vadPadMsValue.textContent = String(sttState.vad_pad_ms);
  226. });
  227. sttEls.vadPadMs.addEventListener("change", sttSendSettings);
  228. }
  229. function sttLoadFromStatus(data) {
  230. if (data.stt_enabled !== undefined) sttState.enabled = Boolean(data.stt_enabled);
  231. if (data.stt_language !== undefined) sttState.language = data.stt_language;
  232. if (data.stt_timestamps !== undefined) sttState.timestamps = Boolean(data.stt_timestamps);
  233. if (data.stt_diarize !== undefined) sttState.diarize = Boolean(data.stt_diarize);
  234. if (data.stt_itn !== undefined) sttState.itn = Boolean(data.stt_itn);
  235. if (data.stt_detect_emotion !== undefined) sttState.detect_emotion = Boolean(data.stt_detect_emotion);
  236. if (data.stt_server_vad !== undefined) sttState.server_vad = Boolean(data.stt_server_vad);
  237. if (data.stt_vad_threshold !== undefined) sttState.vad_threshold = Number(data.stt_vad_threshold);
  238. if (data.stt_vad_pad_ms !== undefined) sttState.vad_pad_ms = Number(data.stt_vad_pad_ms);
  239. if (data.stt_vad_min_ms !== undefined) sttState.vad_min_ms = Number(data.stt_vad_min_ms);
  240. if (data.stt_connected !== undefined) sttState.connected = Boolean(data.stt_connected);
  241. sttEls.enabled.checked = sttState.enabled;
  242. sttEls.language.value = sttState.language;
  243. sttEls.timestamps.checked = sttState.timestamps;
  244. sttEls.diarize.checked = sttState.diarize;
  245. sttEls.itn.checked = sttState.itn;
  246. sttEls.emotion.checked = sttState.detect_emotion;
  247. sttEls.vad.checked = sttState.server_vad;
  248. sttEls.vadThreshold.value = String(sttState.vad_threshold);
  249. sttEls.vadThresholdValue.textContent = sttState.vad_threshold.toFixed(2);
  250. sttEls.vadPadMs.value = String(sttState.vad_pad_ms);
  251. sttEls.vadPadMsValue.textContent = String(sttState.vad_pad_ms);
  252. sttRefreshControls();
  253. sttUpdateStatus();
  254. }
  255. // Hook into existing socket events
  256. socket.on("stt_message", sttHandleMessage);
  257. // Patch the existing status handler to also load STT state
  258. var _origLoadStatus = loadStatus;
  259. loadStatus = async function() {
  260. var response = await fetch("/api/status");
  261. var data = await response.json();
  262. if (data.settings) {
  263. state.mode = data.settings.mode;
  264. state.gain_db = data.settings.gain_db;
  265. state.agc = data.settings.agc;
  266. state.attack_ms = data.settings.attack_ms;
  267. state.release_ms = data.settings.release_ms;
  268. state.noise_suppression = Boolean(data.settings.noise_suppression);
  269. state.speech_gate = Boolean(data.settings.speech_gate);
  270. state.hum_filter = Boolean(data.settings.hum_filter);
  271. state.limiter = Boolean(data.settings.limiter);
  272. state.beam_clarity = Boolean(data.settings.beam_clarity);
  273. state.hifi_mode = Boolean(data.settings.hifi_mode);
  274. state.hifi_mic = data.settings.hifi_mic || "mic1";
  275. state.angle = data.settings.angle;
  276. state.auto_beam = Boolean(data.settings.auto_beam);
  277. state.monitor_on = Boolean(data.settings.monitor_on);
  278. state.monitor_source = data.settings.monitor_source || "beam";
  279. state.sample_rate = data.settings.sample_rate;
  280. }
  281. state.auto_angle = Number(data.auto_beam_angle_deg ?? state.angle ?? 0);
  282. state.speech_detected = false;
  283. state.recording = Boolean(data.recording);
  284. els.audioStatus.textContent = data.audio_error
  285. ? "Audio: blad (" + data.audio_error + ")"
  286. : data.audio_running
  287. ? "Audio: aktywne"
  288. : "Audio: zatrzymane";
  289. syncUiFromState();
  290. sttLoadFromStatus(data);
  291. };
  292. // Also hook into status WS event
  293. var _origStatusHandler = null;
  294. socket.off("status");
  295. socket.on("status", function(payload) {
  296. if (payload && payload.settings) {
  297. state.mode = payload.settings.mode;
  298. state.gain_db = payload.settings.gain_db;
  299. state.agc = payload.settings.agc;
  300. state.attack_ms = payload.settings.attack_ms;
  301. state.release_ms = payload.settings.release_ms;
  302. state.noise_suppression = Boolean(payload.settings.noise_suppression);
  303. state.speech_gate = Boolean(payload.settings.speech_gate);
  304. state.hum_filter = Boolean(payload.settings.hum_filter);
  305. state.limiter = Boolean(payload.settings.limiter);
  306. state.beam_clarity = Boolean(payload.settings.beam_clarity);
  307. state.hifi_mode = Boolean(payload.settings.hifi_mode);
  308. state.hifi_mic = payload.settings.hifi_mic || "mic1";
  309. state.angle = payload.settings.angle;
  310. state.auto_beam = Boolean(payload.settings.auto_beam);
  311. state.monitor_on = Boolean(payload.settings.monitor_on);
  312. state.monitor_source = payload.settings.monitor_source || "beam";
  313. state.sample_rate = payload.settings.sample_rate;
  314. syncUiFromState();
  315. }
  316. sttLoadFromStatus(payload || {});
  317. });
  318. // Hook stt_settings_applied
  319. socket.on("server_ack", function(payload) {
  320. if (payload && payload.type === "stt_settings_applied" && payload.settings) {
  321. sttLoadFromStatus(payload.settings);
  322. }
  323. });
  324. // Initialize STT controls
  325. sttBindControls();