|
@@ -0,0 +1,371 @@
|
|
|
|
|
+
|
|
|
|
|
+// ── STT Integration ──
|
|
|
|
|
+
|
|
|
|
|
+const sttState = {
|
|
|
|
|
+ enabled: false,
|
|
|
|
|
+ language: "pl",
|
|
|
|
|
+ timestamps: true,
|
|
|
|
|
+ diarize: true,
|
|
|
|
|
+ itn: true,
|
|
|
|
|
+ detect_emotion: false,
|
|
|
|
|
+ server_vad: false,
|
|
|
|
|
+ vad_threshold: 0.3,
|
|
|
|
|
+ vad_pad_ms: 400,
|
|
|
|
|
+ vad_min_ms: 100,
|
|
|
|
|
+ connected: false,
|
|
|
|
|
+ segments: [], // finalized segments
|
|
|
|
|
+ partialText: "", // current partial
|
|
|
|
|
+ speakerMap: {}, // SPEAKER_00 -> {name, score}
|
|
|
|
|
+};
|
|
|
|
|
+
|
|
|
|
|
+const sttEls = {
|
|
|
|
|
+ enabled: document.getElementById("sttEnabled"),
|
|
|
|
|
+ status: document.getElementById("sttStatus"),
|
|
|
|
|
+ controls: document.getElementById("sttControls"),
|
|
|
|
|
+ language: document.getElementById("sttLanguage"),
|
|
|
|
|
+ timestamps: document.getElementById("sttTimestamps"),
|
|
|
|
|
+ diarize: document.getElementById("sttDiarize"),
|
|
|
|
|
+ itn: document.getElementById("sttItn"),
|
|
|
|
|
+ emotion: document.getElementById("sttEmotion"),
|
|
|
|
|
+ vad: document.getElementById("sttVad"),
|
|
|
|
|
+ vadControls: document.getElementById("sttVadControls"),
|
|
|
|
|
+ vadThreshold: document.getElementById("sttVadThreshold"),
|
|
|
|
|
+ vadThresholdValue: document.getElementById("sttVadThresholdValue"),
|
|
|
|
|
+ vadPadMs: document.getElementById("sttVadPadMs"),
|
|
|
|
|
+ vadPadMsValue: document.getElementById("sttVadPadMsValue"),
|
|
|
|
|
+ output: document.getElementById("sttOutput"),
|
|
|
|
|
+};
|
|
|
|
|
+
|
|
|
|
|
+function sttSendSettings() {
|
|
|
|
|
+ socket.emit("client_message", {
|
|
|
|
|
+ type: "stt_settings",
|
|
|
|
|
+ stt_enabled: sttState.enabled,
|
|
|
|
|
+ stt_language: sttState.language,
|
|
|
|
|
+ stt_timestamps: sttState.timestamps,
|
|
|
|
|
+ stt_diarize: sttState.diarize,
|
|
|
|
|
+ stt_itn: sttState.itn,
|
|
|
|
|
+ stt_detect_emotion: sttState.detect_emotion,
|
|
|
|
|
+ stt_server_vad: sttState.server_vad,
|
|
|
|
|
+ stt_vad_threshold: sttState.vad_threshold,
|
|
|
|
|
+ stt_vad_pad_ms: sttState.vad_pad_ms,
|
|
|
|
|
+ stt_vad_min_ms: sttState.vad_min_ms,
|
|
|
|
|
+ });
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function sttFormatTime(seconds) {
|
|
|
|
|
+ if (!seconds && seconds !== 0) return "";
|
|
|
|
|
+ var m = Math.floor(seconds / 60);
|
|
|
|
|
+ var s = Math.floor(seconds % 60);
|
|
|
|
|
+ return String(m).padStart(2, "0") + ":" + String(s).padStart(2, "0");
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function sttSpeakerColor(speakerId) {
|
|
|
|
|
+ if (!speakerId) return "";
|
|
|
|
|
+ var match = speakerId.match(/(\d+)/);
|
|
|
|
|
+ var idx = match ? parseInt(match[1], 10) % 8 : 0;
|
|
|
|
|
+ return "stt-speaker-" + idx;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function sttSpeakerName(speakerId) {
|
|
|
|
|
+ if (!speakerId) return "";
|
|
|
|
|
+ var mapped = sttState.speakerMap[speakerId];
|
|
|
|
|
+ if (mapped && mapped.name && mapped.name !== "Unknown") {
|
|
|
|
|
+ return mapped.name;
|
|
|
|
|
+ }
|
|
|
|
|
+ return speakerId.replace("SPEAKER_", "Mowca ");
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function sttConfidenceClass(conf) {
|
|
|
|
|
+ if (conf >= 0.85) return "conf-high";
|
|
|
|
|
+ if (conf >= 0.6) return "conf-mid";
|
|
|
|
|
+ return "conf-low";
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function sttRenderSegment(seg, isPartial) {
|
|
|
|
|
+ var div = document.createElement("div");
|
|
|
|
|
+ div.className = "stt-segment" + (isPartial ? " partial" : "");
|
|
|
|
|
+
|
|
|
|
|
+ var html = "";
|
|
|
|
|
+
|
|
|
|
|
+ // Timestamp
|
|
|
|
|
+ if (seg.duration && sttState.timestamps) {
|
|
|
|
|
+ html += '<span class="stt-timestamp">[' + sttFormatTime(seg.duration) + ']</span>';
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Speaker segments with diarization
|
|
|
|
|
+ if (seg.speakers && seg.speakers.length > 0 && sttState.diarize) {
|
|
|
|
|
+ seg.speakers.forEach(function(sp) {
|
|
|
|
|
+ var spName = sttSpeakerName(sp.speaker);
|
|
|
|
|
+ var colorClass = sttSpeakerColor(sp.speaker);
|
|
|
|
|
+ html += '<div style="margin: 0.2rem 0;">';
|
|
|
|
|
+ html += '<span class="stt-speaker ' + colorClass + '">' + spName + '</span>';
|
|
|
|
|
+ if (sp.start !== undefined && sttState.timestamps) {
|
|
|
|
|
+ html += '<span class="stt-timestamp">' + sttFormatTime(sp.start) + '-' + sttFormatTime(sp.end) + '</span>';
|
|
|
|
|
+ }
|
|
|
|
|
+ html += '<span>' + (sp.text || "") + '</span>';
|
|
|
|
|
+ html += '</div>';
|
|
|
|
|
+ });
|
|
|
|
|
+ } else {
|
|
|
|
|
+ // Words with confidence
|
|
|
|
|
+ if (seg.words && seg.words.length > 0) {
|
|
|
|
|
+ seg.words.forEach(function(w) {
|
|
|
|
|
+ var cls = sttConfidenceClass(w.confidence || 1.0);
|
|
|
|
|
+ html += '<span class="stt-word ' + cls + '">' + w.word + '</span> ';
|
|
|
|
|
+ });
|
|
|
|
|
+ } else {
|
|
|
|
|
+ html += '<span>' + (seg.text || "") + '</span>';
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Confidence badge
|
|
|
|
|
+ if (seg.confidence && !isPartial) {
|
|
|
|
|
+ html += '<span class="stt-confidence">' + Math.round(seg.confidence * 100) + '%</span>';
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Emotion
|
|
|
|
|
+ if (seg.emotion && sttState.detect_emotion) {
|
|
|
|
|
+ html += '<span class="stt-emotion">' + seg.emotion + '</span>';
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ div.innerHTML = html;
|
|
|
|
|
+ return div;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function sttRenderAll() {
|
|
|
|
|
+ var output = sttEls.output;
|
|
|
|
|
+ output.innerHTML = "";
|
|
|
|
|
+
|
|
|
|
|
+ sttState.segments.forEach(function(seg) {
|
|
|
|
|
+ output.appendChild(sttRenderSegment(seg, false));
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ if (sttState.partialText) {
|
|
|
|
|
+ output.appendChild(sttRenderSegment({ text: sttState.partialText }, true));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Auto-scroll to bottom
|
|
|
|
|
+ output.scrollTop = output.scrollHeight;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function sttHandleMessage(msg) {
|
|
|
|
|
+ if (!msg || !msg.type) return;
|
|
|
|
|
+
|
|
|
|
|
+ if (msg.type === "stt_status") {
|
|
|
|
|
+ sttState.connected = Boolean(msg.connected);
|
|
|
|
|
+ sttUpdateStatus();
|
|
|
|
|
+ return;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (msg.type === "partial") {
|
|
|
|
|
+ sttState.partialText = msg.text || "";
|
|
|
|
|
+ sttRenderAll();
|
|
|
|
|
+ return;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (msg.type === "final") {
|
|
|
|
|
+ sttState.partialText = "";
|
|
|
|
|
+ if (msg.text) {
|
|
|
|
|
+ sttState.segments.push(msg);
|
|
|
|
|
+ }
|
|
|
|
|
+ if (msg.speaker_map) {
|
|
|
|
|
+ Object.assign(sttState.speakerMap, msg.speaker_map);
|
|
|
|
|
+ }
|
|
|
|
|
+ sttRenderAll();
|
|
|
|
|
+ return;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (msg.type === "speaker_update" && msg.speaker_map) {
|
|
|
|
|
+ Object.assign(sttState.speakerMap, msg.speaker_map);
|
|
|
|
|
+ sttRenderAll();
|
|
|
|
|
+ return;
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function sttUpdateStatus() {
|
|
|
|
|
+ if (!sttState.enabled) {
|
|
|
|
|
+ sttEls.status.textContent = "STT: wylaczone";
|
|
|
|
|
+ sttEls.status.style.background = "";
|
|
|
|
|
+ } else if (sttState.connected) {
|
|
|
|
|
+ sttEls.status.textContent = "STT: polaczone";
|
|
|
|
|
+ sttEls.status.style.background = "rgba(81, 207, 102, 0.25)";
|
|
|
|
|
+ } else {
|
|
|
|
|
+ sttEls.status.textContent = "STT: laczenie...";
|
|
|
|
|
+ sttEls.status.style.background = "rgba(255, 210, 0, 0.2)";
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function sttRefreshControls() {
|
|
|
|
|
+ sttEls.controls.classList.toggle("hidden", !sttState.enabled);
|
|
|
|
|
+ sttEls.vadControls.classList.toggle("hidden", !sttState.server_vad);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function sttBindControls() {
|
|
|
|
|
+ sttEls.enabled.addEventListener("change", function() {
|
|
|
|
|
+ sttState.enabled = sttEls.enabled.checked;
|
|
|
|
|
+ if (!sttState.enabled) {
|
|
|
|
|
+ sttState.connected = false;
|
|
|
|
|
+ sttState.segments = [];
|
|
|
|
|
+ sttState.partialText = "";
|
|
|
|
|
+ sttState.speakerMap = {};
|
|
|
|
|
+ sttRenderAll();
|
|
|
|
|
+ }
|
|
|
|
|
+ sttRefreshControls();
|
|
|
|
|
+ sttUpdateStatus();
|
|
|
|
|
+ sttSendSettings();
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ sttEls.language.addEventListener("change", function() {
|
|
|
|
|
+ sttState.language = sttEls.language.value;
|
|
|
|
|
+ // Clear transcript on language change
|
|
|
|
|
+ sttState.segments = [];
|
|
|
|
|
+ sttState.partialText = "";
|
|
|
|
|
+ sttRenderAll();
|
|
|
|
|
+ sttSendSettings();
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ sttEls.timestamps.addEventListener("change", function() {
|
|
|
|
|
+ sttState.timestamps = sttEls.timestamps.checked;
|
|
|
|
|
+ sttSendSettings();
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ sttEls.diarize.addEventListener("change", function() {
|
|
|
|
|
+ sttState.diarize = sttEls.diarize.checked;
|
|
|
|
|
+ sttSendSettings();
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ sttEls.itn.addEventListener("change", function() {
|
|
|
|
|
+ sttState.itn = sttEls.itn.checked;
|
|
|
|
|
+ sttSendSettings();
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ sttEls.emotion.addEventListener("change", function() {
|
|
|
|
|
+ sttState.detect_emotion = sttEls.emotion.checked;
|
|
|
|
|
+ sttSendSettings();
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ sttEls.vad.addEventListener("change", function() {
|
|
|
|
|
+ sttState.server_vad = sttEls.vad.checked;
|
|
|
|
|
+ sttRefreshControls();
|
|
|
|
|
+ sttSendSettings();
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ sttEls.vadThreshold.addEventListener("input", function() {
|
|
|
|
|
+ sttState.vad_threshold = parseFloat(sttEls.vadThreshold.value);
|
|
|
|
|
+ sttEls.vadThresholdValue.textContent = sttState.vad_threshold.toFixed(2);
|
|
|
|
|
+ });
|
|
|
|
|
+ sttEls.vadThreshold.addEventListener("change", sttSendSettings);
|
|
|
|
|
+
|
|
|
|
|
+ sttEls.vadPadMs.addEventListener("input", function() {
|
|
|
|
|
+ sttState.vad_pad_ms = parseInt(sttEls.vadPadMs.value, 10);
|
|
|
|
|
+ sttEls.vadPadMsValue.textContent = String(sttState.vad_pad_ms);
|
|
|
|
|
+ });
|
|
|
|
|
+ sttEls.vadPadMs.addEventListener("change", sttSendSettings);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function sttLoadFromStatus(data) {
|
|
|
|
|
+ if (data.stt_enabled !== undefined) sttState.enabled = Boolean(data.stt_enabled);
|
|
|
|
|
+ if (data.stt_language !== undefined) sttState.language = data.stt_language;
|
|
|
|
|
+ if (data.stt_timestamps !== undefined) sttState.timestamps = Boolean(data.stt_timestamps);
|
|
|
|
|
+ if (data.stt_diarize !== undefined) sttState.diarize = Boolean(data.stt_diarize);
|
|
|
|
|
+ if (data.stt_itn !== undefined) sttState.itn = Boolean(data.stt_itn);
|
|
|
|
|
+ if (data.stt_detect_emotion !== undefined) sttState.detect_emotion = Boolean(data.stt_detect_emotion);
|
|
|
|
|
+ if (data.stt_server_vad !== undefined) sttState.server_vad = Boolean(data.stt_server_vad);
|
|
|
|
|
+ if (data.stt_vad_threshold !== undefined) sttState.vad_threshold = Number(data.stt_vad_threshold);
|
|
|
|
|
+ if (data.stt_vad_pad_ms !== undefined) sttState.vad_pad_ms = Number(data.stt_vad_pad_ms);
|
|
|
|
|
+ if (data.stt_vad_min_ms !== undefined) sttState.vad_min_ms = Number(data.stt_vad_min_ms);
|
|
|
|
|
+ if (data.stt_connected !== undefined) sttState.connected = Boolean(data.stt_connected);
|
|
|
|
|
+
|
|
|
|
|
+ sttEls.enabled.checked = sttState.enabled;
|
|
|
|
|
+ sttEls.language.value = sttState.language;
|
|
|
|
|
+ sttEls.timestamps.checked = sttState.timestamps;
|
|
|
|
|
+ sttEls.diarize.checked = sttState.diarize;
|
|
|
|
|
+ sttEls.itn.checked = sttState.itn;
|
|
|
|
|
+ sttEls.emotion.checked = sttState.detect_emotion;
|
|
|
|
|
+ sttEls.vad.checked = sttState.server_vad;
|
|
|
|
|
+ sttEls.vadThreshold.value = String(sttState.vad_threshold);
|
|
|
|
|
+ sttEls.vadThresholdValue.textContent = sttState.vad_threshold.toFixed(2);
|
|
|
|
|
+ sttEls.vadPadMs.value = String(sttState.vad_pad_ms);
|
|
|
|
|
+ sttEls.vadPadMsValue.textContent = String(sttState.vad_pad_ms);
|
|
|
|
|
+
|
|
|
|
|
+ sttRefreshControls();
|
|
|
|
|
+ sttUpdateStatus();
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+// Hook into existing socket events
|
|
|
|
|
+socket.on("stt_message", sttHandleMessage);
|
|
|
|
|
+
|
|
|
|
|
+// Patch the existing status handler to also load STT state
|
|
|
|
|
+var _origLoadStatus = loadStatus;
|
|
|
|
|
+loadStatus = async function() {
|
|
|
|
|
+ var response = await fetch("/api/status");
|
|
|
|
|
+ var data = await response.json();
|
|
|
|
|
+
|
|
|
|
|
+ if (data.settings) {
|
|
|
|
|
+ state.mode = data.settings.mode;
|
|
|
|
|
+ state.gain_db = data.settings.gain_db;
|
|
|
|
|
+ state.agc = data.settings.agc;
|
|
|
|
|
+ state.attack_ms = data.settings.attack_ms;
|
|
|
|
|
+ state.release_ms = data.settings.release_ms;
|
|
|
|
|
+ state.noise_suppression = Boolean(data.settings.noise_suppression);
|
|
|
|
|
+ state.speech_gate = Boolean(data.settings.speech_gate);
|
|
|
|
|
+ state.hum_filter = Boolean(data.settings.hum_filter);
|
|
|
|
|
+ state.limiter = Boolean(data.settings.limiter);
|
|
|
|
|
+ state.beam_clarity = Boolean(data.settings.beam_clarity);
|
|
|
|
|
+ state.hifi_mode = Boolean(data.settings.hifi_mode);
|
|
|
|
|
+ state.hifi_mic = data.settings.hifi_mic || "mic1";
|
|
|
|
|
+ state.angle = data.settings.angle;
|
|
|
|
|
+ state.auto_beam = Boolean(data.settings.auto_beam);
|
|
|
|
|
+ state.monitor_on = Boolean(data.settings.monitor_on);
|
|
|
|
|
+ state.monitor_source = data.settings.monitor_source || "beam";
|
|
|
|
|
+ state.sample_rate = data.settings.sample_rate;
|
|
|
|
|
+ }
|
|
|
|
|
+ state.auto_angle = Number(data.auto_beam_angle_deg ?? state.angle ?? 0);
|
|
|
|
|
+ state.speech_detected = false;
|
|
|
|
|
+ state.recording = Boolean(data.recording);
|
|
|
|
|
+
|
|
|
|
|
+ els.audioStatus.textContent = data.audio_error
|
|
|
|
|
+ ? "Audio: blad (" + data.audio_error + ")"
|
|
|
|
|
+ : data.audio_running
|
|
|
|
|
+ ? "Audio: aktywne"
|
|
|
|
|
+ : "Audio: zatrzymane";
|
|
|
|
|
+
|
|
|
|
|
+ syncUiFromState();
|
|
|
|
|
+ sttLoadFromStatus(data);
|
|
|
|
|
+};
|
|
|
|
|
+
|
|
|
|
|
+// Also hook into status WS event
|
|
|
|
|
+var _origStatusHandler = null;
|
|
|
|
|
+socket.off("status");
|
|
|
|
|
+socket.on("status", function(payload) {
|
|
|
|
|
+ if (payload && payload.settings) {
|
|
|
|
|
+ state.mode = payload.settings.mode;
|
|
|
|
|
+ state.gain_db = payload.settings.gain_db;
|
|
|
|
|
+ state.agc = payload.settings.agc;
|
|
|
|
|
+ state.attack_ms = payload.settings.attack_ms;
|
|
|
|
|
+ state.release_ms = payload.settings.release_ms;
|
|
|
|
|
+ state.noise_suppression = Boolean(payload.settings.noise_suppression);
|
|
|
|
|
+ state.speech_gate = Boolean(payload.settings.speech_gate);
|
|
|
|
|
+ state.hum_filter = Boolean(payload.settings.hum_filter);
|
|
|
|
|
+ state.limiter = Boolean(payload.settings.limiter);
|
|
|
|
|
+ state.beam_clarity = Boolean(payload.settings.beam_clarity);
|
|
|
|
|
+ state.hifi_mode = Boolean(payload.settings.hifi_mode);
|
|
|
|
|
+ state.hifi_mic = payload.settings.hifi_mic || "mic1";
|
|
|
|
|
+ state.angle = payload.settings.angle;
|
|
|
|
|
+ state.auto_beam = Boolean(payload.settings.auto_beam);
|
|
|
|
|
+ state.monitor_on = Boolean(payload.settings.monitor_on);
|
|
|
|
|
+ state.monitor_source = payload.settings.monitor_source || "beam";
|
|
|
|
|
+ state.sample_rate = payload.settings.sample_rate;
|
|
|
|
|
+ syncUiFromState();
|
|
|
|
|
+ }
|
|
|
|
|
+ sttLoadFromStatus(payload || {});
|
|
|
|
|
+});
|
|
|
|
|
+
|
|
|
|
|
+// Hook stt_settings_applied
|
|
|
|
|
+socket.on("server_ack", function(payload) {
|
|
|
|
|
+ if (payload && payload.type === "stt_settings_applied" && payload.settings) {
|
|
|
|
|
+ sttLoadFromStatus(payload.settings);
|
|
|
|
|
+ }
|
|
|
|
|
+});
|
|
|
|
|
+
|
|
|
|
|
+// Initialize STT controls
|
|
|
|
|
+sttBindControls();
|