Pārlūkot izejas kodu

Add live STT integration with stt.mm.mk

- stt_bridge.py: WebSocket client bridging processed audio to stt.mm.mk
- app.py: STT settings handling via client_message/stt_settings
- audio_capture.py: feed processed audio (post-DSP) to STT bridge
- Frontend: STT panel with language, timestamps, diarization, ITN,
  emotion detection, server VAD controls, and live transcript display
- requirements.txt: add websocket-client dependency
Paweł Chodaczek 1 mēnesi atpakaļ
vecāks
revīzija
978a800506
7 mainītis faili ar 762 papildinājumiem un 4 dzēšanām
  1. 29 4
      app.py
  2. 17 0
      audio_capture.py
  3. 1 0
      requirements.txt
  4. 371 0
      static/stt.js
  5. 81 0
      static/style.css
  6. 201 0
      stt_bridge.py
  7. 62 0
      templates/index.html

+ 29 - 4
app.py

@@ -1,4 +1,4 @@
-from __future__ import annotations
+from __future__ import annotations
 
 from dataclasses import asdict
 from pathlib import Path
@@ -9,6 +9,7 @@ from flask_socketio import SocketIO, emit
 
 from audio_capture import AudioEngine
 from recorder import list_recordings, resolve_recording_path
+from stt_bridge import SttBridge
 
 
 BASE_DIR = Path(__file__).resolve().parent
@@ -20,6 +21,17 @@ app.config["SECRET_KEY"] = os.environ.get("MIC_SYSTEM_SECRET", "mic-system-dev")
 socketio = SocketIO(app, cors_allowed_origins="*", async_mode="eventlet")
 audio_engine = AudioEngine(RECORDINGS_DIR)
 
+
+def _stt_message_callback(msg):
+    """Called from SttBridge background thread when STT sends a message."""
+    socketio.emit("stt_message", msg)
+
+
+stt_bridge = SttBridge(on_message=_stt_message_callback)
+
+# Attach STT bridge to audio engine so it receives processed audio
+audio_engine.set_stt_bridge(stt_bridge)
+
 _stream_task_started = False
 audio_start_error = ""
 
@@ -54,6 +66,7 @@ def api_status():
     ensure_audio_started()
     status = audio_engine.get_status()
     status["audio_error"] = audio_start_error
+    status.update(stt_bridge.get_settings())
     return jsonify(status)
 
 
@@ -95,8 +108,18 @@ def _handle_command(payload: dict) -> dict:
 
     if msg_type == "settings":
         settings = audio_engine.update_settings(payload)
+        # Handle STT settings
+        stt_keys = {k: v for k, v in payload.items() if k.startswith("stt_")}
+        if stt_keys:
+            stt_settings = stt_bridge.update_settings(**stt_keys)
+            settings.update(stt_settings)
         return {"type": "settings_applied", "settings": settings}
 
+    if msg_type == "stt_settings":
+        stt_keys = {k: v for k, v in payload.items() if k.startswith("stt_")}
+        stt_settings = stt_bridge.update_settings(**stt_keys)
+        return {"type": "stt_settings_applied", "settings": stt_settings}
+
     if msg_type == "record_start":
         source = str(payload.get("source", "mic1"))
         duration_sec = payload.get("duration_sec")
@@ -119,7 +142,7 @@ def _handle_command(payload: dict) -> dict:
             socketio.emit("recordings_updated", {"ok": True})
         return {"type": "recording_deleted", "filename": filename}
 
-    raise ValueError(f"Unsupported message type: {msg_type}")
+    raise ValueError("Unsupported message type: " + str(msg_type))
 
 
 @socketio.on("connect")
@@ -132,7 +155,9 @@ def ws_connect():
         socketio.start_background_task(stream_audio_packets)
         _stream_task_started = True
 
-    emit("status", audio_engine.get_status())
+    status = audio_engine.get_status()
+    status.update(stt_bridge.get_settings())
+    emit("status", status)
     if audio_start_error:
         emit("server_error", {"message": audio_start_error})
 
@@ -161,4 +186,4 @@ if __name__ == "__main__":
     ensure_audio_started()
     host = os.environ.get("MIC_SYSTEM_HOST", "0.0.0.0")
     port = int(os.environ.get("MIC_SYSTEM_PORT", "5000"))
-    socketio.run(app, host=host, port=port)
+    socketio.run(app, host=host, port=port)

+ 17 - 0
audio_capture.py

@@ -183,6 +183,7 @@ class AudioEngine:
         self._agc_beam = AgcProcessor(self._settings.sample_rate)
 
         self._latest_frame: dict[str, object] = self._make_empty_frame()
+        self._stt_bridge = None
 
     def start(self) -> None:
         with self._lock:
@@ -214,6 +215,9 @@ class AudioEngine:
         with self._lock:
             return self._running
 
+    def set_stt_bridge(self, bridge) -> None:
+        self._stt_bridge = bridge
+
     def get_settings(self) -> dict[str, object]:
         with self._lock:
             return asdict(self._settings)
@@ -744,6 +748,19 @@ class AudioEngine:
             "recording": rec_status.recording,
             "rec_duration": rec_status.duration_sec,
         }
+        # Feed processed audio to STT bridge
+        if self._stt_bridge is not None:
+            if settings.mode == 'beamforming':
+                stt_signal = beam_proc
+            elif settings.mode == 'mono_mix':
+                stt_signal = mono_mix_proc
+            else:
+                stt_signal = mic1_proc
+            try:
+                self._stt_bridge.feed_audio(stt_signal, processing_rate)
+            except Exception:
+                pass
+
         with self._lock:
             self._latest_frame = frame
 

+ 1 - 0
requirements.txt

@@ -4,3 +4,4 @@ eventlet
 numpy
 scipy
 sounddevice
+websocket-client

+ 371 - 0
static/stt.js

@@ -0,0 +1,371 @@
+
+// ── STT Integration ──
+
+const sttState = {
+    enabled: false,
+    language: "pl",
+    timestamps: true,
+    diarize: true,
+    itn: true,
+    detect_emotion: false,
+    server_vad: false,
+    vad_threshold: 0.3,
+    vad_pad_ms: 400,
+    vad_min_ms: 100,
+    connected: false,
+    segments: [],      // finalized segments
+    partialText: "",   // current partial
+    speakerMap: {},    // SPEAKER_00 -> {name, score}
+};
+
+const sttEls = {
+    enabled: document.getElementById("sttEnabled"),
+    status: document.getElementById("sttStatus"),
+    controls: document.getElementById("sttControls"),
+    language: document.getElementById("sttLanguage"),
+    timestamps: document.getElementById("sttTimestamps"),
+    diarize: document.getElementById("sttDiarize"),
+    itn: document.getElementById("sttItn"),
+    emotion: document.getElementById("sttEmotion"),
+    vad: document.getElementById("sttVad"),
+    vadControls: document.getElementById("sttVadControls"),
+    vadThreshold: document.getElementById("sttVadThreshold"),
+    vadThresholdValue: document.getElementById("sttVadThresholdValue"),
+    vadPadMs: document.getElementById("sttVadPadMs"),
+    vadPadMsValue: document.getElementById("sttVadPadMsValue"),
+    output: document.getElementById("sttOutput"),
+};
+
+function sttSendSettings() {
+    socket.emit("client_message", {
+        type: "stt_settings",
+        stt_enabled: sttState.enabled,
+        stt_language: sttState.language,
+        stt_timestamps: sttState.timestamps,
+        stt_diarize: sttState.diarize,
+        stt_itn: sttState.itn,
+        stt_detect_emotion: sttState.detect_emotion,
+        stt_server_vad: sttState.server_vad,
+        stt_vad_threshold: sttState.vad_threshold,
+        stt_vad_pad_ms: sttState.vad_pad_ms,
+        stt_vad_min_ms: sttState.vad_min_ms,
+    });
+}
+
+function sttFormatTime(seconds) {
+    if (!seconds && seconds !== 0) return "";
+    var m = Math.floor(seconds / 60);
+    var s = Math.floor(seconds % 60);
+    return String(m).padStart(2, "0") + ":" + String(s).padStart(2, "0");
+}
+
+function sttSpeakerColor(speakerId) {
+    if (!speakerId) return "";
+    var match = speakerId.match(/(\d+)/);
+    var idx = match ? parseInt(match[1], 10) % 8 : 0;
+    return "stt-speaker-" + idx;
+}
+
+function sttSpeakerName(speakerId) {
+    if (!speakerId) return "";
+    var mapped = sttState.speakerMap[speakerId];
+    if (mapped && mapped.name && mapped.name !== "Unknown") {
+        return mapped.name;
+    }
+    return speakerId.replace("SPEAKER_", "Mowca ");
+}
+
+function sttConfidenceClass(conf) {
+    if (conf >= 0.85) return "conf-high";
+    if (conf >= 0.6) return "conf-mid";
+    return "conf-low";
+}
+
+function sttRenderSegment(seg, isPartial) {
+    var div = document.createElement("div");
+    div.className = "stt-segment" + (isPartial ? " partial" : "");
+
+    var html = "";
+
+    // Timestamp
+    if (seg.duration && sttState.timestamps) {
+        html += '<span class="stt-timestamp">[' + sttFormatTime(seg.duration) + ']</span>';
+    }
+
+    // Speaker segments with diarization
+    if (seg.speakers && seg.speakers.length > 0 && sttState.diarize) {
+        seg.speakers.forEach(function(sp) {
+            var spName = sttSpeakerName(sp.speaker);
+            var colorClass = sttSpeakerColor(sp.speaker);
+            html += '<div style="margin: 0.2rem 0;">';
+            html += '<span class="stt-speaker ' + colorClass + '">' + spName + '</span>';
+            if (sp.start !== undefined && sttState.timestamps) {
+                html += '<span class="stt-timestamp">' + sttFormatTime(sp.start) + '-' + sttFormatTime(sp.end) + '</span>';
+            }
+            html += '<span>' + (sp.text || "") + '</span>';
+            html += '</div>';
+        });
+    } else {
+        // Words with confidence
+        if (seg.words && seg.words.length > 0) {
+            seg.words.forEach(function(w) {
+                var cls = sttConfidenceClass(w.confidence || 1.0);
+                html += '<span class="stt-word ' + cls + '">' + w.word + '</span> ';
+            });
+        } else {
+            html += '<span>' + (seg.text || "") + '</span>';
+        }
+    }
+
+    // Confidence badge
+    if (seg.confidence && !isPartial) {
+        html += '<span class="stt-confidence">' + Math.round(seg.confidence * 100) + '%</span>';
+    }
+
+    // Emotion
+    if (seg.emotion && sttState.detect_emotion) {
+        html += '<span class="stt-emotion">' + seg.emotion + '</span>';
+    }
+
+    div.innerHTML = html;
+    return div;
+}
+
+function sttRenderAll() {
+    var output = sttEls.output;
+    output.innerHTML = "";
+
+    sttState.segments.forEach(function(seg) {
+        output.appendChild(sttRenderSegment(seg, false));
+    });
+
+    if (sttState.partialText) {
+        output.appendChild(sttRenderSegment({ text: sttState.partialText }, true));
+    }
+
+    // Auto-scroll to bottom
+    output.scrollTop = output.scrollHeight;
+}
+
+function sttHandleMessage(msg) {
+    if (!msg || !msg.type) return;
+
+    if (msg.type === "stt_status") {
+        sttState.connected = Boolean(msg.connected);
+        sttUpdateStatus();
+        return;
+    }
+
+    if (msg.type === "partial") {
+        sttState.partialText = msg.text || "";
+        sttRenderAll();
+        return;
+    }
+
+    if (msg.type === "final") {
+        sttState.partialText = "";
+        if (msg.text) {
+            sttState.segments.push(msg);
+        }
+        if (msg.speaker_map) {
+            Object.assign(sttState.speakerMap, msg.speaker_map);
+        }
+        sttRenderAll();
+        return;
+    }
+
+    if (msg.type === "speaker_update" && msg.speaker_map) {
+        Object.assign(sttState.speakerMap, msg.speaker_map);
+        sttRenderAll();
+        return;
+    }
+}
+
+function sttUpdateStatus() {
+    if (!sttState.enabled) {
+        sttEls.status.textContent = "STT: wylaczone";
+        sttEls.status.style.background = "";
+    } else if (sttState.connected) {
+        sttEls.status.textContent = "STT: polaczone";
+        sttEls.status.style.background = "rgba(81, 207, 102, 0.25)";
+    } else {
+        sttEls.status.textContent = "STT: laczenie...";
+        sttEls.status.style.background = "rgba(255, 210, 0, 0.2)";
+    }
+}
+
+function sttRefreshControls() {
+    sttEls.controls.classList.toggle("hidden", !sttState.enabled);
+    sttEls.vadControls.classList.toggle("hidden", !sttState.server_vad);
+}
+
+function sttBindControls() {
+    sttEls.enabled.addEventListener("change", function() {
+        sttState.enabled = sttEls.enabled.checked;
+        if (!sttState.enabled) {
+            sttState.connected = false;
+            sttState.segments = [];
+            sttState.partialText = "";
+            sttState.speakerMap = {};
+            sttRenderAll();
+        }
+        sttRefreshControls();
+        sttUpdateStatus();
+        sttSendSettings();
+    });
+
+    sttEls.language.addEventListener("change", function() {
+        sttState.language = sttEls.language.value;
+        // Clear transcript on language change
+        sttState.segments = [];
+        sttState.partialText = "";
+        sttRenderAll();
+        sttSendSettings();
+    });
+
+    sttEls.timestamps.addEventListener("change", function() {
+        sttState.timestamps = sttEls.timestamps.checked;
+        sttSendSettings();
+    });
+
+    sttEls.diarize.addEventListener("change", function() {
+        sttState.diarize = sttEls.diarize.checked;
+        sttSendSettings();
+    });
+
+    sttEls.itn.addEventListener("change", function() {
+        sttState.itn = sttEls.itn.checked;
+        sttSendSettings();
+    });
+
+    sttEls.emotion.addEventListener("change", function() {
+        sttState.detect_emotion = sttEls.emotion.checked;
+        sttSendSettings();
+    });
+
+    sttEls.vad.addEventListener("change", function() {
+        sttState.server_vad = sttEls.vad.checked;
+        sttRefreshControls();
+        sttSendSettings();
+    });
+
+    sttEls.vadThreshold.addEventListener("input", function() {
+        sttState.vad_threshold = parseFloat(sttEls.vadThreshold.value);
+        sttEls.vadThresholdValue.textContent = sttState.vad_threshold.toFixed(2);
+    });
+    sttEls.vadThreshold.addEventListener("change", sttSendSettings);
+
+    sttEls.vadPadMs.addEventListener("input", function() {
+        sttState.vad_pad_ms = parseInt(sttEls.vadPadMs.value, 10);
+        sttEls.vadPadMsValue.textContent = String(sttState.vad_pad_ms);
+    });
+    sttEls.vadPadMs.addEventListener("change", sttSendSettings);
+}
+
+function sttLoadFromStatus(data) {
+    if (data.stt_enabled !== undefined) sttState.enabled = Boolean(data.stt_enabled);
+    if (data.stt_language !== undefined) sttState.language = data.stt_language;
+    if (data.stt_timestamps !== undefined) sttState.timestamps = Boolean(data.stt_timestamps);
+    if (data.stt_diarize !== undefined) sttState.diarize = Boolean(data.stt_diarize);
+    if (data.stt_itn !== undefined) sttState.itn = Boolean(data.stt_itn);
+    if (data.stt_detect_emotion !== undefined) sttState.detect_emotion = Boolean(data.stt_detect_emotion);
+    if (data.stt_server_vad !== undefined) sttState.server_vad = Boolean(data.stt_server_vad);
+    if (data.stt_vad_threshold !== undefined) sttState.vad_threshold = Number(data.stt_vad_threshold);
+    if (data.stt_vad_pad_ms !== undefined) sttState.vad_pad_ms = Number(data.stt_vad_pad_ms);
+    if (data.stt_vad_min_ms !== undefined) sttState.vad_min_ms = Number(data.stt_vad_min_ms);
+    if (data.stt_connected !== undefined) sttState.connected = Boolean(data.stt_connected);
+
+    sttEls.enabled.checked = sttState.enabled;
+    sttEls.language.value = sttState.language;
+    sttEls.timestamps.checked = sttState.timestamps;
+    sttEls.diarize.checked = sttState.diarize;
+    sttEls.itn.checked = sttState.itn;
+    sttEls.emotion.checked = sttState.detect_emotion;
+    sttEls.vad.checked = sttState.server_vad;
+    sttEls.vadThreshold.value = String(sttState.vad_threshold);
+    sttEls.vadThresholdValue.textContent = sttState.vad_threshold.toFixed(2);
+    sttEls.vadPadMs.value = String(sttState.vad_pad_ms);
+    sttEls.vadPadMsValue.textContent = String(sttState.vad_pad_ms);
+
+    sttRefreshControls();
+    sttUpdateStatus();
+}
+
+// Hook into existing socket events
+socket.on("stt_message", sttHandleMessage);
+
+// Patch the existing status handler to also load STT state
+var _origLoadStatus = loadStatus;
+loadStatus = async function() {
+    var response = await fetch("/api/status");
+    var data = await response.json();
+
+    if (data.settings) {
+        state.mode = data.settings.mode;
+        state.gain_db = data.settings.gain_db;
+        state.agc = data.settings.agc;
+        state.attack_ms = data.settings.attack_ms;
+        state.release_ms = data.settings.release_ms;
+        state.noise_suppression = Boolean(data.settings.noise_suppression);
+        state.speech_gate = Boolean(data.settings.speech_gate);
+        state.hum_filter = Boolean(data.settings.hum_filter);
+        state.limiter = Boolean(data.settings.limiter);
+        state.beam_clarity = Boolean(data.settings.beam_clarity);
+        state.hifi_mode = Boolean(data.settings.hifi_mode);
+        state.hifi_mic = data.settings.hifi_mic || "mic1";
+        state.angle = data.settings.angle;
+        state.auto_beam = Boolean(data.settings.auto_beam);
+        state.monitor_on = Boolean(data.settings.monitor_on);
+        state.monitor_source = data.settings.monitor_source || "beam";
+        state.sample_rate = data.settings.sample_rate;
+    }
+    state.auto_angle = Number(data.auto_beam_angle_deg ?? state.angle ?? 0);
+    state.speech_detected = false;
+    state.recording = Boolean(data.recording);
+
+    els.audioStatus.textContent = data.audio_error
+        ? "Audio: blad (" + data.audio_error + ")"
+        : data.audio_running
+            ? "Audio: aktywne"
+            : "Audio: zatrzymane";
+
+    syncUiFromState();
+    sttLoadFromStatus(data);
+};
+
+// Also hook into status WS event
+var _origStatusHandler = null;
+socket.off("status");
+socket.on("status", function(payload) {
+    if (payload && payload.settings) {
+        state.mode = payload.settings.mode;
+        state.gain_db = payload.settings.gain_db;
+        state.agc = payload.settings.agc;
+        state.attack_ms = payload.settings.attack_ms;
+        state.release_ms = payload.settings.release_ms;
+        state.noise_suppression = Boolean(payload.settings.noise_suppression);
+        state.speech_gate = Boolean(payload.settings.speech_gate);
+        state.hum_filter = Boolean(payload.settings.hum_filter);
+        state.limiter = Boolean(payload.settings.limiter);
+        state.beam_clarity = Boolean(payload.settings.beam_clarity);
+        state.hifi_mode = Boolean(payload.settings.hifi_mode);
+        state.hifi_mic = payload.settings.hifi_mic || "mic1";
+        state.angle = payload.settings.angle;
+        state.auto_beam = Boolean(payload.settings.auto_beam);
+        state.monitor_on = Boolean(payload.settings.monitor_on);
+        state.monitor_source = payload.settings.monitor_source || "beam";
+        state.sample_rate = payload.settings.sample_rate;
+        syncUiFromState();
+    }
+    sttLoadFromStatus(payload || {});
+});
+
+// Hook stt_settings_applied
+socket.on("server_ack", function(payload) {
+    if (payload && payload.type === "stt_settings_applied" && payload.settings) {
+        sttLoadFromStatus(payload.settings);
+    }
+});
+
+// Initialize STT controls
+sttBindControls();

+ 81 - 0
static/style.css

@@ -313,3 +313,84 @@ td {
         align-items: flex-start;
     }
 }
+
+/* ── STT Panel ── */
+.stt-panel .stt-output {
+    max-height: 400px;
+    overflow-y: auto;
+    padding: 0.8rem;
+    background: rgba(0, 0, 0, 0.25);
+    border-radius: 8px;
+    margin-top: 0.8rem;
+    font-size: 0.92rem;
+    line-height: 1.6;
+    min-height: 60px;
+}
+
+.stt-output:empty::after {
+    content: "Transkrypcja pojawi sie tutaj...";
+    color: rgba(255,255,255,0.25);
+    font-style: italic;
+}
+
+.stt-segment {
+    margin-bottom: 0.6rem;
+    padding: 0.4rem 0;
+    border-bottom: 1px solid rgba(255,255,255,0.06);
+}
+
+.stt-segment:last-child {
+    border-bottom: none;
+}
+
+.stt-segment.partial {
+    opacity: 0.6;
+    font-style: italic;
+}
+
+.stt-speaker {
+    display: inline-block;
+    padding: 0.1rem 0.5rem;
+    border-radius: 4px;
+    font-size: 0.78rem;
+    font-weight: 600;
+    margin-right: 0.4rem;
+    color: #fff;
+}
+
+.stt-speaker-0 { background: #4a9eff; }
+.stt-speaker-1 { background: #ff6b6b; }
+.stt-speaker-2 { background: #51cf66; }
+.stt-speaker-3 { background: #ffd43b; color: #333; }
+.stt-speaker-4 { background: #cc5de8; }
+.stt-speaker-5 { background: #ff922b; }
+.stt-speaker-6 { background: #20c997; }
+.stt-speaker-7 { background: #f06595; }
+
+.stt-timestamp {
+    color: rgba(255,255,255,0.35);
+    font-size: 0.75rem;
+    margin-right: 0.4rem;
+    font-variant-numeric: tabular-nums;
+}
+
+.stt-confidence {
+    color: rgba(255,255,255,0.3);
+    font-size: 0.72rem;
+    margin-left: 0.3rem;
+}
+
+.stt-emotion {
+    font-size: 0.72rem;
+    margin-left: 0.4rem;
+    padding: 0.05rem 0.35rem;
+    border-radius: 3px;
+    background: rgba(255,255,255,0.08);
+}
+
+.stt-word {
+    transition: background 0.15s;
+}
+.stt-word.conf-high { }
+.stt-word.conf-mid { color: #ffd43b; }
+.stt-word.conf-low { color: #ff6b6b; }

+ 201 - 0
stt_bridge.py

@@ -0,0 +1,201 @@
+"""Bridge between AudioEngine and stt.mm.mk WebSocket STT service.
+
+Connects to wss://stt.mm.mk/ws/transcribe, streams processed audio,
+and forwards transcription results back via a callback.
+"""
+from __future__ import annotations
+
+import json
+import logging
+import threading
+import time
+from dataclasses import dataclass
+
+import numpy as np
+
+logger = logging.getLogger("stt_bridge")
+
+try:
+    import websocket as ws_client  # websocket-client library
+except ImportError:
+    ws_client = None
+    logger.warning("websocket-client not installed — STT bridge unavailable")
+
+
+@dataclass
+class SttSettings:
+    enabled: bool = False
+    language: str = "pl"
+    timestamps: bool = True
+    diarize: bool = True
+    itn: bool = True
+    detect_emotion: bool = False
+    server_vad: bool = False
+    vad_threshold: float = 0.3
+    vad_pad_ms: int = 400
+    vad_min_ms: int = 100
+
+
+class SttBridge:
+    """Manages WebSocket connection to stt.mm.mk and streams audio."""
+
+    STT_URL = "wss://stt.mm.mk/ws/transcribe"
+
+    def __init__(self, on_message=None):
+        self._lock = threading.Lock()
+        self._settings = SttSettings()
+        self._on_message = on_message
+        self._ws = None
+        self._ws_thread: threading.Thread | None = None
+        self._connected = False
+        self._should_run = False
+        self._sample_rate = 16000
+
+    def get_settings(self) -> dict:
+        with self._lock:
+            return {
+                "stt_enabled": self._settings.enabled,
+                "stt_language": self._settings.language,
+                "stt_timestamps": self._settings.timestamps,
+                "stt_diarize": self._settings.diarize,
+                "stt_itn": self._settings.itn,
+                "stt_detect_emotion": self._settings.detect_emotion,
+                "stt_server_vad": self._settings.server_vad,
+                "stt_vad_threshold": self._settings.vad_threshold,
+                "stt_vad_pad_ms": self._settings.vad_pad_ms,
+                "stt_vad_min_ms": self._settings.vad_min_ms,
+                "stt_connected": self._connected,
+            }
+
+    def update_settings(self, **kwargs) -> dict:
+        reconnect_keys = {
+            "language", "timestamps", "diarize", "itn",
+            "detect_emotion", "server_vad", "vad_threshold",
+            "vad_pad_ms", "vad_min_ms",
+        }
+        changed_enabled = False
+        need_reconnect = False
+
+        with self._lock:
+            for key, val in kwargs.items():
+                attr = key.replace("stt_", "")
+                if hasattr(self._settings, attr):
+                    old = getattr(self._settings, attr)
+                    setattr(self._settings, attr, type(old)(val))
+                    if attr == "enabled" and old != self._settings.enabled:
+                        changed_enabled = True
+                    if attr in reconnect_keys:
+                        need_reconnect = True
+
+        if changed_enabled:
+            if self._settings.enabled:
+                self._start_connection()
+            else:
+                self._stop_connection()
+        elif self._settings.enabled and self._connected and need_reconnect:
+            self._stop_connection()
+            self._start_connection()
+
+        return self.get_settings()
+
+    def feed_audio(self, audio: np.ndarray, sample_rate: int) -> None:
+        """Feed processed audio (post-beamforming/AGC) to STT."""
+        if not self._connected or not self._settings.enabled:
+            return
+
+        self._sample_rate = sample_rate
+        pcm16 = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16)
+
+        try:
+            if self._ws and self._connected:
+                self._ws.send(pcm16.tobytes(), opcode=0x2)
+        except Exception:
+            pass
+
+    def _build_url(self) -> str:
+        s = self._settings
+        parts = [
+            "language=" + s.language,
+            "rate=" + str(self._sample_rate),
+            "stream_id=mic-system-" + str(int(time.time())),
+        ]
+        if s.timestamps:
+            parts.append("timestamps=1")
+        if s.diarize:
+            parts.append("diarize=1")
+        if s.itn:
+            parts.append("itn=1")
+        if s.detect_emotion:
+            parts.append("detect_emotion=1")
+        if s.server_vad:
+            parts.append("vad=1")
+            parts.append("vad_threshold=" + str(s.vad_threshold))
+            parts.append("vad_pad_ms=" + str(s.vad_pad_ms))
+            parts.append("vad_min_ms=" + str(s.vad_min_ms))
+        return self.STT_URL + "?" + "&".join(parts)
+
+    def _start_connection(self):
+        if ws_client is None:
+            logger.error("websocket-client not installed")
+            return
+
+        self._should_run = True
+        url = self._build_url()
+        logger.info("STT connecting to %s", url)
+
+        bridge = self
+
+        def on_open(ws):
+            bridge._connected = True
+            logger.info("STT WebSocket connected")
+            if bridge._on_message:
+                bridge._on_message({"type": "stt_status", "connected": True})
+
+        def on_message(ws, message):
+            try:
+                msg = json.loads(message)
+                if bridge._on_message:
+                    bridge._on_message(msg)
+            except Exception as e:
+                logger.error("STT message parse error: %s", e)
+
+        def on_error(ws, error):
+            logger.error("STT WebSocket error: %s", error)
+
+        def on_close(ws, close_status_code, close_msg):
+            bridge._connected = False
+            logger.info("STT WebSocket closed: %s %s", close_status_code, close_msg)
+            if bridge._on_message:
+                bridge._on_message({"type": "stt_status", "connected": False})
+            if bridge._should_run:
+                time.sleep(2)
+                if bridge._should_run:
+                    bridge._start_connection()
+
+        self._ws = ws_client.WebSocketApp(
+            url,
+            on_open=on_open,
+            on_message=on_message,
+            on_error=on_error,
+            on_close=on_close,
+        )
+        self._ws_thread = threading.Thread(
+            target=self._ws.run_forever,
+            kwargs={"ping_interval": 20, "ping_timeout": 10},
+            daemon=True,
+        )
+        self._ws_thread.start()
+
+    def _stop_connection(self):
+        self._should_run = False
+        self._connected = False
+        if self._ws:
+            try:
+                self._ws.close()
+            except Exception:
+                pass
+            self._ws = None
+        self._ws_thread = None
+
+    def stop(self):
+        self._stop_connection()

+ 62 - 0
templates/index.html

@@ -165,6 +165,67 @@
             </div>
         </section>
 
+        
+        <section class="stt-panel card">
+            <h2>Transkrypcja (STT)</h2>
+            <div class="group">
+                <label><input id="sttEnabled" type="checkbox"> Wlacz transkrypcje na zywo (stt.mm.mk)</label>
+                <span id="sttStatus" class="pill">STT: wylaczone</span>
+            </div>
+
+            <div id="sttControls" class="hidden">
+                <div class="row two-col">
+                    <div class="group">
+                        <label for="sttLanguage">Jezyk</label>
+                        <select id="sttLanguage">
+                            <option value="pl" selected>Polski</option>
+                            <option value="en">English</option>
+                            <option value="de">Deutsch</option>
+                            <option value="fr">Francais</option>
+                            <option value="es">Espanol</option>
+                            <option value="it">Italiano</option>
+                            <option value="pt">Portugues</option>
+                            <option value="nl">Nederlands</option>
+                            <option value="el">Ellinika</option>
+                            <option value="zh">Zhongwen</option>
+                            <option value="ja">Nihongo</option>
+                            <option value="ko">Hangugeo</option>
+                            <option value="vi">Tieng Viet</option>
+                            <option value="ar">Arabiyya</option>
+                        </select>
+                    </div>
+                    <div class="group">
+                        <label>Opcje</label>
+                        <div class="radio-row">
+                            <label><input id="sttTimestamps" type="checkbox" checked> Timestampy</label>
+                            <label><input id="sttDiarize" type="checkbox" checked> Diaryzacja</label>
+                            <label><input id="sttItn" type="checkbox" checked> ITN</label>
+                            <label><input id="sttEmotion" type="checkbox"> Emocje</label>
+                        </div>
+                    </div>
+                </div>
+
+                <div class="group">
+                    <label>Server VAD</label>
+                    <div class="radio-row">
+                        <label><input id="sttVad" type="checkbox"> Wlacz server VAD</label>
+                    </div>
+                </div>
+                <div id="sttVadControls" class="row two-col hidden">
+                    <div class="group">
+                        <label for="sttVadThreshold">VAD threshold: <span id="sttVadThresholdValue">0.3</span></label>
+                        <input id="sttVadThreshold" type="range" min="0.1" max="0.9" value="0.3" step="0.05">
+                    </div>
+                    <div class="group">
+                        <label for="sttVadPadMs">VAD pad: <span id="sttVadPadMsValue">400</span> ms</label>
+                        <input id="sttVadPadMs" type="range" min="100" max="1000" value="400" step="50">
+                    </div>
+                </div>
+            </div>
+
+            <div id="sttOutput" class="stt-output"></div>
+        </section>
+
         <section class="files card">
             <h2>Nagrania</h2>
             <div class="table-wrap">
@@ -186,5 +247,6 @@
 
     <script src="{{ url_for('static', filename='vendor/socket.io.min.js') }}"></script>
     <script src="{{ url_for('static', filename='app.js') }}"></script>
+    <script src="{{ url_for('static', filename='stt.js') }}"></script>
 </body>
 </html>