audio_capture.py 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185
  1. from __future__ import annotations
  2. from dataclasses import asdict, dataclass
  3. from datetime import datetime
  4. import base64
  5. from collections import deque
  6. from concurrent.futures import Future, ProcessPoolExecutor
  7. import math
  8. import os
  9. from pathlib import Path
  10. import threading
  11. import time
  12. import numpy as np
  13. from scipy import signal
  14. import sounddevice as sd
  15. from agc import AgcProcessor
  16. from beamforming import beamform_delay_and_sum
  17. from recorder import RecorderStatus, WavRecorder
  18. def _gcc_phat_job(sig: np.ndarray, refsig: np.ndarray, sample_rate: int, max_tau: float) -> float:
  19. n = sig.size + refsig.size
  20. sig_fft = np.fft.rfft(sig, n=n)
  21. ref_fft = np.fft.rfft(refsig, n=n)
  22. cross = sig_fft * np.conj(ref_fft)
  23. denom = np.abs(cross)
  24. cross = cross / np.maximum(denom, 1e-10)
  25. cc = np.fft.irfft(cross, n=n)
  26. max_shift = int(min(n // 2, max_tau * sample_rate))
  27. if max_shift <= 0:
  28. return 0.0
  29. cc_window = np.concatenate((cc[-max_shift:], cc[: max_shift + 1]))
  30. shift = int(np.argmax(np.abs(cc_window)) - max_shift)
  31. return float(shift) / float(sample_rate)
  32. def _estimate_speech_angle_job(
  33. mic1: np.ndarray,
  34. mic2: np.ndarray,
  35. sample_rate: int,
  36. mic_spacing: float,
  37. prev_angle_deg: float,
  38. prev_noise_floor: float,
  39. ) -> tuple[float, bool, float]:
  40. if mic1.size < 64 or mic2.size < 64:
  41. return float(prev_angle_deg), False, float(prev_noise_floor)
  42. high = min(3400.0, 0.45 * sample_rate)
  43. low = min(300.0, high * 0.5)
  44. if high <= low + 1.0:
  45. return float(prev_angle_deg), False, float(prev_noise_floor)
  46. try:
  47. sos = signal.butter(4, [low, high], btype="bandpass", fs=sample_rate, output="sos")
  48. except ValueError:
  49. return float(prev_angle_deg), False, float(prev_noise_floor)
  50. speech1 = signal.sosfilt(sos, mic1).astype(np.float32, copy=False)
  51. speech2 = signal.sosfilt(sos, mic2).astype(np.float32, copy=False)
  52. speech_energy = 0.5 * (np.mean(speech1 * speech1) + np.mean(speech2 * speech2))
  53. full_energy = 0.5 * (np.mean(mic1 * mic1) + np.mean(mic2 * mic2))
  54. speech_ratio = float(speech_energy / max(full_energy, 1e-12))
  55. noise_floor = 0.995 * float(prev_noise_floor) + 0.005 * float(speech_energy)
  56. speech_threshold = max(2.5e-7, noise_floor * 2.0)
  57. speech_detected = bool(speech_energy > speech_threshold and speech_ratio > 0.08)
  58. if not speech_detected:
  59. return float(prev_angle_deg), False, float(noise_floor)
  60. max_tau = mic_spacing / 343.0
  61. tau = _gcc_phat_job(speech1, speech2, sample_rate, max_tau=max_tau)
  62. sin_theta = np.clip((tau * 343.0) / max(mic_spacing, 1e-6), -1.0, 1.0)
  63. raw_angle = float(np.rad2deg(np.arcsin(sin_theta)))
  64. raw_angle = float(np.clip(raw_angle, -90.0, 90.0))
  65. angle = 0.88 * float(prev_angle_deg) + 0.12 * raw_angle
  66. return float(angle), True, float(noise_floor)
  67. ALLOWED_SAMPLE_RATES = {16000, 22050, 24000, 32000, 48000}
  68. ALLOWED_MODES = {"mic1", "mic2", "mono_mix", "beamforming"}
  69. ALLOWED_SOURCES = {"mic1", "mic2", "mono_mix", "beam", "compare_all", "hifi_raw"}
  70. ALLOWED_MONITOR_SOURCES = {"mic1", "mic2", "mono_mix", "beam"}
  71. ALLOWED_HIFI_MICS = {"mic1", "mic2"}
  72. @dataclass
  73. class AudioSettings:
  74. mode: str = "beamforming"
  75. gain_db: float = 0.0
  76. agc: bool = True
  77. attack_ms: float = 6.0
  78. release_ms: float = 280.0
  79. noise_suppression: bool = True
  80. speech_gate: bool = False
  81. hum_filter: bool = True
  82. limiter: bool = True
  83. beam_clarity: bool = True
  84. hifi_mode: bool = False
  85. hifi_mic: str = "mic1"
  86. angle: float = 0.0
  87. auto_beam: bool = True
  88. monitor_on: bool = False
  89. monitor_source: str = "beam"
  90. sample_rate: int = 16000
  91. class AudioEngine:
  92. CHANNELS = 2
  93. BIT_DEPTH = 32
  94. CHUNK_SIZE = 2048
  95. HARDWARE_SAMPLE_RATE = 48000
  96. RING_DIAMETER_M = 0.06
  97. RING_SLOTS = 4
  98. SLOT_STEP = 1 # neighboring slots (90 degrees). Use 2 for opposite mics.
  99. MIC_SPACING = RING_DIAMETER_M * math.sin(math.pi * SLOT_STEP / RING_SLOTS)
  100. STARTUP_IGNORE_SECONDS = 0.30
  101. SPEECH_GATE_HOLD_SECONDS = 0.85
  102. SPEECH_GATE_FLOOR = 0.55
  103. SPEECH_GATE_ATTACK_SECONDS = 0.012
  104. SPEECH_GATE_RELEASE_SECONDS = 0.360
  105. NOISE_SUPPRESS_OPEN_FLOOR = 0.72
  106. NOISE_SUPPRESS_CLOSED_FLOOR = 0.40
  107. NOISE_SUPPRESS_OPEN_STRENGTH = 0.30
  108. NOISE_SUPPRESS_CLOSED_STRENGTH = 0.55
  109. HUM_HPF_CUTOFF_HZ = 75.0
  110. HUM_NOTCH_HZ = 50.0
  111. HUM_NOTCH_Q = 22.0
  112. BEAM_CLARITY_BLEND = 0.22
  113. BEAM_PRESENCE_BOOST = 0.20
  114. def __init__(self, recordings_dir: Path) -> None:
  115. self._lock = threading.Lock()
  116. self._stream: sd.InputStream | None = None
  117. self._stream_channels = self.CHANNELS
  118. self._input_device_name = "unknown"
  119. self._settings = AudioSettings()
  120. self._running = False
  121. self._startup_deadline = 0.0
  122. self._auto_angle_deg = 0.0
  123. self._noise_floor = 1e-7
  124. self._vad_noise_floor = 1e-7
  125. self._speech_sos_cache: dict[int, np.ndarray | None] = {}
  126. self._hpf_sos_cache: dict[int, np.ndarray | None] = {}
  127. self._notch_cache: dict[int, tuple[np.ndarray, np.ndarray] | None] = {}
  128. self._hpf_state: dict[tuple[str, int], np.ndarray] = {}
  129. self._notch_state: dict[tuple[str, int], np.ndarray] = {}
  130. self._last_speech_detected = False
  131. self._angle_update_counter = 0
  132. self._angle_update_interval = 4
  133. cpu_total = max(1, int(os.cpu_count() or 1))
  134. self._cpu_workers = max(1, min(4, cpu_total))
  135. self._angle_workers = max(1, min(3, self._cpu_workers - 1))
  136. self._max_pending_angle_jobs = max(1, min(2, self._angle_workers))
  137. self._angle_executor: ProcessPoolExecutor | None = ProcessPoolExecutor(max_workers=self._angle_workers)
  138. self._angle_futures: deque[Future] = deque()
  139. self._speech_gate_hold_chunks = 0
  140. self._speech_gate_gain = 1.0
  141. self._ns_noise_power = {
  142. "mic1": 1e-7,
  143. "mic2": 1e-7,
  144. "mono_mix": 1e-7,
  145. "beam": 1e-7,
  146. }
  147. self._presence_prev = {"beam": 0.0}
  148. self._recordings_dir = Path(recordings_dir)
  149. self._recorder = WavRecorder(recordings_dir)
  150. self._compare_recorders: dict[str, WavRecorder] = {}
  151. self._compare_filenames: dict[str, str] = {}
  152. self._record_duration_limit_sec: float | None = None
  153. self._auto_stop_requested = False
  154. self._monitor_queue: deque[np.ndarray] = deque()
  155. self._monitor_queue_samples = 0
  156. self._agc_mic1 = AgcProcessor(self._settings.sample_rate)
  157. self._agc_mic2 = AgcProcessor(self._settings.sample_rate)
  158. self._agc_beam = AgcProcessor(self._settings.sample_rate)
  159. self._latest_frame: dict[str, object] = self._make_empty_frame()
  160. self._stt_bridge = None
  161. def start(self) -> None:
  162. with self._lock:
  163. if self._running:
  164. return
  165. self._open_stream()
  166. def stop(self) -> None:
  167. with self._lock:
  168. stream = self._stream
  169. self._stream = None
  170. self._running = False
  171. angle_executor = self._angle_executor
  172. self._angle_executor = None
  173. angle_futures = list(self._angle_futures)
  174. self._angle_futures.clear()
  175. if stream is not None:
  176. stream.stop()
  177. stream.close()
  178. self.stop_recording()
  179. for fut in angle_futures:
  180. fut.cancel()
  181. if angle_executor is not None:
  182. angle_executor.shutdown(wait=False, cancel_futures=True)
  183. def is_running(self) -> bool:
  184. with self._lock:
  185. return self._running
  186. def set_stt_bridge(self, bridge) -> None:
  187. self._stt_bridge = bridge
  188. def get_settings(self) -> dict[str, object]:
  189. with self._lock:
  190. return asdict(self._settings)
  191. def get_status(self) -> dict[str, object]:
  192. rec_status = self._current_recording_status()
  193. return {
  194. "recording": rec_status.recording,
  195. "mic_count": self.CHANNELS,
  196. "hardware_sample_rate": self.HARDWARE_SAMPLE_RATE,
  197. "mic_spacing_m": self.MIC_SPACING,
  198. "auto_beam_angle_deg": self._auto_angle_deg,
  199. "input_device": self._input_device_name,
  200. "settings": self.get_settings(),
  201. "recording_status": asdict(rec_status),
  202. "audio_running": self.is_running(),
  203. }
  204. def get_latest_packet(self) -> dict[str, object]:
  205. empty = np.empty(0, dtype=np.float32)
  206. with self._lock:
  207. frame = dict(self._latest_frame)
  208. mic1 = np.asarray(frame.get("mic1", empty), dtype=np.float32)
  209. mic2 = np.asarray(frame.get("mic2", empty), dtype=np.float32)
  210. beam = np.asarray(frame.get("beam", empty), dtype=np.float32)
  211. mono_mix = np.asarray(frame.get("mono_mix", empty), dtype=np.float32)
  212. show_mic2 = bool(frame.get("show_mic2", True))
  213. show_beam = bool(frame.get("show_beam", False))
  214. show_mono_mix = bool(frame.get("show_mono_mix", False))
  215. beam_angle_deg = float(frame.get("beam_angle_deg", 0.0))
  216. auto_beam = bool(frame.get("auto_beam", True))
  217. speech_detected = bool(frame.get("speech_detected", False))
  218. speech_gate_open = bool(frame.get("speech_gate_open", True))
  219. hifi_mode = bool(frame.get("hifi_mode", False))
  220. monitor_on = bool(frame.get("monitor_on", False))
  221. monitor_source = str(frame.get("monitor_source", "beam"))
  222. recording = bool(frame.get("recording", False))
  223. rec_duration = float(frame.get("rec_duration", 0.0))
  224. monitor_rate = int(self.HARDWARE_SAMPLE_RATE if hifi_mode else self._settings.sample_rate)
  225. monitor_chunk = None
  226. if monitor_on:
  227. monitor_chunk = self._pop_monitor_chunk(max_samples=max(512, int(monitor_rate * 0.08)))
  228. packet = {
  229. "type": "audio_data",
  230. "mic1": self._downsample_for_ui(mic1).tolist(),
  231. "mic2": self._downsample_for_ui(mic2).tolist() if show_mic2 else [],
  232. "beam": self._downsample_for_ui(beam).tolist() if show_beam else [],
  233. "mono_mix": self._downsample_for_ui(mono_mix).tolist() if show_mono_mix else [],
  234. "rms_mic1": self._rms(mic1),
  235. "rms_mic2": self._rms(mic2) if show_mic2 else 0.0,
  236. "rms_beam": self._rms(beam) if show_beam else 0.0,
  237. "rms_mono_mix": self._rms(mono_mix) if show_mono_mix else 0.0,
  238. "beam_angle_deg": beam_angle_deg,
  239. "auto_beam": auto_beam,
  240. "speech_detected": speech_detected,
  241. "speech_gate_open": speech_gate_open,
  242. "hifi_mode": hifi_mode,
  243. "monitor_on": monitor_on,
  244. "monitor_source": monitor_source,
  245. "monitor_sr": monitor_rate,
  246. "monitor_chunk_b64": "",
  247. "recording": recording,
  248. "rec_duration": rec_duration,
  249. }
  250. if monitor_chunk is not None and monitor_chunk.size > 0:
  251. packet["monitor_chunk_b64"] = self._encode_pcm16_base64(monitor_chunk)
  252. return packet
  253. def update_settings(self, updates: dict[str, object]) -> dict[str, object]:
  254. with self._lock:
  255. current = self._settings
  256. mode = str(updates.get("mode", current.mode))
  257. if mode not in ALLOWED_MODES:
  258. mode = current.mode
  259. sample_rate = int(updates.get("sample_rate", current.sample_rate))
  260. if sample_rate not in ALLOWED_SAMPLE_RATES:
  261. sample_rate = current.sample_rate
  262. gain_db = float(updates.get("gain_db", current.gain_db))
  263. gain_db = float(np.clip(gain_db, 0.0, 30.0))
  264. agc = bool(updates.get("agc", current.agc))
  265. attack_ms = float(updates.get("attack_ms", current.attack_ms))
  266. attack_ms = float(np.clip(attack_ms, 1.0, 50.0))
  267. release_ms = float(updates.get("release_ms", current.release_ms))
  268. release_ms = float(np.clip(release_ms, 50.0, 1000.0))
  269. noise_suppression = bool(updates.get("noise_suppression", current.noise_suppression))
  270. speech_gate = bool(updates.get("speech_gate", current.speech_gate))
  271. hum_filter = bool(updates.get("hum_filter", current.hum_filter))
  272. limiter = bool(updates.get("limiter", current.limiter))
  273. beam_clarity = bool(updates.get("beam_clarity", current.beam_clarity))
  274. hifi_mode = bool(updates.get("hifi_mode", current.hifi_mode))
  275. hifi_mic = str(updates.get("hifi_mic", current.hifi_mic))
  276. if hifi_mic not in ALLOWED_HIFI_MICS:
  277. hifi_mic = current.hifi_mic
  278. angle = float(updates.get("angle", current.angle))
  279. angle = float(np.clip(angle, -90.0, 90.0))
  280. auto_beam = bool(updates.get("auto_beam", current.auto_beam))
  281. monitor_on = bool(updates.get("monitor_on", current.monitor_on))
  282. monitor_source = str(updates.get("monitor_source", current.monitor_source))
  283. if monitor_source not in ALLOWED_MONITOR_SOURCES:
  284. monitor_source = current.monitor_source
  285. if hifi_mode:
  286. monitor_on = False
  287. self._settings = AudioSettings(
  288. mode=mode,
  289. gain_db=gain_db,
  290. agc=agc,
  291. attack_ms=attack_ms,
  292. release_ms=release_ms,
  293. noise_suppression=noise_suppression,
  294. speech_gate=speech_gate,
  295. hum_filter=hum_filter,
  296. limiter=limiter,
  297. beam_clarity=beam_clarity,
  298. hifi_mode=hifi_mode,
  299. hifi_mic=hifi_mic,
  300. angle=angle,
  301. auto_beam=auto_beam,
  302. monitor_on=monitor_on,
  303. monitor_source=monitor_source,
  304. sample_rate=sample_rate,
  305. )
  306. if not auto_beam:
  307. self._auto_angle_deg = angle
  308. if not monitor_on or hifi_mode:
  309. self._clear_monitor_queue_locked()
  310. self._agc_mic1.update(
  311. sample_rate=sample_rate,
  312. attack_ms=attack_ms,
  313. release_ms=release_ms,
  314. )
  315. self._agc_mic2.update(
  316. sample_rate=sample_rate,
  317. attack_ms=attack_ms,
  318. release_ms=release_ms,
  319. )
  320. self._agc_beam.update(
  321. sample_rate=sample_rate,
  322. attack_ms=attack_ms,
  323. release_ms=release_ms,
  324. )
  325. return self.get_settings()
  326. def start_recording(self, source: str, duration_sec: float | None = None) -> dict[str, object]:
  327. source = source.lower().strip()
  328. if source not in ALLOWED_SOURCES:
  329. raise ValueError("Invalid recording source")
  330. if self._current_recording_status().recording:
  331. raise RuntimeError("Recording is already active")
  332. limit = None
  333. if duration_sec is not None:
  334. try:
  335. duration_value = float(duration_sec)
  336. except (TypeError, ValueError):
  337. duration_value = 0.0
  338. if duration_value > 0.0:
  339. limit = float(np.clip(duration_value, 1.0, 3600.0))
  340. with self._lock:
  341. self._record_duration_limit_sec = limit
  342. self._auto_stop_requested = False
  343. settings = self.get_settings()
  344. sample_rate = int(settings["sample_rate"])
  345. hifi_mode = bool(settings.get("hifi_mode", False))
  346. hifi_mic = str(settings.get("hifi_mic", "mic1"))
  347. if hifi_mic not in ALLOWED_HIFI_MICS:
  348. hifi_mic = "mic1"
  349. if hifi_mode:
  350. timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
  351. filename = self._recorder.start(
  352. source="hifi_raw",
  353. sample_rate=self.HARDWARE_SAMPLE_RATE,
  354. channels=1,
  355. filename=f"rec_{timestamp}_hifi_{hifi_mic}_48k.wav",
  356. )
  357. with self._lock:
  358. self._compare_recorders = {}
  359. self._compare_filenames = {}
  360. return {
  361. "source": "hifi_raw",
  362. "filenames": [filename],
  363. "duration_limit_sec": limit or 0.0,
  364. }
  365. if source == "hifi_raw":
  366. source = hifi_mic
  367. if source == "compare_all":
  368. timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
  369. recorders = {
  370. "mic1": WavRecorder(self._recordings_dir),
  371. "mono_mix": WavRecorder(self._recordings_dir),
  372. "beam": WavRecorder(self._recordings_dir),
  373. }
  374. filenames = {
  375. "mic1": f"rec_{timestamp}_mic1.wav",
  376. "mono_mix": f"rec_{timestamp}_mono_mix.wav",
  377. "beam": f"rec_{timestamp}_beam.wav",
  378. }
  379. for key, recorder in recorders.items():
  380. recorder.start(
  381. source=key,
  382. sample_rate=sample_rate,
  383. channels=1,
  384. filename=filenames[key],
  385. )
  386. with self._lock:
  387. self._compare_recorders = recorders
  388. self._compare_filenames = filenames
  389. return {
  390. "source": source,
  391. "filenames": [filenames["mic1"], filenames["mono_mix"], filenames["beam"]],
  392. "duration_limit_sec": limit or 0.0,
  393. }
  394. channels = 1
  395. filename = self._recorder.start(
  396. source=source,
  397. sample_rate=sample_rate,
  398. channels=channels,
  399. )
  400. with self._lock:
  401. self._compare_recorders = {}
  402. self._compare_filenames = {}
  403. return {
  404. "source": source,
  405. "filenames": [filename],
  406. "duration_limit_sec": limit or 0.0,
  407. }
  408. def stop_recording(self) -> RecorderStatus:
  409. with self._lock:
  410. compare_recorders = self._compare_recorders
  411. compare_filenames = dict(self._compare_filenames)
  412. self._compare_recorders = {}
  413. self._compare_filenames = {}
  414. self._record_duration_limit_sec = None
  415. self._auto_stop_requested = False
  416. if compare_recorders:
  417. max_duration = 0.0
  418. sample_rate = self._settings.sample_rate
  419. for recorder in compare_recorders.values():
  420. status = recorder.stop()
  421. max_duration = max(max_duration, status.duration_sec)
  422. sample_rate = status.sample_rate
  423. return RecorderStatus(
  424. recording=False,
  425. filename=",".join(compare_filenames.values()),
  426. duration_sec=max_duration,
  427. channels=1,
  428. sample_rate=sample_rate,
  429. source="compare_all",
  430. )
  431. return self._recorder.stop()
  432. def get_recording_status(self) -> RecorderStatus:
  433. return self._current_recording_status()
  434. def _current_recording_status(self) -> RecorderStatus:
  435. with self._lock:
  436. compare_recorders = dict(self._compare_recorders)
  437. compare_filenames = dict(self._compare_filenames)
  438. if compare_recorders:
  439. statuses = [recorder.get_status() for recorder in compare_recorders.values()]
  440. is_recording = any(status.recording for status in statuses)
  441. duration = max((status.duration_sec for status in statuses), default=0.0)
  442. sample_rate = statuses[0].sample_rate if statuses else self._settings.sample_rate
  443. return RecorderStatus(
  444. recording=is_recording,
  445. filename=",".join(compare_filenames.values()),
  446. duration_sec=duration,
  447. channels=1,
  448. sample_rate=sample_rate,
  449. source="compare_all",
  450. )
  451. return self._recorder.get_status()
  452. def _open_stream(self) -> None:
  453. device_idx, device_name, input_channels = self._resolve_input_device()
  454. stream = sd.InputStream(
  455. samplerate=self.HARDWARE_SAMPLE_RATE,
  456. device=device_idx,
  457. channels=input_channels,
  458. dtype="int32",
  459. blocksize=self.CHUNK_SIZE,
  460. callback=self._audio_callback,
  461. )
  462. stream.start()
  463. with self._lock:
  464. self._stream = stream
  465. self._stream_channels = input_channels
  466. self._input_device_name = device_name
  467. self._running = True
  468. self._startup_deadline = time.monotonic() + self.STARTUP_IGNORE_SECONDS
  469. def _restart_stream(self) -> None:
  470. with self._lock:
  471. stream = self._stream
  472. self._stream = None
  473. self._running = False
  474. if stream is not None:
  475. stream.stop()
  476. stream.close()
  477. self._open_stream()
  478. def _audio_callback(self, indata: np.ndarray, frames: int, time_info: dict, status: sd.CallbackFlags) -> None:
  479. del frames, time_info
  480. if status:
  481. return
  482. with self._lock:
  483. if not self._running:
  484. return
  485. if time.monotonic() < self._startup_deadline:
  486. return
  487. settings = self._settings
  488. stream_channels = self._stream_channels
  489. pcm32 = indata.astype(np.int32, copy=False)
  490. pcm24 = pcm32 >> 8
  491. normalized = np.clip(pcm24.astype(np.float32) / 8388608.0, -1.0, 1.0)
  492. mic1_raw = normalized[:, 0]
  493. mic2_raw = normalized[:, 1] if stream_channels > 1 else mic1_raw.copy()
  494. processing_rate = self.HARDWARE_SAMPLE_RATE if settings.hifi_mode else settings.sample_rate
  495. if (not settings.hifi_mode) and settings.sample_rate != self.HARDWARE_SAMPLE_RATE:
  496. mic1_raw = self._resample_audio(mic1_raw, self.HARDWARE_SAMPLE_RATE, settings.sample_rate)
  497. mic2_raw = self._resample_audio(mic2_raw, self.HARDWARE_SAMPLE_RATE, settings.sample_rate)
  498. common_len = min(mic1_raw.size, mic2_raw.size)
  499. mic1_raw = mic1_raw[:common_len]
  500. mic2_raw = mic2_raw[:common_len]
  501. if settings.hifi_mode:
  502. hifi_mic = settings.hifi_mic if settings.hifi_mic in ALLOWED_HIFI_MICS else "mic1"
  503. hifi_signal = mic1_raw if hifi_mic == "mic1" else mic2_raw
  504. mic1_proc = hifi_signal
  505. mic2_proc = mic2_raw if hifi_mic == "mic1" else mic1_raw
  506. mono_mix_proc = hifi_signal
  507. beam_proc = hifi_signal
  508. angle_to_use = 0.0
  509. speech_active = True
  510. gate_open = True
  511. else:
  512. gain_linear = 10.0 ** (settings.gain_db / 20.0)
  513. mic1_base = np.clip(mic1_raw * gain_linear, -1.0, 1.0)
  514. mic2_base = np.clip(mic2_raw * gain_linear, -1.0, 1.0)
  515. mono_base = np.clip(0.5 * (mic1_base + mic2_base), -1.0, 1.0)
  516. angle_to_use = settings.angle
  517. speech_detected = False
  518. if settings.mode == "beamforming" and settings.auto_beam:
  519. latest_done: Future | None = None
  520. pending: deque[Future] = deque()
  521. while self._angle_futures:
  522. fut = self._angle_futures.popleft()
  523. if fut.done():
  524. latest_done = fut
  525. else:
  526. pending.append(fut)
  527. self._angle_futures = pending
  528. if latest_done is not None:
  529. try:
  530. angle_est, speech_est, noise_floor = latest_done.result(timeout=0)
  531. self._auto_angle_deg = float(angle_est)
  532. self._last_speech_detected = bool(speech_est)
  533. self._noise_floor = max(float(noise_floor), 1e-9)
  534. except Exception:
  535. pass
  536. self._angle_update_counter += 1
  537. if (
  538. self._angle_update_counter >= self._angle_update_interval
  539. and len(self._angle_futures) < self._max_pending_angle_jobs
  540. and self._angle_executor is not None
  541. ):
  542. self._angle_update_counter = 0
  543. try:
  544. fut = self._angle_executor.submit(
  545. _estimate_speech_angle_job,
  546. mic1_base.astype(np.float32, copy=True),
  547. mic2_base.astype(np.float32, copy=True),
  548. int(processing_rate),
  549. float(self.MIC_SPACING),
  550. float(self._auto_angle_deg),
  551. float(self._noise_floor),
  552. )
  553. self._angle_futures.append(fut)
  554. except Exception:
  555. pass
  556. angle_to_use = self._auto_angle_deg
  557. speech_detected = self._last_speech_detected
  558. else:
  559. while self._angle_futures:
  560. self._angle_futures.popleft().cancel()
  561. if not settings.auto_beam:
  562. self._auto_angle_deg = settings.angle
  563. speech_detected = False
  564. speech_presence = self._estimate_speech_presence_fast(mic1_base, mic2_base)
  565. speech_active = bool(speech_detected or speech_presence)
  566. beam_proc = beamform_delay_and_sum(
  567. mic1_base,
  568. mic2_base,
  569. angle_deg=angle_to_use,
  570. sample_rate=processing_rate,
  571. mic_spacing=self.MIC_SPACING,
  572. )
  573. if settings.beam_clarity:
  574. beam_proc = self._apply_beam_clarity_blend(beam_proc, mono_base)
  575. gate_open, gate_gain = self._update_speech_gate(
  576. speech_detected=speech_active,
  577. sample_rate=processing_rate,
  578. chunk_len=max(beam_proc.size, 1),
  579. enabled=settings.speech_gate,
  580. )
  581. if settings.agc:
  582. mic1_proc = self._agc_mic1.process(mic1_base, speech_hint=gate_open)
  583. mic2_proc = self._agc_mic2.process(mic2_base, speech_hint=gate_open)
  584. beam_proc = self._agc_beam.process(beam_proc, speech_hint=gate_open)
  585. else:
  586. mic1_proc = mic1_base
  587. mic2_proc = mic2_base
  588. mono_mix_proc = np.clip(0.5 * (mic1_proc + mic2_proc), -1.0, 1.0)
  589. if settings.hum_filter:
  590. mic1_proc = self._apply_hum_filter(mic1_proc, processing_rate, "mic1")
  591. mic2_proc = self._apply_hum_filter(mic2_proc, processing_rate, "mic2")
  592. beam_proc = self._apply_hum_filter(beam_proc, processing_rate, "beam")
  593. mono_mix_proc = np.clip(0.5 * (mic1_proc + mic2_proc), -1.0, 1.0)
  594. mono_mix_proc = self._apply_hum_filter(mono_mix_proc, processing_rate, "mono_mix")
  595. if settings.noise_suppression:
  596. mic1_proc = self._apply_noise_suppression(mic1_proc, "mic1", speech_active=gate_open)
  597. mic2_proc = self._apply_noise_suppression(mic2_proc, "mic2", speech_active=gate_open)
  598. mono_mix_proc = self._apply_noise_suppression(mono_mix_proc, "mono_mix", speech_active=gate_open)
  599. beam_proc = self._apply_noise_suppression(beam_proc, "beam", speech_active=gate_open)
  600. if settings.speech_gate:
  601. mic1_proc = np.clip(mic1_proc * gate_gain, -1.0, 1.0).astype(np.float32, copy=False)
  602. mic2_proc = np.clip(mic2_proc * gate_gain, -1.0, 1.0).astype(np.float32, copy=False)
  603. mono_mix_proc = np.clip(mono_mix_proc * gate_gain, -1.0, 1.0).astype(np.float32, copy=False)
  604. beam_proc = np.clip(beam_proc * gate_gain, -1.0, 1.0).astype(np.float32, copy=False)
  605. if settings.limiter:
  606. mic1_proc = self._apply_limiter(mic1_proc)
  607. mic2_proc = self._apply_limiter(mic2_proc)
  608. mono_mix_proc = self._apply_limiter(mono_mix_proc)
  609. beam_proc = self._apply_limiter(beam_proc)
  610. rec_status = self._current_recording_status()
  611. if rec_status.recording:
  612. self._write_recording_chunk(rec_status.source, mic1_raw, mic2_raw, mic1_proc, mono_mix_proc, beam_proc)
  613. rec_status = self._current_recording_status()
  614. should_auto_stop = False
  615. with self._lock:
  616. duration_limit = self._record_duration_limit_sec
  617. if (
  618. rec_status.recording
  619. and duration_limit is not None
  620. and rec_status.duration_sec >= duration_limit
  621. and not self._auto_stop_requested
  622. ):
  623. self._auto_stop_requested = True
  624. should_auto_stop = True
  625. if should_auto_stop:
  626. threading.Thread(target=self.stop_recording, daemon=True).start()
  627. show_beam = (settings.mode == "beamforming") and (not settings.hifi_mode)
  628. show_mono_mix = (settings.mode == "mono_mix") and (not settings.hifi_mode)
  629. show_mic2 = not settings.hifi_mode
  630. if settings.monitor_on:
  631. monitor_map = {
  632. "mic1": mic1_proc,
  633. "mic2": mic2_proc,
  634. "mono_mix": mono_mix_proc,
  635. "beam": beam_proc,
  636. }
  637. monitor_signal = monitor_map.get(settings.monitor_source, beam_proc)
  638. with self._lock:
  639. self._push_monitor_chunk_locked(monitor_signal, processing_rate)
  640. frame = {
  641. "mic1": mic1_proc.astype(np.float32, copy=False),
  642. "mic2": mic2_proc.astype(np.float32, copy=False),
  643. "beam": beam_proc.astype(np.float32, copy=False),
  644. "mono_mix": mono_mix_proc.astype(np.float32, copy=False),
  645. "show_mic2": show_mic2,
  646. "show_beam": show_beam,
  647. "show_mono_mix": show_mono_mix,
  648. "beam_angle_deg": float(angle_to_use),
  649. "auto_beam": settings.auto_beam,
  650. "speech_detected": speech_active,
  651. "speech_gate_open": gate_open,
  652. "hifi_mode": settings.hifi_mode,
  653. "monitor_on": settings.monitor_on,
  654. "monitor_source": settings.monitor_source,
  655. "recording": rec_status.recording,
  656. "rec_duration": rec_status.duration_sec,
  657. }
  658. # Feed processed audio to STT bridge
  659. if self._stt_bridge is not None:
  660. if settings.mode == 'beamforming':
  661. stt_signal = beam_proc
  662. elif settings.mode == 'mono_mix':
  663. stt_signal = mono_mix_proc
  664. else:
  665. stt_signal = mic1_proc
  666. try:
  667. self._stt_bridge.feed_audio(stt_signal, processing_rate)
  668. except Exception:
  669. pass
  670. with self._lock:
  671. self._latest_frame = frame
  672. def _write_recording_chunk(
  673. self,
  674. source: str,
  675. mic1_raw: np.ndarray,
  676. mic2_raw: np.ndarray,
  677. mic1_proc: np.ndarray,
  678. mono_mix_proc: np.ndarray,
  679. beam_proc: np.ndarray,
  680. ) -> None:
  681. if source == "compare_all":
  682. with self._lock:
  683. mic1_rec = self._compare_recorders.get("mic1")
  684. mix_rec = self._compare_recorders.get("mono_mix")
  685. beam_rec = self._compare_recorders.get("beam")
  686. if mic1_rec is not None:
  687. mic1_rec.write(mic1_proc)
  688. if mix_rec is not None:
  689. mix_rec.write(mono_mix_proc)
  690. if beam_rec is not None:
  691. beam_rec.write(beam_proc)
  692. return
  693. if source == "mic1":
  694. self._recorder.write(mic1_raw)
  695. return
  696. if source == "mic2":
  697. self._recorder.write(mic2_raw)
  698. return
  699. if source == "mono_mix":
  700. self._recorder.write(mono_mix_proc)
  701. return
  702. if source == "beam":
  703. self._recorder.write(beam_proc)
  704. return
  705. if source == "hifi_raw":
  706. with self._lock:
  707. hifi_mic = self._settings.hifi_mic
  708. if hifi_mic == "mic2":
  709. self._recorder.write(mic2_raw)
  710. else:
  711. self._recorder.write(mic1_raw)
  712. return
  713. def _apply_beam_clarity_blend(self, beam: np.ndarray, mono: np.ndarray) -> np.ndarray:
  714. if beam.size == 0 or mono.size == 0:
  715. return beam.astype(np.float32, copy=False)
  716. n = min(beam.size, mono.size)
  717. if n <= 0:
  718. return beam.astype(np.float32, copy=False)
  719. blend = float(np.clip(self.BEAM_CLARITY_BLEND, 0.0, 0.5))
  720. out = ((1.0 - blend) * beam[:n] + blend * mono[:n]).astype(np.float32, copy=False)
  721. prev = float(self._presence_prev.get("beam", 0.0))
  722. hp = np.empty_like(out)
  723. hp[0] = out[0] - prev
  724. if out.size > 1:
  725. hp[1:] = out[1:] - out[:-1]
  726. self._presence_prev["beam"] = float(out[-1])
  727. out = out + float(self.BEAM_PRESENCE_BOOST) * hp
  728. return np.clip(out, -1.0, 1.0).astype(np.float32, copy=False)
  729. def _update_speech_gate(
  730. self,
  731. *,
  732. speech_detected: bool,
  733. sample_rate: int,
  734. chunk_len: int,
  735. enabled: bool,
  736. ) -> tuple[bool, float]:
  737. if not enabled:
  738. self._speech_gate_hold_chunks = 0
  739. self._speech_gate_gain = 1.0
  740. return True, 1.0
  741. chunk_seconds = max(float(chunk_len) / float(sample_rate), 1e-6)
  742. hold_chunks = max(1, int(round(self.SPEECH_GATE_HOLD_SECONDS / chunk_seconds)))
  743. if speech_detected:
  744. self._speech_gate_hold_chunks = hold_chunks
  745. elif self._speech_gate_hold_chunks > 0:
  746. self._speech_gate_hold_chunks -= 1
  747. gate_open = bool(speech_detected or self._speech_gate_hold_chunks > 0)
  748. target_gain = 1.0 if gate_open else self.SPEECH_GATE_FLOOR
  749. tau = self.SPEECH_GATE_ATTACK_SECONDS if target_gain > self._speech_gate_gain else self.SPEECH_GATE_RELEASE_SECONDS
  750. coeff = np.exp(-chunk_seconds / max(tau, 1e-4))
  751. self._speech_gate_gain = float(coeff * self._speech_gate_gain + (1.0 - coeff) * target_gain)
  752. return gate_open, self._speech_gate_gain
  753. def _apply_noise_suppression(self, audio: np.ndarray, key: str, *, speech_active: bool) -> np.ndarray:
  754. if audio.size == 0:
  755. return audio.astype(np.float32, copy=False)
  756. power = float(np.mean(audio * audio, dtype=np.float64))
  757. prev_noise = float(self._ns_noise_power.get(key, 1e-7))
  758. alpha = 0.01 if speech_active else 0.07
  759. noise = (1.0 - alpha) * prev_noise + alpha * power
  760. noise = max(noise, 1e-9)
  761. self._ns_noise_power[key] = noise
  762. ratio = noise / max(power, 1e-9)
  763. if speech_active:
  764. strength = self.NOISE_SUPPRESS_OPEN_STRENGTH
  765. floor = self.NOISE_SUPPRESS_OPEN_FLOOR
  766. else:
  767. strength = self.NOISE_SUPPRESS_CLOSED_STRENGTH
  768. floor = self.NOISE_SUPPRESS_CLOSED_FLOOR
  769. gain = float(np.clip(1.0 - strength * ratio, floor, 1.0))
  770. out = audio.astype(np.float32, copy=False) * gain
  771. return np.clip(out, -1.0, 1.0).astype(np.float32, copy=False)
  772. def _apply_hum_filter(self, audio: np.ndarray, sample_rate: int, channel_key: str) -> np.ndarray:
  773. if audio.size == 0:
  774. return audio.astype(np.float32, copy=False)
  775. out = audio.astype(np.float32, copy=False)
  776. state_key = (channel_key, int(sample_rate))
  777. sos = self._get_hpf_sos(sample_rate)
  778. if sos is not None:
  779. zi = self._hpf_state.get(state_key)
  780. if zi is None or zi.shape != (sos.shape[0], 2):
  781. zi = np.zeros((sos.shape[0], 2), dtype=np.float32)
  782. out, zi_new = signal.sosfilt(sos, out, zi=zi)
  783. self._hpf_state[state_key] = zi_new.astype(np.float32, copy=False)
  784. out = out.astype(np.float32, copy=False)
  785. notch = self._get_notch_coeff(sample_rate)
  786. if notch is not None:
  787. b, a = notch
  788. zi = self._notch_state.get(state_key)
  789. expected = max(len(a), len(b)) - 1
  790. if zi is None or zi.size != expected:
  791. zi = np.zeros(expected, dtype=np.float32)
  792. out, zi_new = signal.lfilter(b, a, out, zi=zi)
  793. self._notch_state[state_key] = zi_new.astype(np.float32, copy=False)
  794. out = out.astype(np.float32, copy=False)
  795. return out
  796. def _get_hpf_sos(self, sample_rate: int) -> np.ndarray | None:
  797. cached = self._hpf_sos_cache.get(sample_rate, "__missing__")
  798. if isinstance(cached, np.ndarray):
  799. return cached
  800. if cached is None:
  801. return None
  802. cutoff = min(self.HUM_HPF_CUTOFF_HZ, 0.45 * sample_rate)
  803. if cutoff < 20.0:
  804. self._hpf_sos_cache[sample_rate] = None
  805. return None
  806. try:
  807. sos = signal.butter(2, cutoff, btype="highpass", fs=sample_rate, output="sos")
  808. except ValueError:
  809. self._hpf_sos_cache[sample_rate] = None
  810. return None
  811. self._hpf_sos_cache[sample_rate] = sos
  812. return sos
  813. def _get_notch_coeff(self, sample_rate: int) -> tuple[np.ndarray, np.ndarray] | None:
  814. cached = self._notch_cache.get(sample_rate, "__missing__")
  815. if isinstance(cached, tuple):
  816. return cached
  817. if cached is None:
  818. return None
  819. nyquist = 0.5 * sample_rate
  820. if nyquist <= self.HUM_NOTCH_HZ * 1.2:
  821. self._notch_cache[sample_rate] = None
  822. return None
  823. w0 = self.HUM_NOTCH_HZ / nyquist
  824. try:
  825. b, a = signal.iirnotch(w0, self.HUM_NOTCH_Q)
  826. except ValueError:
  827. self._notch_cache[sample_rate] = None
  828. return None
  829. coeff = (b.astype(np.float32), a.astype(np.float32))
  830. self._notch_cache[sample_rate] = coeff
  831. return coeff
  832. @staticmethod
  833. def _apply_limiter(audio: np.ndarray) -> np.ndarray:
  834. if audio.size == 0:
  835. return audio.astype(np.float32, copy=False)
  836. x = np.clip(audio.astype(np.float32, copy=False), -1.0, 1.0)
  837. threshold = 0.82
  838. abs_x = np.abs(x)
  839. if not np.any(abs_x > threshold):
  840. return x.astype(np.float32, copy=False)
  841. out = x.copy()
  842. over = abs_x > threshold
  843. norm = (abs_x[over] - threshold) / max(1.0 - threshold, 1e-6)
  844. compressed = threshold + (1.0 - threshold) * (np.tanh(2.2 * norm) / np.tanh(2.2))
  845. out[over] = np.sign(x[over]) * compressed
  846. return out.astype(np.float32, copy=False)
  847. @staticmethod
  848. def _downsample_for_ui(audio: np.ndarray, target_points: int = 320) -> np.ndarray:
  849. if audio.size <= target_points:
  850. return audio.astype(np.float32, copy=False)
  851. step = int(np.ceil(audio.size / target_points))
  852. sampled = audio[::step]
  853. if sampled.size > target_points:
  854. sampled = sampled[:target_points]
  855. return sampled.astype(np.float32, copy=False)
  856. @staticmethod
  857. def _rms(audio: np.ndarray) -> float:
  858. if audio.size == 0:
  859. return 0.0
  860. return float(np.sqrt(np.mean(np.square(audio), dtype=np.float64)))
  861. @staticmethod
  862. def _resample_audio(audio: np.ndarray, source_rate: int, target_rate: int) -> np.ndarray:
  863. if audio.size == 0 or source_rate == target_rate:
  864. return audio.astype(np.float32, copy=False)
  865. if source_rate == 48000 and target_rate == 16000:
  866. usable = audio.size - (audio.size % 3)
  867. if usable <= 0:
  868. return audio.astype(np.float32, copy=False)
  869. # Fast decimation-by-3 tuned for speech workloads on Zero 2W.
  870. grouped = audio[:usable].reshape(-1, 3)
  871. return grouped.mean(axis=1, dtype=np.float32).astype(np.float32, copy=False)
  872. gcd = math.gcd(source_rate, target_rate)
  873. up = target_rate // gcd
  874. down = source_rate // gcd
  875. resampled = signal.resample_poly(audio, up=up, down=down)
  876. return resampled.astype(np.float32, copy=False)
  877. def _resolve_input_device(self) -> tuple[int | None, str, int]:
  878. try:
  879. default_input = sd.default.device[0]
  880. if isinstance(default_input, int) and default_input >= 0:
  881. info = sd.query_devices(default_input, "input")
  882. max_inputs = int(info.get("max_input_channels", 0))
  883. if max_inputs > 0:
  884. return int(default_input), str(info.get("name", f"device-{default_input}")), min(self.CHANNELS, max_inputs)
  885. except Exception:
  886. pass
  887. all_devices = sd.query_devices()
  888. preferred: tuple[int, dict] | None = None
  889. fallback: tuple[int, dict] | None = None
  890. for idx, raw_info in enumerate(all_devices):
  891. info = dict(raw_info)
  892. max_inputs = int(info.get("max_input_channels", 0))
  893. if max_inputs <= 0:
  894. continue
  895. device = (idx, info)
  896. name = str(info.get("name", "")).lower()
  897. if max_inputs >= self.CHANNELS and any(tag in name for tag in ("google", "voicehat", "i2s", "mic")):
  898. preferred = device
  899. break
  900. if max_inputs >= self.CHANNELS and fallback is None:
  901. fallback = device
  902. if fallback is None:
  903. fallback = device
  904. chosen = preferred or fallback
  905. if chosen is None:
  906. raise RuntimeError("No audio input device available")
  907. idx, info = chosen
  908. max_inputs = int(info.get("max_input_channels", 1))
  909. channels = min(self.CHANNELS, max_inputs)
  910. return int(idx), str(info.get("name", f"device-{idx}")), channels
  911. def _push_monitor_chunk_locked(self, chunk: np.ndarray, sample_rate: int) -> None:
  912. samples = chunk.astype(np.float32, copy=True)
  913. if samples.size == 0:
  914. return
  915. self._monitor_queue.append(samples)
  916. self._monitor_queue_samples += samples.size
  917. max_samples = max(sample_rate * 2, 2048)
  918. while self._monitor_queue_samples > max_samples and self._monitor_queue:
  919. dropped = self._monitor_queue.popleft()
  920. self._monitor_queue_samples -= dropped.size
  921. def _pop_monitor_chunk(self, max_samples: int) -> np.ndarray | None:
  922. if self._monitor_queue_samples <= 0 or not self._monitor_queue:
  923. return None
  924. take: list[np.ndarray] = []
  925. collected = 0
  926. while self._monitor_queue and collected < max_samples:
  927. chunk = self._monitor_queue[0]
  928. remaining = max_samples - collected
  929. if chunk.size <= remaining:
  930. take.append(chunk)
  931. collected += chunk.size
  932. self._monitor_queue.popleft()
  933. else:
  934. take.append(chunk[:remaining])
  935. self._monitor_queue[0] = chunk[remaining:]
  936. collected += remaining
  937. break
  938. self._monitor_queue_samples -= collected
  939. if not take:
  940. return None
  941. if len(take) == 1:
  942. return take[0]
  943. return np.concatenate(take).astype(np.float32, copy=False)
  944. @staticmethod
  945. def _encode_pcm16_base64(audio: np.ndarray) -> str:
  946. pcm16 = (np.clip(audio, -1.0, 1.0) * 32767.0).astype(np.int16)
  947. return base64.b64encode(pcm16.tobytes()).decode("ascii")
  948. def _clear_monitor_queue_locked(self) -> None:
  949. self._monitor_queue.clear()
  950. self._monitor_queue_samples = 0
  951. def _estimate_speech_presence_fast(self, mic1: np.ndarray, mic2: np.ndarray) -> bool:
  952. if mic1.size == 0 or mic2.size == 0:
  953. return False
  954. energy = 0.5 * (np.mean(mic1 * mic1) + np.mean(mic2 * mic2))
  955. energy = float(max(energy, 1e-10))
  956. if energy < self._vad_noise_floor:
  957. alpha = 0.05
  958. else:
  959. alpha = 0.004
  960. self._vad_noise_floor = (1.0 - alpha) * self._vad_noise_floor + alpha * energy
  961. threshold = max(2.2e-7, self._vad_noise_floor * 1.9)
  962. return bool(energy > threshold)
  963. @staticmethod
  964. def _gcc_phat(sig: np.ndarray, refsig: np.ndarray, sample_rate: int, max_tau: float) -> float:
  965. n = sig.size + refsig.size
  966. sig_fft = np.fft.rfft(sig, n=n)
  967. ref_fft = np.fft.rfft(refsig, n=n)
  968. cross = sig_fft * np.conj(ref_fft)
  969. denom = np.abs(cross)
  970. cross = cross / np.maximum(denom, 1e-10)
  971. cc = np.fft.irfft(cross, n=n)
  972. max_shift = int(min(n // 2, max_tau * sample_rate))
  973. if max_shift <= 0:
  974. return 0.0
  975. cc_window = np.concatenate((cc[-max_shift:], cc[: max_shift + 1]))
  976. shift = int(np.argmax(np.abs(cc_window)) - max_shift)
  977. return float(shift) / float(sample_rate)
  978. def _estimate_speech_angle(self, mic1: np.ndarray, mic2: np.ndarray, sample_rate: int) -> tuple[float, bool]:
  979. if mic1.size < 64 or mic2.size < 64:
  980. return self._auto_angle_deg, False
  981. high = min(3400.0, 0.45 * sample_rate)
  982. low = min(300.0, high * 0.5)
  983. if high <= low + 1.0:
  984. return self._auto_angle_deg, False
  985. sos = self._get_speech_sos(sample_rate, low, high)
  986. if sos is None:
  987. return self._auto_angle_deg, False
  988. speech1 = signal.sosfilt(sos, mic1).astype(np.float32, copy=False)
  989. speech2 = signal.sosfilt(sos, mic2).astype(np.float32, copy=False)
  990. speech_energy = 0.5 * (np.mean(speech1 * speech1) + np.mean(speech2 * speech2))
  991. full_energy = 0.5 * (np.mean(mic1 * mic1) + np.mean(mic2 * mic2))
  992. speech_ratio = float(speech_energy / max(full_energy, 1e-12))
  993. self._noise_floor = 0.995 * self._noise_floor + 0.005 * float(speech_energy)
  994. speech_threshold = max(2.5e-7, self._noise_floor * 2.0)
  995. speech_detected = bool(speech_energy > speech_threshold and speech_ratio > 0.08)
  996. if not speech_detected:
  997. return self._auto_angle_deg, False
  998. max_tau = self.MIC_SPACING / 343.0
  999. tau = self._gcc_phat(speech1, speech2, sample_rate, max_tau=max_tau)
  1000. sin_theta = np.clip((tau * 343.0) / self.MIC_SPACING, -1.0, 1.0)
  1001. raw_angle = float(np.rad2deg(np.arcsin(sin_theta)))
  1002. raw_angle = float(np.clip(raw_angle, -90.0, 90.0))
  1003. self._auto_angle_deg = 0.88 * self._auto_angle_deg + 0.12 * raw_angle
  1004. return self._auto_angle_deg, True
  1005. def _get_speech_sos(self, sample_rate: int, low: float, high: float) -> np.ndarray | None:
  1006. cached = self._speech_sos_cache.get(sample_rate, "__missing__")
  1007. if isinstance(cached, np.ndarray):
  1008. return cached
  1009. if cached is None:
  1010. return None
  1011. try:
  1012. sos = signal.butter(4, [low, high], btype="bandpass", fs=sample_rate, output="sos")
  1013. except ValueError:
  1014. self._speech_sos_cache[sample_rate] = None
  1015. return None
  1016. self._speech_sos_cache[sample_rate] = sos
  1017. return sos
  1018. @staticmethod
  1019. def _make_empty_frame() -> dict[str, object]:
  1020. empty = np.empty(0, dtype=np.float32)
  1021. return {
  1022. "mic1": empty,
  1023. "mic2": empty,
  1024. "beam": empty,
  1025. "mono_mix": empty,
  1026. "show_mic2": True,
  1027. "show_beam": False,
  1028. "show_mono_mix": False,
  1029. "beam_angle_deg": 0.0,
  1030. "auto_beam": True,
  1031. "speech_detected": False,
  1032. "speech_gate_open": True,
  1033. "hifi_mode": False,
  1034. "monitor_on": False,
  1035. "monitor_source": "beam",
  1036. "recording": False,
  1037. "rec_duration": 0.0,
  1038. }