audio_capture.py 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168
  1. from __future__ import annotations
  2. from dataclasses import asdict, dataclass
  3. from datetime import datetime
  4. import base64
  5. from collections import deque
  6. from concurrent.futures import Future, ProcessPoolExecutor
  7. import math
  8. import os
  9. from pathlib import Path
  10. import threading
  11. import time
  12. import numpy as np
  13. from scipy import signal
  14. import sounddevice as sd
  15. from agc import AgcProcessor
  16. from beamforming import beamform_delay_and_sum
  17. from recorder import RecorderStatus, WavRecorder
  18. def _gcc_phat_job(sig: np.ndarray, refsig: np.ndarray, sample_rate: int, max_tau: float) -> float:
  19. n = sig.size + refsig.size
  20. sig_fft = np.fft.rfft(sig, n=n)
  21. ref_fft = np.fft.rfft(refsig, n=n)
  22. cross = sig_fft * np.conj(ref_fft)
  23. denom = np.abs(cross)
  24. cross = cross / np.maximum(denom, 1e-10)
  25. cc = np.fft.irfft(cross, n=n)
  26. max_shift = int(min(n // 2, max_tau * sample_rate))
  27. if max_shift <= 0:
  28. return 0.0
  29. cc_window = np.concatenate((cc[-max_shift:], cc[: max_shift + 1]))
  30. shift = int(np.argmax(np.abs(cc_window)) - max_shift)
  31. return float(shift) / float(sample_rate)
  32. def _estimate_speech_angle_job(
  33. mic1: np.ndarray,
  34. mic2: np.ndarray,
  35. sample_rate: int,
  36. mic_spacing: float,
  37. prev_angle_deg: float,
  38. prev_noise_floor: float,
  39. ) -> tuple[float, bool, float]:
  40. if mic1.size < 64 or mic2.size < 64:
  41. return float(prev_angle_deg), False, float(prev_noise_floor)
  42. high = min(3400.0, 0.45 * sample_rate)
  43. low = min(300.0, high * 0.5)
  44. if high <= low + 1.0:
  45. return float(prev_angle_deg), False, float(prev_noise_floor)
  46. try:
  47. sos = signal.butter(4, [low, high], btype="bandpass", fs=sample_rate, output="sos")
  48. except ValueError:
  49. return float(prev_angle_deg), False, float(prev_noise_floor)
  50. speech1 = signal.sosfilt(sos, mic1).astype(np.float32, copy=False)
  51. speech2 = signal.sosfilt(sos, mic2).astype(np.float32, copy=False)
  52. speech_energy = 0.5 * (np.mean(speech1 * speech1) + np.mean(speech2 * speech2))
  53. full_energy = 0.5 * (np.mean(mic1 * mic1) + np.mean(mic2 * mic2))
  54. speech_ratio = float(speech_energy / max(full_energy, 1e-12))
  55. noise_floor = 0.995 * float(prev_noise_floor) + 0.005 * float(speech_energy)
  56. speech_threshold = max(2.5e-7, noise_floor * 2.0)
  57. speech_detected = bool(speech_energy > speech_threshold and speech_ratio > 0.08)
  58. if not speech_detected:
  59. return float(prev_angle_deg), False, float(noise_floor)
  60. max_tau = mic_spacing / 343.0
  61. tau = _gcc_phat_job(speech1, speech2, sample_rate, max_tau=max_tau)
  62. sin_theta = np.clip((tau * 343.0) / max(mic_spacing, 1e-6), -1.0, 1.0)
  63. raw_angle = float(np.rad2deg(np.arcsin(sin_theta)))
  64. raw_angle = float(np.clip(raw_angle, -90.0, 90.0))
  65. angle = 0.88 * float(prev_angle_deg) + 0.12 * raw_angle
  66. return float(angle), True, float(noise_floor)
  67. ALLOWED_SAMPLE_RATES = {16000, 22050, 24000, 32000, 48000}
  68. ALLOWED_MODES = {"mic1", "mic2", "mono_mix", "beamforming"}
  69. ALLOWED_SOURCES = {"mic1", "mic2", "mono_mix", "beam", "compare_all", "hifi_raw"}
  70. ALLOWED_MONITOR_SOURCES = {"mic1", "mic2", "mono_mix", "beam"}
  71. ALLOWED_HIFI_MICS = {"mic1", "mic2"}
  72. @dataclass
  73. class AudioSettings:
  74. mode: str = "beamforming"
  75. gain_db: float = 0.0
  76. agc: bool = True
  77. attack_ms: float = 6.0
  78. release_ms: float = 280.0
  79. noise_suppression: bool = True
  80. speech_gate: bool = False
  81. hum_filter: bool = True
  82. limiter: bool = True
  83. beam_clarity: bool = True
  84. hifi_mode: bool = False
  85. hifi_mic: str = "mic1"
  86. angle: float = 0.0
  87. auto_beam: bool = True
  88. monitor_on: bool = False
  89. monitor_source: str = "beam"
  90. sample_rate: int = 16000
  91. class AudioEngine:
  92. CHANNELS = 2
  93. BIT_DEPTH = 32
  94. CHUNK_SIZE = 2048
  95. HARDWARE_SAMPLE_RATE = 48000
  96. RING_DIAMETER_M = 0.06
  97. RING_SLOTS = 4
  98. SLOT_STEP = 1 # neighboring slots (90 degrees). Use 2 for opposite mics.
  99. MIC_SPACING = RING_DIAMETER_M * math.sin(math.pi * SLOT_STEP / RING_SLOTS)
  100. STARTUP_IGNORE_SECONDS = 0.30
  101. SPEECH_GATE_HOLD_SECONDS = 0.85
  102. SPEECH_GATE_FLOOR = 0.55
  103. SPEECH_GATE_ATTACK_SECONDS = 0.012
  104. SPEECH_GATE_RELEASE_SECONDS = 0.360
  105. NOISE_SUPPRESS_OPEN_FLOOR = 0.72
  106. NOISE_SUPPRESS_CLOSED_FLOOR = 0.40
  107. NOISE_SUPPRESS_OPEN_STRENGTH = 0.30
  108. NOISE_SUPPRESS_CLOSED_STRENGTH = 0.55
  109. HUM_HPF_CUTOFF_HZ = 75.0
  110. HUM_NOTCH_HZ = 50.0
  111. HUM_NOTCH_Q = 22.0
  112. BEAM_CLARITY_BLEND = 0.22
  113. BEAM_PRESENCE_BOOST = 0.20
  114. def __init__(self, recordings_dir: Path) -> None:
  115. self._lock = threading.Lock()
  116. self._stream: sd.InputStream | None = None
  117. self._stream_channels = self.CHANNELS
  118. self._input_device_name = "unknown"
  119. self._settings = AudioSettings()
  120. self._running = False
  121. self._startup_deadline = 0.0
  122. self._auto_angle_deg = 0.0
  123. self._noise_floor = 1e-7
  124. self._vad_noise_floor = 1e-7
  125. self._speech_sos_cache: dict[int, np.ndarray | None] = {}
  126. self._hpf_sos_cache: dict[int, np.ndarray | None] = {}
  127. self._notch_cache: dict[int, tuple[np.ndarray, np.ndarray] | None] = {}
  128. self._hpf_state: dict[tuple[str, int], np.ndarray] = {}
  129. self._notch_state: dict[tuple[str, int], np.ndarray] = {}
  130. self._last_speech_detected = False
  131. self._angle_update_counter = 0
  132. self._angle_update_interval = 4
  133. cpu_total = max(1, int(os.cpu_count() or 1))
  134. self._cpu_workers = max(1, min(4, cpu_total))
  135. self._angle_workers = max(1, min(3, self._cpu_workers - 1))
  136. self._max_pending_angle_jobs = max(1, min(2, self._angle_workers))
  137. self._angle_executor: ProcessPoolExecutor | None = ProcessPoolExecutor(max_workers=self._angle_workers)
  138. self._angle_futures: deque[Future] = deque()
  139. self._speech_gate_hold_chunks = 0
  140. self._speech_gate_gain = 1.0
  141. self._ns_noise_power = {
  142. "mic1": 1e-7,
  143. "mic2": 1e-7,
  144. "mono_mix": 1e-7,
  145. "beam": 1e-7,
  146. }
  147. self._presence_prev = {"beam": 0.0}
  148. self._recordings_dir = Path(recordings_dir)
  149. self._recorder = WavRecorder(recordings_dir)
  150. self._compare_recorders: dict[str, WavRecorder] = {}
  151. self._compare_filenames: dict[str, str] = {}
  152. self._record_duration_limit_sec: float | None = None
  153. self._auto_stop_requested = False
  154. self._monitor_queue: deque[np.ndarray] = deque()
  155. self._monitor_queue_samples = 0
  156. self._agc_mic1 = AgcProcessor(self._settings.sample_rate)
  157. self._agc_mic2 = AgcProcessor(self._settings.sample_rate)
  158. self._agc_beam = AgcProcessor(self._settings.sample_rate)
  159. self._latest_frame: dict[str, object] = self._make_empty_frame()
  160. def start(self) -> None:
  161. with self._lock:
  162. if self._running:
  163. return
  164. self._open_stream()
  165. def stop(self) -> None:
  166. with self._lock:
  167. stream = self._stream
  168. self._stream = None
  169. self._running = False
  170. angle_executor = self._angle_executor
  171. self._angle_executor = None
  172. angle_futures = list(self._angle_futures)
  173. self._angle_futures.clear()
  174. if stream is not None:
  175. stream.stop()
  176. stream.close()
  177. self.stop_recording()
  178. for fut in angle_futures:
  179. fut.cancel()
  180. if angle_executor is not None:
  181. angle_executor.shutdown(wait=False, cancel_futures=True)
  182. def is_running(self) -> bool:
  183. with self._lock:
  184. return self._running
  185. def get_settings(self) -> dict[str, object]:
  186. with self._lock:
  187. return asdict(self._settings)
  188. def get_status(self) -> dict[str, object]:
  189. rec_status = self._current_recording_status()
  190. return {
  191. "recording": rec_status.recording,
  192. "mic_count": self.CHANNELS,
  193. "hardware_sample_rate": self.HARDWARE_SAMPLE_RATE,
  194. "mic_spacing_m": self.MIC_SPACING,
  195. "auto_beam_angle_deg": self._auto_angle_deg,
  196. "input_device": self._input_device_name,
  197. "settings": self.get_settings(),
  198. "recording_status": asdict(rec_status),
  199. "audio_running": self.is_running(),
  200. }
  201. def get_latest_packet(self) -> dict[str, object]:
  202. empty = np.empty(0, dtype=np.float32)
  203. with self._lock:
  204. frame = dict(self._latest_frame)
  205. mic1 = np.asarray(frame.get("mic1", empty), dtype=np.float32)
  206. mic2 = np.asarray(frame.get("mic2", empty), dtype=np.float32)
  207. beam = np.asarray(frame.get("beam", empty), dtype=np.float32)
  208. mono_mix = np.asarray(frame.get("mono_mix", empty), dtype=np.float32)
  209. show_mic2 = bool(frame.get("show_mic2", True))
  210. show_beam = bool(frame.get("show_beam", False))
  211. show_mono_mix = bool(frame.get("show_mono_mix", False))
  212. beam_angle_deg = float(frame.get("beam_angle_deg", 0.0))
  213. auto_beam = bool(frame.get("auto_beam", True))
  214. speech_detected = bool(frame.get("speech_detected", False))
  215. speech_gate_open = bool(frame.get("speech_gate_open", True))
  216. hifi_mode = bool(frame.get("hifi_mode", False))
  217. monitor_on = bool(frame.get("monitor_on", False))
  218. monitor_source = str(frame.get("monitor_source", "beam"))
  219. recording = bool(frame.get("recording", False))
  220. rec_duration = float(frame.get("rec_duration", 0.0))
  221. monitor_rate = int(self.HARDWARE_SAMPLE_RATE if hifi_mode else self._settings.sample_rate)
  222. monitor_chunk = None
  223. if monitor_on:
  224. monitor_chunk = self._pop_monitor_chunk(max_samples=max(512, int(monitor_rate * 0.08)))
  225. packet = {
  226. "type": "audio_data",
  227. "mic1": self._downsample_for_ui(mic1).tolist(),
  228. "mic2": self._downsample_for_ui(mic2).tolist() if show_mic2 else [],
  229. "beam": self._downsample_for_ui(beam).tolist() if show_beam else [],
  230. "mono_mix": self._downsample_for_ui(mono_mix).tolist() if show_mono_mix else [],
  231. "rms_mic1": self._rms(mic1),
  232. "rms_mic2": self._rms(mic2) if show_mic2 else 0.0,
  233. "rms_beam": self._rms(beam) if show_beam else 0.0,
  234. "rms_mono_mix": self._rms(mono_mix) if show_mono_mix else 0.0,
  235. "beam_angle_deg": beam_angle_deg,
  236. "auto_beam": auto_beam,
  237. "speech_detected": speech_detected,
  238. "speech_gate_open": speech_gate_open,
  239. "hifi_mode": hifi_mode,
  240. "monitor_on": monitor_on,
  241. "monitor_source": monitor_source,
  242. "monitor_sr": monitor_rate,
  243. "monitor_chunk_b64": "",
  244. "recording": recording,
  245. "rec_duration": rec_duration,
  246. }
  247. if monitor_chunk is not None and monitor_chunk.size > 0:
  248. packet["monitor_chunk_b64"] = self._encode_pcm16_base64(monitor_chunk)
  249. return packet
  250. def update_settings(self, updates: dict[str, object]) -> dict[str, object]:
  251. with self._lock:
  252. current = self._settings
  253. mode = str(updates.get("mode", current.mode))
  254. if mode not in ALLOWED_MODES:
  255. mode = current.mode
  256. sample_rate = int(updates.get("sample_rate", current.sample_rate))
  257. if sample_rate not in ALLOWED_SAMPLE_RATES:
  258. sample_rate = current.sample_rate
  259. gain_db = float(updates.get("gain_db", current.gain_db))
  260. gain_db = float(np.clip(gain_db, 0.0, 30.0))
  261. agc = bool(updates.get("agc", current.agc))
  262. attack_ms = float(updates.get("attack_ms", current.attack_ms))
  263. attack_ms = float(np.clip(attack_ms, 1.0, 50.0))
  264. release_ms = float(updates.get("release_ms", current.release_ms))
  265. release_ms = float(np.clip(release_ms, 50.0, 1000.0))
  266. noise_suppression = bool(updates.get("noise_suppression", current.noise_suppression))
  267. speech_gate = bool(updates.get("speech_gate", current.speech_gate))
  268. hum_filter = bool(updates.get("hum_filter", current.hum_filter))
  269. limiter = bool(updates.get("limiter", current.limiter))
  270. beam_clarity = bool(updates.get("beam_clarity", current.beam_clarity))
  271. hifi_mode = bool(updates.get("hifi_mode", current.hifi_mode))
  272. hifi_mic = str(updates.get("hifi_mic", current.hifi_mic))
  273. if hifi_mic not in ALLOWED_HIFI_MICS:
  274. hifi_mic = current.hifi_mic
  275. angle = float(updates.get("angle", current.angle))
  276. angle = float(np.clip(angle, -90.0, 90.0))
  277. auto_beam = bool(updates.get("auto_beam", current.auto_beam))
  278. monitor_on = bool(updates.get("monitor_on", current.monitor_on))
  279. monitor_source = str(updates.get("monitor_source", current.monitor_source))
  280. if monitor_source not in ALLOWED_MONITOR_SOURCES:
  281. monitor_source = current.monitor_source
  282. if hifi_mode:
  283. monitor_on = False
  284. self._settings = AudioSettings(
  285. mode=mode,
  286. gain_db=gain_db,
  287. agc=agc,
  288. attack_ms=attack_ms,
  289. release_ms=release_ms,
  290. noise_suppression=noise_suppression,
  291. speech_gate=speech_gate,
  292. hum_filter=hum_filter,
  293. limiter=limiter,
  294. beam_clarity=beam_clarity,
  295. hifi_mode=hifi_mode,
  296. hifi_mic=hifi_mic,
  297. angle=angle,
  298. auto_beam=auto_beam,
  299. monitor_on=monitor_on,
  300. monitor_source=monitor_source,
  301. sample_rate=sample_rate,
  302. )
  303. if not auto_beam:
  304. self._auto_angle_deg = angle
  305. if not monitor_on or hifi_mode:
  306. self._clear_monitor_queue_locked()
  307. self._agc_mic1.update(
  308. sample_rate=sample_rate,
  309. attack_ms=attack_ms,
  310. release_ms=release_ms,
  311. )
  312. self._agc_mic2.update(
  313. sample_rate=sample_rate,
  314. attack_ms=attack_ms,
  315. release_ms=release_ms,
  316. )
  317. self._agc_beam.update(
  318. sample_rate=sample_rate,
  319. attack_ms=attack_ms,
  320. release_ms=release_ms,
  321. )
  322. return self.get_settings()
  323. def start_recording(self, source: str, duration_sec: float | None = None) -> dict[str, object]:
  324. source = source.lower().strip()
  325. if source not in ALLOWED_SOURCES:
  326. raise ValueError("Invalid recording source")
  327. if self._current_recording_status().recording:
  328. raise RuntimeError("Recording is already active")
  329. limit = None
  330. if duration_sec is not None:
  331. try:
  332. duration_value = float(duration_sec)
  333. except (TypeError, ValueError):
  334. duration_value = 0.0
  335. if duration_value > 0.0:
  336. limit = float(np.clip(duration_value, 1.0, 3600.0))
  337. with self._lock:
  338. self._record_duration_limit_sec = limit
  339. self._auto_stop_requested = False
  340. settings = self.get_settings()
  341. sample_rate = int(settings["sample_rate"])
  342. hifi_mode = bool(settings.get("hifi_mode", False))
  343. hifi_mic = str(settings.get("hifi_mic", "mic1"))
  344. if hifi_mic not in ALLOWED_HIFI_MICS:
  345. hifi_mic = "mic1"
  346. if hifi_mode:
  347. timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
  348. filename = self._recorder.start(
  349. source="hifi_raw",
  350. sample_rate=self.HARDWARE_SAMPLE_RATE,
  351. channels=1,
  352. filename=f"rec_{timestamp}_hifi_{hifi_mic}_48k.wav",
  353. )
  354. with self._lock:
  355. self._compare_recorders = {}
  356. self._compare_filenames = {}
  357. return {
  358. "source": "hifi_raw",
  359. "filenames": [filename],
  360. "duration_limit_sec": limit or 0.0,
  361. }
  362. if source == "hifi_raw":
  363. source = hifi_mic
  364. if source == "compare_all":
  365. timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
  366. recorders = {
  367. "mic1": WavRecorder(self._recordings_dir),
  368. "mono_mix": WavRecorder(self._recordings_dir),
  369. "beam": WavRecorder(self._recordings_dir),
  370. }
  371. filenames = {
  372. "mic1": f"rec_{timestamp}_mic1.wav",
  373. "mono_mix": f"rec_{timestamp}_mono_mix.wav",
  374. "beam": f"rec_{timestamp}_beam.wav",
  375. }
  376. for key, recorder in recorders.items():
  377. recorder.start(
  378. source=key,
  379. sample_rate=sample_rate,
  380. channels=1,
  381. filename=filenames[key],
  382. )
  383. with self._lock:
  384. self._compare_recorders = recorders
  385. self._compare_filenames = filenames
  386. return {
  387. "source": source,
  388. "filenames": [filenames["mic1"], filenames["mono_mix"], filenames["beam"]],
  389. "duration_limit_sec": limit or 0.0,
  390. }
  391. channels = 1
  392. filename = self._recorder.start(
  393. source=source,
  394. sample_rate=sample_rate,
  395. channels=channels,
  396. )
  397. with self._lock:
  398. self._compare_recorders = {}
  399. self._compare_filenames = {}
  400. return {
  401. "source": source,
  402. "filenames": [filename],
  403. "duration_limit_sec": limit or 0.0,
  404. }
  405. def stop_recording(self) -> RecorderStatus:
  406. with self._lock:
  407. compare_recorders = self._compare_recorders
  408. compare_filenames = dict(self._compare_filenames)
  409. self._compare_recorders = {}
  410. self._compare_filenames = {}
  411. self._record_duration_limit_sec = None
  412. self._auto_stop_requested = False
  413. if compare_recorders:
  414. max_duration = 0.0
  415. sample_rate = self._settings.sample_rate
  416. for recorder in compare_recorders.values():
  417. status = recorder.stop()
  418. max_duration = max(max_duration, status.duration_sec)
  419. sample_rate = status.sample_rate
  420. return RecorderStatus(
  421. recording=False,
  422. filename=",".join(compare_filenames.values()),
  423. duration_sec=max_duration,
  424. channels=1,
  425. sample_rate=sample_rate,
  426. source="compare_all",
  427. )
  428. return self._recorder.stop()
  429. def get_recording_status(self) -> RecorderStatus:
  430. return self._current_recording_status()
  431. def _current_recording_status(self) -> RecorderStatus:
  432. with self._lock:
  433. compare_recorders = dict(self._compare_recorders)
  434. compare_filenames = dict(self._compare_filenames)
  435. if compare_recorders:
  436. statuses = [recorder.get_status() for recorder in compare_recorders.values()]
  437. is_recording = any(status.recording for status in statuses)
  438. duration = max((status.duration_sec for status in statuses), default=0.0)
  439. sample_rate = statuses[0].sample_rate if statuses else self._settings.sample_rate
  440. return RecorderStatus(
  441. recording=is_recording,
  442. filename=",".join(compare_filenames.values()),
  443. duration_sec=duration,
  444. channels=1,
  445. sample_rate=sample_rate,
  446. source="compare_all",
  447. )
  448. return self._recorder.get_status()
  449. def _open_stream(self) -> None:
  450. device_idx, device_name, input_channels = self._resolve_input_device()
  451. stream = sd.InputStream(
  452. samplerate=self.HARDWARE_SAMPLE_RATE,
  453. device=device_idx,
  454. channels=input_channels,
  455. dtype="int32",
  456. blocksize=self.CHUNK_SIZE,
  457. callback=self._audio_callback,
  458. )
  459. stream.start()
  460. with self._lock:
  461. self._stream = stream
  462. self._stream_channels = input_channels
  463. self._input_device_name = device_name
  464. self._running = True
  465. self._startup_deadline = time.monotonic() + self.STARTUP_IGNORE_SECONDS
  466. def _restart_stream(self) -> None:
  467. with self._lock:
  468. stream = self._stream
  469. self._stream = None
  470. self._running = False
  471. if stream is not None:
  472. stream.stop()
  473. stream.close()
  474. self._open_stream()
  475. def _audio_callback(self, indata: np.ndarray, frames: int, time_info: dict, status: sd.CallbackFlags) -> None:
  476. del frames, time_info
  477. if status:
  478. return
  479. with self._lock:
  480. if not self._running:
  481. return
  482. if time.monotonic() < self._startup_deadline:
  483. return
  484. settings = self._settings
  485. stream_channels = self._stream_channels
  486. pcm32 = indata.astype(np.int32, copy=False)
  487. pcm24 = pcm32 >> 8
  488. normalized = np.clip(pcm24.astype(np.float32) / 8388608.0, -1.0, 1.0)
  489. mic1_raw = normalized[:, 0]
  490. mic2_raw = normalized[:, 1] if stream_channels > 1 else mic1_raw.copy()
  491. processing_rate = self.HARDWARE_SAMPLE_RATE if settings.hifi_mode else settings.sample_rate
  492. if (not settings.hifi_mode) and settings.sample_rate != self.HARDWARE_SAMPLE_RATE:
  493. mic1_raw = self._resample_audio(mic1_raw, self.HARDWARE_SAMPLE_RATE, settings.sample_rate)
  494. mic2_raw = self._resample_audio(mic2_raw, self.HARDWARE_SAMPLE_RATE, settings.sample_rate)
  495. common_len = min(mic1_raw.size, mic2_raw.size)
  496. mic1_raw = mic1_raw[:common_len]
  497. mic2_raw = mic2_raw[:common_len]
  498. if settings.hifi_mode:
  499. hifi_mic = settings.hifi_mic if settings.hifi_mic in ALLOWED_HIFI_MICS else "mic1"
  500. hifi_signal = mic1_raw if hifi_mic == "mic1" else mic2_raw
  501. mic1_proc = hifi_signal
  502. mic2_proc = mic2_raw if hifi_mic == "mic1" else mic1_raw
  503. mono_mix_proc = hifi_signal
  504. beam_proc = hifi_signal
  505. angle_to_use = 0.0
  506. speech_active = True
  507. gate_open = True
  508. else:
  509. gain_linear = 10.0 ** (settings.gain_db / 20.0)
  510. mic1_base = np.clip(mic1_raw * gain_linear, -1.0, 1.0)
  511. mic2_base = np.clip(mic2_raw * gain_linear, -1.0, 1.0)
  512. mono_base = np.clip(0.5 * (mic1_base + mic2_base), -1.0, 1.0)
  513. angle_to_use = settings.angle
  514. speech_detected = False
  515. if settings.mode == "beamforming" and settings.auto_beam:
  516. latest_done: Future | None = None
  517. pending: deque[Future] = deque()
  518. while self._angle_futures:
  519. fut = self._angle_futures.popleft()
  520. if fut.done():
  521. latest_done = fut
  522. else:
  523. pending.append(fut)
  524. self._angle_futures = pending
  525. if latest_done is not None:
  526. try:
  527. angle_est, speech_est, noise_floor = latest_done.result(timeout=0)
  528. self._auto_angle_deg = float(angle_est)
  529. self._last_speech_detected = bool(speech_est)
  530. self._noise_floor = max(float(noise_floor), 1e-9)
  531. except Exception:
  532. pass
  533. self._angle_update_counter += 1
  534. if (
  535. self._angle_update_counter >= self._angle_update_interval
  536. and len(self._angle_futures) < self._max_pending_angle_jobs
  537. and self._angle_executor is not None
  538. ):
  539. self._angle_update_counter = 0
  540. try:
  541. fut = self._angle_executor.submit(
  542. _estimate_speech_angle_job,
  543. mic1_base.astype(np.float32, copy=True),
  544. mic2_base.astype(np.float32, copy=True),
  545. int(processing_rate),
  546. float(self.MIC_SPACING),
  547. float(self._auto_angle_deg),
  548. float(self._noise_floor),
  549. )
  550. self._angle_futures.append(fut)
  551. except Exception:
  552. pass
  553. angle_to_use = self._auto_angle_deg
  554. speech_detected = self._last_speech_detected
  555. else:
  556. while self._angle_futures:
  557. self._angle_futures.popleft().cancel()
  558. if not settings.auto_beam:
  559. self._auto_angle_deg = settings.angle
  560. speech_detected = False
  561. speech_presence = self._estimate_speech_presence_fast(mic1_base, mic2_base)
  562. speech_active = bool(speech_detected or speech_presence)
  563. beam_proc = beamform_delay_and_sum(
  564. mic1_base,
  565. mic2_base,
  566. angle_deg=angle_to_use,
  567. sample_rate=processing_rate,
  568. mic_spacing=self.MIC_SPACING,
  569. )
  570. if settings.beam_clarity:
  571. beam_proc = self._apply_beam_clarity_blend(beam_proc, mono_base)
  572. gate_open, gate_gain = self._update_speech_gate(
  573. speech_detected=speech_active,
  574. sample_rate=processing_rate,
  575. chunk_len=max(beam_proc.size, 1),
  576. enabled=settings.speech_gate,
  577. )
  578. if settings.agc:
  579. mic1_proc = self._agc_mic1.process(mic1_base, speech_hint=gate_open)
  580. mic2_proc = self._agc_mic2.process(mic2_base, speech_hint=gate_open)
  581. beam_proc = self._agc_beam.process(beam_proc, speech_hint=gate_open)
  582. else:
  583. mic1_proc = mic1_base
  584. mic2_proc = mic2_base
  585. mono_mix_proc = np.clip(0.5 * (mic1_proc + mic2_proc), -1.0, 1.0)
  586. if settings.hum_filter:
  587. mic1_proc = self._apply_hum_filter(mic1_proc, processing_rate, "mic1")
  588. mic2_proc = self._apply_hum_filter(mic2_proc, processing_rate, "mic2")
  589. beam_proc = self._apply_hum_filter(beam_proc, processing_rate, "beam")
  590. mono_mix_proc = np.clip(0.5 * (mic1_proc + mic2_proc), -1.0, 1.0)
  591. mono_mix_proc = self._apply_hum_filter(mono_mix_proc, processing_rate, "mono_mix")
  592. if settings.noise_suppression:
  593. mic1_proc = self._apply_noise_suppression(mic1_proc, "mic1", speech_active=gate_open)
  594. mic2_proc = self._apply_noise_suppression(mic2_proc, "mic2", speech_active=gate_open)
  595. mono_mix_proc = self._apply_noise_suppression(mono_mix_proc, "mono_mix", speech_active=gate_open)
  596. beam_proc = self._apply_noise_suppression(beam_proc, "beam", speech_active=gate_open)
  597. if settings.speech_gate:
  598. mic1_proc = np.clip(mic1_proc * gate_gain, -1.0, 1.0).astype(np.float32, copy=False)
  599. mic2_proc = np.clip(mic2_proc * gate_gain, -1.0, 1.0).astype(np.float32, copy=False)
  600. mono_mix_proc = np.clip(mono_mix_proc * gate_gain, -1.0, 1.0).astype(np.float32, copy=False)
  601. beam_proc = np.clip(beam_proc * gate_gain, -1.0, 1.0).astype(np.float32, copy=False)
  602. if settings.limiter:
  603. mic1_proc = self._apply_limiter(mic1_proc)
  604. mic2_proc = self._apply_limiter(mic2_proc)
  605. mono_mix_proc = self._apply_limiter(mono_mix_proc)
  606. beam_proc = self._apply_limiter(beam_proc)
  607. rec_status = self._current_recording_status()
  608. if rec_status.recording:
  609. self._write_recording_chunk(rec_status.source, mic1_raw, mic2_raw, mic1_proc, mono_mix_proc, beam_proc)
  610. rec_status = self._current_recording_status()
  611. should_auto_stop = False
  612. with self._lock:
  613. duration_limit = self._record_duration_limit_sec
  614. if (
  615. rec_status.recording
  616. and duration_limit is not None
  617. and rec_status.duration_sec >= duration_limit
  618. and not self._auto_stop_requested
  619. ):
  620. self._auto_stop_requested = True
  621. should_auto_stop = True
  622. if should_auto_stop:
  623. threading.Thread(target=self.stop_recording, daemon=True).start()
  624. show_beam = (settings.mode == "beamforming") and (not settings.hifi_mode)
  625. show_mono_mix = (settings.mode == "mono_mix") and (not settings.hifi_mode)
  626. show_mic2 = not settings.hifi_mode
  627. if settings.monitor_on:
  628. monitor_map = {
  629. "mic1": mic1_proc,
  630. "mic2": mic2_proc,
  631. "mono_mix": mono_mix_proc,
  632. "beam": beam_proc,
  633. }
  634. monitor_signal = monitor_map.get(settings.monitor_source, beam_proc)
  635. with self._lock:
  636. self._push_monitor_chunk_locked(monitor_signal, processing_rate)
  637. frame = {
  638. "mic1": mic1_proc.astype(np.float32, copy=False),
  639. "mic2": mic2_proc.astype(np.float32, copy=False),
  640. "beam": beam_proc.astype(np.float32, copy=False),
  641. "mono_mix": mono_mix_proc.astype(np.float32, copy=False),
  642. "show_mic2": show_mic2,
  643. "show_beam": show_beam,
  644. "show_mono_mix": show_mono_mix,
  645. "beam_angle_deg": float(angle_to_use),
  646. "auto_beam": settings.auto_beam,
  647. "speech_detected": speech_active,
  648. "speech_gate_open": gate_open,
  649. "hifi_mode": settings.hifi_mode,
  650. "monitor_on": settings.monitor_on,
  651. "monitor_source": settings.monitor_source,
  652. "recording": rec_status.recording,
  653. "rec_duration": rec_status.duration_sec,
  654. }
  655. with self._lock:
  656. self._latest_frame = frame
  657. def _write_recording_chunk(
  658. self,
  659. source: str,
  660. mic1_raw: np.ndarray,
  661. mic2_raw: np.ndarray,
  662. mic1_proc: np.ndarray,
  663. mono_mix_proc: np.ndarray,
  664. beam_proc: np.ndarray,
  665. ) -> None:
  666. if source == "compare_all":
  667. with self._lock:
  668. mic1_rec = self._compare_recorders.get("mic1")
  669. mix_rec = self._compare_recorders.get("mono_mix")
  670. beam_rec = self._compare_recorders.get("beam")
  671. if mic1_rec is not None:
  672. mic1_rec.write(mic1_proc)
  673. if mix_rec is not None:
  674. mix_rec.write(mono_mix_proc)
  675. if beam_rec is not None:
  676. beam_rec.write(beam_proc)
  677. return
  678. if source == "mic1":
  679. self._recorder.write(mic1_raw)
  680. return
  681. if source == "mic2":
  682. self._recorder.write(mic2_raw)
  683. return
  684. if source == "mono_mix":
  685. self._recorder.write(mono_mix_proc)
  686. return
  687. if source == "beam":
  688. self._recorder.write(beam_proc)
  689. return
  690. if source == "hifi_raw":
  691. with self._lock:
  692. hifi_mic = self._settings.hifi_mic
  693. if hifi_mic == "mic2":
  694. self._recorder.write(mic2_raw)
  695. else:
  696. self._recorder.write(mic1_raw)
  697. return
  698. def _apply_beam_clarity_blend(self, beam: np.ndarray, mono: np.ndarray) -> np.ndarray:
  699. if beam.size == 0 or mono.size == 0:
  700. return beam.astype(np.float32, copy=False)
  701. n = min(beam.size, mono.size)
  702. if n <= 0:
  703. return beam.astype(np.float32, copy=False)
  704. blend = float(np.clip(self.BEAM_CLARITY_BLEND, 0.0, 0.5))
  705. out = ((1.0 - blend) * beam[:n] + blend * mono[:n]).astype(np.float32, copy=False)
  706. prev = float(self._presence_prev.get("beam", 0.0))
  707. hp = np.empty_like(out)
  708. hp[0] = out[0] - prev
  709. if out.size > 1:
  710. hp[1:] = out[1:] - out[:-1]
  711. self._presence_prev["beam"] = float(out[-1])
  712. out = out + float(self.BEAM_PRESENCE_BOOST) * hp
  713. return np.clip(out, -1.0, 1.0).astype(np.float32, copy=False)
  714. def _update_speech_gate(
  715. self,
  716. *,
  717. speech_detected: bool,
  718. sample_rate: int,
  719. chunk_len: int,
  720. enabled: bool,
  721. ) -> tuple[bool, float]:
  722. if not enabled:
  723. self._speech_gate_hold_chunks = 0
  724. self._speech_gate_gain = 1.0
  725. return True, 1.0
  726. chunk_seconds = max(float(chunk_len) / float(sample_rate), 1e-6)
  727. hold_chunks = max(1, int(round(self.SPEECH_GATE_HOLD_SECONDS / chunk_seconds)))
  728. if speech_detected:
  729. self._speech_gate_hold_chunks = hold_chunks
  730. elif self._speech_gate_hold_chunks > 0:
  731. self._speech_gate_hold_chunks -= 1
  732. gate_open = bool(speech_detected or self._speech_gate_hold_chunks > 0)
  733. target_gain = 1.0 if gate_open else self.SPEECH_GATE_FLOOR
  734. tau = self.SPEECH_GATE_ATTACK_SECONDS if target_gain > self._speech_gate_gain else self.SPEECH_GATE_RELEASE_SECONDS
  735. coeff = np.exp(-chunk_seconds / max(tau, 1e-4))
  736. self._speech_gate_gain = float(coeff * self._speech_gate_gain + (1.0 - coeff) * target_gain)
  737. return gate_open, self._speech_gate_gain
  738. def _apply_noise_suppression(self, audio: np.ndarray, key: str, *, speech_active: bool) -> np.ndarray:
  739. if audio.size == 0:
  740. return audio.astype(np.float32, copy=False)
  741. power = float(np.mean(audio * audio, dtype=np.float64))
  742. prev_noise = float(self._ns_noise_power.get(key, 1e-7))
  743. alpha = 0.01 if speech_active else 0.07
  744. noise = (1.0 - alpha) * prev_noise + alpha * power
  745. noise = max(noise, 1e-9)
  746. self._ns_noise_power[key] = noise
  747. ratio = noise / max(power, 1e-9)
  748. if speech_active:
  749. strength = self.NOISE_SUPPRESS_OPEN_STRENGTH
  750. floor = self.NOISE_SUPPRESS_OPEN_FLOOR
  751. else:
  752. strength = self.NOISE_SUPPRESS_CLOSED_STRENGTH
  753. floor = self.NOISE_SUPPRESS_CLOSED_FLOOR
  754. gain = float(np.clip(1.0 - strength * ratio, floor, 1.0))
  755. out = audio.astype(np.float32, copy=False) * gain
  756. return np.clip(out, -1.0, 1.0).astype(np.float32, copy=False)
  757. def _apply_hum_filter(self, audio: np.ndarray, sample_rate: int, channel_key: str) -> np.ndarray:
  758. if audio.size == 0:
  759. return audio.astype(np.float32, copy=False)
  760. out = audio.astype(np.float32, copy=False)
  761. state_key = (channel_key, int(sample_rate))
  762. sos = self._get_hpf_sos(sample_rate)
  763. if sos is not None:
  764. zi = self._hpf_state.get(state_key)
  765. if zi is None or zi.shape != (sos.shape[0], 2):
  766. zi = np.zeros((sos.shape[0], 2), dtype=np.float32)
  767. out, zi_new = signal.sosfilt(sos, out, zi=zi)
  768. self._hpf_state[state_key] = zi_new.astype(np.float32, copy=False)
  769. out = out.astype(np.float32, copy=False)
  770. notch = self._get_notch_coeff(sample_rate)
  771. if notch is not None:
  772. b, a = notch
  773. zi = self._notch_state.get(state_key)
  774. expected = max(len(a), len(b)) - 1
  775. if zi is None or zi.size != expected:
  776. zi = np.zeros(expected, dtype=np.float32)
  777. out, zi_new = signal.lfilter(b, a, out, zi=zi)
  778. self._notch_state[state_key] = zi_new.astype(np.float32, copy=False)
  779. out = out.astype(np.float32, copy=False)
  780. return out
  781. def _get_hpf_sos(self, sample_rate: int) -> np.ndarray | None:
  782. cached = self._hpf_sos_cache.get(sample_rate, "__missing__")
  783. if isinstance(cached, np.ndarray):
  784. return cached
  785. if cached is None:
  786. return None
  787. cutoff = min(self.HUM_HPF_CUTOFF_HZ, 0.45 * sample_rate)
  788. if cutoff < 20.0:
  789. self._hpf_sos_cache[sample_rate] = None
  790. return None
  791. try:
  792. sos = signal.butter(2, cutoff, btype="highpass", fs=sample_rate, output="sos")
  793. except ValueError:
  794. self._hpf_sos_cache[sample_rate] = None
  795. return None
  796. self._hpf_sos_cache[sample_rate] = sos
  797. return sos
  798. def _get_notch_coeff(self, sample_rate: int) -> tuple[np.ndarray, np.ndarray] | None:
  799. cached = self._notch_cache.get(sample_rate, "__missing__")
  800. if isinstance(cached, tuple):
  801. return cached
  802. if cached is None:
  803. return None
  804. nyquist = 0.5 * sample_rate
  805. if nyquist <= self.HUM_NOTCH_HZ * 1.2:
  806. self._notch_cache[sample_rate] = None
  807. return None
  808. w0 = self.HUM_NOTCH_HZ / nyquist
  809. try:
  810. b, a = signal.iirnotch(w0, self.HUM_NOTCH_Q)
  811. except ValueError:
  812. self._notch_cache[sample_rate] = None
  813. return None
  814. coeff = (b.astype(np.float32), a.astype(np.float32))
  815. self._notch_cache[sample_rate] = coeff
  816. return coeff
  817. @staticmethod
  818. def _apply_limiter(audio: np.ndarray) -> np.ndarray:
  819. if audio.size == 0:
  820. return audio.astype(np.float32, copy=False)
  821. x = np.clip(audio.astype(np.float32, copy=False), -1.0, 1.0)
  822. threshold = 0.82
  823. abs_x = np.abs(x)
  824. if not np.any(abs_x > threshold):
  825. return x.astype(np.float32, copy=False)
  826. out = x.copy()
  827. over = abs_x > threshold
  828. norm = (abs_x[over] - threshold) / max(1.0 - threshold, 1e-6)
  829. compressed = threshold + (1.0 - threshold) * (np.tanh(2.2 * norm) / np.tanh(2.2))
  830. out[over] = np.sign(x[over]) * compressed
  831. return out.astype(np.float32, copy=False)
  832. @staticmethod
  833. def _downsample_for_ui(audio: np.ndarray, target_points: int = 320) -> np.ndarray:
  834. if audio.size <= target_points:
  835. return audio.astype(np.float32, copy=False)
  836. step = int(np.ceil(audio.size / target_points))
  837. sampled = audio[::step]
  838. if sampled.size > target_points:
  839. sampled = sampled[:target_points]
  840. return sampled.astype(np.float32, copy=False)
  841. @staticmethod
  842. def _rms(audio: np.ndarray) -> float:
  843. if audio.size == 0:
  844. return 0.0
  845. return float(np.sqrt(np.mean(np.square(audio), dtype=np.float64)))
  846. @staticmethod
  847. def _resample_audio(audio: np.ndarray, source_rate: int, target_rate: int) -> np.ndarray:
  848. if audio.size == 0 or source_rate == target_rate:
  849. return audio.astype(np.float32, copy=False)
  850. if source_rate == 48000 and target_rate == 16000:
  851. usable = audio.size - (audio.size % 3)
  852. if usable <= 0:
  853. return audio.astype(np.float32, copy=False)
  854. # Fast decimation-by-3 tuned for speech workloads on Zero 2W.
  855. grouped = audio[:usable].reshape(-1, 3)
  856. return grouped.mean(axis=1, dtype=np.float32).astype(np.float32, copy=False)
  857. gcd = math.gcd(source_rate, target_rate)
  858. up = target_rate // gcd
  859. down = source_rate // gcd
  860. resampled = signal.resample_poly(audio, up=up, down=down)
  861. return resampled.astype(np.float32, copy=False)
  862. def _resolve_input_device(self) -> tuple[int | None, str, int]:
  863. try:
  864. default_input = sd.default.device[0]
  865. if isinstance(default_input, int) and default_input >= 0:
  866. info = sd.query_devices(default_input, "input")
  867. max_inputs = int(info.get("max_input_channels", 0))
  868. if max_inputs > 0:
  869. return int(default_input), str(info.get("name", f"device-{default_input}")), min(self.CHANNELS, max_inputs)
  870. except Exception:
  871. pass
  872. all_devices = sd.query_devices()
  873. preferred: tuple[int, dict] | None = None
  874. fallback: tuple[int, dict] | None = None
  875. for idx, raw_info in enumerate(all_devices):
  876. info = dict(raw_info)
  877. max_inputs = int(info.get("max_input_channels", 0))
  878. if max_inputs <= 0:
  879. continue
  880. device = (idx, info)
  881. name = str(info.get("name", "")).lower()
  882. if max_inputs >= self.CHANNELS and any(tag in name for tag in ("google", "voicehat", "i2s", "mic")):
  883. preferred = device
  884. break
  885. if max_inputs >= self.CHANNELS and fallback is None:
  886. fallback = device
  887. if fallback is None:
  888. fallback = device
  889. chosen = preferred or fallback
  890. if chosen is None:
  891. raise RuntimeError("No audio input device available")
  892. idx, info = chosen
  893. max_inputs = int(info.get("max_input_channels", 1))
  894. channels = min(self.CHANNELS, max_inputs)
  895. return int(idx), str(info.get("name", f"device-{idx}")), channels
  896. def _push_monitor_chunk_locked(self, chunk: np.ndarray, sample_rate: int) -> None:
  897. samples = chunk.astype(np.float32, copy=True)
  898. if samples.size == 0:
  899. return
  900. self._monitor_queue.append(samples)
  901. self._monitor_queue_samples += samples.size
  902. max_samples = max(sample_rate * 2, 2048)
  903. while self._monitor_queue_samples > max_samples and self._monitor_queue:
  904. dropped = self._monitor_queue.popleft()
  905. self._monitor_queue_samples -= dropped.size
  906. def _pop_monitor_chunk(self, max_samples: int) -> np.ndarray | None:
  907. if self._monitor_queue_samples <= 0 or not self._monitor_queue:
  908. return None
  909. take: list[np.ndarray] = []
  910. collected = 0
  911. while self._monitor_queue and collected < max_samples:
  912. chunk = self._monitor_queue[0]
  913. remaining = max_samples - collected
  914. if chunk.size <= remaining:
  915. take.append(chunk)
  916. collected += chunk.size
  917. self._monitor_queue.popleft()
  918. else:
  919. take.append(chunk[:remaining])
  920. self._monitor_queue[0] = chunk[remaining:]
  921. collected += remaining
  922. break
  923. self._monitor_queue_samples -= collected
  924. if not take:
  925. return None
  926. if len(take) == 1:
  927. return take[0]
  928. return np.concatenate(take).astype(np.float32, copy=False)
  929. @staticmethod
  930. def _encode_pcm16_base64(audio: np.ndarray) -> str:
  931. pcm16 = (np.clip(audio, -1.0, 1.0) * 32767.0).astype(np.int16)
  932. return base64.b64encode(pcm16.tobytes()).decode("ascii")
  933. def _clear_monitor_queue_locked(self) -> None:
  934. self._monitor_queue.clear()
  935. self._monitor_queue_samples = 0
  936. def _estimate_speech_presence_fast(self, mic1: np.ndarray, mic2: np.ndarray) -> bool:
  937. if mic1.size == 0 or mic2.size == 0:
  938. return False
  939. energy = 0.5 * (np.mean(mic1 * mic1) + np.mean(mic2 * mic2))
  940. energy = float(max(energy, 1e-10))
  941. if energy < self._vad_noise_floor:
  942. alpha = 0.05
  943. else:
  944. alpha = 0.004
  945. self._vad_noise_floor = (1.0 - alpha) * self._vad_noise_floor + alpha * energy
  946. threshold = max(2.2e-7, self._vad_noise_floor * 1.9)
  947. return bool(energy > threshold)
  948. @staticmethod
  949. def _gcc_phat(sig: np.ndarray, refsig: np.ndarray, sample_rate: int, max_tau: float) -> float:
  950. n = sig.size + refsig.size
  951. sig_fft = np.fft.rfft(sig, n=n)
  952. ref_fft = np.fft.rfft(refsig, n=n)
  953. cross = sig_fft * np.conj(ref_fft)
  954. denom = np.abs(cross)
  955. cross = cross / np.maximum(denom, 1e-10)
  956. cc = np.fft.irfft(cross, n=n)
  957. max_shift = int(min(n // 2, max_tau * sample_rate))
  958. if max_shift <= 0:
  959. return 0.0
  960. cc_window = np.concatenate((cc[-max_shift:], cc[: max_shift + 1]))
  961. shift = int(np.argmax(np.abs(cc_window)) - max_shift)
  962. return float(shift) / float(sample_rate)
  963. def _estimate_speech_angle(self, mic1: np.ndarray, mic2: np.ndarray, sample_rate: int) -> tuple[float, bool]:
  964. if mic1.size < 64 or mic2.size < 64:
  965. return self._auto_angle_deg, False
  966. high = min(3400.0, 0.45 * sample_rate)
  967. low = min(300.0, high * 0.5)
  968. if high <= low + 1.0:
  969. return self._auto_angle_deg, False
  970. sos = self._get_speech_sos(sample_rate, low, high)
  971. if sos is None:
  972. return self._auto_angle_deg, False
  973. speech1 = signal.sosfilt(sos, mic1).astype(np.float32, copy=False)
  974. speech2 = signal.sosfilt(sos, mic2).astype(np.float32, copy=False)
  975. speech_energy = 0.5 * (np.mean(speech1 * speech1) + np.mean(speech2 * speech2))
  976. full_energy = 0.5 * (np.mean(mic1 * mic1) + np.mean(mic2 * mic2))
  977. speech_ratio = float(speech_energy / max(full_energy, 1e-12))
  978. self._noise_floor = 0.995 * self._noise_floor + 0.005 * float(speech_energy)
  979. speech_threshold = max(2.5e-7, self._noise_floor * 2.0)
  980. speech_detected = bool(speech_energy > speech_threshold and speech_ratio > 0.08)
  981. if not speech_detected:
  982. return self._auto_angle_deg, False
  983. max_tau = self.MIC_SPACING / 343.0
  984. tau = self._gcc_phat(speech1, speech2, sample_rate, max_tau=max_tau)
  985. sin_theta = np.clip((tau * 343.0) / self.MIC_SPACING, -1.0, 1.0)
  986. raw_angle = float(np.rad2deg(np.arcsin(sin_theta)))
  987. raw_angle = float(np.clip(raw_angle, -90.0, 90.0))
  988. self._auto_angle_deg = 0.88 * self._auto_angle_deg + 0.12 * raw_angle
  989. return self._auto_angle_deg, True
  990. def _get_speech_sos(self, sample_rate: int, low: float, high: float) -> np.ndarray | None:
  991. cached = self._speech_sos_cache.get(sample_rate, "__missing__")
  992. if isinstance(cached, np.ndarray):
  993. return cached
  994. if cached is None:
  995. return None
  996. try:
  997. sos = signal.butter(4, [low, high], btype="bandpass", fs=sample_rate, output="sos")
  998. except ValueError:
  999. self._speech_sos_cache[sample_rate] = None
  1000. return None
  1001. self._speech_sos_cache[sample_rate] = sos
  1002. return sos
  1003. @staticmethod
  1004. def _make_empty_frame() -> dict[str, object]:
  1005. empty = np.empty(0, dtype=np.float32)
  1006. return {
  1007. "mic1": empty,
  1008. "mic2": empty,
  1009. "beam": empty,
  1010. "mono_mix": empty,
  1011. "show_mic2": True,
  1012. "show_beam": False,
  1013. "show_mono_mix": False,
  1014. "beam_angle_deg": 0.0,
  1015. "auto_beam": True,
  1016. "speech_detected": False,
  1017. "speech_gate_open": True,
  1018. "hifi_mode": False,
  1019. "monitor_on": False,
  1020. "monitor_source": "beam",
  1021. "recording": False,
  1022. "rec_duration": 0.0,
  1023. }