From 669bfac7221846be2d154c7804deab0ea966f1a6 Mon Sep 17 00:00:00 2001 From: imbytecat Date: Mon, 2 Mar 2026 07:42:45 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=20=E4=BD=BF=E7=94=A8=20@picovoice/web?= =?UTF-8?q?-voice-processor=20=E6=9B=BF=E6=8D=A2=E6=89=8B=E5=86=99?= =?UTF-8?q?=E9=9F=B3=E9=A2=91=E9=87=87=E9=9B=86=E7=AE=A1=E7=BA=BF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 引入 WebVoiceProcessor 处理 getUserMedia、AudioContext 生命周期和 WASM 重采样 - 删除自定义 AudioWorklet (audio-processor.ts) 和线性插值重采样器 (resample.ts) - 改善音频采集稳定性:自动检测 AudioContext suspended/closed 状态并重建 - 更精确的错误提示:区分权限拒绝、设备未找到、设备异常 --- AGENTS.md | 11 +-- web/bun.lock | 7 ++ web/package.json | 1 + web/src/hooks/useRecorder.ts | 112 +++++++++++++---------------- web/src/lib/resample.ts | 23 ------ web/src/workers/audio-processor.ts | 88 ----------------------- 6 files changed, 61 insertions(+), 181 deletions(-) delete mode 100644 web/src/lib/resample.ts delete mode 100644 web/src/workers/audio-processor.ts diff --git a/AGENTS.md b/AGENTS.md index 9669154..7526cce 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -84,17 +84,12 @@ web/ app-store.ts # Zustand store: connection, recording, preview, history, toast hooks/ useWebSocket.ts # WS client hook: connect, reconnect, message dispatch - useRecorder.ts # Audio pipeline hook: getUserMedia, AudioWorklet, resample + useRecorder.ts # Audio pipeline hook: WebVoiceProcessor (16kHz Int16 PCM capture) components/ StatusBadge.tsx # Connection status indicator PreviewBox.tsx # Real-time transcription preview MicButton.tsx # Push-to-talk button with animations HistoryList.tsx # Transcription history with re-send - Toast.tsx # Auto-dismiss toast notifications - lib/ - resample.ts # Linear interpolation resampler (native rate → 16kHz Int16) - workers/ - audio-processor.ts # AudioWorklet: PCM capture, 200ms frame accumulation ``` ## Code Style — Go @@ -158,8 +153,8 @@ Per-connection loggers via `slog.With("remote", addr)`. - Custom hooks for imperative APIs: `useWebSocket`, `useRecorder` - Zustand `getState()` in hooks/callbacks to avoid stale closures - Pointer Events for touch/mouse (not touch + mouse separately) -- AudioWorklet for audio capture (not MediaRecorder) -- `?worker&url` Vite import for AudioWorklet files +- @picovoice/web-voice-processor for audio capture (16kHz Int16 PCM, WASM resampling) +- WebVoiceProcessor handles getUserMedia, AudioContext lifecycle, cross-browser compat - WebSocket: binary for audio frames, JSON text for control messages - Tailwind CSS v4 with `@theme` design tokens; minimal custom CSS (keyframes only) diff --git a/web/bun.lock b/web/bun.lock index 9273a7e..bafeff6 100644 --- a/web/bun.lock +++ b/web/bun.lock @@ -5,6 +5,7 @@ "": { "name": "web", "dependencies": { + "@picovoice/web-voice-processor": "^4.0.9", "partysocket": "^1.1.16", "react": "^19.2.4", "react-dom": "^19.2.4", @@ -296,6 +297,10 @@ "@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.31", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.1.0", "@jridgewell/sourcemap-codec": "^1.4.14" } }, "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw=="], + "@picovoice/web-utils": ["@picovoice/web-utils@1.3.1", "", { "dependencies": { "commander": "^9.2.0" }, "bin": { "pvbase64": "scripts/base64.js" } }, "sha512-jcDqdULtTm+yJrnHDjg64hARup+Z4wNkYuXHNx6EM8+qZkweBq9UA6XJrHAlUkPnlkso4JWjaIKhz3x8vZcd3g=="], + + "@picovoice/web-voice-processor": ["@picovoice/web-voice-processor@4.0.9", "", { "dependencies": { "@picovoice/web-utils": "=1.3.1" } }, "sha512-20pdkFjtuiojAdLIkNHXt4YgpRnlUePFW+gfkeCb+J+2XTRDGOI50+aJzL95p6QjDzGXsO7PZhlz7yDofOvZtg=="], + "@rolldown/pluginutils": ["@rolldown/pluginutils@1.0.0-rc.3", "", {}, "sha512-eybk3TjzzzV97Dlj5c+XrBFW57eTNhzod66y9HrBlzJ6NsCrWCp/2kaPS3K9wJmurBC0Tdw4yPjXKZqlznim3Q=="], "@rollup/plugin-babel": ["@rollup/plugin-babel@5.3.1", "", { "dependencies": { "@babel/helper-module-imports": "^7.10.4", "@rollup/pluginutils": "^3.1.0" }, "peerDependencies": { "@babel/core": "^7.0.0", "@types/babel__core": "^7.1.9", "rollup": "^1.20.0||^2.0.0" }, "optionalPeers": ["@types/babel__core"] }, "sha512-WFfdLWU/xVWKeRQnKmIAQULUI7Il0gZnBIH/ZFO069wYIfPu+8zrfp/KMW0atmELoRDq8FbiP3VCss9MhCut7Q=="], @@ -918,6 +923,8 @@ "zustand": ["zustand@5.0.11", "", { "peerDependencies": { "@types/react": ">=18.0.0", "immer": ">=9.0.6", "react": ">=18.0.0", "use-sync-external-store": ">=1.2.0" }, "optionalPeers": ["@types/react", "immer", "react", "use-sync-external-store"] }, "sha512-fdZY+dk7zn/vbWNCYmzZULHRrss0jx5pPFiOuMZ/5HJN6Yv3u+1Wswy/4MpZEkEGhtNH+pwxZB8OKgUBPzYAGg=="], + "@picovoice/web-utils/commander": ["commander@9.5.0", "", {}, "sha512-KRs7WVDKg86PWiuAqhDrAQnTXZKraVcCc6vFdL14qrZ/DcWwuRo7VoiYXalXO7S5GKpqYiVEwCbgFDfxNHKJBQ=="], + "@rollup/plugin-babel/rollup": ["rollup@2.80.0", "", { "optionalDependencies": { "fsevents": "~2.3.2" }, "bin": { "rollup": "dist/bin/rollup" } }, "sha512-cIFJOD1DESzpjOBl763Kp1AH7UE/0fcdHe6rZXUdQ9c50uvgigvW97u3IcSeBwOkgqL/PXPBktBCh0KEu5L8XQ=="], "@rollup/plugin-node-resolve/@rollup/pluginutils": ["@rollup/pluginutils@5.3.0", "", { "dependencies": { "@types/estree": "^1.0.0", "estree-walker": "^2.0.2", "picomatch": "^4.0.2" }, "peerDependencies": { "rollup": "^1.20.0||^2.0.0||^3.0.0||^4.0.0" }, "optionalPeers": ["rollup"] }, "sha512-5EdhGZtnu3V88ces7s53hhfK5KSASnJZv8Lulpc04cWO3REESroJXg73DFsOmgbU2BhwV0E20bu2IDZb3VKW4Q=="], diff --git a/web/package.json b/web/package.json index 87674d4..734e807 100644 --- a/web/package.json +++ b/web/package.json @@ -22,6 +22,7 @@ "vite-plugin-pwa": "^1.2.0" }, "dependencies": { + "@picovoice/web-voice-processor": "^4.0.9", "partysocket": "^1.1.16", "react": "^19.2.4", "react-dom": "^19.2.4", diff --git a/web/src/hooks/useRecorder.ts b/web/src/hooks/useRecorder.ts index fd7cf8d..dc32767 100644 --- a/web/src/hooks/useRecorder.ts +++ b/web/src/hooks/useRecorder.ts @@ -1,8 +1,13 @@ +import { WebVoiceProcessor } from "@picovoice/web-voice-processor"; import { useCallback, useRef } from "react"; import { toast } from "sonner"; -import { resampleTo16kInt16 } from "../lib/resample"; import { useAppStore } from "../stores/app-store"; -import audioProcessorUrl from "../workers/audio-processor.ts?worker&url"; + +/** + * ~200ms frames at 16kHz = 3200 samples. + * Doubao bigmodel_async recommends 200ms packets for optimal performance. + */ +const FRAME_LENGTH = 3200; interface UseRecorderOptions { sendJSON: (obj: Record) => void; @@ -10,10 +15,10 @@ interface UseRecorderOptions { } export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) { - const audioCtxRef = useRef(null); - const workletRef = useRef(null); - const streamRef = useRef(null); const abortRef = useRef(null); + const engineRef = useRef<{ onmessage: (e: MessageEvent) => void } | null>( + null, + ); // Keep stable refs so callbacks never go stale const sendJSONRef = useRef(sendJSON); @@ -21,16 +26,6 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) { sendJSONRef.current = sendJSON; sendBinaryRef.current = sendBinary; - const initAudio = useCallback(async () => { - if (audioCtxRef.current) return; - // Use device native sample rate — we resample to 16kHz in software - const ctx = new AudioContext(); - // Chrome requires resume() after user gesture - if (ctx.state === "suspended") await ctx.resume(); - await ctx.audioWorklet.addModule(audioProcessorUrl); - audioCtxRef.current = ctx; - }, []); - const startRecording = useCallback(async () => { const store = useAppStore.getState(); if (store.recording || store.pendingStart) return; @@ -40,48 +35,32 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) { abortRef.current = abort; try { - await initAudio(); - if (abort.signal.aborted) { - store.setPendingStart(false); - return; - } - - const ctx = audioCtxRef.current as AudioContext; - if (ctx.state === "suspended") await ctx.resume(); - if (abort.signal.aborted) { - store.setPendingStart(false); - return; - } - - const stream = await navigator.mediaDevices.getUserMedia({ - audio: { - echoCancellation: true, - noiseSuppression: true, - channelCount: 1, + // Create an engine that receives Int16Array @ 16kHz from WebVoiceProcessor + const engine = { + onmessage: (e: MessageEvent) => { + if (e.data.command === "process") { + sendBinaryRef.current(e.data.inputFrame as Int16Array); + } }, + }; + engineRef.current = engine; + + WebVoiceProcessor.setOptions({ + frameLength: FRAME_LENGTH, + outputSampleRate: 16000, }); + + // subscribe() handles getUserMedia + AudioContext lifecycle internally. + // It checks for closed/suspended AudioContext and re-creates as needed. + await WebVoiceProcessor.subscribe(engine); + if (abort.signal.aborted) { - stream.getTracks().forEach((t) => { - t.stop(); - }); + await WebVoiceProcessor.unsubscribe(engine); + engineRef.current = null; store.setPendingStart(false); return; } - streamRef.current = stream; - const source = ctx.createMediaStreamSource(stream); - const worklet = new AudioWorkletNode(ctx, "audio-processor"); - worklet.port.onmessage = (e: MessageEvent) => { - if (e.data.type === "audio") { - sendBinaryRef.current( - resampleTo16kInt16(e.data.samples, e.data.sampleRate), - ); - } - }; - source.connect(worklet); - worklet.port.postMessage({ command: "start" }); - workletRef.current = worklet; - store.setPendingStart(false); abortRef.current = null; store.setRecording(true); @@ -90,9 +69,24 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) { } catch (err) { useAppStore.getState().setPendingStart(false); abortRef.current = null; - toast.error(`麦克风错误: ${(err as Error).message}`); + engineRef.current = null; + + const error = err as Error; + switch (error.name) { + case "PermissionError": + toast.error("麦克风权限被拒绝"); + break; + case "DeviceMissingError": + toast.error("未找到麦克风设备"); + break; + case "DeviceReadError": + toast.error("麦克风设备异常,请检查连接"); + break; + default: + toast.error(`麦克风错误: ${error.message}`); + } } - }, [initAudio]); + }, []); const stopRecording = useCallback(() => { const store = useAppStore.getState(); @@ -107,16 +101,10 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) { if (!store.recording) return; store.setRecording(false); - if (workletRef.current) { - workletRef.current.port.postMessage({ command: "stop" }); - workletRef.current.disconnect(); - workletRef.current = null; - } - if (streamRef.current) { - streamRef.current.getTracks().forEach((t) => { - t.stop(); - }); - streamRef.current = null; + if (engineRef.current) { + // Fire-and-forget: state is already updated, cleanup is async + WebVoiceProcessor.unsubscribe(engineRef.current); + engineRef.current = null; } sendJSONRef.current({ type: "stop" }); diff --git a/web/src/lib/resample.ts b/web/src/lib/resample.ts deleted file mode 100644 index 7f127b9..0000000 --- a/web/src/lib/resample.ts +++ /dev/null @@ -1,23 +0,0 @@ -/** - * Linear interpolation resampler: native sample rate -> 16kHz 16-bit mono PCM. - */ -const TARGET_SAMPLE_RATE = 16000; - -export function resampleTo16kInt16( - float32: Float32Array, - srcRate: number, -): Int16Array { - const ratio = srcRate / TARGET_SAMPLE_RATE; - const outLen = Math.floor(float32.length / ratio); - const out = new Int16Array(outLen); - for (let i = 0; i < outLen; i++) { - const srcIdx = i * ratio; - const lo = Math.floor(srcIdx); - const hi = Math.min(lo + 1, float32.length - 1); - const frac = srcIdx - lo; - const sample = float32[lo] + frac * (float32[hi] - float32[lo]); - // Clamp to [-1, 1] then scale to Int16 - out[i] = Math.max(-32768, Math.min(32767, Math.round(sample * 32767))); - } - return out; -} diff --git a/web/src/workers/audio-processor.ts b/web/src/workers/audio-processor.ts deleted file mode 100644 index 3c8f3f4..0000000 --- a/web/src/workers/audio-processor.ts +++ /dev/null @@ -1,88 +0,0 @@ -/** - * AudioWorklet processor for VoicePaste. - * - * Captures raw Float32 PCM from the microphone, accumulates samples into - * ~200ms frames, and posts them to the main thread for resampling + WS send. - * - * Communication: - * Main → Processor: { command: "start" | "stop" } - * Processor → Main: { type: "audio", samples: Float32Array, sampleRate: number } - */ - -// AudioWorkletGlobalScope globals (not in standard lib) -declare const sampleRate: number; -declare class AudioWorkletProcessor { - readonly port: MessagePort; - constructor(); - process( - inputs: Float32Array[][], - outputs: Float32Array[][], - parameters: Record, - ): boolean; -} -declare function registerProcessor( - name: string, - ctor: new () => AudioWorkletProcessor, -): void; - -class VoicePasteProcessor extends AudioWorkletProcessor { - private recording = false; - private buffer: Float32Array[] = []; - private bufferLen = 0; - private readonly frameSize: number; - - constructor() { - super(); - // ~200ms worth of samples at current sample rate - this.frameSize = Math.floor(sampleRate * 0.2); - - this.port.onmessage = (e: MessageEvent) => { - if (e.data.command === "start") { - this.recording = true; - this.buffer = []; - this.bufferLen = 0; - } else if (e.data.command === "stop") { - if (this.bufferLen > 0) { - this.flush(); - } - this.recording = false; - } - }; - } - - process(inputs: Float32Array[][]): boolean { - if (!this.recording) return true; - - const input = inputs[0]; - if (!input || !input[0]) return true; - - const channelData = input[0]; - this.buffer.push(new Float32Array(channelData)); - this.bufferLen += channelData.length; - - if (this.bufferLen >= this.frameSize) { - this.flush(); - } - - return true; - } - - private flush(): void { - const merged = new Float32Array(this.bufferLen); - let offset = 0; - for (const chunk of this.buffer) { - merged.set(chunk, offset); - offset += chunk.length; - } - - this.port.postMessage( - { type: "audio", samples: merged, sampleRate: sampleRate }, - [merged.buffer], - ); - - this.buffer = []; - this.bufferLen = 0; - } -} - -registerProcessor("audio-processor", VoicePasteProcessor);