refactor: 使用 @picovoice/web-voice-processor 替换手写音频采集管线

- 引入 WebVoiceProcessor 处理 getUserMedia、AudioContext 生命周期和 WASM 重采样
- 删除自定义 AudioWorklet (audio-processor.ts) 和线性插值重采样器 (resample.ts)
- 改善音频采集稳定性:自动检测 AudioContext suspended/closed 状态并重建
- 更精确的错误提示:区分权限拒绝、设备未找到、设备异常
This commit is contained in:
2026-03-02 07:42:45 +08:00
parent 677ef35ff7
commit 669bfac722
6 changed files with 61 additions and 181 deletions

View File

@@ -84,17 +84,12 @@ web/
app-store.ts # Zustand store: connection, recording, preview, history, toast
hooks/
useWebSocket.ts # WS client hook: connect, reconnect, message dispatch
useRecorder.ts # Audio pipeline hook: getUserMedia, AudioWorklet, resample
useRecorder.ts # Audio pipeline hook: WebVoiceProcessor (16kHz Int16 PCM capture)
components/
StatusBadge.tsx # Connection status indicator
PreviewBox.tsx # Real-time transcription preview
MicButton.tsx # Push-to-talk button with animations
HistoryList.tsx # Transcription history with re-send
Toast.tsx # Auto-dismiss toast notifications
lib/
resample.ts # Linear interpolation resampler (native rate → 16kHz Int16)
workers/
audio-processor.ts # AudioWorklet: PCM capture, 200ms frame accumulation
```
## Code Style — Go
@@ -158,8 +153,8 @@ Per-connection loggers via `slog.With("remote", addr)`.
- Custom hooks for imperative APIs: `useWebSocket`, `useRecorder`
- Zustand `getState()` in hooks/callbacks to avoid stale closures
- Pointer Events for touch/mouse (not touch + mouse separately)
- AudioWorklet for audio capture (not MediaRecorder)
- `?worker&url` Vite import for AudioWorklet files
- @picovoice/web-voice-processor for audio capture (16kHz Int16 PCM, WASM resampling)
- WebVoiceProcessor handles getUserMedia, AudioContext lifecycle, cross-browser compat
- WebSocket: binary for audio frames, JSON text for control messages
- Tailwind CSS v4 with `@theme` design tokens; minimal custom CSS (keyframes only)

View File

@@ -5,6 +5,7 @@
"": {
"name": "web",
"dependencies": {
"@picovoice/web-voice-processor": "^4.0.9",
"partysocket": "^1.1.16",
"react": "^19.2.4",
"react-dom": "^19.2.4",
@@ -296,6 +297,10 @@
"@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.31", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.1.0", "@jridgewell/sourcemap-codec": "^1.4.14" } }, "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw=="],
"@picovoice/web-utils": ["@picovoice/web-utils@1.3.1", "", { "dependencies": { "commander": "^9.2.0" }, "bin": { "pvbase64": "scripts/base64.js" } }, "sha512-jcDqdULtTm+yJrnHDjg64hARup+Z4wNkYuXHNx6EM8+qZkweBq9UA6XJrHAlUkPnlkso4JWjaIKhz3x8vZcd3g=="],
"@picovoice/web-voice-processor": ["@picovoice/web-voice-processor@4.0.9", "", { "dependencies": { "@picovoice/web-utils": "=1.3.1" } }, "sha512-20pdkFjtuiojAdLIkNHXt4YgpRnlUePFW+gfkeCb+J+2XTRDGOI50+aJzL95p6QjDzGXsO7PZhlz7yDofOvZtg=="],
"@rolldown/pluginutils": ["@rolldown/pluginutils@1.0.0-rc.3", "", {}, "sha512-eybk3TjzzzV97Dlj5c+XrBFW57eTNhzod66y9HrBlzJ6NsCrWCp/2kaPS3K9wJmurBC0Tdw4yPjXKZqlznim3Q=="],
"@rollup/plugin-babel": ["@rollup/plugin-babel@5.3.1", "", { "dependencies": { "@babel/helper-module-imports": "^7.10.4", "@rollup/pluginutils": "^3.1.0" }, "peerDependencies": { "@babel/core": "^7.0.0", "@types/babel__core": "^7.1.9", "rollup": "^1.20.0||^2.0.0" }, "optionalPeers": ["@types/babel__core"] }, "sha512-WFfdLWU/xVWKeRQnKmIAQULUI7Il0gZnBIH/ZFO069wYIfPu+8zrfp/KMW0atmELoRDq8FbiP3VCss9MhCut7Q=="],
@@ -918,6 +923,8 @@
"zustand": ["zustand@5.0.11", "", { "peerDependencies": { "@types/react": ">=18.0.0", "immer": ">=9.0.6", "react": ">=18.0.0", "use-sync-external-store": ">=1.2.0" }, "optionalPeers": ["@types/react", "immer", "react", "use-sync-external-store"] }, "sha512-fdZY+dk7zn/vbWNCYmzZULHRrss0jx5pPFiOuMZ/5HJN6Yv3u+1Wswy/4MpZEkEGhtNH+pwxZB8OKgUBPzYAGg=="],
"@picovoice/web-utils/commander": ["commander@9.5.0", "", {}, "sha512-KRs7WVDKg86PWiuAqhDrAQnTXZKraVcCc6vFdL14qrZ/DcWwuRo7VoiYXalXO7S5GKpqYiVEwCbgFDfxNHKJBQ=="],
"@rollup/plugin-babel/rollup": ["rollup@2.80.0", "", { "optionalDependencies": { "fsevents": "~2.3.2" }, "bin": { "rollup": "dist/bin/rollup" } }, "sha512-cIFJOD1DESzpjOBl763Kp1AH7UE/0fcdHe6rZXUdQ9c50uvgigvW97u3IcSeBwOkgqL/PXPBktBCh0KEu5L8XQ=="],
"@rollup/plugin-node-resolve/@rollup/pluginutils": ["@rollup/pluginutils@5.3.0", "", { "dependencies": { "@types/estree": "^1.0.0", "estree-walker": "^2.0.2", "picomatch": "^4.0.2" }, "peerDependencies": { "rollup": "^1.20.0||^2.0.0||^3.0.0||^4.0.0" }, "optionalPeers": ["rollup"] }, "sha512-5EdhGZtnu3V88ces7s53hhfK5KSASnJZv8Lulpc04cWO3REESroJXg73DFsOmgbU2BhwV0E20bu2IDZb3VKW4Q=="],

View File

@@ -22,6 +22,7 @@
"vite-plugin-pwa": "^1.2.0"
},
"dependencies": {
"@picovoice/web-voice-processor": "^4.0.9",
"partysocket": "^1.1.16",
"react": "^19.2.4",
"react-dom": "^19.2.4",

View File

@@ -1,8 +1,13 @@
import { WebVoiceProcessor } from "@picovoice/web-voice-processor";
import { useCallback, useRef } from "react";
import { toast } from "sonner";
import { resampleTo16kInt16 } from "../lib/resample";
import { useAppStore } from "../stores/app-store";
import audioProcessorUrl from "../workers/audio-processor.ts?worker&url";
/**
* ~200ms frames at 16kHz = 3200 samples.
* Doubao bigmodel_async recommends 200ms packets for optimal performance.
*/
const FRAME_LENGTH = 3200;
interface UseRecorderOptions {
sendJSON: (obj: Record<string, unknown>) => void;
@@ -10,10 +15,10 @@ interface UseRecorderOptions {
}
export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
const audioCtxRef = useRef<AudioContext | null>(null);
const workletRef = useRef<AudioWorkletNode | null>(null);
const streamRef = useRef<MediaStream | null>(null);
const abortRef = useRef<AbortController | null>(null);
const engineRef = useRef<{ onmessage: (e: MessageEvent) => void } | null>(
null,
);
// Keep stable refs so callbacks never go stale
const sendJSONRef = useRef(sendJSON);
@@ -21,16 +26,6 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
sendJSONRef.current = sendJSON;
sendBinaryRef.current = sendBinary;
const initAudio = useCallback(async () => {
if (audioCtxRef.current) return;
// Use device native sample rate — we resample to 16kHz in software
const ctx = new AudioContext();
// Chrome requires resume() after user gesture
if (ctx.state === "suspended") await ctx.resume();
await ctx.audioWorklet.addModule(audioProcessorUrl);
audioCtxRef.current = ctx;
}, []);
const startRecording = useCallback(async () => {
const store = useAppStore.getState();
if (store.recording || store.pendingStart) return;
@@ -40,48 +35,32 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
abortRef.current = abort;
try {
await initAudio();
if (abort.signal.aborted) {
store.setPendingStart(false);
return;
}
const ctx = audioCtxRef.current as AudioContext;
if (ctx.state === "suspended") await ctx.resume();
if (abort.signal.aborted) {
store.setPendingStart(false);
return;
}
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
channelCount: 1,
// Create an engine that receives Int16Array @ 16kHz from WebVoiceProcessor
const engine = {
onmessage: (e: MessageEvent) => {
if (e.data.command === "process") {
sendBinaryRef.current(e.data.inputFrame as Int16Array);
}
},
};
engineRef.current = engine;
WebVoiceProcessor.setOptions({
frameLength: FRAME_LENGTH,
outputSampleRate: 16000,
});
// subscribe() handles getUserMedia + AudioContext lifecycle internally.
// It checks for closed/suspended AudioContext and re-creates as needed.
await WebVoiceProcessor.subscribe(engine);
if (abort.signal.aborted) {
stream.getTracks().forEach((t) => {
t.stop();
});
await WebVoiceProcessor.unsubscribe(engine);
engineRef.current = null;
store.setPendingStart(false);
return;
}
streamRef.current = stream;
const source = ctx.createMediaStreamSource(stream);
const worklet = new AudioWorkletNode(ctx, "audio-processor");
worklet.port.onmessage = (e: MessageEvent) => {
if (e.data.type === "audio") {
sendBinaryRef.current(
resampleTo16kInt16(e.data.samples, e.data.sampleRate),
);
}
};
source.connect(worklet);
worklet.port.postMessage({ command: "start" });
workletRef.current = worklet;
store.setPendingStart(false);
abortRef.current = null;
store.setRecording(true);
@@ -90,9 +69,24 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
} catch (err) {
useAppStore.getState().setPendingStart(false);
abortRef.current = null;
toast.error(`麦克风错误: ${(err as Error).message}`);
engineRef.current = null;
const error = err as Error;
switch (error.name) {
case "PermissionError":
toast.error("麦克风权限被拒绝");
break;
case "DeviceMissingError":
toast.error("未找到麦克风设备");
break;
case "DeviceReadError":
toast.error("麦克风设备异常,请检查连接");
break;
default:
toast.error(`麦克风错误: ${error.message}`);
}
}
}, [initAudio]);
}, []);
const stopRecording = useCallback(() => {
const store = useAppStore.getState();
@@ -107,16 +101,10 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
if (!store.recording) return;
store.setRecording(false);
if (workletRef.current) {
workletRef.current.port.postMessage({ command: "stop" });
workletRef.current.disconnect();
workletRef.current = null;
}
if (streamRef.current) {
streamRef.current.getTracks().forEach((t) => {
t.stop();
});
streamRef.current = null;
if (engineRef.current) {
// Fire-and-forget: state is already updated, cleanup is async
WebVoiceProcessor.unsubscribe(engineRef.current);
engineRef.current = null;
}
sendJSONRef.current({ type: "stop" });

View File

@@ -1,23 +0,0 @@
/**
* Linear interpolation resampler: native sample rate -> 16kHz 16-bit mono PCM.
*/
const TARGET_SAMPLE_RATE = 16000;
export function resampleTo16kInt16(
float32: Float32Array,
srcRate: number,
): Int16Array {
const ratio = srcRate / TARGET_SAMPLE_RATE;
const outLen = Math.floor(float32.length / ratio);
const out = new Int16Array(outLen);
for (let i = 0; i < outLen; i++) {
const srcIdx = i * ratio;
const lo = Math.floor(srcIdx);
const hi = Math.min(lo + 1, float32.length - 1);
const frac = srcIdx - lo;
const sample = float32[lo] + frac * (float32[hi] - float32[lo]);
// Clamp to [-1, 1] then scale to Int16
out[i] = Math.max(-32768, Math.min(32767, Math.round(sample * 32767)));
}
return out;
}

View File

@@ -1,88 +0,0 @@
/**
* AudioWorklet processor for VoicePaste.
*
* Captures raw Float32 PCM from the microphone, accumulates samples into
* ~200ms frames, and posts them to the main thread for resampling + WS send.
*
* Communication:
* Main → Processor: { command: "start" | "stop" }
* Processor → Main: { type: "audio", samples: Float32Array, sampleRate: number }
*/
// AudioWorkletGlobalScope globals (not in standard lib)
declare const sampleRate: number;
declare class AudioWorkletProcessor {
readonly port: MessagePort;
constructor();
process(
inputs: Float32Array[][],
outputs: Float32Array[][],
parameters: Record<string, Float32Array>,
): boolean;
}
declare function registerProcessor(
name: string,
ctor: new () => AudioWorkletProcessor,
): void;
class VoicePasteProcessor extends AudioWorkletProcessor {
private recording = false;
private buffer: Float32Array[] = [];
private bufferLen = 0;
private readonly frameSize: number;
constructor() {
super();
// ~200ms worth of samples at current sample rate
this.frameSize = Math.floor(sampleRate * 0.2);
this.port.onmessage = (e: MessageEvent) => {
if (e.data.command === "start") {
this.recording = true;
this.buffer = [];
this.bufferLen = 0;
} else if (e.data.command === "stop") {
if (this.bufferLen > 0) {
this.flush();
}
this.recording = false;
}
};
}
process(inputs: Float32Array[][]): boolean {
if (!this.recording) return true;
const input = inputs[0];
if (!input || !input[0]) return true;
const channelData = input[0];
this.buffer.push(new Float32Array(channelData));
this.bufferLen += channelData.length;
if (this.bufferLen >= this.frameSize) {
this.flush();
}
return true;
}
private flush(): void {
const merged = new Float32Array(this.bufferLen);
let offset = 0;
for (const chunk of this.buffer) {
merged.set(chunk, offset);
offset += chunk.length;
}
this.port.postMessage(
{ type: "audio", samples: merged, sampleRate: sampleRate },
[merged.buffer],
);
this.buffer = [];
this.bufferLen = 0;
}
}
registerProcessor("audio-processor", VoicePasteProcessor);