refactor: 使用 @picovoice/web-voice-processor 替换手写音频采集管线

- 引入 WebVoiceProcessor 处理 getUserMedia、AudioContext 生命周期和 WASM 重采样
- 删除自定义 AudioWorklet (audio-processor.ts) 和线性插值重采样器 (resample.ts)
- 改善音频采集稳定性:自动检测 AudioContext suspended/closed 状态并重建
- 更精确的错误提示:区分权限拒绝、设备未找到、设备异常
This commit is contained in:
2026-03-02 07:42:45 +08:00
parent 677ef35ff7
commit 669bfac722
6 changed files with 61 additions and 181 deletions

View File

@@ -1,8 +1,13 @@
import { WebVoiceProcessor } from "@picovoice/web-voice-processor";
import { useCallback, useRef } from "react";
import { toast } from "sonner";
import { resampleTo16kInt16 } from "../lib/resample";
import { useAppStore } from "../stores/app-store";
import audioProcessorUrl from "../workers/audio-processor.ts?worker&url";
/**
* ~200ms frames at 16kHz = 3200 samples.
* Doubao bigmodel_async recommends 200ms packets for optimal performance.
*/
const FRAME_LENGTH = 3200;
interface UseRecorderOptions {
sendJSON: (obj: Record<string, unknown>) => void;
@@ -10,10 +15,10 @@ interface UseRecorderOptions {
}
export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
const audioCtxRef = useRef<AudioContext | null>(null);
const workletRef = useRef<AudioWorkletNode | null>(null);
const streamRef = useRef<MediaStream | null>(null);
const abortRef = useRef<AbortController | null>(null);
const engineRef = useRef<{ onmessage: (e: MessageEvent) => void } | null>(
null,
);
// Keep stable refs so callbacks never go stale
const sendJSONRef = useRef(sendJSON);
@@ -21,16 +26,6 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
sendJSONRef.current = sendJSON;
sendBinaryRef.current = sendBinary;
const initAudio = useCallback(async () => {
if (audioCtxRef.current) return;
// Use device native sample rate — we resample to 16kHz in software
const ctx = new AudioContext();
// Chrome requires resume() after user gesture
if (ctx.state === "suspended") await ctx.resume();
await ctx.audioWorklet.addModule(audioProcessorUrl);
audioCtxRef.current = ctx;
}, []);
const startRecording = useCallback(async () => {
const store = useAppStore.getState();
if (store.recording || store.pendingStart) return;
@@ -40,48 +35,32 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
abortRef.current = abort;
try {
await initAudio();
if (abort.signal.aborted) {
store.setPendingStart(false);
return;
}
const ctx = audioCtxRef.current as AudioContext;
if (ctx.state === "suspended") await ctx.resume();
if (abort.signal.aborted) {
store.setPendingStart(false);
return;
}
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
channelCount: 1,
// Create an engine that receives Int16Array @ 16kHz from WebVoiceProcessor
const engine = {
onmessage: (e: MessageEvent) => {
if (e.data.command === "process") {
sendBinaryRef.current(e.data.inputFrame as Int16Array);
}
},
};
engineRef.current = engine;
WebVoiceProcessor.setOptions({
frameLength: FRAME_LENGTH,
outputSampleRate: 16000,
});
// subscribe() handles getUserMedia + AudioContext lifecycle internally.
// It checks for closed/suspended AudioContext and re-creates as needed.
await WebVoiceProcessor.subscribe(engine);
if (abort.signal.aborted) {
stream.getTracks().forEach((t) => {
t.stop();
});
await WebVoiceProcessor.unsubscribe(engine);
engineRef.current = null;
store.setPendingStart(false);
return;
}
streamRef.current = stream;
const source = ctx.createMediaStreamSource(stream);
const worklet = new AudioWorkletNode(ctx, "audio-processor");
worklet.port.onmessage = (e: MessageEvent) => {
if (e.data.type === "audio") {
sendBinaryRef.current(
resampleTo16kInt16(e.data.samples, e.data.sampleRate),
);
}
};
source.connect(worklet);
worklet.port.postMessage({ command: "start" });
workletRef.current = worklet;
store.setPendingStart(false);
abortRef.current = null;
store.setRecording(true);
@@ -90,9 +69,24 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
} catch (err) {
useAppStore.getState().setPendingStart(false);
abortRef.current = null;
toast.error(`麦克风错误: ${(err as Error).message}`);
engineRef.current = null;
const error = err as Error;
switch (error.name) {
case "PermissionError":
toast.error("麦克风权限被拒绝");
break;
case "DeviceMissingError":
toast.error("未找到麦克风设备");
break;
case "DeviceReadError":
toast.error("麦克风设备异常,请检查连接");
break;
default:
toast.error(`麦克风错误: ${error.message}`);
}
}
}, [initAudio]);
}, []);
const stopRecording = useCallback(() => {
const store = useAppStore.getState();
@@ -107,16 +101,10 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
if (!store.recording) return;
store.setRecording(false);
if (workletRef.current) {
workletRef.current.port.postMessage({ command: "stop" });
workletRef.current.disconnect();
workletRef.current = null;
}
if (streamRef.current) {
streamRef.current.getTracks().forEach((t) => {
t.stop();
});
streamRef.current = null;
if (engineRef.current) {
// Fire-and-forget: state is already updated, cleanup is async
WebVoiceProcessor.unsubscribe(engineRef.current);
engineRef.current = null;
}
sendJSONRef.current({ type: "stop" });

View File

@@ -1,23 +0,0 @@
/**
* Linear interpolation resampler: native sample rate -> 16kHz 16-bit mono PCM.
*/
const TARGET_SAMPLE_RATE = 16000;
export function resampleTo16kInt16(
float32: Float32Array,
srcRate: number,
): Int16Array {
const ratio = srcRate / TARGET_SAMPLE_RATE;
const outLen = Math.floor(float32.length / ratio);
const out = new Int16Array(outLen);
for (let i = 0; i < outLen; i++) {
const srcIdx = i * ratio;
const lo = Math.floor(srcIdx);
const hi = Math.min(lo + 1, float32.length - 1);
const frac = srcIdx - lo;
const sample = float32[lo] + frac * (float32[hi] - float32[lo]);
// Clamp to [-1, 1] then scale to Int16
out[i] = Math.max(-32768, Math.min(32767, Math.round(sample * 32767)));
}
return out;
}

View File

@@ -1,88 +0,0 @@
/**
* AudioWorklet processor for VoicePaste.
*
* Captures raw Float32 PCM from the microphone, accumulates samples into
* ~200ms frames, and posts them to the main thread for resampling + WS send.
*
* Communication:
* Main → Processor: { command: "start" | "stop" }
* Processor → Main: { type: "audio", samples: Float32Array, sampleRate: number }
*/
// AudioWorkletGlobalScope globals (not in standard lib)
declare const sampleRate: number;
declare class AudioWorkletProcessor {
readonly port: MessagePort;
constructor();
process(
inputs: Float32Array[][],
outputs: Float32Array[][],
parameters: Record<string, Float32Array>,
): boolean;
}
declare function registerProcessor(
name: string,
ctor: new () => AudioWorkletProcessor,
): void;
class VoicePasteProcessor extends AudioWorkletProcessor {
private recording = false;
private buffer: Float32Array[] = [];
private bufferLen = 0;
private readonly frameSize: number;
constructor() {
super();
// ~200ms worth of samples at current sample rate
this.frameSize = Math.floor(sampleRate * 0.2);
this.port.onmessage = (e: MessageEvent) => {
if (e.data.command === "start") {
this.recording = true;
this.buffer = [];
this.bufferLen = 0;
} else if (e.data.command === "stop") {
if (this.bufferLen > 0) {
this.flush();
}
this.recording = false;
}
};
}
process(inputs: Float32Array[][]): boolean {
if (!this.recording) return true;
const input = inputs[0];
if (!input || !input[0]) return true;
const channelData = input[0];
this.buffer.push(new Float32Array(channelData));
this.bufferLen += channelData.length;
if (this.bufferLen >= this.frameSize) {
this.flush();
}
return true;
}
private flush(): void {
const merged = new Float32Array(this.bufferLen);
let offset = 0;
for (const chunk of this.buffer) {
merged.set(chunk, offset);
offset += chunk.length;
}
this.port.postMessage(
{ type: "audio", samples: merged, sampleRate: sampleRate },
[merged.buffer],
);
this.buffer = [];
this.bufferLen = 0;
}
}
registerProcessor("audio-processor", VoicePasteProcessor);