refactor: 使用 @picovoice/web-voice-processor 替换手写音频采集管线
- 引入 WebVoiceProcessor 处理 getUserMedia、AudioContext 生命周期和 WASM 重采样 - 删除自定义 AudioWorklet (audio-processor.ts) 和线性插值重采样器 (resample.ts) - 改善音频采集稳定性:自动检测 AudioContext suspended/closed 状态并重建 - 更精确的错误提示:区分权限拒绝、设备未找到、设备异常
This commit is contained in:
11
AGENTS.md
11
AGENTS.md
@@ -84,17 +84,12 @@ web/
|
||||
app-store.ts # Zustand store: connection, recording, preview, history, toast
|
||||
hooks/
|
||||
useWebSocket.ts # WS client hook: connect, reconnect, message dispatch
|
||||
useRecorder.ts # Audio pipeline hook: getUserMedia, AudioWorklet, resample
|
||||
useRecorder.ts # Audio pipeline hook: WebVoiceProcessor (16kHz Int16 PCM capture)
|
||||
components/
|
||||
StatusBadge.tsx # Connection status indicator
|
||||
PreviewBox.tsx # Real-time transcription preview
|
||||
MicButton.tsx # Push-to-talk button with animations
|
||||
HistoryList.tsx # Transcription history with re-send
|
||||
Toast.tsx # Auto-dismiss toast notifications
|
||||
lib/
|
||||
resample.ts # Linear interpolation resampler (native rate → 16kHz Int16)
|
||||
workers/
|
||||
audio-processor.ts # AudioWorklet: PCM capture, 200ms frame accumulation
|
||||
```
|
||||
|
||||
## Code Style — Go
|
||||
@@ -158,8 +153,8 @@ Per-connection loggers via `slog.With("remote", addr)`.
|
||||
- Custom hooks for imperative APIs: `useWebSocket`, `useRecorder`
|
||||
- Zustand `getState()` in hooks/callbacks to avoid stale closures
|
||||
- Pointer Events for touch/mouse (not touch + mouse separately)
|
||||
- AudioWorklet for audio capture (not MediaRecorder)
|
||||
- `?worker&url` Vite import for AudioWorklet files
|
||||
- @picovoice/web-voice-processor for audio capture (16kHz Int16 PCM, WASM resampling)
|
||||
- WebVoiceProcessor handles getUserMedia, AudioContext lifecycle, cross-browser compat
|
||||
- WebSocket: binary for audio frames, JSON text for control messages
|
||||
- Tailwind CSS v4 with `@theme` design tokens; minimal custom CSS (keyframes only)
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
"": {
|
||||
"name": "web",
|
||||
"dependencies": {
|
||||
"@picovoice/web-voice-processor": "^4.0.9",
|
||||
"partysocket": "^1.1.16",
|
||||
"react": "^19.2.4",
|
||||
"react-dom": "^19.2.4",
|
||||
@@ -296,6 +297,10 @@
|
||||
|
||||
"@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.31", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.1.0", "@jridgewell/sourcemap-codec": "^1.4.14" } }, "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw=="],
|
||||
|
||||
"@picovoice/web-utils": ["@picovoice/web-utils@1.3.1", "", { "dependencies": { "commander": "^9.2.0" }, "bin": { "pvbase64": "scripts/base64.js" } }, "sha512-jcDqdULtTm+yJrnHDjg64hARup+Z4wNkYuXHNx6EM8+qZkweBq9UA6XJrHAlUkPnlkso4JWjaIKhz3x8vZcd3g=="],
|
||||
|
||||
"@picovoice/web-voice-processor": ["@picovoice/web-voice-processor@4.0.9", "", { "dependencies": { "@picovoice/web-utils": "=1.3.1" } }, "sha512-20pdkFjtuiojAdLIkNHXt4YgpRnlUePFW+gfkeCb+J+2XTRDGOI50+aJzL95p6QjDzGXsO7PZhlz7yDofOvZtg=="],
|
||||
|
||||
"@rolldown/pluginutils": ["@rolldown/pluginutils@1.0.0-rc.3", "", {}, "sha512-eybk3TjzzzV97Dlj5c+XrBFW57eTNhzod66y9HrBlzJ6NsCrWCp/2kaPS3K9wJmurBC0Tdw4yPjXKZqlznim3Q=="],
|
||||
|
||||
"@rollup/plugin-babel": ["@rollup/plugin-babel@5.3.1", "", { "dependencies": { "@babel/helper-module-imports": "^7.10.4", "@rollup/pluginutils": "^3.1.0" }, "peerDependencies": { "@babel/core": "^7.0.0", "@types/babel__core": "^7.1.9", "rollup": "^1.20.0||^2.0.0" }, "optionalPeers": ["@types/babel__core"] }, "sha512-WFfdLWU/xVWKeRQnKmIAQULUI7Il0gZnBIH/ZFO069wYIfPu+8zrfp/KMW0atmELoRDq8FbiP3VCss9MhCut7Q=="],
|
||||
@@ -918,6 +923,8 @@
|
||||
|
||||
"zustand": ["zustand@5.0.11", "", { "peerDependencies": { "@types/react": ">=18.0.0", "immer": ">=9.0.6", "react": ">=18.0.0", "use-sync-external-store": ">=1.2.0" }, "optionalPeers": ["@types/react", "immer", "react", "use-sync-external-store"] }, "sha512-fdZY+dk7zn/vbWNCYmzZULHRrss0jx5pPFiOuMZ/5HJN6Yv3u+1Wswy/4MpZEkEGhtNH+pwxZB8OKgUBPzYAGg=="],
|
||||
|
||||
"@picovoice/web-utils/commander": ["commander@9.5.0", "", {}, "sha512-KRs7WVDKg86PWiuAqhDrAQnTXZKraVcCc6vFdL14qrZ/DcWwuRo7VoiYXalXO7S5GKpqYiVEwCbgFDfxNHKJBQ=="],
|
||||
|
||||
"@rollup/plugin-babel/rollup": ["rollup@2.80.0", "", { "optionalDependencies": { "fsevents": "~2.3.2" }, "bin": { "rollup": "dist/bin/rollup" } }, "sha512-cIFJOD1DESzpjOBl763Kp1AH7UE/0fcdHe6rZXUdQ9c50uvgigvW97u3IcSeBwOkgqL/PXPBktBCh0KEu5L8XQ=="],
|
||||
|
||||
"@rollup/plugin-node-resolve/@rollup/pluginutils": ["@rollup/pluginutils@5.3.0", "", { "dependencies": { "@types/estree": "^1.0.0", "estree-walker": "^2.0.2", "picomatch": "^4.0.2" }, "peerDependencies": { "rollup": "^1.20.0||^2.0.0||^3.0.0||^4.0.0" }, "optionalPeers": ["rollup"] }, "sha512-5EdhGZtnu3V88ces7s53hhfK5KSASnJZv8Lulpc04cWO3REESroJXg73DFsOmgbU2BhwV0E20bu2IDZb3VKW4Q=="],
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
"vite-plugin-pwa": "^1.2.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"@picovoice/web-voice-processor": "^4.0.9",
|
||||
"partysocket": "^1.1.16",
|
||||
"react": "^19.2.4",
|
||||
"react-dom": "^19.2.4",
|
||||
|
||||
@@ -1,8 +1,13 @@
|
||||
import { WebVoiceProcessor } from "@picovoice/web-voice-processor";
|
||||
import { useCallback, useRef } from "react";
|
||||
import { toast } from "sonner";
|
||||
import { resampleTo16kInt16 } from "../lib/resample";
|
||||
import { useAppStore } from "../stores/app-store";
|
||||
import audioProcessorUrl from "../workers/audio-processor.ts?worker&url";
|
||||
|
||||
/**
|
||||
* ~200ms frames at 16kHz = 3200 samples.
|
||||
* Doubao bigmodel_async recommends 200ms packets for optimal performance.
|
||||
*/
|
||||
const FRAME_LENGTH = 3200;
|
||||
|
||||
interface UseRecorderOptions {
|
||||
sendJSON: (obj: Record<string, unknown>) => void;
|
||||
@@ -10,10 +15,10 @@ interface UseRecorderOptions {
|
||||
}
|
||||
|
||||
export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
|
||||
const audioCtxRef = useRef<AudioContext | null>(null);
|
||||
const workletRef = useRef<AudioWorkletNode | null>(null);
|
||||
const streamRef = useRef<MediaStream | null>(null);
|
||||
const abortRef = useRef<AbortController | null>(null);
|
||||
const engineRef = useRef<{ onmessage: (e: MessageEvent) => void } | null>(
|
||||
null,
|
||||
);
|
||||
|
||||
// Keep stable refs so callbacks never go stale
|
||||
const sendJSONRef = useRef(sendJSON);
|
||||
@@ -21,16 +26,6 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
|
||||
sendJSONRef.current = sendJSON;
|
||||
sendBinaryRef.current = sendBinary;
|
||||
|
||||
const initAudio = useCallback(async () => {
|
||||
if (audioCtxRef.current) return;
|
||||
// Use device native sample rate — we resample to 16kHz in software
|
||||
const ctx = new AudioContext();
|
||||
// Chrome requires resume() after user gesture
|
||||
if (ctx.state === "suspended") await ctx.resume();
|
||||
await ctx.audioWorklet.addModule(audioProcessorUrl);
|
||||
audioCtxRef.current = ctx;
|
||||
}, []);
|
||||
|
||||
const startRecording = useCallback(async () => {
|
||||
const store = useAppStore.getState();
|
||||
if (store.recording || store.pendingStart) return;
|
||||
@@ -40,48 +35,32 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
|
||||
abortRef.current = abort;
|
||||
|
||||
try {
|
||||
await initAudio();
|
||||
if (abort.signal.aborted) {
|
||||
store.setPendingStart(false);
|
||||
return;
|
||||
}
|
||||
|
||||
const ctx = audioCtxRef.current as AudioContext;
|
||||
if (ctx.state === "suspended") await ctx.resume();
|
||||
if (abort.signal.aborted) {
|
||||
store.setPendingStart(false);
|
||||
return;
|
||||
}
|
||||
|
||||
const stream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: {
|
||||
echoCancellation: true,
|
||||
noiseSuppression: true,
|
||||
channelCount: 1,
|
||||
// Create an engine that receives Int16Array @ 16kHz from WebVoiceProcessor
|
||||
const engine = {
|
||||
onmessage: (e: MessageEvent) => {
|
||||
if (e.data.command === "process") {
|
||||
sendBinaryRef.current(e.data.inputFrame as Int16Array);
|
||||
}
|
||||
},
|
||||
};
|
||||
engineRef.current = engine;
|
||||
|
||||
WebVoiceProcessor.setOptions({
|
||||
frameLength: FRAME_LENGTH,
|
||||
outputSampleRate: 16000,
|
||||
});
|
||||
|
||||
// subscribe() handles getUserMedia + AudioContext lifecycle internally.
|
||||
// It checks for closed/suspended AudioContext and re-creates as needed.
|
||||
await WebVoiceProcessor.subscribe(engine);
|
||||
|
||||
if (abort.signal.aborted) {
|
||||
stream.getTracks().forEach((t) => {
|
||||
t.stop();
|
||||
});
|
||||
await WebVoiceProcessor.unsubscribe(engine);
|
||||
engineRef.current = null;
|
||||
store.setPendingStart(false);
|
||||
return;
|
||||
}
|
||||
|
||||
streamRef.current = stream;
|
||||
const source = ctx.createMediaStreamSource(stream);
|
||||
const worklet = new AudioWorkletNode(ctx, "audio-processor");
|
||||
worklet.port.onmessage = (e: MessageEvent) => {
|
||||
if (e.data.type === "audio") {
|
||||
sendBinaryRef.current(
|
||||
resampleTo16kInt16(e.data.samples, e.data.sampleRate),
|
||||
);
|
||||
}
|
||||
};
|
||||
source.connect(worklet);
|
||||
worklet.port.postMessage({ command: "start" });
|
||||
workletRef.current = worklet;
|
||||
|
||||
store.setPendingStart(false);
|
||||
abortRef.current = null;
|
||||
store.setRecording(true);
|
||||
@@ -90,9 +69,24 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
|
||||
} catch (err) {
|
||||
useAppStore.getState().setPendingStart(false);
|
||||
abortRef.current = null;
|
||||
toast.error(`麦克风错误: ${(err as Error).message}`);
|
||||
engineRef.current = null;
|
||||
|
||||
const error = err as Error;
|
||||
switch (error.name) {
|
||||
case "PermissionError":
|
||||
toast.error("麦克风权限被拒绝");
|
||||
break;
|
||||
case "DeviceMissingError":
|
||||
toast.error("未找到麦克风设备");
|
||||
break;
|
||||
case "DeviceReadError":
|
||||
toast.error("麦克风设备异常,请检查连接");
|
||||
break;
|
||||
default:
|
||||
toast.error(`麦克风错误: ${error.message}`);
|
||||
}
|
||||
}
|
||||
}, [initAudio]);
|
||||
}, []);
|
||||
|
||||
const stopRecording = useCallback(() => {
|
||||
const store = useAppStore.getState();
|
||||
@@ -107,16 +101,10 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
|
||||
if (!store.recording) return;
|
||||
store.setRecording(false);
|
||||
|
||||
if (workletRef.current) {
|
||||
workletRef.current.port.postMessage({ command: "stop" });
|
||||
workletRef.current.disconnect();
|
||||
workletRef.current = null;
|
||||
}
|
||||
if (streamRef.current) {
|
||||
streamRef.current.getTracks().forEach((t) => {
|
||||
t.stop();
|
||||
});
|
||||
streamRef.current = null;
|
||||
if (engineRef.current) {
|
||||
// Fire-and-forget: state is already updated, cleanup is async
|
||||
WebVoiceProcessor.unsubscribe(engineRef.current);
|
||||
engineRef.current = null;
|
||||
}
|
||||
|
||||
sendJSONRef.current({ type: "stop" });
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
/**
|
||||
* Linear interpolation resampler: native sample rate -> 16kHz 16-bit mono PCM.
|
||||
*/
|
||||
const TARGET_SAMPLE_RATE = 16000;
|
||||
|
||||
export function resampleTo16kInt16(
|
||||
float32: Float32Array,
|
||||
srcRate: number,
|
||||
): Int16Array {
|
||||
const ratio = srcRate / TARGET_SAMPLE_RATE;
|
||||
const outLen = Math.floor(float32.length / ratio);
|
||||
const out = new Int16Array(outLen);
|
||||
for (let i = 0; i < outLen; i++) {
|
||||
const srcIdx = i * ratio;
|
||||
const lo = Math.floor(srcIdx);
|
||||
const hi = Math.min(lo + 1, float32.length - 1);
|
||||
const frac = srcIdx - lo;
|
||||
const sample = float32[lo] + frac * (float32[hi] - float32[lo]);
|
||||
// Clamp to [-1, 1] then scale to Int16
|
||||
out[i] = Math.max(-32768, Math.min(32767, Math.round(sample * 32767)));
|
||||
}
|
||||
return out;
|
||||
}
|
||||
@@ -1,88 +0,0 @@
|
||||
/**
|
||||
* AudioWorklet processor for VoicePaste.
|
||||
*
|
||||
* Captures raw Float32 PCM from the microphone, accumulates samples into
|
||||
* ~200ms frames, and posts them to the main thread for resampling + WS send.
|
||||
*
|
||||
* Communication:
|
||||
* Main → Processor: { command: "start" | "stop" }
|
||||
* Processor → Main: { type: "audio", samples: Float32Array, sampleRate: number }
|
||||
*/
|
||||
|
||||
// AudioWorkletGlobalScope globals (not in standard lib)
|
||||
declare const sampleRate: number;
|
||||
declare class AudioWorkletProcessor {
|
||||
readonly port: MessagePort;
|
||||
constructor();
|
||||
process(
|
||||
inputs: Float32Array[][],
|
||||
outputs: Float32Array[][],
|
||||
parameters: Record<string, Float32Array>,
|
||||
): boolean;
|
||||
}
|
||||
declare function registerProcessor(
|
||||
name: string,
|
||||
ctor: new () => AudioWorkletProcessor,
|
||||
): void;
|
||||
|
||||
class VoicePasteProcessor extends AudioWorkletProcessor {
|
||||
private recording = false;
|
||||
private buffer: Float32Array[] = [];
|
||||
private bufferLen = 0;
|
||||
private readonly frameSize: number;
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
// ~200ms worth of samples at current sample rate
|
||||
this.frameSize = Math.floor(sampleRate * 0.2);
|
||||
|
||||
this.port.onmessage = (e: MessageEvent) => {
|
||||
if (e.data.command === "start") {
|
||||
this.recording = true;
|
||||
this.buffer = [];
|
||||
this.bufferLen = 0;
|
||||
} else if (e.data.command === "stop") {
|
||||
if (this.bufferLen > 0) {
|
||||
this.flush();
|
||||
}
|
||||
this.recording = false;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
process(inputs: Float32Array[][]): boolean {
|
||||
if (!this.recording) return true;
|
||||
|
||||
const input = inputs[0];
|
||||
if (!input || !input[0]) return true;
|
||||
|
||||
const channelData = input[0];
|
||||
this.buffer.push(new Float32Array(channelData));
|
||||
this.bufferLen += channelData.length;
|
||||
|
||||
if (this.bufferLen >= this.frameSize) {
|
||||
this.flush();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private flush(): void {
|
||||
const merged = new Float32Array(this.bufferLen);
|
||||
let offset = 0;
|
||||
for (const chunk of this.buffer) {
|
||||
merged.set(chunk, offset);
|
||||
offset += chunk.length;
|
||||
}
|
||||
|
||||
this.port.postMessage(
|
||||
{ type: "audio", samples: merged, sampleRate: sampleRate },
|
||||
[merged.buffer],
|
||||
);
|
||||
|
||||
this.buffer = [];
|
||||
this.bufferLen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
registerProcessor("audio-processor", VoicePasteProcessor);
|
||||
Reference in New Issue
Block a user