From 669bfac7221846be2d154c7804deab0ea966f1a6 Mon Sep 17 00:00:00 2001
From: imbytecat <imbytecat@gmail.com>
Date: Mon, 2 Mar 2026 07:42:45 +0800
Subject: [PATCH] =?UTF-8?q?refactor:=20=E4=BD=BF=E7=94=A8=20@picovoice/web?=
 =?UTF-8?q?-voice-processor=20=E6=9B=BF=E6=8D=A2=E6=89=8B=E5=86=99?=
 =?UTF-8?q?=E9=9F=B3=E9=A2=91=E9=87=87=E9=9B=86=E7=AE=A1=E7=BA=BF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 引入 WebVoiceProcessor 处理 getUserMedia、AudioContext 生命周期和 WASM 重采样
- 删除自定义 AudioWorklet (audio-processor.ts) 和线性插值重采样器 (resample.ts)
- 改善音频采集稳定性：自动检测 AudioContext suspended/closed 状态并重建
- 更精确的错误提示：区分权限拒绝、设备未找到、设备异常
---
 AGENTS.md                          |  11 +--
 web/bun.lock                       |   7 ++
 web/package.json                   |   1 +
 web/src/hooks/useRecorder.ts       | 112 +++++++++++++----------------
 web/src/lib/resample.ts            |  23 ------
 web/src/workers/audio-processor.ts |  88 -----------------------
 6 files changed, 61 insertions(+), 181 deletions(-)
 delete mode 100644 web/src/lib/resample.ts
 delete mode 100644 web/src/workers/audio-processor.ts

diff --git a/AGENTS.md b/AGENTS.md
index 9669154..7526cce 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -84,17 +84,12 @@ web/
       app-store.ts           # Zustand store: connection, recording, preview, history, toast
     hooks/
       useWebSocket.ts        # WS client hook: connect, reconnect, message dispatch
-      useRecorder.ts         # Audio pipeline hook: getUserMedia, AudioWorklet, resample
+      useRecorder.ts         # Audio pipeline hook: WebVoiceProcessor (16kHz Int16 PCM capture)
     components/
       StatusBadge.tsx         # Connection status indicator
       PreviewBox.tsx          # Real-time transcription preview
       MicButton.tsx           # Push-to-talk button with animations
       HistoryList.tsx         # Transcription history with re-send
-      Toast.tsx               # Auto-dismiss toast notifications
-    lib/
-      resample.ts             # Linear interpolation resampler (native rate → 16kHz Int16)
-    workers/
-      audio-processor.ts      # AudioWorklet: PCM capture, 200ms frame accumulation
 ```
 
 ## Code Style — Go
@@ -158,8 +153,8 @@ Per-connection loggers via `slog.With("remote", addr)`.
 - Custom hooks for imperative APIs: `useWebSocket`, `useRecorder`
 - Zustand `getState()` in hooks/callbacks to avoid stale closures
 - Pointer Events for touch/mouse (not touch + mouse separately)
-- AudioWorklet for audio capture (not MediaRecorder)
-- `?worker&url` Vite import for AudioWorklet files
+- @picovoice/web-voice-processor for audio capture (16kHz Int16 PCM, WASM resampling)
+- WebVoiceProcessor handles getUserMedia, AudioContext lifecycle, cross-browser compat
 - WebSocket: binary for audio frames, JSON text for control messages
 - Tailwind CSS v4 with `@theme` design tokens; minimal custom CSS (keyframes only)
 
diff --git a/web/bun.lock b/web/bun.lock
index 9273a7e..bafeff6 100644
--- a/web/bun.lock
+++ b/web/bun.lock
@@ -5,6 +5,7 @@
     "": {
       "name": "web",
       "dependencies": {
+        "@picovoice/web-voice-processor": "^4.0.9",
         "partysocket": "^1.1.16",
         "react": "^19.2.4",
         "react-dom": "^19.2.4",
@@ -296,6 +297,10 @@
 
     "@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.31", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.1.0", "@jridgewell/sourcemap-codec": "^1.4.14" } }, "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw=="],
 
+    "@picovoice/web-utils": ["@picovoice/web-utils@1.3.1", "", { "dependencies": { "commander": "^9.2.0" }, "bin": { "pvbase64": "scripts/base64.js" } }, "sha512-jcDqdULtTm+yJrnHDjg64hARup+Z4wNkYuXHNx6EM8+qZkweBq9UA6XJrHAlUkPnlkso4JWjaIKhz3x8vZcd3g=="],
+
+    "@picovoice/web-voice-processor": ["@picovoice/web-voice-processor@4.0.9", "", { "dependencies": { "@picovoice/web-utils": "=1.3.1" } }, "sha512-20pdkFjtuiojAdLIkNHXt4YgpRnlUePFW+gfkeCb+J+2XTRDGOI50+aJzL95p6QjDzGXsO7PZhlz7yDofOvZtg=="],
+
     "@rolldown/pluginutils": ["@rolldown/pluginutils@1.0.0-rc.3", "", {}, "sha512-eybk3TjzzzV97Dlj5c+XrBFW57eTNhzod66y9HrBlzJ6NsCrWCp/2kaPS3K9wJmurBC0Tdw4yPjXKZqlznim3Q=="],
 
     "@rollup/plugin-babel": ["@rollup/plugin-babel@5.3.1", "", { "dependencies": { "@babel/helper-module-imports": "^7.10.4", "@rollup/pluginutils": "^3.1.0" }, "peerDependencies": { "@babel/core": "^7.0.0", "@types/babel__core": "^7.1.9", "rollup": "^1.20.0||^2.0.0" }, "optionalPeers": ["@types/babel__core"] }, "sha512-WFfdLWU/xVWKeRQnKmIAQULUI7Il0gZnBIH/ZFO069wYIfPu+8zrfp/KMW0atmELoRDq8FbiP3VCss9MhCut7Q=="],
@@ -918,6 +923,8 @@
 
     "zustand": ["zustand@5.0.11", "", { "peerDependencies": { "@types/react": ">=18.0.0", "immer": ">=9.0.6", "react": ">=18.0.0", "use-sync-external-store": ">=1.2.0" }, "optionalPeers": ["@types/react", "immer", "react", "use-sync-external-store"] }, "sha512-fdZY+dk7zn/vbWNCYmzZULHRrss0jx5pPFiOuMZ/5HJN6Yv3u+1Wswy/4MpZEkEGhtNH+pwxZB8OKgUBPzYAGg=="],
 
+    "@picovoice/web-utils/commander": ["commander@9.5.0", "", {}, "sha512-KRs7WVDKg86PWiuAqhDrAQnTXZKraVcCc6vFdL14qrZ/DcWwuRo7VoiYXalXO7S5GKpqYiVEwCbgFDfxNHKJBQ=="],
+
     "@rollup/plugin-babel/rollup": ["rollup@2.80.0", "", { "optionalDependencies": { "fsevents": "~2.3.2" }, "bin": { "rollup": "dist/bin/rollup" } }, "sha512-cIFJOD1DESzpjOBl763Kp1AH7UE/0fcdHe6rZXUdQ9c50uvgigvW97u3IcSeBwOkgqL/PXPBktBCh0KEu5L8XQ=="],
 
     "@rollup/plugin-node-resolve/@rollup/pluginutils": ["@rollup/pluginutils@5.3.0", "", { "dependencies": { "@types/estree": "^1.0.0", "estree-walker": "^2.0.2", "picomatch": "^4.0.2" }, "peerDependencies": { "rollup": "^1.20.0||^2.0.0||^3.0.0||^4.0.0" }, "optionalPeers": ["rollup"] }, "sha512-5EdhGZtnu3V88ces7s53hhfK5KSASnJZv8Lulpc04cWO3REESroJXg73DFsOmgbU2BhwV0E20bu2IDZb3VKW4Q=="],
diff --git a/web/package.json b/web/package.json
index 87674d4..734e807 100644
--- a/web/package.json
+++ b/web/package.json
@@ -22,6 +22,7 @@
 		"vite-plugin-pwa": "^1.2.0"
 	},
 	"dependencies": {
+		"@picovoice/web-voice-processor": "^4.0.9",
 		"partysocket": "^1.1.16",
 		"react": "^19.2.4",
 		"react-dom": "^19.2.4",
diff --git a/web/src/hooks/useRecorder.ts b/web/src/hooks/useRecorder.ts
index fd7cf8d..dc32767 100644
--- a/web/src/hooks/useRecorder.ts
+++ b/web/src/hooks/useRecorder.ts
@@ -1,8 +1,13 @@
+import { WebVoiceProcessor } from "@picovoice/web-voice-processor";
 import { useCallback, useRef } from "react";
 import { toast } from "sonner";
-import { resampleTo16kInt16 } from "../lib/resample";
 import { useAppStore } from "../stores/app-store";
-import audioProcessorUrl from "../workers/audio-processor.ts?worker&url";
+
+/**
+ * ~200ms frames at 16kHz = 3200 samples.
+ * Doubao bigmodel_async recommends 200ms packets for optimal performance.
+ */
+const FRAME_LENGTH = 3200;
 
 interface UseRecorderOptions {
 	sendJSON: (obj: Record<string, unknown>) => void;
@@ -10,10 +15,10 @@ interface UseRecorderOptions {
 }
 
 export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
-	const audioCtxRef = useRef<AudioContext | null>(null);
-	const workletRef = useRef<AudioWorkletNode | null>(null);
-	const streamRef = useRef<MediaStream | null>(null);
 	const abortRef = useRef<AbortController | null>(null);
+	const engineRef = useRef<{ onmessage: (e: MessageEvent) => void } | null>(
+		null,
+	);
 
 	// Keep stable refs so callbacks never go stale
 	const sendJSONRef = useRef(sendJSON);
@@ -21,16 +26,6 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
 	sendJSONRef.current = sendJSON;
 	sendBinaryRef.current = sendBinary;
 
-	const initAudio = useCallback(async () => {
-		if (audioCtxRef.current) return;
-		// Use device native sample rate — we resample to 16kHz in software
-		const ctx = new AudioContext();
-		// Chrome requires resume() after user gesture
-		if (ctx.state === "suspended") await ctx.resume();
-		await ctx.audioWorklet.addModule(audioProcessorUrl);
-		audioCtxRef.current = ctx;
-	}, []);
-
 	const startRecording = useCallback(async () => {
 		const store = useAppStore.getState();
 		if (store.recording || store.pendingStart) return;
@@ -40,48 +35,32 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
 		abortRef.current = abort;
 
 		try {
-			await initAudio();
-			if (abort.signal.aborted) {
-				store.setPendingStart(false);
-				return;
-			}
-
-			const ctx = audioCtxRef.current as AudioContext;
-			if (ctx.state === "suspended") await ctx.resume();
-			if (abort.signal.aborted) {
-				store.setPendingStart(false);
-				return;
-			}
-
-			const stream = await navigator.mediaDevices.getUserMedia({
-				audio: {
-					echoCancellation: true,
-					noiseSuppression: true,
-					channelCount: 1,
+			// Create an engine that receives Int16Array @ 16kHz from WebVoiceProcessor
+			const engine = {
+				onmessage: (e: MessageEvent) => {
+					if (e.data.command === "process") {
+						sendBinaryRef.current(e.data.inputFrame as Int16Array);
+					}
 				},
+			};
+			engineRef.current = engine;
+
+			WebVoiceProcessor.setOptions({
+				frameLength: FRAME_LENGTH,
+				outputSampleRate: 16000,
 			});
+
+			// subscribe() handles getUserMedia + AudioContext lifecycle internally.
+			// It checks for closed/suspended AudioContext and re-creates as needed.
+			await WebVoiceProcessor.subscribe(engine);
+
 			if (abort.signal.aborted) {
-				stream.getTracks().forEach((t) => {
-					t.stop();
-				});
+				await WebVoiceProcessor.unsubscribe(engine);
+				engineRef.current = null;
 				store.setPendingStart(false);
 				return;
 			}
 
-			streamRef.current = stream;
-			const source = ctx.createMediaStreamSource(stream);
-			const worklet = new AudioWorkletNode(ctx, "audio-processor");
-			worklet.port.onmessage = (e: MessageEvent) => {
-				if (e.data.type === "audio") {
-					sendBinaryRef.current(
-						resampleTo16kInt16(e.data.samples, e.data.sampleRate),
-					);
-				}
-			};
-			source.connect(worklet);
-			worklet.port.postMessage({ command: "start" });
-			workletRef.current = worklet;
-
 			store.setPendingStart(false);
 			abortRef.current = null;
 			store.setRecording(true);
@@ -90,9 +69,24 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
 		} catch (err) {
 			useAppStore.getState().setPendingStart(false);
 			abortRef.current = null;
-			toast.error(`麦克风错误: ${(err as Error).message}`);
+			engineRef.current = null;
+
+			const error = err as Error;
+			switch (error.name) {
+				case "PermissionError":
+					toast.error("麦克风权限被拒绝");
+					break;
+				case "DeviceMissingError":
+					toast.error("未找到麦克风设备");
+					break;
+				case "DeviceReadError":
+					toast.error("麦克风设备异常，请检查连接");
+					break;
+				default:
+					toast.error(`麦克风错误: ${error.message}`);
+			}
 		}
-	}, [initAudio]);
+	}, []);
 
 	const stopRecording = useCallback(() => {
 		const store = useAppStore.getState();
@@ -107,16 +101,10 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
 		if (!store.recording) return;
 		store.setRecording(false);
 
-		if (workletRef.current) {
-			workletRef.current.port.postMessage({ command: "stop" });
-			workletRef.current.disconnect();
-			workletRef.current = null;
-		}
-		if (streamRef.current) {
-			streamRef.current.getTracks().forEach((t) => {
-				t.stop();
-			});
-			streamRef.current = null;
+		if (engineRef.current) {
+			// Fire-and-forget: state is already updated, cleanup is async
+			WebVoiceProcessor.unsubscribe(engineRef.current);
+			engineRef.current = null;
 		}
 
 		sendJSONRef.current({ type: "stop" });
diff --git a/web/src/lib/resample.ts b/web/src/lib/resample.ts
deleted file mode 100644
index 7f127b9..0000000
--- a/web/src/lib/resample.ts
+++ /dev/null
@@ -1,23 +0,0 @@
-/**
- * Linear interpolation resampler: native sample rate -> 16kHz 16-bit mono PCM.
- */
-const TARGET_SAMPLE_RATE = 16000;
-
-export function resampleTo16kInt16(
-	float32: Float32Array,
-	srcRate: number,
-): Int16Array {
-	const ratio = srcRate / TARGET_SAMPLE_RATE;
-	const outLen = Math.floor(float32.length / ratio);
-	const out = new Int16Array(outLen);
-	for (let i = 0; i < outLen; i++) {
-		const srcIdx = i * ratio;
-		const lo = Math.floor(srcIdx);
-		const hi = Math.min(lo + 1, float32.length - 1);
-		const frac = srcIdx - lo;
-		const sample = float32[lo] + frac * (float32[hi] - float32[lo]);
-		// Clamp to [-1, 1] then scale to Int16
-		out[i] = Math.max(-32768, Math.min(32767, Math.round(sample * 32767)));
-	}
-	return out;
-}
diff --git a/web/src/workers/audio-processor.ts b/web/src/workers/audio-processor.ts
deleted file mode 100644
index 3c8f3f4..0000000
--- a/web/src/workers/audio-processor.ts
+++ /dev/null
@@ -1,88 +0,0 @@
-/**
- * AudioWorklet processor for VoicePaste.
- *
- * Captures raw Float32 PCM from the microphone, accumulates samples into
- * ~200ms frames, and posts them to the main thread for resampling + WS send.
- *
- * Communication:
- *   Main → Processor: { command: "start" | "stop" }
- *   Processor → Main: { type: "audio", samples: Float32Array, sampleRate: number }
- */
-
-// AudioWorkletGlobalScope globals (not in standard lib)
-declare const sampleRate: number;
-declare class AudioWorkletProcessor {
-	readonly port: MessagePort;
-	constructor();
-	process(
-		inputs: Float32Array[][],
-		outputs: Float32Array[][],
-		parameters: Record<string, Float32Array>,
-	): boolean;
-}
-declare function registerProcessor(
-	name: string,
-	ctor: new () => AudioWorkletProcessor,
-): void;
-
-class VoicePasteProcessor extends AudioWorkletProcessor {
-	private recording = false;
-	private buffer: Float32Array[] = [];
-	private bufferLen = 0;
-	private readonly frameSize: number;
-
-	constructor() {
-		super();
-		// ~200ms worth of samples at current sample rate
-		this.frameSize = Math.floor(sampleRate * 0.2);
-
-		this.port.onmessage = (e: MessageEvent) => {
-			if (e.data.command === "start") {
-				this.recording = true;
-				this.buffer = [];
-				this.bufferLen = 0;
-			} else if (e.data.command === "stop") {
-				if (this.bufferLen > 0) {
-					this.flush();
-				}
-				this.recording = false;
-			}
-		};
-	}
-
-	process(inputs: Float32Array[][]): boolean {
-		if (!this.recording) return true;
-
-		const input = inputs[0];
-		if (!input || !input[0]) return true;
-
-		const channelData = input[0];
-		this.buffer.push(new Float32Array(channelData));
-		this.bufferLen += channelData.length;
-
-		if (this.bufferLen >= this.frameSize) {
-			this.flush();
-		}
-
-		return true;
-	}
-
-	private flush(): void {
-		const merged = new Float32Array(this.bufferLen);
-		let offset = 0;
-		for (const chunk of this.buffer) {
-			merged.set(chunk, offset);
-			offset += chunk.length;
-		}
-
-		this.port.postMessage(
-			{ type: "audio", samples: merged, sampleRate: sampleRate },
-			[merged.buffer],
-		);
-
-		this.buffer = [];
-		this.bufferLen = 0;
-	}
-}
-
-registerProcessor("audio-processor", VoicePasteProcessor);