Compare commits

..

5 Commits

15 changed files with 876 additions and 178 deletions

2
go.mod
View File

@@ -54,6 +54,8 @@ require (
github.com/vcaesar/keycode v0.10.1 // indirect
github.com/vcaesar/screenshot v0.11.1 // indirect
github.com/vcaesar/tt v0.20.1 // indirect
github.com/vmihailenco/msgpack/v5 v5.4.1 // indirect
github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect
github.com/yusufpapurcu/wmi v1.2.4 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/crypto v0.48.0 // indirect

4
go.sum
View File

@@ -120,6 +120,10 @@ github.com/vcaesar/screenshot v0.11.1 h1:GgPuN89XC4Yh38dLx4quPlSo3YiWWhwIria/j3L
github.com/vcaesar/screenshot v0.11.1/go.mod h1:gJNwHBiP1v1v7i8TQ4yV1XJtcyn2I/OJL7OziVQkwjs=
github.com/vcaesar/tt v0.20.1 h1:D/jUeeVCNbq3ad8M7hhtB3J9x5RZ6I1n1eZ0BJp7M+4=
github.com/vcaesar/tt v0.20.1/go.mod h1:cH2+AwGAJm19Wa6xvEa+0r+sXDJBT0QgNQey6mwqLeU=
github.com/vmihailenco/msgpack/v5 v5.4.1 h1:cQriyiUvjTwOHg8QZaPihLWeRAAVoCpE00IUPn0Bjt8=
github.com/vmihailenco/msgpack/v5 v5.4.1/go.mod h1:GaZTsDaehaPpQVyxrf5mtQlH+pc21PIudVV/E3rRQok=
github.com/vmihailenco/tagparser/v2 v2.0.0 h1:y09buUbR+b5aycVFQs/g70pqKVZNBmxwAhO7/IwNM9g=
github.com/vmihailenco/tagparser/v2 v2.0.0/go.mod h1:Wri+At7QHww0WTrCBeu4J6bNtoV6mEfg5OIWRZA9qds=
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=

View File

@@ -2,40 +2,45 @@ package ws
import (
"encoding/json"
"fmt"
"log/slog"
"sync"
"time"
"github.com/gofiber/contrib/v3/websocket"
"github.com/gofiber/fiber/v3"
"github.com/vmihailenco/msgpack/v5"
)
// session holds the state for a single WebSocket connection.
const wsReadTimeout = 75 * time.Second
type session struct {
conn *websocket.Conn
log *slog.Logger
resultCh chan ServerMsg
previewMu sync.Mutex
previewText string
sendAudio func([]byte)
cleanup func()
active bool
conn *websocket.Conn
log *slog.Logger
resultCh chan ServerMsg
stateMu sync.Mutex
writeMu sync.Mutex
previewMu sync.Mutex
state string
sessionID string
seq int64
lastAudioSeq uint64
previewText string
sendAudio func([]byte)
cleanup func()
}
// PasteFunc is called when the server should paste text into the focused app.
type PasteFunc func(text string) error
// ASRFactory creates an ASR session. It returns a channel that receives
// partial/final results, and a function to send audio frames.
// The cleanup function must be called when the session ends.
type ASRFactory func(resultCh chan<- ServerMsg) (sendAudio func(pcm []byte), cleanup func(), err error)
// Handler holds dependencies for WebSocket connections.
type Handler struct {
token string
pasteFunc PasteFunc
asrFactory ASRFactory
}
// NewHandler creates a WS handler with the given dependencies.
func NewHandler(token string, pasteFn PasteFunc, asrFn ASRFactory) *Handler {
return &Handler{
token: token,
@@ -44,150 +49,350 @@ func NewHandler(token string, pasteFn PasteFunc, asrFn ASRFactory) *Handler {
}
}
// Register adds the /ws route to the Fiber app.
func (h *Handler) Register(app *fiber.App) {
// Token check middleware (before upgrade)
app.Use("/ws", func(c fiber.Ctx) error {
if !websocket.IsWebSocketUpgrade(c) {
return fiber.ErrUpgradeRequired
}
if h.token != "" {
q := c.Query("token")
if q != h.token {
return c.Status(fiber.StatusUnauthorized).SendString("invalid token")
}
}
return c.Next()
})
app.Use("/ws", func(c fiber.Ctx) error {
if !websocket.IsWebSocketUpgrade(c) {
return fiber.ErrUpgradeRequired
}
if h.token != "" {
q := c.Query("token")
if q != h.token {
return c.Status(fiber.StatusUnauthorized).SendString("invalid token")
}
}
return c.Next()
})
app.Get("/ws", websocket.New(h.handleConn))
app.Get("/ws", websocket.New(h.handleConn))
}
func (h *Handler) handleConn(c *websocket.Conn) {
sess := &session{
conn: c,
log: slog.With("remote", c.RemoteAddr().String()),
resultCh: make(chan ServerMsg, 32),
conn: c,
log: slog.With("remote", c.RemoteAddr().String()),
resultCh: make(chan ServerMsg, 64),
state: StateIdle,
sessionID: "",
seq: 1,
lastAudioSeq: 0,
}
sess.log.Info("ws connected")
defer sess.log.Info("ws disconnected")
defer close(sess.resultCh)
defer sess.cleanupASR()
defer sess.forceStop()
var wg sync.WaitGroup
wg.Add(1)
go sess.writerLoop(&wg)
defer wg.Wait()
_ = sess.writeJSON(ServerMsg{Type: MsgReady, Message: "ok"})
_ = sess.writeJSON(ServerMsg{Type: MsgState, State: StateIdle})
for {
_ = c.SetReadDeadline(time.Now().Add(wsReadTimeout))
mt, data, err := c.ReadMessage()
if err != nil {
break
}
if mt == websocket.BinaryMessage {
switch mt {
case websocket.BinaryMessage:
sess.handleAudioFrame(data)
} else if mt == websocket.TextMessage {
case websocket.TextMessage:
h.handleTextMessage(sess, data)
}
}
}
func (s *session) writerLoop(wg *sync.WaitGroup) {
defer wg.Done()
for msg := range s.resultCh {
if msg.Type == MsgPartial || msg.Type == MsgFinal {
s.previewMu.Lock()
s.previewText = msg.Text
preview := ServerMsg{Type: msg.Type, Text: s.previewText}
s.previewMu.Unlock()
if err := s.conn.WriteMessage(websocket.TextMessage, preview.Bytes()); err != nil {
s.log.Warn("ws write error", "err", err)
return
}
continue
s.stateMu.Lock()
msg.SessionID = s.sessionID
msg.Seq = s.seq
s.seq++
s.stateMu.Unlock()
}
if err := s.conn.WriteMessage(websocket.TextMessage, msg.Bytes()); err != nil {
if err := s.writeJSON(msg); err != nil {
s.log.Warn("ws write error", "err", err)
return
}
}
}
func (s *session) handleAudioFrame(data []byte) {
if s.active && s.sendAudio != nil {
s.sendAudio(data)
}
func (s *session) writeJSON(msg ServerMsg) error {
s.writeMu.Lock()
defer s.writeMu.Unlock()
return s.conn.WriteMessage(websocket.TextMessage, msg.Bytes())
}
func (s *session) handleAudioFrame(data []byte) {
packet, err := decodeAudioPacket(data)
if err != nil {
s.log.Warn("invalid audio packet", "err", err)
return
}
s.stateMu.Lock()
if s.state != StateRecording || s.sendAudio == nil {
s.stateMu.Unlock()
return
}
if packet.SessionID != s.sessionID {
s.stateMu.Unlock()
return
}
if packet.Seq <= s.lastAudioSeq {
s.stateMu.Unlock()
return
}
lastSeq := s.lastAudioSeq
if lastSeq > 0 && packet.Seq > lastSeq+1 {
s.log.Warn(
"audio seq gap",
"session_id",
s.sessionID,
"last",
lastSeq,
"curr",
packet.Seq,
)
}
s.lastAudioSeq = packet.Seq
sendAudio := s.sendAudio
s.stateMu.Unlock()
sendAudio(packet.PCM)
}
type audioPacket struct {
Version int `msgpack:"v"`
SessionID string `msgpack:"sessionId"`
Seq uint64 `msgpack:"seq"`
PCM []byte `msgpack:"pcm"`
}
func decodeAudioPacket(data []byte) (audioPacket, error) {
var packet audioPacket
if err := msgpack.Unmarshal(data, &packet); err != nil {
return audioPacket{}, fmt.Errorf("msgpack decode failed: %w", err)
}
if packet.Version != 1 {
return audioPacket{}, fmt.Errorf("unsupported version: %d", packet.Version)
}
if packet.SessionID == "" || len(packet.SessionID) > 96 {
return audioPacket{}, fmt.Errorf("invalid session id")
}
if packet.Seq == 0 {
return audioPacket{}, fmt.Errorf("invalid seq")
}
if len(packet.PCM) == 0 || len(packet.PCM)%2 != 0 {
return audioPacket{}, fmt.Errorf("invalid pcm size")
}
return packet, nil
}
func (h *Handler) handleTextMessage(s *session, data []byte) {
var msg ClientMsg
if err := json.Unmarshal(data, &msg); err != nil {
s.log.Warn("invalid json", "err", err)
return
}
switch msg.Type {
case MsgHello:
_ = s.writeJSON(ServerMsg{Type: MsgReady, Message: "ok"})
s.stateMu.Lock()
state := s.state
sid := s.sessionID
s.stateMu.Unlock()
_ = s.writeJSON(ServerMsg{Type: MsgState, State: state, SessionID: sid})
case MsgPing:
_ = s.writeJSON(ServerMsg{Type: MsgPong, TS: msg.TS})
case MsgStart:
h.handleStart(s)
h.handleStart(s, msg)
case MsgStop:
h.handleStop(s)
h.handleStop(s, msg)
case MsgPaste:
h.handlePaste(s, msg.Text)
h.handlePaste(s, msg.Text, msg.SessionID)
default:
h.sendError(s, "bad_message", "unsupported message type", true, "")
}
}
func (h *Handler) handleStart(s *session) {
if s.active {
func (h *Handler) handleStart(s *session, msg ClientMsg) {
if msg.SessionID == "" || len(msg.SessionID) > 96 {
h.sendError(s, "invalid_session", "invalid sessionId", false, "")
return
}
// Future extension: support runtime dynamic hotwords (Phase 2)
// if len(msg.Hotwords) > 0 {
// // Priority: runtime hotwords > config.yaml hotwords
// // Need to modify asrFactory signature to pass msg.Hotwords
// }
s.stateMu.Lock()
if s.state != StateIdle {
current := s.sessionID
s.stateMu.Unlock()
h.sendError(s, "busy", "session is not idle", true, current)
return
}
s.state = StateRecording
s.sessionID = msg.SessionID
s.seq = 1
s.lastAudioSeq = 0
s.sendAudio = nil
s.cleanup = nil
s.stateMu.Unlock()
s.previewMu.Lock()
s.previewText = ""
s.previewMu.Unlock()
sa, cl, err := h.asrFactory(s.resultCh)
if err != nil {
s.log.Error("asr start failed", "err", err)
s.resultCh <- ServerMsg{Type: MsgError, Message: "ASR start failed"}
s.stateMu.Lock()
s.state = StateIdle
s.sessionID = ""
s.sendAudio = nil
s.cleanup = nil
s.stateMu.Unlock()
h.sendError(s, "start_failed", "ASR start failed", true, "")
_ = s.writeJSON(ServerMsg{Type: MsgState, State: StateIdle})
return
}
s.stateMu.Lock()
if s.sessionID != msg.SessionID || s.state != StateRecording {
s.stateMu.Unlock()
cl()
return
}
s.sendAudio = sa
s.cleanup = cl
s.active = true
s.log.Info("recording started")
s.stateMu.Unlock()
_ = s.writeJSON(ServerMsg{Type: MsgStartAck, SessionID: msg.SessionID})
_ = s.writeJSON(ServerMsg{Type: MsgState, State: StateRecording, SessionID: msg.SessionID})
s.log.Info("recording started", "session_id", msg.SessionID)
}
func (h *Handler) handleStop(s *session) {
if !s.active {
func (h *Handler) handleStop(s *session, msg ClientMsg) {
s.stateMu.Lock()
if s.state == StateIdle {
s.stateMu.Unlock()
return
}
s.cleanupASR()
s.sendAudio = nil
s.active = false
if s.state == StateStopping {
sid := s.sessionID
s.stateMu.Unlock()
_ = s.writeJSON(ServerMsg{Type: MsgStopAck, SessionID: sid})
return
}
if msg.SessionID != "" && msg.SessionID != s.sessionID {
current := s.sessionID
s.stateMu.Unlock()
h.sendError(s, "session_mismatch", "stop sessionId mismatch", false, current)
return
}
s.state = StateStopping
sid := s.sessionID
s.stateMu.Unlock()
_ = s.writeJSON(ServerMsg{Type: MsgStopAck, SessionID: sid})
_ = s.writeJSON(ServerMsg{Type: MsgState, State: StateStopping, SessionID: sid})
go h.finalizeStop(s, sid)
}
func (h *Handler) finalizeStop(s *session, sid string) {
cleanup := s.detachCleanup(sid)
if cleanup != nil {
cleanup()
}
s.previewMu.Lock()
finalText := s.previewText
s.previewText = ""
s.previewMu.Unlock()
if finalText != "" && h.pasteFunc != nil {
if err := h.pasteFunc(finalText); err != nil {
s.log.Error("auto-paste failed", "err", err)
h.sendError(s, "paste_failed", "auto-paste failed", true, sid)
} else {
s.resultCh <- ServerMsg{Type: MsgPasted}
_ = s.writeJSON(ServerMsg{Type: MsgPasted, SessionID: sid})
}
}
s.log.Info("recording stopped")
s.stateMu.Lock()
if s.sessionID == sid {
s.state = StateIdle
s.sessionID = ""
s.seq = 1
s.lastAudioSeq = 0
s.sendAudio = nil
s.cleanup = nil
}
s.stateMu.Unlock()
_ = s.writeJSON(ServerMsg{Type: MsgState, State: StateIdle})
s.log.Info("recording stopped", "session_id", sid)
}
func (h *Handler) handlePaste(s *session, text string) {
func (h *Handler) handlePaste(s *session, text, sid string) {
if text == "" {
return
}
if h.pasteFunc != nil {
if err := h.pasteFunc(text); err != nil {
s.log.Error("paste failed", "err", err)
s.resultCh <- ServerMsg{Type: MsgError, Message: "paste failed"}
h.sendError(s, "paste_failed", "paste failed", true, sid)
} else {
s.resultCh <- ServerMsg{Type: MsgPasted}
_ = s.writeJSON(ServerMsg{Type: MsgPasted, SessionID: sid})
}
}
}
func (s *session) cleanupASR() {
if s.cleanup != nil {
s.cleanup()
s.cleanup = nil
func (h *Handler) sendError(s *session, code, message string, retryable bool, sid string) {
err := s.writeJSON(ServerMsg{
Type: MsgError,
Code: code,
Message: message,
Retryable: retryable,
SessionID: sid,
})
if err != nil {
s.log.Warn("send error message failed", "err", err)
}
}
}
func (s *session) detachCleanup(sid string) func() {
s.stateMu.Lock()
defer s.stateMu.Unlock()
if sid != "" && s.sessionID != sid {
return nil
}
cleanup := s.cleanup
s.cleanup = nil
s.sendAudio = nil
return cleanup
}
func (s *session) forceStop() {
cleanup := s.detachCleanup("")
if cleanup != nil {
cleanup()
}
s.stateMu.Lock()
s.state = StateIdle
s.sessionID = ""
s.seq = 1
s.lastAudioSeq = 0
s.stateMu.Unlock()
}

View File

@@ -8,15 +8,22 @@ import "encoding/json"
type MsgType string
const (
MsgStart MsgType = "start" // Begin recording session
MsgStop MsgType = "stop" // End recording session
MsgPaste MsgType = "paste" // Re-paste a history item
MsgHello MsgType = "hello"
MsgStart MsgType = "start"
MsgStop MsgType = "stop"
MsgPaste MsgType = "paste"
MsgPing MsgType = "ping"
MsgPong MsgType = "pong"
)
// ClientMsg is a JSON control message from the phone.
type ClientMsg struct {
Type MsgType `json:"type"`
Text string `json:"text,omitempty"` // Only for "paste"
Type MsgType `json:"type"`
SessionID string `json:"sessionId,omitempty"`
Seq int64 `json:"seq,omitempty"`
Text string `json:"text,omitempty"` // Only for "paste"
Version int `json:"version,omitempty"`
TS int64 `json:"ts,omitempty"`
// Future extension: dynamic hotwords (Phase 2)
// Hotwords []string `json:"hotwords,omitempty"`
}
@@ -24,17 +31,33 @@ type ClientMsg struct {
// ── Server → Client messages ──
const (
MsgPartial MsgType = "partial" // Interim ASR result
MsgFinal MsgType = "final" // Final ASR result
MsgPasted MsgType = "pasted" // Paste confirmed
MsgError MsgType = "error" // Error notification
MsgReady MsgType = "ready"
MsgState MsgType = "state"
MsgStartAck MsgType = "start_ack"
MsgStopAck MsgType = "stop_ack"
MsgPartial MsgType = "partial"
MsgFinal MsgType = "final"
MsgPasted MsgType = "pasted"
MsgError MsgType = "error"
)
const (
StateIdle = "idle"
StateRecording = "recording"
StateStopping = "stopping"
)
// ServerMsg is a JSON message sent to the phone.
type ServerMsg struct {
Type MsgType `json:"type"`
Text string `json:"text,omitempty"`
Message string `json:"message,omitempty"` // For errors
Type MsgType `json:"type"`
State string `json:"state,omitempty"`
SessionID string `json:"sessionId,omitempty"`
Seq int64 `json:"seq,omitempty"`
Text string `json:"text,omitempty"`
Message string `json:"message,omitempty"` // For errors
Code string `json:"code,omitempty"`
Retryable bool `json:"retryable,omitempty"`
TS int64 `json:"ts,omitempty"`
}
func (m ServerMsg) Bytes() []byte {

View File

@@ -5,6 +5,7 @@
"": {
"name": "web",
"dependencies": {
"@msgpack/msgpack": "^3.1.3",
"@picovoice/web-voice-processor": "^4.0.9",
"partysocket": "^1.1.16",
"react": "^19.2.4",
@@ -297,6 +298,8 @@
"@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.31", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.1.0", "@jridgewell/sourcemap-codec": "^1.4.14" } }, "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw=="],
"@msgpack/msgpack": ["@msgpack/msgpack@3.1.3", "", {}, "sha512-47XIizs9XZXvuJgoaJUIE2lFoID8ugvc0jzSHP+Ptfk8nTbnR8g788wv48N03Kx0UkAv559HWRQ3yzOgzlRNUA=="],
"@picovoice/web-utils": ["@picovoice/web-utils@1.3.1", "", { "dependencies": { "commander": "^9.2.0" }, "bin": { "pvbase64": "scripts/base64.js" } }, "sha512-jcDqdULtTm+yJrnHDjg64hARup+Z4wNkYuXHNx6EM8+qZkweBq9UA6XJrHAlUkPnlkso4JWjaIKhz3x8vZcd3g=="],
"@picovoice/web-voice-processor": ["@picovoice/web-voice-processor@4.0.9", "", { "dependencies": { "@picovoice/web-utils": "=1.3.1" } }, "sha512-20pdkFjtuiojAdLIkNHXt4YgpRnlUePFW+gfkeCb+J+2XTRDGOI50+aJzL95p6QjDzGXsO7PZhlz7yDofOvZtg=="],

View File

@@ -22,6 +22,7 @@
"vite-plugin-pwa": "^1.2.0"
},
"dependencies": {
"@msgpack/msgpack": "^3.1.3",
"@picovoice/web-voice-processor": "^4.0.9",
"partysocket": "^1.1.16",
"react": "^19.2.4",

View File

@@ -1,3 +1,4 @@
import { useEffect } from "react";
import { Toaster } from "sonner";
import { HistoryList } from "./components/HistoryList";
import { MicButton } from "./components/MicButton";
@@ -5,13 +6,36 @@ import { PreviewBox } from "./components/PreviewBox";
import { StatusBadge } from "./components/StatusBadge";
import { useRecorder } from "./hooks/useRecorder";
import { useWebSocket } from "./hooks/useWebSocket";
import { useAppStore } from "./stores/app-store";
export function App() {
const { sendJSON, sendBinary } = useWebSocket();
const { startRecording, stopRecording } = useRecorder({
sendJSON,
sendBinary,
const { requestStart, sendStop, sendPaste, sendAudioFrame } = useWebSocket();
const { prewarm, startRecording, stopRecording } = useRecorder({
requestStart,
sendStop,
sendAudioFrame,
});
const micReady = useAppStore((s) => s.micReady);
useEffect(() => {
const forceStopOnBackground = () => {
const state = useAppStore.getState();
if (state.recording || state.pendingStart) {
stopRecording();
}
};
const onVisibility = () => {
if (document.hidden) forceStopOnBackground();
};
document.addEventListener("visibilitychange", onVisibility);
window.addEventListener("pagehide", forceStopOnBackground);
return () => {
document.removeEventListener("visibilitychange", onVisibility);
window.removeEventListener("pagehide", forceStopOnBackground);
};
}, [stopRecording]);
return (
<>
@@ -24,8 +48,17 @@ export function App() {
</header>
<PreviewBox />
{!micReady ? (
<button
type="button"
onClick={() => void prewarm()}
className="mb-3 rounded-xl border border-edge bg-surface px-4 py-3 font-medium text-fg text-sm"
>
{"先点我准备麦克风(首次会弹权限)"}
</button>
) : null}
<MicButton onStart={startRecording} onStop={stopRecording} />
<HistoryList sendJSON={sendJSON} />
<HistoryList sendPaste={sendPaste} />
</div>
<Toaster theme="dark" position="bottom-center" />
</>

View File

@@ -8,19 +8,19 @@ function formatTime(ts: number): string {
}
interface HistoryListProps {
sendJSON: (obj: Record<string, unknown>) => void;
sendPaste: (text: string) => void;
}
export function HistoryList({ sendJSON }: HistoryListProps) {
export function HistoryList({ sendPaste }: HistoryListProps) {
const history = useAppStore((s) => s.history);
const clearHistory = useAppStore((s) => s.clearHistory);
const handleItemClick = useCallback(
(text: string) => {
sendJSON({ type: "paste", text });
sendPaste(text);
toast.info("发送粘贴…");
},
[sendJSON],
[sendPaste],
);
return (

View File

@@ -8,9 +8,13 @@ interface MicButtonProps {
export function MicButton({ onStart, onStop }: MicButtonProps) {
const connected = useAppStore((s) => s.connectionStatus === "connected");
const micReady = useAppStore((s) => s.micReady);
const recording = useAppStore((s) => s.recording);
const pendingStart = useAppStore((s) => s.pendingStart);
const isActive = recording || pendingStart;
const stopping = useAppStore((s) => s.stopping);
const weakNetwork = useAppStore((s) => s.weakNetwork);
const isActive = recording || pendingStart || stopping;
const disabled = !connected || !micReady || stopping;
const handlePointerDown = useCallback(
(e: React.PointerEvent<HTMLButtonElement>) => {
@@ -46,6 +50,8 @@ export function MicButton({ onStart, onStop }: MicButtonProps) {
"cursor-not-allowed border-edge bg-linear-to-br from-surface-hover to-surface text-fg-secondary opacity-30 shadow-[0_2px_12px_rgba(0,0,0,0.3),inset_0_1px_0_rgba(255,255,255,0.04)]";
const activeClasses =
"animate-mic-breathe scale-[1.06] border-accent-hover bg-accent text-white shadow-[0_0_32px_rgba(99,102,241,0.35),0_0_80px_rgba(99,102,241,0.2)]";
const weakClasses =
"border-amber-400 bg-linear-to-br from-amber-400/15 to-surface text-amber-200 shadow-[0_0_22px_rgba(251,191,36,0.28)]";
const idleClasses =
"border-edge bg-linear-to-br from-surface-hover to-surface text-fg-secondary shadow-[0_2px_12px_rgba(0,0,0,0.3),inset_0_1px_0_rgba(255,255,255,0.04)]";
@@ -54,13 +60,15 @@ export function MicButton({ onStart, onStop }: MicButtonProps) {
<div className="relative flex touch-none items-center justify-center">
<button
type="button"
disabled={!connected}
disabled={disabled}
className={`relative z-1 flex size-24 cursor-pointer touch-none select-none items-center justify-center rounded-full border-2 transition-all duration-[250ms] ease-[cubic-bezier(0.4,0,0.2,1)] ${
!connected
disabled
? disabledClasses
: isActive
? activeClasses
: idleClasses
: weakNetwork
? weakClasses
: isActive
? activeClasses
: idleClasses
}`}
onPointerDown={handlePointerDown}
onPointerUp={handlePointerUp}
@@ -98,7 +106,17 @@ export function MicButton({ onStart, onStop }: MicButtonProps) {
))}
</div>
</div>
<p className="font-medium text-fg-dim text-sm">{"按住说话"}</p>
<p className="font-medium text-fg-dim text-sm">
{!micReady
? "请先准备麦克风"
: !connected
? "连接中断,等待重连"
: stopping
? "收尾中…"
: weakNetwork
? "网络波动,已启用缓冲"
: "按住说话"}
</p>
</section>
);
}

View File

@@ -3,7 +3,11 @@ import { useAppStore } from "../stores/app-store";
export function PreviewBox() {
const text = useAppStore((s) => s.previewText);
const active = useAppStore((s) => s.previewActive);
const weakNetwork = useAppStore((s) => s.weakNetwork);
const recording = useAppStore((s) => s.recording);
const hasText = text.length > 0;
const placeholder =
weakNetwork && recording ? "网络波动中,音频缓冲后发送…" : "按住说话…";
return (
<section className="shrink-0 pb-3">
@@ -17,7 +21,7 @@ export function PreviewBox() {
<p
className={`break-words text-base leading-relaxed ${hasText ? "" : "text-fg-dim"}`}
>
{hasText ? text : "按住说话…"}
{hasText ? text : placeholder}
</p>
</div>
</section>

View File

@@ -20,7 +20,12 @@ const statusConfig = {
export function StatusBadge() {
const status = useAppStore((s) => s.connectionStatus);
const { text, dotClass, borderClass } = statusConfig[status];
const weakNetwork = useAppStore((s) => s.weakNetwork);
const { dotClass, borderClass } = statusConfig[status];
const text =
status === "connected" && weakNetwork
? "网络波动"
: statusConfig[status].text;
return (
<div

View File

@@ -3,71 +3,116 @@ import { useCallback, useRef } from "react";
import { toast } from "sonner";
import { useAppStore } from "../stores/app-store";
/**
* ~200ms frames at 16kHz = 3200 samples.
* Doubao bigmodel_async recommends 200ms packets for optimal performance.
*/
const FRAME_LENGTH = 3200;
interface UseRecorderOptions {
sendJSON: (obj: Record<string, unknown>) => void;
sendBinary: (data: Int16Array) => void;
requestStart: () => Promise<string | null>;
sendStop: (sessionId: string | null) => void;
sendAudioFrame: (sessionId: string, seq: number, data: Int16Array) => boolean;
}
export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
let optionsInitialized = false;
export function useRecorder({
requestStart,
sendStop,
sendAudioFrame,
}: UseRecorderOptions) {
const abortRef = useRef<AbortController | null>(null);
const engineRef = useRef<{ onmessage: (e: MessageEvent) => void } | null>(
null,
);
const warmedRef = useRef(false);
const audioSeqRef = useRef(1);
// Keep stable refs so callbacks never go stale
const sendJSONRef = useRef(sendJSON);
const sendBinaryRef = useRef(sendBinary);
sendJSONRef.current = sendJSON;
sendBinaryRef.current = sendBinary;
const initOptions = useCallback(() => {
if (optionsInitialized) return;
WebVoiceProcessor.setOptions({
frameLength: FRAME_LENGTH,
outputSampleRate: 16000,
});
optionsInitialized = true;
}, []);
const prewarm = useCallback(async (): Promise<boolean> => {
if (warmedRef.current) return true;
initOptions();
const warmEngine = {
onmessage: () => {},
};
try {
await WebVoiceProcessor.subscribe(warmEngine);
await WebVoiceProcessor.unsubscribe(warmEngine);
warmedRef.current = true;
useAppStore.getState().setMicReady(true);
return true;
} catch (err) {
const error = err as Error;
toast.error(`麦克风准备失败: ${error.message}`);
return false;
}
}, [initOptions]);
const startRecording = useCallback(async () => {
const store = useAppStore.getState();
if (store.recording || store.pendingStart) return;
if (store.recording || store.pendingStart || store.stopping) return;
store.setPendingStart(true);
store.setStopping(false);
const abort = new AbortController();
abortRef.current = abort;
try {
// Create an engine that receives Int16Array @ 16kHz from WebVoiceProcessor
const warmed = await prewarm();
if (!warmed) {
store.setPendingStart(false);
abortRef.current = null;
return;
}
const sessionId = await requestStart();
if (!sessionId) {
store.setPendingStart(false);
abortRef.current = null;
toast.error("启动会话失败,请重试");
return;
}
const engine = {
onmessage: (e: MessageEvent) => {
if (e.data.command === "process") {
sendBinaryRef.current(e.data.inputFrame as Int16Array);
}
if (e.data.command !== "process") return;
const seq = audioSeqRef.current;
audioSeqRef.current += 1;
sendAudioFrame(sessionId, seq, e.data.inputFrame as Int16Array);
},
};
engineRef.current = engine;
audioSeqRef.current = 1;
WebVoiceProcessor.setOptions({
frameLength: FRAME_LENGTH,
outputSampleRate: 16000,
});
// subscribe() handles getUserMedia + AudioContext lifecycle internally.
// It checks for closed/suspended AudioContext and re-creates as needed.
await WebVoiceProcessor.subscribe(engine);
if (abort.signal.aborted) {
await WebVoiceProcessor.unsubscribe(engine);
engineRef.current = null;
sendStop(sessionId);
store.setPendingStart(false);
abortRef.current = null;
return;
}
store.setActiveSessionId(sessionId);
store.setPendingStart(false);
abortRef.current = null;
store.setRecording(true);
sendJSONRef.current({ type: "start" });
store.setStopping(false);
store.clearPreview();
abortRef.current = null;
} catch (err) {
useAppStore.getState().setPendingStart(false);
store.setPendingStart(false);
store.setRecording(false);
store.setStopping(false);
abortRef.current = null;
engineRef.current = null;
@@ -86,7 +131,7 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
toast.error(`麦克风错误: ${error.message}`);
}
}
}, []);
}, [prewarm, requestStart, sendAudioFrame, sendStop]);
const stopRecording = useCallback(() => {
const store = useAppStore.getState();
@@ -100,15 +145,16 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
if (!store.recording) return;
store.setRecording(false);
store.setStopping(true);
audioSeqRef.current = 1;
if (engineRef.current) {
// Fire-and-forget: state is already updated, cleanup is async
WebVoiceProcessor.unsubscribe(engineRef.current);
engineRef.current = null;
}
sendJSONRef.current({ type: "stop" });
}, []);
sendStop(store.activeSessionId);
}, [sendStop]);
return { startRecording, stopRecording };
return { prewarm, startRecording, stopRecording };
}

View File

@@ -1,8 +1,31 @@
import { encode } from "@msgpack/msgpack";
import { WebSocket as ReconnectingWebSocket } from "partysocket";
import { useCallback, useEffect, useRef } from "react";
import { toast } from "sonner";
import type { ClientMsg, ServerMsg } from "../protocol";
import { useAppStore } from "../stores/app-store";
const HEARTBEAT_INTERVAL_MS = 15000;
const HEARTBEAT_TIMEOUT_MS = 10000;
const START_TIMEOUT_MS = 5000;
const AUDIO_PACKET_VERSION = 1;
const MAX_AUDIO_QUEUE = 6;
const WEAK_NETWORK_TOAST_INTERVAL_MS = 4000;
const WS_BACKPRESSURE_BYTES = 128 * 1024;
interface QueuedAudioPacket {
sessionId: string;
seq: number;
buffer: ArrayBuffer;
}
interface AudioPacketPayload {
v: number;
sessionId: string;
seq: number;
pcm: Uint8Array;
}
function getWsUrl(): string {
const proto = location.protocol === "https:" ? "wss:" : "ws:";
const params = new URLSearchParams(location.search);
@@ -11,75 +34,355 @@ function getWsUrl(): string {
return `${proto}//${location.host}/ws${q}`;
}
function createSessionId(): string {
if (typeof crypto !== "undefined" && "randomUUID" in crypto) {
return crypto.randomUUID();
}
return `${Date.now()}-${Math.random().toString(16).slice(2)}`;
}
function encodeAudioPacket(
sessionId: string,
seq: number,
pcm: Int16Array,
): ArrayBuffer {
const sidBytes = new TextEncoder().encode(sessionId);
if (sidBytes.length === 0 || sidBytes.length > 96) {
throw new Error("invalid sessionId length");
}
const pcmBytes = new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength);
const payload: AudioPacketPayload = {
v: AUDIO_PACKET_VERSION,
sessionId,
seq,
pcm: pcmBytes,
};
const packed = encode(payload);
return packed.buffer.slice(
packed.byteOffset,
packed.byteOffset + packed.byteLength,
);
}
export function useWebSocket() {
const wsRef = useRef<ReconnectingWebSocket | null>(null);
const heartbeatTimerRef = useRef<number | null>(null);
const heartbeatTimeoutRef = useRef<number | null>(null);
const pendingStartRef = useRef<{
sessionId: string;
resolve: (sessionId: string | null) => void;
timer: number;
} | null>(null);
const audioQueueRef = useRef<QueuedAudioPacket[]>([]);
const lastWeakToastAtRef = useRef(0);
const sendJSON = useCallback((obj: Record<string, unknown>) => {
const ws = wsRef.current;
if (ws?.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify(obj));
const clearHeartbeat = useCallback(() => {
if (heartbeatTimerRef.current !== null) {
window.clearInterval(heartbeatTimerRef.current);
heartbeatTimerRef.current = null;
}
if (heartbeatTimeoutRef.current !== null) {
window.clearTimeout(heartbeatTimeoutRef.current);
heartbeatTimeoutRef.current = null;
}
}, []);
const sendBinary = useCallback((data: Int16Array) => {
const sendControl = useCallback((msg: ClientMsg): boolean => {
const ws = wsRef.current;
if (ws?.readyState === WebSocket.OPEN) {
ws.send(data.buffer as ArrayBuffer);
ws.send(JSON.stringify(msg));
return true;
}
return false;
}, []);
const notifyWeakNetwork = useCallback(() => {
const now = Date.now();
if (now - lastWeakToastAtRef.current < WEAK_NETWORK_TOAST_INTERVAL_MS) {
return;
}
lastWeakToastAtRef.current = now;
toast.warning("网络波动,正在缓冲音频…");
}, []);
const flushAudioQueue = useCallback(() => {
const ws = wsRef.current;
if (ws?.readyState !== WebSocket.OPEN) return;
const activeSessionId = useAppStore.getState().activeSessionId;
if (!activeSessionId) {
audioQueueRef.current = [];
useAppStore.getState().setWeakNetwork(false);
return;
}
const remaining: QueuedAudioPacket[] = [];
for (const packet of audioQueueRef.current) {
if (packet.sessionId !== activeSessionId) {
continue;
}
if (
ws.readyState !== WebSocket.OPEN ||
ws.bufferedAmount > WS_BACKPRESSURE_BYTES
) {
remaining.push(packet);
continue;
}
ws.send(packet.buffer);
}
audioQueueRef.current = remaining;
if (audioQueueRef.current.length === 0) {
useAppStore.getState().setWeakNetwork(false);
}
}, []);
const sendAudioFrame = useCallback(
(sessionId: string, seq: number, pcm: Int16Array): boolean => {
const store = useAppStore.getState();
if (!store.recording || store.activeSessionId !== sessionId) {
return false;
}
let buffer: ArrayBuffer;
try {
buffer = encodeAudioPacket(sessionId, seq, pcm);
} catch {
return false;
}
const ws = wsRef.current;
if (
ws?.readyState === WebSocket.OPEN &&
ws.bufferedAmount <= WS_BACKPRESSURE_BYTES
) {
ws.send(buffer);
if (audioQueueRef.current.length === 0) {
useAppStore.getState().setWeakNetwork(false);
}
return true;
}
if (audioQueueRef.current.length >= MAX_AUDIO_QUEUE) {
audioQueueRef.current.shift();
}
audioQueueRef.current.push({ sessionId, seq, buffer });
useAppStore.getState().setWeakNetwork(true);
notifyWeakNetwork();
return false;
},
[notifyWeakNetwork],
);
const resolvePendingStart = useCallback((sessionId: string | null) => {
const pending = pendingStartRef.current;
if (!pending) return;
window.clearTimeout(pending.timer);
pending.resolve(sessionId);
pendingStartRef.current = null;
}, []);
const handleServerMessage = useCallback(
(raw: string) => {
let msg: ServerMsg;
try {
msg = JSON.parse(raw) as ServerMsg;
} catch {
return;
}
const store = useAppStore.getState();
switch (msg.type) {
case "ready":
break;
case "state":
if (msg.state === "idle") {
store.setRecording(false);
store.setStopping(false);
store.setPendingStart(false);
store.setActiveSessionId(null);
}
if (msg.state === "stopping") {
store.setStopping(true);
}
break;
case "start_ack":
if (
pendingStartRef.current &&
msg.sessionId === pendingStartRef.current.sessionId
) {
resolvePendingStart(msg.sessionId ?? null);
}
break;
case "stop_ack":
store.setStopping(true);
break;
case "partial": {
const activeSessionId = store.activeSessionId;
if (!activeSessionId || msg.sessionId !== activeSessionId) break;
store.setPreview(msg.text || "", false);
break;
}
case "final": {
const activeSessionId = store.activeSessionId;
if (!activeSessionId || msg.sessionId !== activeSessionId) break;
store.setPreview(msg.text || "", true);
if (msg.text) store.addHistory(msg.text);
break;
}
case "pasted":
toast.success("已粘贴");
break;
case "error":
store.setRecording(false);
store.setPendingStart(false);
store.setStopping(false);
resolvePendingStart(null);
toast.error(msg.message || "服务错误");
break;
case "pong":
if (heartbeatTimeoutRef.current !== null) {
window.clearTimeout(heartbeatTimeoutRef.current);
heartbeatTimeoutRef.current = null;
}
break;
}
},
[resolvePendingStart],
);
const startHeartbeat = useCallback(() => {
clearHeartbeat();
heartbeatTimerRef.current = window.setInterval(() => {
if (document.hidden) return;
const now = Date.now();
if (!sendControl({ type: "ping", ts: now })) return;
if (heartbeatTimeoutRef.current !== null) {
window.clearTimeout(heartbeatTimeoutRef.current);
}
heartbeatTimeoutRef.current = window.setTimeout(() => {
wsRef.current?.close();
}, HEARTBEAT_TIMEOUT_MS);
}, HEARTBEAT_INTERVAL_MS);
}, [clearHeartbeat, sendControl]);
useEffect(() => {
useAppStore.getState().setConnectionStatus("connecting");
const ws = new ReconnectingWebSocket(getWsUrl(), undefined, {
minReconnectionDelay: 1000,
maxReconnectionDelay: 16000,
minReconnectionDelay: 400,
maxReconnectionDelay: 6000,
});
ws.binaryType = "arraybuffer";
wsRef.current = ws;
useAppStore.getState().setConnectionStatus("connecting");
ws.onopen = () => {
useAppStore.getState().setConnectionStatus("connected");
const store = useAppStore.getState();
store.setConnectionStatus("connected");
store.setWeakNetwork(false);
sendControl({ type: "hello", version: 2 });
startHeartbeat();
flushAudioQueue();
};
ws.onmessage = (e: MessageEvent) => {
if (typeof e.data !== "string") return;
try {
const msg = JSON.parse(e.data);
const store = useAppStore.getState();
switch (msg.type) {
case "partial":
store.setPreview(msg.text || "", false);
break;
case "final":
store.setPreview(msg.text || "", true);
if (msg.text) store.addHistory(msg.text);
break;
case "pasted":
toast.success("已粘贴");
break;
case "error":
toast.error(msg.message || "错误");
break;
}
} catch {
// Ignore malformed messages
if (typeof e.data === "string") {
handleServerMessage(e.data);
}
};
ws.onclose = () => {
clearHeartbeat();
resolvePendingStart(null);
const store = useAppStore.getState();
if (store.recording || store.pendingStart) {
toast.warning("连接中断,本次录音已结束");
}
store.setConnectionStatus("disconnected");
if (store.recording) store.setRecording(false);
if (store.pendingStart) store.setPendingStart(false);
store.setWeakNetwork(store.recording || audioQueueRef.current.length > 0);
store.setRecording(false);
store.setPendingStart(false);
store.setStopping(false);
store.setActiveSessionId(null);
audioQueueRef.current = [];
};
wsRef.current = ws;
ws.onerror = () => {
ws.close();
};
const onVisibilityChange = () => {
if (!document.hidden && wsRef.current?.readyState !== WebSocket.OPEN) {
wsRef.current?.close();
}
};
document.addEventListener("visibilitychange", onVisibilityChange);
return () => {
ws.close();
document.removeEventListener("visibilitychange", onVisibilityChange);
clearHeartbeat();
resolvePendingStart(null);
wsRef.current?.close();
wsRef.current = null;
};
}, []);
}, [
clearHeartbeat,
flushAudioQueue,
handleServerMessage,
resolvePendingStart,
sendControl,
startHeartbeat,
]);
return { sendJSON, sendBinary };
const requestStart = useCallback((): Promise<string | null> => {
const sessionId = createSessionId();
return new Promise<string | null>((resolve) => {
if (pendingStartRef.current) {
resolve(null);
return;
}
const timer = window.setTimeout(() => {
if (pendingStartRef.current?.sessionId === sessionId) {
pendingStartRef.current = null;
resolve(null);
}
}, START_TIMEOUT_MS);
pendingStartRef.current = { sessionId, resolve, timer };
const ok = sendControl({ type: "start", sessionId, version: 2 });
if (!ok) {
resolvePendingStart(null);
}
});
}, [resolvePendingStart, sendControl]);
const sendStop = useCallback(
(sessionId: string | null) => {
if (!sessionId) return;
audioQueueRef.current = audioQueueRef.current.filter(
(packet) => packet.sessionId !== sessionId,
);
if (audioQueueRef.current.length === 0) {
useAppStore.getState().setWeakNetwork(false);
}
sendControl({ type: "stop", sessionId });
},
[sendControl],
);
const sendPaste = useCallback(
(text: string) => {
const store = useAppStore.getState();
if (store.connectionStatus !== "connected") {
toast.warning("当前未连接,无法发送粘贴");
return;
}
const sessionId = useAppStore.getState().activeSessionId ?? undefined;
sendControl({ type: "paste", text, sessionId });
},
[sendControl],
);
return { requestStart, sendStop, sendPaste, sendAudioFrame };
}

35
web/src/protocol.ts Normal file
View File

@@ -0,0 +1,35 @@
export type ClientMsgType = "hello" | "start" | "stop" | "paste" | "ping";
export type ServerMsgType =
| "ready"
| "state"
| "start_ack"
| "stop_ack"
| "partial"
| "final"
| "pasted"
| "error"
| "pong";
export type SessionState = "idle" | "recording" | "stopping";
export interface ClientMsg {
type: ClientMsgType;
sessionId?: string;
seq?: number;
text?: string;
version?: number;
ts?: number;
}
export interface ServerMsg {
type: ServerMsgType;
state?: SessionState;
sessionId?: string;
seq?: number;
text?: string;
message?: string;
code?: string;
retryable?: boolean;
ts?: number;
}

View File

@@ -13,9 +13,13 @@ type ConnectionStatus = "connected" | "disconnected" | "connecting";
interface AppState {
// Connection
connectionStatus: ConnectionStatus;
weakNetwork: boolean;
// Recording
recording: boolean;
pendingStart: boolean;
stopping: boolean;
micReady: boolean;
activeSessionId: string | null;
// Preview
previewText: string;
previewActive: boolean;
@@ -24,8 +28,12 @@ interface AppState {
// Actions
setConnectionStatus: (status: ConnectionStatus) => void;
setWeakNetwork: (weak: boolean) => void;
setRecording: (recording: boolean) => void;
setPendingStart: (pending: boolean) => void;
setStopping: (stopping: boolean) => void;
setMicReady: (ready: boolean) => void;
setActiveSessionId: (sessionId: string | null) => void;
setPreview: (text: string, isFinal: boolean) => void;
clearPreview: () => void;
addHistory: (text: string) => void;
@@ -36,15 +44,23 @@ export const useAppStore = create<AppState>()(
persist(
(set, get) => ({
connectionStatus: "connecting",
weakNetwork: false,
recording: false,
pendingStart: false,
stopping: false,
micReady: false,
activeSessionId: null,
previewText: "",
previewActive: false,
history: [],
setConnectionStatus: (connectionStatus) => set({ connectionStatus }),
setWeakNetwork: (weakNetwork) => set({ weakNetwork }),
setRecording: (recording) => set({ recording }),
setPendingStart: (pendingStart) => set({ pendingStart }),
setStopping: (stopping) => set({ stopping }),
setMicReady: (micReady) => set({ micReady }),
setActiveSessionId: (activeSessionId) => set({ activeSessionId }),
setPreview: (text, isFinal) =>
set({