Compare commits

...

5 Commits

15 changed files with 876 additions and 178 deletions

2
go.mod
View File

@@ -54,6 +54,8 @@ require (
github.com/vcaesar/keycode v0.10.1 // indirect github.com/vcaesar/keycode v0.10.1 // indirect
github.com/vcaesar/screenshot v0.11.1 // indirect github.com/vcaesar/screenshot v0.11.1 // indirect
github.com/vcaesar/tt v0.20.1 // indirect github.com/vcaesar/tt v0.20.1 // indirect
github.com/vmihailenco/msgpack/v5 v5.4.1 // indirect
github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect
github.com/yusufpapurcu/wmi v1.2.4 // indirect github.com/yusufpapurcu/wmi v1.2.4 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/crypto v0.48.0 // indirect golang.org/x/crypto v0.48.0 // indirect

4
go.sum
View File

@@ -120,6 +120,10 @@ github.com/vcaesar/screenshot v0.11.1 h1:GgPuN89XC4Yh38dLx4quPlSo3YiWWhwIria/j3L
github.com/vcaesar/screenshot v0.11.1/go.mod h1:gJNwHBiP1v1v7i8TQ4yV1XJtcyn2I/OJL7OziVQkwjs= github.com/vcaesar/screenshot v0.11.1/go.mod h1:gJNwHBiP1v1v7i8TQ4yV1XJtcyn2I/OJL7OziVQkwjs=
github.com/vcaesar/tt v0.20.1 h1:D/jUeeVCNbq3ad8M7hhtB3J9x5RZ6I1n1eZ0BJp7M+4= github.com/vcaesar/tt v0.20.1 h1:D/jUeeVCNbq3ad8M7hhtB3J9x5RZ6I1n1eZ0BJp7M+4=
github.com/vcaesar/tt v0.20.1/go.mod h1:cH2+AwGAJm19Wa6xvEa+0r+sXDJBT0QgNQey6mwqLeU= github.com/vcaesar/tt v0.20.1/go.mod h1:cH2+AwGAJm19Wa6xvEa+0r+sXDJBT0QgNQey6mwqLeU=
github.com/vmihailenco/msgpack/v5 v5.4.1 h1:cQriyiUvjTwOHg8QZaPihLWeRAAVoCpE00IUPn0Bjt8=
github.com/vmihailenco/msgpack/v5 v5.4.1/go.mod h1:GaZTsDaehaPpQVyxrf5mtQlH+pc21PIudVV/E3rRQok=
github.com/vmihailenco/tagparser/v2 v2.0.0 h1:y09buUbR+b5aycVFQs/g70pqKVZNBmxwAhO7/IwNM9g=
github.com/vmihailenco/tagparser/v2 v2.0.0/go.mod h1:Wri+At7QHww0WTrCBeu4J6bNtoV6mEfg5OIWRZA9qds=
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU= github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=

View File

@@ -2,40 +2,45 @@ package ws
import ( import (
"encoding/json" "encoding/json"
"fmt"
"log/slog" "log/slog"
"sync" "sync"
"time"
"github.com/gofiber/contrib/v3/websocket" "github.com/gofiber/contrib/v3/websocket"
"github.com/gofiber/fiber/v3" "github.com/gofiber/fiber/v3"
"github.com/vmihailenco/msgpack/v5"
) )
// session holds the state for a single WebSocket connection. const wsReadTimeout = 75 * time.Second
type session struct { type session struct {
conn *websocket.Conn conn *websocket.Conn
log *slog.Logger log *slog.Logger
resultCh chan ServerMsg resultCh chan ServerMsg
stateMu sync.Mutex
writeMu sync.Mutex
previewMu sync.Mutex previewMu sync.Mutex
state string
sessionID string
seq int64
lastAudioSeq uint64
previewText string previewText string
sendAudio func([]byte) sendAudio func([]byte)
cleanup func() cleanup func()
active bool
} }
// PasteFunc is called when the server should paste text into the focused app.
type PasteFunc func(text string) error type PasteFunc func(text string) error
// ASRFactory creates an ASR session. It returns a channel that receives
// partial/final results, and a function to send audio frames.
// The cleanup function must be called when the session ends.
type ASRFactory func(resultCh chan<- ServerMsg) (sendAudio func(pcm []byte), cleanup func(), err error) type ASRFactory func(resultCh chan<- ServerMsg) (sendAudio func(pcm []byte), cleanup func(), err error)
// Handler holds dependencies for WebSocket connections.
type Handler struct { type Handler struct {
token string token string
pasteFunc PasteFunc pasteFunc PasteFunc
asrFactory ASRFactory asrFactory ASRFactory
} }
// NewHandler creates a WS handler with the given dependencies.
func NewHandler(token string, pasteFn PasteFunc, asrFn ASRFactory) *Handler { func NewHandler(token string, pasteFn PasteFunc, asrFn ASRFactory) *Handler {
return &Handler{ return &Handler{
token: token, token: token,
@@ -44,9 +49,7 @@ func NewHandler(token string, pasteFn PasteFunc, asrFn ASRFactory) *Handler {
} }
} }
// Register adds the /ws route to the Fiber app.
func (h *Handler) Register(app *fiber.App) { func (h *Handler) Register(app *fiber.App) {
// Token check middleware (before upgrade)
app.Use("/ws", func(c fiber.Ctx) error { app.Use("/ws", func(c fiber.Ctx) error {
if !websocket.IsWebSocketUpgrade(c) { if !websocket.IsWebSocketUpgrade(c) {
return fiber.ErrUpgradeRequired return fiber.ErrUpgradeRequired
@@ -67,127 +70,329 @@ func (h *Handler) handleConn(c *websocket.Conn) {
sess := &session{ sess := &session{
conn: c, conn: c,
log: slog.With("remote", c.RemoteAddr().String()), log: slog.With("remote", c.RemoteAddr().String()),
resultCh: make(chan ServerMsg, 32), resultCh: make(chan ServerMsg, 64),
state: StateIdle,
sessionID: "",
seq: 1,
lastAudioSeq: 0,
} }
sess.log.Info("ws connected") sess.log.Info("ws connected")
defer sess.log.Info("ws disconnected") defer sess.log.Info("ws disconnected")
defer close(sess.resultCh) defer close(sess.resultCh)
defer sess.cleanupASR() defer sess.forceStop()
var wg sync.WaitGroup var wg sync.WaitGroup
wg.Add(1) wg.Add(1)
go sess.writerLoop(&wg) go sess.writerLoop(&wg)
defer wg.Wait() defer wg.Wait()
_ = sess.writeJSON(ServerMsg{Type: MsgReady, Message: "ok"})
_ = sess.writeJSON(ServerMsg{Type: MsgState, State: StateIdle})
for { for {
_ = c.SetReadDeadline(time.Now().Add(wsReadTimeout))
mt, data, err := c.ReadMessage() mt, data, err := c.ReadMessage()
if err != nil { if err != nil {
break break
} }
if mt == websocket.BinaryMessage { switch mt {
case websocket.BinaryMessage:
sess.handleAudioFrame(data) sess.handleAudioFrame(data)
} else if mt == websocket.TextMessage { case websocket.TextMessage:
h.handleTextMessage(sess, data) h.handleTextMessage(sess, data)
} }
} }
} }
func (s *session) writerLoop(wg *sync.WaitGroup) { func (s *session) writerLoop(wg *sync.WaitGroup) {
defer wg.Done() defer wg.Done()
for msg := range s.resultCh { for msg := range s.resultCh {
if msg.Type == MsgPartial || msg.Type == MsgFinal { if msg.Type == MsgPartial || msg.Type == MsgFinal {
s.previewMu.Lock() s.previewMu.Lock()
s.previewText = msg.Text s.previewText = msg.Text
preview := ServerMsg{Type: msg.Type, Text: s.previewText}
s.previewMu.Unlock() s.previewMu.Unlock()
if err := s.conn.WriteMessage(websocket.TextMessage, preview.Bytes()); err != nil {
s.log.Warn("ws write error", "err", err) s.stateMu.Lock()
return msg.SessionID = s.sessionID
msg.Seq = s.seq
s.seq++
s.stateMu.Unlock()
} }
continue
} if err := s.writeJSON(msg); err != nil {
if err := s.conn.WriteMessage(websocket.TextMessage, msg.Bytes()); err != nil {
s.log.Warn("ws write error", "err", err) s.log.Warn("ws write error", "err", err)
return return
} }
} }
} }
func (s *session) writeJSON(msg ServerMsg) error {
s.writeMu.Lock()
defer s.writeMu.Unlock()
return s.conn.WriteMessage(websocket.TextMessage, msg.Bytes())
}
func (s *session) handleAudioFrame(data []byte) { func (s *session) handleAudioFrame(data []byte) {
if s.active && s.sendAudio != nil { packet, err := decodeAudioPacket(data)
s.sendAudio(data) if err != nil {
s.log.Warn("invalid audio packet", "err", err)
return
} }
s.stateMu.Lock()
if s.state != StateRecording || s.sendAudio == nil {
s.stateMu.Unlock()
return
}
if packet.SessionID != s.sessionID {
s.stateMu.Unlock()
return
}
if packet.Seq <= s.lastAudioSeq {
s.stateMu.Unlock()
return
}
lastSeq := s.lastAudioSeq
if lastSeq > 0 && packet.Seq > lastSeq+1 {
s.log.Warn(
"audio seq gap",
"session_id",
s.sessionID,
"last",
lastSeq,
"curr",
packet.Seq,
)
}
s.lastAudioSeq = packet.Seq
sendAudio := s.sendAudio
s.stateMu.Unlock()
sendAudio(packet.PCM)
} }
type audioPacket struct {
Version int `msgpack:"v"`
SessionID string `msgpack:"sessionId"`
Seq uint64 `msgpack:"seq"`
PCM []byte `msgpack:"pcm"`
}
func decodeAudioPacket(data []byte) (audioPacket, error) {
var packet audioPacket
if err := msgpack.Unmarshal(data, &packet); err != nil {
return audioPacket{}, fmt.Errorf("msgpack decode failed: %w", err)
}
if packet.Version != 1 {
return audioPacket{}, fmt.Errorf("unsupported version: %d", packet.Version)
}
if packet.SessionID == "" || len(packet.SessionID) > 96 {
return audioPacket{}, fmt.Errorf("invalid session id")
}
if packet.Seq == 0 {
return audioPacket{}, fmt.Errorf("invalid seq")
}
if len(packet.PCM) == 0 || len(packet.PCM)%2 != 0 {
return audioPacket{}, fmt.Errorf("invalid pcm size")
}
return packet, nil
}
func (h *Handler) handleTextMessage(s *session, data []byte) { func (h *Handler) handleTextMessage(s *session, data []byte) {
var msg ClientMsg var msg ClientMsg
if err := json.Unmarshal(data, &msg); err != nil { if err := json.Unmarshal(data, &msg); err != nil {
s.log.Warn("invalid json", "err", err) s.log.Warn("invalid json", "err", err)
return return
} }
switch msg.Type { switch msg.Type {
case MsgHello:
_ = s.writeJSON(ServerMsg{Type: MsgReady, Message: "ok"})
s.stateMu.Lock()
state := s.state
sid := s.sessionID
s.stateMu.Unlock()
_ = s.writeJSON(ServerMsg{Type: MsgState, State: state, SessionID: sid})
case MsgPing:
_ = s.writeJSON(ServerMsg{Type: MsgPong, TS: msg.TS})
case MsgStart: case MsgStart:
h.handleStart(s) h.handleStart(s, msg)
case MsgStop: case MsgStop:
h.handleStop(s) h.handleStop(s, msg)
case MsgPaste: case MsgPaste:
h.handlePaste(s, msg.Text) h.handlePaste(s, msg.Text, msg.SessionID)
default:
h.sendError(s, "bad_message", "unsupported message type", true, "")
} }
} }
func (h *Handler) handleStart(s *session) {
if s.active { func (h *Handler) handleStart(s *session, msg ClientMsg) {
if msg.SessionID == "" || len(msg.SessionID) > 96 {
h.sendError(s, "invalid_session", "invalid sessionId", false, "")
return return
} }
// Future extension: support runtime dynamic hotwords (Phase 2)
// if len(msg.Hotwords) > 0 { s.stateMu.Lock()
// // Priority: runtime hotwords > config.yaml hotwords if s.state != StateIdle {
// // Need to modify asrFactory signature to pass msg.Hotwords current := s.sessionID
// } s.stateMu.Unlock()
h.sendError(s, "busy", "session is not idle", true, current)
return
}
s.state = StateRecording
s.sessionID = msg.SessionID
s.seq = 1
s.lastAudioSeq = 0
s.sendAudio = nil
s.cleanup = nil
s.stateMu.Unlock()
s.previewMu.Lock() s.previewMu.Lock()
s.previewText = "" s.previewText = ""
s.previewMu.Unlock() s.previewMu.Unlock()
sa, cl, err := h.asrFactory(s.resultCh) sa, cl, err := h.asrFactory(s.resultCh)
if err != nil { if err != nil {
s.log.Error("asr start failed", "err", err) s.stateMu.Lock()
s.resultCh <- ServerMsg{Type: MsgError, Message: "ASR start failed"} s.state = StateIdle
s.sessionID = ""
s.sendAudio = nil
s.cleanup = nil
s.stateMu.Unlock()
h.sendError(s, "start_failed", "ASR start failed", true, "")
_ = s.writeJSON(ServerMsg{Type: MsgState, State: StateIdle})
return
}
s.stateMu.Lock()
if s.sessionID != msg.SessionID || s.state != StateRecording {
s.stateMu.Unlock()
cl()
return return
} }
s.sendAudio = sa s.sendAudio = sa
s.cleanup = cl s.cleanup = cl
s.active = true s.stateMu.Unlock()
s.log.Info("recording started")
_ = s.writeJSON(ServerMsg{Type: MsgStartAck, SessionID: msg.SessionID})
_ = s.writeJSON(ServerMsg{Type: MsgState, State: StateRecording, SessionID: msg.SessionID})
s.log.Info("recording started", "session_id", msg.SessionID)
} }
func (h *Handler) handleStop(s *session) {
if !s.active { func (h *Handler) handleStop(s *session, msg ClientMsg) {
s.stateMu.Lock()
if s.state == StateIdle {
s.stateMu.Unlock()
return return
} }
s.cleanupASR() if s.state == StateStopping {
s.sendAudio = nil sid := s.sessionID
s.active = false s.stateMu.Unlock()
_ = s.writeJSON(ServerMsg{Type: MsgStopAck, SessionID: sid})
return
}
if msg.SessionID != "" && msg.SessionID != s.sessionID {
current := s.sessionID
s.stateMu.Unlock()
h.sendError(s, "session_mismatch", "stop sessionId mismatch", false, current)
return
}
s.state = StateStopping
sid := s.sessionID
s.stateMu.Unlock()
_ = s.writeJSON(ServerMsg{Type: MsgStopAck, SessionID: sid})
_ = s.writeJSON(ServerMsg{Type: MsgState, State: StateStopping, SessionID: sid})
go h.finalizeStop(s, sid)
}
func (h *Handler) finalizeStop(s *session, sid string) {
cleanup := s.detachCleanup(sid)
if cleanup != nil {
cleanup()
}
s.previewMu.Lock() s.previewMu.Lock()
finalText := s.previewText finalText := s.previewText
s.previewText = "" s.previewText = ""
s.previewMu.Unlock() s.previewMu.Unlock()
if finalText != "" && h.pasteFunc != nil { if finalText != "" && h.pasteFunc != nil {
if err := h.pasteFunc(finalText); err != nil { if err := h.pasteFunc(finalText); err != nil {
s.log.Error("auto-paste failed", "err", err) h.sendError(s, "paste_failed", "auto-paste failed", true, sid)
} else { } else {
s.resultCh <- ServerMsg{Type: MsgPasted} _ = s.writeJSON(ServerMsg{Type: MsgPasted, SessionID: sid})
} }
} }
s.log.Info("recording stopped")
s.stateMu.Lock()
if s.sessionID == sid {
s.state = StateIdle
s.sessionID = ""
s.seq = 1
s.lastAudioSeq = 0
s.sendAudio = nil
s.cleanup = nil
}
s.stateMu.Unlock()
_ = s.writeJSON(ServerMsg{Type: MsgState, State: StateIdle})
s.log.Info("recording stopped", "session_id", sid)
} }
func (h *Handler) handlePaste(s *session, text string) {
func (h *Handler) handlePaste(s *session, text, sid string) {
if text == "" { if text == "" {
return return
} }
if h.pasteFunc != nil { if h.pasteFunc != nil {
if err := h.pasteFunc(text); err != nil { if err := h.pasteFunc(text); err != nil {
s.log.Error("paste failed", "err", err) h.sendError(s, "paste_failed", "paste failed", true, sid)
s.resultCh <- ServerMsg{Type: MsgError, Message: "paste failed"}
} else { } else {
s.resultCh <- ServerMsg{Type: MsgPasted} _ = s.writeJSON(ServerMsg{Type: MsgPasted, SessionID: sid})
} }
} }
} }
func (s *session) cleanupASR() {
if s.cleanup != nil { func (h *Handler) sendError(s *session, code, message string, retryable bool, sid string) {
s.cleanup() err := s.writeJSON(ServerMsg{
Type: MsgError,
Code: code,
Message: message,
Retryable: retryable,
SessionID: sid,
})
if err != nil {
s.log.Warn("send error message failed", "err", err)
}
}
func (s *session) detachCleanup(sid string) func() {
s.stateMu.Lock()
defer s.stateMu.Unlock()
if sid != "" && s.sessionID != sid {
return nil
}
cleanup := s.cleanup
s.cleanup = nil s.cleanup = nil
} s.sendAudio = nil
return cleanup
}
func (s *session) forceStop() {
cleanup := s.detachCleanup("")
if cleanup != nil {
cleanup()
}
s.stateMu.Lock()
s.state = StateIdle
s.sessionID = ""
s.seq = 1
s.lastAudioSeq = 0
s.stateMu.Unlock()
} }

View File

@@ -8,15 +8,22 @@ import "encoding/json"
type MsgType string type MsgType string
const ( const (
MsgStart MsgType = "start" // Begin recording session MsgHello MsgType = "hello"
MsgStop MsgType = "stop" // End recording session MsgStart MsgType = "start"
MsgPaste MsgType = "paste" // Re-paste a history item MsgStop MsgType = "stop"
MsgPaste MsgType = "paste"
MsgPing MsgType = "ping"
MsgPong MsgType = "pong"
) )
// ClientMsg is a JSON control message from the phone. // ClientMsg is a JSON control message from the phone.
type ClientMsg struct { type ClientMsg struct {
Type MsgType `json:"type"` Type MsgType `json:"type"`
SessionID string `json:"sessionId,omitempty"`
Seq int64 `json:"seq,omitempty"`
Text string `json:"text,omitempty"` // Only for "paste" Text string `json:"text,omitempty"` // Only for "paste"
Version int `json:"version,omitempty"`
TS int64 `json:"ts,omitempty"`
// Future extension: dynamic hotwords (Phase 2) // Future extension: dynamic hotwords (Phase 2)
// Hotwords []string `json:"hotwords,omitempty"` // Hotwords []string `json:"hotwords,omitempty"`
} }
@@ -24,17 +31,33 @@ type ClientMsg struct {
// ── Server → Client messages ── // ── Server → Client messages ──
const ( const (
MsgPartial MsgType = "partial" // Interim ASR result MsgReady MsgType = "ready"
MsgFinal MsgType = "final" // Final ASR result MsgState MsgType = "state"
MsgPasted MsgType = "pasted" // Paste confirmed MsgStartAck MsgType = "start_ack"
MsgError MsgType = "error" // Error notification MsgStopAck MsgType = "stop_ack"
MsgPartial MsgType = "partial"
MsgFinal MsgType = "final"
MsgPasted MsgType = "pasted"
MsgError MsgType = "error"
)
const (
StateIdle = "idle"
StateRecording = "recording"
StateStopping = "stopping"
) )
// ServerMsg is a JSON message sent to the phone. // ServerMsg is a JSON message sent to the phone.
type ServerMsg struct { type ServerMsg struct {
Type MsgType `json:"type"` Type MsgType `json:"type"`
State string `json:"state,omitempty"`
SessionID string `json:"sessionId,omitempty"`
Seq int64 `json:"seq,omitempty"`
Text string `json:"text,omitempty"` Text string `json:"text,omitempty"`
Message string `json:"message,omitempty"` // For errors Message string `json:"message,omitempty"` // For errors
Code string `json:"code,omitempty"`
Retryable bool `json:"retryable,omitempty"`
TS int64 `json:"ts,omitempty"`
} }
func (m ServerMsg) Bytes() []byte { func (m ServerMsg) Bytes() []byte {

View File

@@ -5,6 +5,7 @@
"": { "": {
"name": "web", "name": "web",
"dependencies": { "dependencies": {
"@msgpack/msgpack": "^3.1.3",
"@picovoice/web-voice-processor": "^4.0.9", "@picovoice/web-voice-processor": "^4.0.9",
"partysocket": "^1.1.16", "partysocket": "^1.1.16",
"react": "^19.2.4", "react": "^19.2.4",
@@ -297,6 +298,8 @@
"@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.31", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.1.0", "@jridgewell/sourcemap-codec": "^1.4.14" } }, "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw=="], "@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.31", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.1.0", "@jridgewell/sourcemap-codec": "^1.4.14" } }, "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw=="],
"@msgpack/msgpack": ["@msgpack/msgpack@3.1.3", "", {}, "sha512-47XIizs9XZXvuJgoaJUIE2lFoID8ugvc0jzSHP+Ptfk8nTbnR8g788wv48N03Kx0UkAv559HWRQ3yzOgzlRNUA=="],
"@picovoice/web-utils": ["@picovoice/web-utils@1.3.1", "", { "dependencies": { "commander": "^9.2.0" }, "bin": { "pvbase64": "scripts/base64.js" } }, "sha512-jcDqdULtTm+yJrnHDjg64hARup+Z4wNkYuXHNx6EM8+qZkweBq9UA6XJrHAlUkPnlkso4JWjaIKhz3x8vZcd3g=="], "@picovoice/web-utils": ["@picovoice/web-utils@1.3.1", "", { "dependencies": { "commander": "^9.2.0" }, "bin": { "pvbase64": "scripts/base64.js" } }, "sha512-jcDqdULtTm+yJrnHDjg64hARup+Z4wNkYuXHNx6EM8+qZkweBq9UA6XJrHAlUkPnlkso4JWjaIKhz3x8vZcd3g=="],
"@picovoice/web-voice-processor": ["@picovoice/web-voice-processor@4.0.9", "", { "dependencies": { "@picovoice/web-utils": "=1.3.1" } }, "sha512-20pdkFjtuiojAdLIkNHXt4YgpRnlUePFW+gfkeCb+J+2XTRDGOI50+aJzL95p6QjDzGXsO7PZhlz7yDofOvZtg=="], "@picovoice/web-voice-processor": ["@picovoice/web-voice-processor@4.0.9", "", { "dependencies": { "@picovoice/web-utils": "=1.3.1" } }, "sha512-20pdkFjtuiojAdLIkNHXt4YgpRnlUePFW+gfkeCb+J+2XTRDGOI50+aJzL95p6QjDzGXsO7PZhlz7yDofOvZtg=="],

View File

@@ -22,6 +22,7 @@
"vite-plugin-pwa": "^1.2.0" "vite-plugin-pwa": "^1.2.0"
}, },
"dependencies": { "dependencies": {
"@msgpack/msgpack": "^3.1.3",
"@picovoice/web-voice-processor": "^4.0.9", "@picovoice/web-voice-processor": "^4.0.9",
"partysocket": "^1.1.16", "partysocket": "^1.1.16",
"react": "^19.2.4", "react": "^19.2.4",

View File

@@ -1,3 +1,4 @@
import { useEffect } from "react";
import { Toaster } from "sonner"; import { Toaster } from "sonner";
import { HistoryList } from "./components/HistoryList"; import { HistoryList } from "./components/HistoryList";
import { MicButton } from "./components/MicButton"; import { MicButton } from "./components/MicButton";
@@ -5,13 +6,36 @@ import { PreviewBox } from "./components/PreviewBox";
import { StatusBadge } from "./components/StatusBadge"; import { StatusBadge } from "./components/StatusBadge";
import { useRecorder } from "./hooks/useRecorder"; import { useRecorder } from "./hooks/useRecorder";
import { useWebSocket } from "./hooks/useWebSocket"; import { useWebSocket } from "./hooks/useWebSocket";
import { useAppStore } from "./stores/app-store";
export function App() { export function App() {
const { sendJSON, sendBinary } = useWebSocket(); const { requestStart, sendStop, sendPaste, sendAudioFrame } = useWebSocket();
const { startRecording, stopRecording } = useRecorder({ const { prewarm, startRecording, stopRecording } = useRecorder({
sendJSON, requestStart,
sendBinary, sendStop,
sendAudioFrame,
}); });
const micReady = useAppStore((s) => s.micReady);
useEffect(() => {
const forceStopOnBackground = () => {
const state = useAppStore.getState();
if (state.recording || state.pendingStart) {
stopRecording();
}
};
const onVisibility = () => {
if (document.hidden) forceStopOnBackground();
};
document.addEventListener("visibilitychange", onVisibility);
window.addEventListener("pagehide", forceStopOnBackground);
return () => {
document.removeEventListener("visibilitychange", onVisibility);
window.removeEventListener("pagehide", forceStopOnBackground);
};
}, [stopRecording]);
return ( return (
<> <>
@@ -24,8 +48,17 @@ export function App() {
</header> </header>
<PreviewBox /> <PreviewBox />
{!micReady ? (
<button
type="button"
onClick={() => void prewarm()}
className="mb-3 rounded-xl border border-edge bg-surface px-4 py-3 font-medium text-fg text-sm"
>
{"先点我准备麦克风(首次会弹权限)"}
</button>
) : null}
<MicButton onStart={startRecording} onStop={stopRecording} /> <MicButton onStart={startRecording} onStop={stopRecording} />
<HistoryList sendJSON={sendJSON} /> <HistoryList sendPaste={sendPaste} />
</div> </div>
<Toaster theme="dark" position="bottom-center" /> <Toaster theme="dark" position="bottom-center" />
</> </>

View File

@@ -8,19 +8,19 @@ function formatTime(ts: number): string {
} }
interface HistoryListProps { interface HistoryListProps {
sendJSON: (obj: Record<string, unknown>) => void; sendPaste: (text: string) => void;
} }
export function HistoryList({ sendJSON }: HistoryListProps) { export function HistoryList({ sendPaste }: HistoryListProps) {
const history = useAppStore((s) => s.history); const history = useAppStore((s) => s.history);
const clearHistory = useAppStore((s) => s.clearHistory); const clearHistory = useAppStore((s) => s.clearHistory);
const handleItemClick = useCallback( const handleItemClick = useCallback(
(text: string) => { (text: string) => {
sendJSON({ type: "paste", text }); sendPaste(text);
toast.info("发送粘贴…"); toast.info("发送粘贴…");
}, },
[sendJSON], [sendPaste],
); );
return ( return (

View File

@@ -8,9 +8,13 @@ interface MicButtonProps {
export function MicButton({ onStart, onStop }: MicButtonProps) { export function MicButton({ onStart, onStop }: MicButtonProps) {
const connected = useAppStore((s) => s.connectionStatus === "connected"); const connected = useAppStore((s) => s.connectionStatus === "connected");
const micReady = useAppStore((s) => s.micReady);
const recording = useAppStore((s) => s.recording); const recording = useAppStore((s) => s.recording);
const pendingStart = useAppStore((s) => s.pendingStart); const pendingStart = useAppStore((s) => s.pendingStart);
const isActive = recording || pendingStart; const stopping = useAppStore((s) => s.stopping);
const weakNetwork = useAppStore((s) => s.weakNetwork);
const isActive = recording || pendingStart || stopping;
const disabled = !connected || !micReady || stopping;
const handlePointerDown = useCallback( const handlePointerDown = useCallback(
(e: React.PointerEvent<HTMLButtonElement>) => { (e: React.PointerEvent<HTMLButtonElement>) => {
@@ -46,6 +50,8 @@ export function MicButton({ onStart, onStop }: MicButtonProps) {
"cursor-not-allowed border-edge bg-linear-to-br from-surface-hover to-surface text-fg-secondary opacity-30 shadow-[0_2px_12px_rgba(0,0,0,0.3),inset_0_1px_0_rgba(255,255,255,0.04)]"; "cursor-not-allowed border-edge bg-linear-to-br from-surface-hover to-surface text-fg-secondary opacity-30 shadow-[0_2px_12px_rgba(0,0,0,0.3),inset_0_1px_0_rgba(255,255,255,0.04)]";
const activeClasses = const activeClasses =
"animate-mic-breathe scale-[1.06] border-accent-hover bg-accent text-white shadow-[0_0_32px_rgba(99,102,241,0.35),0_0_80px_rgba(99,102,241,0.2)]"; "animate-mic-breathe scale-[1.06] border-accent-hover bg-accent text-white shadow-[0_0_32px_rgba(99,102,241,0.35),0_0_80px_rgba(99,102,241,0.2)]";
const weakClasses =
"border-amber-400 bg-linear-to-br from-amber-400/15 to-surface text-amber-200 shadow-[0_0_22px_rgba(251,191,36,0.28)]";
const idleClasses = const idleClasses =
"border-edge bg-linear-to-br from-surface-hover to-surface text-fg-secondary shadow-[0_2px_12px_rgba(0,0,0,0.3),inset_0_1px_0_rgba(255,255,255,0.04)]"; "border-edge bg-linear-to-br from-surface-hover to-surface text-fg-secondary shadow-[0_2px_12px_rgba(0,0,0,0.3),inset_0_1px_0_rgba(255,255,255,0.04)]";
@@ -54,10 +60,12 @@ export function MicButton({ onStart, onStop }: MicButtonProps) {
<div className="relative flex touch-none items-center justify-center"> <div className="relative flex touch-none items-center justify-center">
<button <button
type="button" type="button"
disabled={!connected} disabled={disabled}
className={`relative z-1 flex size-24 cursor-pointer touch-none select-none items-center justify-center rounded-full border-2 transition-all duration-[250ms] ease-[cubic-bezier(0.4,0,0.2,1)] ${ className={`relative z-1 flex size-24 cursor-pointer touch-none select-none items-center justify-center rounded-full border-2 transition-all duration-[250ms] ease-[cubic-bezier(0.4,0,0.2,1)] ${
!connected disabled
? disabledClasses ? disabledClasses
: weakNetwork
? weakClasses
: isActive : isActive
? activeClasses ? activeClasses
: idleClasses : idleClasses
@@ -98,7 +106,17 @@ export function MicButton({ onStart, onStop }: MicButtonProps) {
))} ))}
</div> </div>
</div> </div>
<p className="font-medium text-fg-dim text-sm">{"按住说话"}</p> <p className="font-medium text-fg-dim text-sm">
{!micReady
? "请先准备麦克风"
: !connected
? "连接中断,等待重连"
: stopping
? "收尾中…"
: weakNetwork
? "网络波动,已启用缓冲"
: "按住说话"}
</p>
</section> </section>
); );
} }

View File

@@ -3,7 +3,11 @@ import { useAppStore } from "../stores/app-store";
export function PreviewBox() { export function PreviewBox() {
const text = useAppStore((s) => s.previewText); const text = useAppStore((s) => s.previewText);
const active = useAppStore((s) => s.previewActive); const active = useAppStore((s) => s.previewActive);
const weakNetwork = useAppStore((s) => s.weakNetwork);
const recording = useAppStore((s) => s.recording);
const hasText = text.length > 0; const hasText = text.length > 0;
const placeholder =
weakNetwork && recording ? "网络波动中,音频缓冲后发送…" : "按住说话…";
return ( return (
<section className="shrink-0 pb-3"> <section className="shrink-0 pb-3">
@@ -17,7 +21,7 @@ export function PreviewBox() {
<p <p
className={`break-words text-base leading-relaxed ${hasText ? "" : "text-fg-dim"}`} className={`break-words text-base leading-relaxed ${hasText ? "" : "text-fg-dim"}`}
> >
{hasText ? text : "按住说话…"} {hasText ? text : placeholder}
</p> </p>
</div> </div>
</section> </section>

View File

@@ -20,7 +20,12 @@ const statusConfig = {
export function StatusBadge() { export function StatusBadge() {
const status = useAppStore((s) => s.connectionStatus); const status = useAppStore((s) => s.connectionStatus);
const { text, dotClass, borderClass } = statusConfig[status]; const weakNetwork = useAppStore((s) => s.weakNetwork);
const { dotClass, borderClass } = statusConfig[status];
const text =
status === "connected" && weakNetwork
? "网络波动"
: statusConfig[status].text;
return ( return (
<div <div

View File

@@ -3,71 +3,116 @@ import { useCallback, useRef } from "react";
import { toast } from "sonner"; import { toast } from "sonner";
import { useAppStore } from "../stores/app-store"; import { useAppStore } from "../stores/app-store";
/**
* ~200ms frames at 16kHz = 3200 samples.
* Doubao bigmodel_async recommends 200ms packets for optimal performance.
*/
const FRAME_LENGTH = 3200; const FRAME_LENGTH = 3200;
interface UseRecorderOptions { interface UseRecorderOptions {
sendJSON: (obj: Record<string, unknown>) => void; requestStart: () => Promise<string | null>;
sendBinary: (data: Int16Array) => void; sendStop: (sessionId: string | null) => void;
sendAudioFrame: (sessionId: string, seq: number, data: Int16Array) => boolean;
} }
export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) { let optionsInitialized = false;
export function useRecorder({
requestStart,
sendStop,
sendAudioFrame,
}: UseRecorderOptions) {
const abortRef = useRef<AbortController | null>(null); const abortRef = useRef<AbortController | null>(null);
const engineRef = useRef<{ onmessage: (e: MessageEvent) => void } | null>( const engineRef = useRef<{ onmessage: (e: MessageEvent) => void } | null>(
null, null,
); );
const warmedRef = useRef(false);
const audioSeqRef = useRef(1);
// Keep stable refs so callbacks never go stale const initOptions = useCallback(() => {
const sendJSONRef = useRef(sendJSON); if (optionsInitialized) return;
const sendBinaryRef = useRef(sendBinary);
sendJSONRef.current = sendJSON;
sendBinaryRef.current = sendBinary;
const startRecording = useCallback(async () => {
const store = useAppStore.getState();
if (store.recording || store.pendingStart) return;
store.setPendingStart(true);
const abort = new AbortController();
abortRef.current = abort;
try {
// Create an engine that receives Int16Array @ 16kHz from WebVoiceProcessor
const engine = {
onmessage: (e: MessageEvent) => {
if (e.data.command === "process") {
sendBinaryRef.current(e.data.inputFrame as Int16Array);
}
},
};
engineRef.current = engine;
WebVoiceProcessor.setOptions({ WebVoiceProcessor.setOptions({
frameLength: FRAME_LENGTH, frameLength: FRAME_LENGTH,
outputSampleRate: 16000, outputSampleRate: 16000,
}); });
optionsInitialized = true;
}, []);
const prewarm = useCallback(async (): Promise<boolean> => {
if (warmedRef.current) return true;
initOptions();
const warmEngine = {
onmessage: () => {},
};
try {
await WebVoiceProcessor.subscribe(warmEngine);
await WebVoiceProcessor.unsubscribe(warmEngine);
warmedRef.current = true;
useAppStore.getState().setMicReady(true);
return true;
} catch (err) {
const error = err as Error;
toast.error(`麦克风准备失败: ${error.message}`);
return false;
}
}, [initOptions]);
const startRecording = useCallback(async () => {
const store = useAppStore.getState();
if (store.recording || store.pendingStart || store.stopping) return;
store.setPendingStart(true);
store.setStopping(false);
const abort = new AbortController();
abortRef.current = abort;
try {
const warmed = await prewarm();
if (!warmed) {
store.setPendingStart(false);
abortRef.current = null;
return;
}
const sessionId = await requestStart();
if (!sessionId) {
store.setPendingStart(false);
abortRef.current = null;
toast.error("启动会话失败,请重试");
return;
}
const engine = {
onmessage: (e: MessageEvent) => {
if (e.data.command !== "process") return;
const seq = audioSeqRef.current;
audioSeqRef.current += 1;
sendAudioFrame(sessionId, seq, e.data.inputFrame as Int16Array);
},
};
engineRef.current = engine;
audioSeqRef.current = 1;
// subscribe() handles getUserMedia + AudioContext lifecycle internally.
// It checks for closed/suspended AudioContext and re-creates as needed.
await WebVoiceProcessor.subscribe(engine); await WebVoiceProcessor.subscribe(engine);
if (abort.signal.aborted) { if (abort.signal.aborted) {
await WebVoiceProcessor.unsubscribe(engine); await WebVoiceProcessor.unsubscribe(engine);
engineRef.current = null; engineRef.current = null;
sendStop(sessionId);
store.setPendingStart(false); store.setPendingStart(false);
abortRef.current = null;
return; return;
} }
store.setActiveSessionId(sessionId);
store.setPendingStart(false); store.setPendingStart(false);
abortRef.current = null;
store.setRecording(true); store.setRecording(true);
sendJSONRef.current({ type: "start" }); store.setStopping(false);
store.clearPreview(); store.clearPreview();
abortRef.current = null;
} catch (err) { } catch (err) {
useAppStore.getState().setPendingStart(false); store.setPendingStart(false);
store.setRecording(false);
store.setStopping(false);
abortRef.current = null; abortRef.current = null;
engineRef.current = null; engineRef.current = null;
@@ -86,7 +131,7 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
toast.error(`麦克风错误: ${error.message}`); toast.error(`麦克风错误: ${error.message}`);
} }
} }
}, []); }, [prewarm, requestStart, sendAudioFrame, sendStop]);
const stopRecording = useCallback(() => { const stopRecording = useCallback(() => {
const store = useAppStore.getState(); const store = useAppStore.getState();
@@ -100,15 +145,16 @@ export function useRecorder({ sendJSON, sendBinary }: UseRecorderOptions) {
if (!store.recording) return; if (!store.recording) return;
store.setRecording(false); store.setRecording(false);
store.setStopping(true);
audioSeqRef.current = 1;
if (engineRef.current) { if (engineRef.current) {
// Fire-and-forget: state is already updated, cleanup is async
WebVoiceProcessor.unsubscribe(engineRef.current); WebVoiceProcessor.unsubscribe(engineRef.current);
engineRef.current = null; engineRef.current = null;
} }
sendJSONRef.current({ type: "stop" }); sendStop(store.activeSessionId);
}, []); }, [sendStop]);
return { startRecording, stopRecording }; return { prewarm, startRecording, stopRecording };
} }

View File

@@ -1,8 +1,31 @@
import { encode } from "@msgpack/msgpack";
import { WebSocket as ReconnectingWebSocket } from "partysocket"; import { WebSocket as ReconnectingWebSocket } from "partysocket";
import { useCallback, useEffect, useRef } from "react"; import { useCallback, useEffect, useRef } from "react";
import { toast } from "sonner"; import { toast } from "sonner";
import type { ClientMsg, ServerMsg } from "../protocol";
import { useAppStore } from "../stores/app-store"; import { useAppStore } from "../stores/app-store";
const HEARTBEAT_INTERVAL_MS = 15000;
const HEARTBEAT_TIMEOUT_MS = 10000;
const START_TIMEOUT_MS = 5000;
const AUDIO_PACKET_VERSION = 1;
const MAX_AUDIO_QUEUE = 6;
const WEAK_NETWORK_TOAST_INTERVAL_MS = 4000;
const WS_BACKPRESSURE_BYTES = 128 * 1024;
interface QueuedAudioPacket {
sessionId: string;
seq: number;
buffer: ArrayBuffer;
}
interface AudioPacketPayload {
v: number;
sessionId: string;
seq: number;
pcm: Uint8Array;
}
function getWsUrl(): string { function getWsUrl(): string {
const proto = location.protocol === "https:" ? "wss:" : "ws:"; const proto = location.protocol === "https:" ? "wss:" : "ws:";
const params = new URLSearchParams(location.search); const params = new URLSearchParams(location.search);
@@ -11,75 +34,355 @@ function getWsUrl(): string {
return `${proto}//${location.host}/ws${q}`; return `${proto}//${location.host}/ws${q}`;
} }
function createSessionId(): string {
if (typeof crypto !== "undefined" && "randomUUID" in crypto) {
return crypto.randomUUID();
}
return `${Date.now()}-${Math.random().toString(16).slice(2)}`;
}
function encodeAudioPacket(
sessionId: string,
seq: number,
pcm: Int16Array,
): ArrayBuffer {
const sidBytes = new TextEncoder().encode(sessionId);
if (sidBytes.length === 0 || sidBytes.length > 96) {
throw new Error("invalid sessionId length");
}
const pcmBytes = new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength);
const payload: AudioPacketPayload = {
v: AUDIO_PACKET_VERSION,
sessionId,
seq,
pcm: pcmBytes,
};
const packed = encode(payload);
return packed.buffer.slice(
packed.byteOffset,
packed.byteOffset + packed.byteLength,
);
}
export function useWebSocket() { export function useWebSocket() {
const wsRef = useRef<ReconnectingWebSocket | null>(null); const wsRef = useRef<ReconnectingWebSocket | null>(null);
const heartbeatTimerRef = useRef<number | null>(null);
const heartbeatTimeoutRef = useRef<number | null>(null);
const pendingStartRef = useRef<{
sessionId: string;
resolve: (sessionId: string | null) => void;
timer: number;
} | null>(null);
const audioQueueRef = useRef<QueuedAudioPacket[]>([]);
const lastWeakToastAtRef = useRef(0);
const sendJSON = useCallback((obj: Record<string, unknown>) => { const clearHeartbeat = useCallback(() => {
const ws = wsRef.current; if (heartbeatTimerRef.current !== null) {
if (ws?.readyState === WebSocket.OPEN) { window.clearInterval(heartbeatTimerRef.current);
ws.send(JSON.stringify(obj)); heartbeatTimerRef.current = null;
}
if (heartbeatTimeoutRef.current !== null) {
window.clearTimeout(heartbeatTimeoutRef.current);
heartbeatTimeoutRef.current = null;
} }
}, []); }, []);
const sendBinary = useCallback((data: Int16Array) => { const sendControl = useCallback((msg: ClientMsg): boolean => {
const ws = wsRef.current; const ws = wsRef.current;
if (ws?.readyState === WebSocket.OPEN) { if (ws?.readyState === WebSocket.OPEN) {
ws.send(data.buffer as ArrayBuffer); ws.send(JSON.stringify(msg));
return true;
}
return false;
}, []);
const notifyWeakNetwork = useCallback(() => {
const now = Date.now();
if (now - lastWeakToastAtRef.current < WEAK_NETWORK_TOAST_INTERVAL_MS) {
return;
}
lastWeakToastAtRef.current = now;
toast.warning("网络波动,正在缓冲音频…");
}, []);
const flushAudioQueue = useCallback(() => {
const ws = wsRef.current;
if (ws?.readyState !== WebSocket.OPEN) return;
const activeSessionId = useAppStore.getState().activeSessionId;
if (!activeSessionId) {
audioQueueRef.current = [];
useAppStore.getState().setWeakNetwork(false);
return;
}
const remaining: QueuedAudioPacket[] = [];
for (const packet of audioQueueRef.current) {
if (packet.sessionId !== activeSessionId) {
continue;
}
if (
ws.readyState !== WebSocket.OPEN ||
ws.bufferedAmount > WS_BACKPRESSURE_BYTES
) {
remaining.push(packet);
continue;
}
ws.send(packet.buffer);
}
audioQueueRef.current = remaining;
if (audioQueueRef.current.length === 0) {
useAppStore.getState().setWeakNetwork(false);
} }
}, []); }, []);
useEffect(() => { const sendAudioFrame = useCallback(
useAppStore.getState().setConnectionStatus("connecting"); (sessionId: string, seq: number, pcm: Int16Array): boolean => {
const store = useAppStore.getState();
if (!store.recording || store.activeSessionId !== sessionId) {
return false;
}
const ws = new ReconnectingWebSocket(getWsUrl(), undefined, { let buffer: ArrayBuffer;
minReconnectionDelay: 1000,
maxReconnectionDelay: 16000,
});
ws.binaryType = "arraybuffer";
ws.onopen = () => {
useAppStore.getState().setConnectionStatus("connected");
};
ws.onmessage = (e: MessageEvent) => {
if (typeof e.data !== "string") return;
try { try {
const msg = JSON.parse(e.data); buffer = encodeAudioPacket(sessionId, seq, pcm);
} catch {
return false;
}
const ws = wsRef.current;
if (
ws?.readyState === WebSocket.OPEN &&
ws.bufferedAmount <= WS_BACKPRESSURE_BYTES
) {
ws.send(buffer);
if (audioQueueRef.current.length === 0) {
useAppStore.getState().setWeakNetwork(false);
}
return true;
}
if (audioQueueRef.current.length >= MAX_AUDIO_QUEUE) {
audioQueueRef.current.shift();
}
audioQueueRef.current.push({ sessionId, seq, buffer });
useAppStore.getState().setWeakNetwork(true);
notifyWeakNetwork();
return false;
},
[notifyWeakNetwork],
);
const resolvePendingStart = useCallback((sessionId: string | null) => {
const pending = pendingStartRef.current;
if (!pending) return;
window.clearTimeout(pending.timer);
pending.resolve(sessionId);
pendingStartRef.current = null;
}, []);
const handleServerMessage = useCallback(
(raw: string) => {
let msg: ServerMsg;
try {
msg = JSON.parse(raw) as ServerMsg;
} catch {
return;
}
const store = useAppStore.getState(); const store = useAppStore.getState();
switch (msg.type) { switch (msg.type) {
case "partial": case "ready":
break;
case "state":
if (msg.state === "idle") {
store.setRecording(false);
store.setStopping(false);
store.setPendingStart(false);
store.setActiveSessionId(null);
}
if (msg.state === "stopping") {
store.setStopping(true);
}
break;
case "start_ack":
if (
pendingStartRef.current &&
msg.sessionId === pendingStartRef.current.sessionId
) {
resolvePendingStart(msg.sessionId ?? null);
}
break;
case "stop_ack":
store.setStopping(true);
break;
case "partial": {
const activeSessionId = store.activeSessionId;
if (!activeSessionId || msg.sessionId !== activeSessionId) break;
store.setPreview(msg.text || "", false); store.setPreview(msg.text || "", false);
break; break;
case "final": }
case "final": {
const activeSessionId = store.activeSessionId;
if (!activeSessionId || msg.sessionId !== activeSessionId) break;
store.setPreview(msg.text || "", true); store.setPreview(msg.text || "", true);
if (msg.text) store.addHistory(msg.text); if (msg.text) store.addHistory(msg.text);
break; break;
}
case "pasted": case "pasted":
toast.success("已粘贴"); toast.success("已粘贴");
break; break;
case "error": case "error":
toast.error(msg.message || "错误"); store.setRecording(false);
store.setPendingStart(false);
store.setStopping(false);
resolvePendingStart(null);
toast.error(msg.message || "服务错误");
break;
case "pong":
if (heartbeatTimeoutRef.current !== null) {
window.clearTimeout(heartbeatTimeoutRef.current);
heartbeatTimeoutRef.current = null;
}
break; break;
} }
} catch { },
// Ignore malformed messages [resolvePendingStart],
);
const startHeartbeat = useCallback(() => {
clearHeartbeat();
heartbeatTimerRef.current = window.setInterval(() => {
if (document.hidden) return;
const now = Date.now();
if (!sendControl({ type: "ping", ts: now })) return;
if (heartbeatTimeoutRef.current !== null) {
window.clearTimeout(heartbeatTimeoutRef.current);
}
heartbeatTimeoutRef.current = window.setTimeout(() => {
wsRef.current?.close();
}, HEARTBEAT_TIMEOUT_MS);
}, HEARTBEAT_INTERVAL_MS);
}, [clearHeartbeat, sendControl]);
useEffect(() => {
const ws = new ReconnectingWebSocket(getWsUrl(), undefined, {
minReconnectionDelay: 400,
maxReconnectionDelay: 6000,
});
ws.binaryType = "arraybuffer";
wsRef.current = ws;
useAppStore.getState().setConnectionStatus("connecting");
ws.onopen = () => {
const store = useAppStore.getState();
store.setConnectionStatus("connected");
store.setWeakNetwork(false);
sendControl({ type: "hello", version: 2 });
startHeartbeat();
flushAudioQueue();
};
ws.onmessage = (e: MessageEvent) => {
if (typeof e.data === "string") {
handleServerMessage(e.data);
} }
}; };
ws.onclose = () => { ws.onclose = () => {
clearHeartbeat();
resolvePendingStart(null);
const store = useAppStore.getState(); const store = useAppStore.getState();
if (store.recording || store.pendingStart) {
toast.warning("连接中断,本次录音已结束");
}
store.setConnectionStatus("disconnected"); store.setConnectionStatus("disconnected");
if (store.recording) store.setRecording(false); store.setWeakNetwork(store.recording || audioQueueRef.current.length > 0);
if (store.pendingStart) store.setPendingStart(false); store.setRecording(false);
store.setPendingStart(false);
store.setStopping(false);
store.setActiveSessionId(null);
audioQueueRef.current = [];
}; };
wsRef.current = ws; ws.onerror = () => {
ws.close();
};
const onVisibilityChange = () => {
if (!document.hidden && wsRef.current?.readyState !== WebSocket.OPEN) {
wsRef.current?.close();
}
};
document.addEventListener("visibilitychange", onVisibilityChange);
return () => { return () => {
ws.close(); document.removeEventListener("visibilitychange", onVisibilityChange);
clearHeartbeat();
resolvePendingStart(null);
wsRef.current?.close();
wsRef.current = null; wsRef.current = null;
}; };
}, []); }, [
clearHeartbeat,
flushAudioQueue,
handleServerMessage,
resolvePendingStart,
sendControl,
startHeartbeat,
]);
return { sendJSON, sendBinary }; const requestStart = useCallback((): Promise<string | null> => {
const sessionId = createSessionId();
return new Promise<string | null>((resolve) => {
if (pendingStartRef.current) {
resolve(null);
return;
}
const timer = window.setTimeout(() => {
if (pendingStartRef.current?.sessionId === sessionId) {
pendingStartRef.current = null;
resolve(null);
}
}, START_TIMEOUT_MS);
pendingStartRef.current = { sessionId, resolve, timer };
const ok = sendControl({ type: "start", sessionId, version: 2 });
if (!ok) {
resolvePendingStart(null);
}
});
}, [resolvePendingStart, sendControl]);
const sendStop = useCallback(
(sessionId: string | null) => {
if (!sessionId) return;
audioQueueRef.current = audioQueueRef.current.filter(
(packet) => packet.sessionId !== sessionId,
);
if (audioQueueRef.current.length === 0) {
useAppStore.getState().setWeakNetwork(false);
}
sendControl({ type: "stop", sessionId });
},
[sendControl],
);
const sendPaste = useCallback(
(text: string) => {
const store = useAppStore.getState();
if (store.connectionStatus !== "connected") {
toast.warning("当前未连接,无法发送粘贴");
return;
}
const sessionId = useAppStore.getState().activeSessionId ?? undefined;
sendControl({ type: "paste", text, sessionId });
},
[sendControl],
);
return { requestStart, sendStop, sendPaste, sendAudioFrame };
} }

35
web/src/protocol.ts Normal file
View File

@@ -0,0 +1,35 @@
export type ClientMsgType = "hello" | "start" | "stop" | "paste" | "ping";
export type ServerMsgType =
| "ready"
| "state"
| "start_ack"
| "stop_ack"
| "partial"
| "final"
| "pasted"
| "error"
| "pong";
export type SessionState = "idle" | "recording" | "stopping";
export interface ClientMsg {
type: ClientMsgType;
sessionId?: string;
seq?: number;
text?: string;
version?: number;
ts?: number;
}
export interface ServerMsg {
type: ServerMsgType;
state?: SessionState;
sessionId?: string;
seq?: number;
text?: string;
message?: string;
code?: string;
retryable?: boolean;
ts?: number;
}

View File

@@ -13,9 +13,13 @@ type ConnectionStatus = "connected" | "disconnected" | "connecting";
interface AppState { interface AppState {
// Connection // Connection
connectionStatus: ConnectionStatus; connectionStatus: ConnectionStatus;
weakNetwork: boolean;
// Recording // Recording
recording: boolean; recording: boolean;
pendingStart: boolean; pendingStart: boolean;
stopping: boolean;
micReady: boolean;
activeSessionId: string | null;
// Preview // Preview
previewText: string; previewText: string;
previewActive: boolean; previewActive: boolean;
@@ -24,8 +28,12 @@ interface AppState {
// Actions // Actions
setConnectionStatus: (status: ConnectionStatus) => void; setConnectionStatus: (status: ConnectionStatus) => void;
setWeakNetwork: (weak: boolean) => void;
setRecording: (recording: boolean) => void; setRecording: (recording: boolean) => void;
setPendingStart: (pending: boolean) => void; setPendingStart: (pending: boolean) => void;
setStopping: (stopping: boolean) => void;
setMicReady: (ready: boolean) => void;
setActiveSessionId: (sessionId: string | null) => void;
setPreview: (text: string, isFinal: boolean) => void; setPreview: (text: string, isFinal: boolean) => void;
clearPreview: () => void; clearPreview: () => void;
addHistory: (text: string) => void; addHistory: (text: string) => void;
@@ -36,15 +44,23 @@ export const useAppStore = create<AppState>()(
persist( persist(
(set, get) => ({ (set, get) => ({
connectionStatus: "connecting", connectionStatus: "connecting",
weakNetwork: false,
recording: false, recording: false,
pendingStart: false, pendingStart: false,
stopping: false,
micReady: false,
activeSessionId: null,
previewText: "", previewText: "",
previewActive: false, previewActive: false,
history: [], history: [],
setConnectionStatus: (connectionStatus) => set({ connectionStatus }), setConnectionStatus: (connectionStatus) => set({ connectionStatus }),
setWeakNetwork: (weakNetwork) => set({ weakNetwork }),
setRecording: (recording) => set({ recording }), setRecording: (recording) => set({ recording }),
setPendingStart: (pendingStart) => set({ pendingStart }), setPendingStart: (pendingStart) => set({ pendingStart }),
setStopping: (stopping) => set({ stopping }),
setMicReady: (micReady) => set({ micReady }),
setActiveSessionId: (activeSessionId) => set({ activeSessionId }),
setPreview: (text, isFinal) => setPreview: (text, isFinal) =>
set({ set({