diff --git a/config.example.yaml b/config.example.yaml index 6191f27..334a1fa 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -3,8 +3,8 @@ # 火山引擎豆包 ASR 配置 doubao: - app_key: "" # env: DOUBAO_APP_KEY - access_key: "" # env: DOUBAO_ACCESS_KEY + app_id: "" # env: DOUBAO_APP_ID + access_token: "" # env: DOUBAO_ACCESS_TOKEN resource_id: "volc.seedasr.sauc.duration" # env: DOUBAO_RESOURCE_ID # 服务配置 diff --git a/internal/asr/client.go b/internal/asr/client.go index 3f0f553..fa2b8b6 100644 --- a/internal/asr/client.go +++ b/internal/asr/client.go @@ -13,15 +13,15 @@ import ( ) const ( - doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream" + doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_async" writeTimeout = 10 * time.Second readTimeout = 30 * time.Second ) // Config holds Doubao ASR connection parameters. type Config struct { - AppKey string - AccessKey string + AppID string + AccessToken string ResourceID string } @@ -39,8 +39,8 @@ type Client struct { func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) { connID := uuid.New().String() headers := http.Header{ - "X-Api-App-Key": {cfg.AppKey}, - "X-Api-Access-Key": {cfg.AccessKey}, + "X-Api-App-Key": {cfg.AppID}, + "X-Api-Access-Key": {cfg.AccessToken}, "X-Api-Resource-Id": {cfg.ResourceID}, "X-Api-Connect-Id": {connID}, } @@ -68,13 +68,14 @@ func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) { Channel: 1, }, Request: RequestMeta{ - ModelName: "seedasr-2.0", - EnableITN: true, - EnablePUNC: true, - EnableDDC: true, - ShowUtterances: false, - ResultType: "single", - EndWindowSize: 2000, + ModelName: "seedasr-2.0", + EnableITN: true, + EnablePUNC: true, + EnableDDC: true, + ShowUtterances: true, + ResultType: "full", + EnableNonstream: true, + EndWindowSize: 800, }, } data, err := EncodeFullClientRequest(req) @@ -132,13 +133,13 @@ func (c *Client) readLoop(resultCh chan<- wsMsg.ServerMsg) { resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgError, Message: resp.ErrMsg} return } - // nostream mode: may return intermediate results every ~15s + // bigmodel_async with enable_nonstream: returns both streaming (partial) and definite (final) results text := resp.Text if text != "" { if resp.IsLast { resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgFinal, Text: text} } else { - // Intermediate result (>15s audio) — preview only, don't paste + // Intermediate streaming result (first pass) — preview only resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgPartial, Text: text} } } diff --git a/internal/asr/protocol.go b/internal/asr/protocol.go index 4bb9bb1..fad30e8 100644 --- a/internal/asr/protocol.go +++ b/internal/asr/protocol.go @@ -104,13 +104,14 @@ type AudioMeta struct { } type RequestMeta struct { - ModelName string `json:"model_name"` - EnableITN bool `json:"enable_itn"` - EnablePUNC bool `json:"enable_punc"` - EnableDDC bool `json:"enable_ddc"` - ShowUtterances bool `json:"show_utterances"` - ResultType string `json:"result_type,omitempty"` - EndWindowSize int `json:"end_window_size,omitempty"` + ModelName string `json:"model_name"` + EnableITN bool `json:"enable_itn"` + EnablePUNC bool `json:"enable_punc"` + EnableDDC bool `json:"enable_ddc"` + ShowUtterances bool `json:"show_utterances"` + ResultType string `json:"result_type,omitempty"` + EnableNonstream bool `json:"enable_nonstream,omitempty"` + EndWindowSize int `json:"end_window_size,omitempty"` } // EncodeFullClientRequest builds the binary message for the initial handshake. // nostream mode: header(4) + payload_size(4) + gzip(json) diff --git a/internal/config/config.go b/internal/config/config.go index 9290b74..25fd2dd 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -6,8 +6,8 @@ import ( // DoubaoConfig holds 火山引擎豆包 ASR credentials. type DoubaoConfig struct { - AppKey string `yaml:"app_key"` - AccessKey string `yaml:"access_key"` + AppID string `yaml:"app_id"` + AccessToken string `yaml:"access_token"` ResourceID string `yaml:"resource_id"` } diff --git a/internal/config/load.go b/internal/config/load.go index 38dcc81..681e611 100644 --- a/internal/config/load.go +++ b/internal/config/load.go @@ -41,11 +41,11 @@ func Load(configPath string) (Config, error) { // applyEnv overrides config fields with environment variables. func applyEnv(cfg *Config) { - if v := os.Getenv("DOUBAO_APP_KEY"); v != "" { - cfg.Doubao.AppKey = v + if v := os.Getenv("DOUBAO_APP_ID"); v != "" { + cfg.Doubao.AppID = v } - if v := os.Getenv("DOUBAO_ACCESS_KEY"); v != "" { - cfg.Doubao.AccessKey = v + if v := os.Getenv("DOUBAO_ACCESS_TOKEN"); v != "" { + cfg.Doubao.AccessToken = v } if v := os.Getenv("DOUBAO_RESOURCE_ID"); v != "" { cfg.Doubao.ResourceID = v @@ -62,11 +62,11 @@ func applyEnv(cfg *Config) { // validate checks required fields. func validate(cfg Config) error { - if cfg.Doubao.AppKey == "" { - return fmt.Errorf("doubao.app_key is required (set DOUBAO_APP_KEY or config.yaml)") + if cfg.Doubao.AppID == "" { + return fmt.Errorf("doubao.app_id is required (set DOUBAO_APP_ID or config.yaml)") } - if cfg.Doubao.AccessKey == "" { - return fmt.Errorf("doubao.access_key is required (set DOUBAO_ACCESS_KEY or config.yaml)") + if cfg.Doubao.AccessToken == "" { + return fmt.Errorf("doubao.access_token is required (set DOUBAO_ACCESS_TOKEN or config.yaml)") } return nil } diff --git a/internal/ws/handler.go b/internal/ws/handler.go index 6580af4..77df8e4 100644 --- a/internal/ws/handler.go +++ b/internal/ws/handler.go @@ -62,21 +62,21 @@ func (h *Handler) handleConn(c *websocket.Conn) { defer close(resultCh) // Writer goroutine: single writer to avoid concurrent writes - // Accumulates all result texts; paste is triggered by stop, not by ASR final. + // bigmodel_async with enable_nonstream: server returns full text each time (not incremental) + // We replace preview text on each update instead of accumulating. var wg sync.WaitGroup - var accMu sync.Mutex - var accText string + var previewMu sync.Mutex + var previewText string wg.Add(1) go func() { defer wg.Done() for msg := range resultCh { - // Accumulate text from both partial and final results + // Replace preview text with latest result (full mode) if msg.Type == MsgPartial || msg.Type == MsgFinal { - accMu.Lock() - accText += msg.Text - // Send accumulated preview to phone - preview := ServerMsg{Type: msg.Type, Text: accText} - accMu.Unlock() + previewMu.Lock() + previewText = msg.Text + preview := ServerMsg{Type: msg.Type, Text: previewText} + previewMu.Unlock() if err := c.WriteMessage(websocket.TextMessage, preview.Bytes()); err != nil { log.Warn("ws write error", "err", err) return @@ -128,10 +128,10 @@ func (h *Handler) handleConn(c *websocket.Conn) { if active { continue } - // Reset accumulated text for new session - accMu.Lock() - accText = "" - accMu.Unlock() + // Reset preview text for new session + previewMu.Lock() + previewText = "" + previewMu.Unlock() sa, cl, err := h.asrFactory(resultCh) if err != nil { log.Error("asr start failed", "err", err) @@ -154,11 +154,11 @@ func (h *Handler) handleConn(c *websocket.Conn) { } sendAudio = nil active = false - // Now paste the accumulated text - accMu.Lock() - finalText := accText - accText = "" - accMu.Unlock() + // Paste the final preview text + previewMu.Lock() + finalText := previewText + previewText = "" + previewMu.Unlock() if finalText != "" && h.pasteFunc != nil { if err := h.pasteFunc(finalText); err != nil { log.Error("auto-paste failed", "err", err) diff --git a/main.go b/main.go index ba1f4da..43a4067 100644 --- a/main.go +++ b/main.go @@ -110,8 +110,8 @@ func main() { srv := server.New(token, lanIP, webContent, serverTLSCfg) // Build ASR factory from config asrCfg := asr.Config{ - AppKey: cfg.Doubao.AppKey, - AccessKey: cfg.Doubao.AccessKey, + AppID: cfg.Doubao.AppID, + AccessToken: cfg.Doubao.AccessToken, ResourceID: cfg.Doubao.ResourceID, } asrFactory := func(resultCh chan<- ws.ServerMsg) (func([]byte), func(), error) {