feat: 启用豆包二遍识别模式以提升实时性和准确率

- 切换到 bigmodel_async endpoint 并启用 enable_nonstream - 第一遍流式识别提供实时文字预览 - VAD 分句后自动触发第二遍非流式识别提升准确率 - 修改文本处理逻辑从累加改为替换（适配 full 模式） - 统一配置字段命名：app_key → app_id, access_key → access_token
2026-03-01 21:34:54 +08:00
parent e4b5841c93
commit 8c7b9b45fd
7 changed files with 55 additions and 53 deletions
@@ -13,15 +13,15 @@ import (
 )

 const (
-	doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream"
+	doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_async"
 	writeTimeout   = 10 * time.Second
 	readTimeout    = 30 * time.Second
 )

 // Config holds Doubao ASR connection parameters.
 type Config struct {
-	AppKey     string
-	AccessKey  string
+	AppID       string
+	AccessToken string
 	ResourceID string
 }

@@ -39,8 +39,8 @@ type Client struct {
 func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) {
 	connID := uuid.New().String()
 	headers := http.Header{
-		"X-Api-App-Key":    {cfg.AppKey},
-		"X-Api-Access-Key": {cfg.AccessKey},
+		"X-Api-App-Key":    {cfg.AppID},
+		"X-Api-Access-Key": {cfg.AccessToken},
 		"X-Api-Resource-Id": {cfg.ResourceID},
 		"X-Api-Connect-Id": {connID},
 	}
@@ -68,13 +68,14 @@ func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) {
 			Channel: 1,
 		},
 		Request: RequestMeta{
-			ModelName:      "seedasr-2.0",
-			EnableITN:      true,
-			EnablePUNC:     true,
-			EnableDDC:      true,
-			ShowUtterances: false,
-			ResultType:     "single",
-			EndWindowSize:  2000,
+			ModelName:         "seedasr-2.0",
+			EnableITN:         true,
+			EnablePUNC:        true,
+			EnableDDC:         true,
+			ShowUtterances:    true,
+			ResultType:        "full",
+			EnableNonstream:   true,
+			EndWindowSize:     800,
 		},
 	}
 	data, err := EncodeFullClientRequest(req)
@@ -132,13 +133,13 @@ func (c *Client) readLoop(resultCh chan<- wsMsg.ServerMsg) {
 			resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgError, Message: resp.ErrMsg}
 			return
 		}
-		// nostream mode: may return intermediate results every ~15s
+		// bigmodel_async with enable_nonstream: returns both streaming (partial) and definite (final) results
 		text := resp.Text
 		if text != "" {
 			if resp.IsLast {
 				resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgFinal, Text: text}
 			} else {
-				// Intermediate result (>15s audio) — preview only, don't paste
+				// Intermediate streaming result (first pass) — preview only
 				resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgPartial, Text: text}
 			}
 		}
@@ -104,13 +104,14 @@ type AudioMeta struct {
 }

 type RequestMeta struct {
-	ModelName      string `json:"model_name"`
-	EnableITN      bool   `json:"enable_itn"`
-	EnablePUNC     bool   `json:"enable_punc"`
-	EnableDDC      bool   `json:"enable_ddc"`
-	ShowUtterances bool   `json:"show_utterances"`
-	ResultType     string `json:"result_type,omitempty"`
-	EndWindowSize  int    `json:"end_window_size,omitempty"`
+	ModelName        string `json:"model_name"`
+	EnableITN        bool   `json:"enable_itn"`
+	EnablePUNC       bool   `json:"enable_punc"`
+	EnableDDC        bool   `json:"enable_ddc"`
+	ShowUtterances   bool   `json:"show_utterances"`
+	ResultType       string `json:"result_type,omitempty"`
+	EnableNonstream  bool   `json:"enable_nonstream,omitempty"`
+	EndWindowSize    int    `json:"end_window_size,omitempty"`
 }
 // EncodeFullClientRequest builds the binary message for the initial handshake.
 // nostream mode: header(4) + payload_size(4) + gzip(json)