feat: 启用豆包二遍识别模式以提升实时性和准确率

- 切换到 bigmodel_async endpoint 并启用 enable_nonstream
- 第一遍流式识别提供实时文字预览
- VAD 分句后自动触发第二遍非流式识别提升准确率
- 修改文本处理逻辑从累加改为替换(适配 full 模式)
- 统一配置字段命名:app_key → app_id, access_key → access_token
This commit is contained in:
2026-03-01 21:34:54 +08:00
parent e4b5841c93
commit 8c7b9b45fd
7 changed files with 55 additions and 53 deletions

View File

@@ -13,15 +13,15 @@ import (
)
const (
doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream"
doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_async"
writeTimeout = 10 * time.Second
readTimeout = 30 * time.Second
)
// Config holds Doubao ASR connection parameters.
type Config struct {
AppKey string
AccessKey string
AppID string
AccessToken string
ResourceID string
}
@@ -39,8 +39,8 @@ type Client struct {
func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) {
connID := uuid.New().String()
headers := http.Header{
"X-Api-App-Key": {cfg.AppKey},
"X-Api-Access-Key": {cfg.AccessKey},
"X-Api-App-Key": {cfg.AppID},
"X-Api-Access-Key": {cfg.AccessToken},
"X-Api-Resource-Id": {cfg.ResourceID},
"X-Api-Connect-Id": {connID},
}
@@ -68,13 +68,14 @@ func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) {
Channel: 1,
},
Request: RequestMeta{
ModelName: "seedasr-2.0",
EnableITN: true,
EnablePUNC: true,
EnableDDC: true,
ShowUtterances: false,
ResultType: "single",
EndWindowSize: 2000,
ModelName: "seedasr-2.0",
EnableITN: true,
EnablePUNC: true,
EnableDDC: true,
ShowUtterances: true,
ResultType: "full",
EnableNonstream: true,
EndWindowSize: 800,
},
}
data, err := EncodeFullClientRequest(req)
@@ -132,13 +133,13 @@ func (c *Client) readLoop(resultCh chan<- wsMsg.ServerMsg) {
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgError, Message: resp.ErrMsg}
return
}
// nostream mode: may return intermediate results every ~15s
// bigmodel_async with enable_nonstream: returns both streaming (partial) and definite (final) results
text := resp.Text
if text != "" {
if resp.IsLast {
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgFinal, Text: text}
} else {
// Intermediate result (>15s audio) — preview only, don't paste
// Intermediate streaming result (first pass) — preview only
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgPartial, Text: text}
}
}