feat: 启用豆包二遍识别模式以提升实时性和准确率

- 切换到 bigmodel_async endpoint 并启用 enable_nonstream
- 第一遍流式识别提供实时文字预览
- VAD 分句后自动触发第二遍非流式识别提升准确率
- 修改文本处理逻辑从累加改为替换(适配 full 模式)
- 统一配置字段命名:app_key → app_id, access_key → access_token
This commit is contained in:
2026-03-01 21:34:54 +08:00
parent e4b5841c93
commit 8c7b9b45fd
7 changed files with 55 additions and 53 deletions

View File

@@ -3,8 +3,8 @@
# 火山引擎豆包 ASR 配置 # 火山引擎豆包 ASR 配置
doubao: doubao:
app_key: "" # env: DOUBAO_APP_KEY app_id: "" # env: DOUBAO_APP_ID
access_key: "" # env: DOUBAO_ACCESS_KEY access_token: "" # env: DOUBAO_ACCESS_TOKEN
resource_id: "volc.seedasr.sauc.duration" # env: DOUBAO_RESOURCE_ID resource_id: "volc.seedasr.sauc.duration" # env: DOUBAO_RESOURCE_ID
# 服务配置 # 服务配置

View File

@@ -13,15 +13,15 @@ import (
) )
const ( const (
doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream" doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_async"
writeTimeout = 10 * time.Second writeTimeout = 10 * time.Second
readTimeout = 30 * time.Second readTimeout = 30 * time.Second
) )
// Config holds Doubao ASR connection parameters. // Config holds Doubao ASR connection parameters.
type Config struct { type Config struct {
AppKey string AppID string
AccessKey string AccessToken string
ResourceID string ResourceID string
} }
@@ -39,8 +39,8 @@ type Client struct {
func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) { func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) {
connID := uuid.New().String() connID := uuid.New().String()
headers := http.Header{ headers := http.Header{
"X-Api-App-Key": {cfg.AppKey}, "X-Api-App-Key": {cfg.AppID},
"X-Api-Access-Key": {cfg.AccessKey}, "X-Api-Access-Key": {cfg.AccessToken},
"X-Api-Resource-Id": {cfg.ResourceID}, "X-Api-Resource-Id": {cfg.ResourceID},
"X-Api-Connect-Id": {connID}, "X-Api-Connect-Id": {connID},
} }
@@ -68,13 +68,14 @@ func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) {
Channel: 1, Channel: 1,
}, },
Request: RequestMeta{ Request: RequestMeta{
ModelName: "seedasr-2.0", ModelName: "seedasr-2.0",
EnableITN: true, EnableITN: true,
EnablePUNC: true, EnablePUNC: true,
EnableDDC: true, EnableDDC: true,
ShowUtterances: false, ShowUtterances: true,
ResultType: "single", ResultType: "full",
EndWindowSize: 2000, EnableNonstream: true,
EndWindowSize: 800,
}, },
} }
data, err := EncodeFullClientRequest(req) data, err := EncodeFullClientRequest(req)
@@ -132,13 +133,13 @@ func (c *Client) readLoop(resultCh chan<- wsMsg.ServerMsg) {
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgError, Message: resp.ErrMsg} resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgError, Message: resp.ErrMsg}
return return
} }
// nostream mode: may return intermediate results every ~15s // bigmodel_async with enable_nonstream: returns both streaming (partial) and definite (final) results
text := resp.Text text := resp.Text
if text != "" { if text != "" {
if resp.IsLast { if resp.IsLast {
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgFinal, Text: text} resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgFinal, Text: text}
} else { } else {
// Intermediate result (>15s audio) — preview only, don't paste // Intermediate streaming result (first pass) — preview only
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgPartial, Text: text} resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgPartial, Text: text}
} }
} }

View File

@@ -104,13 +104,14 @@ type AudioMeta struct {
} }
type RequestMeta struct { type RequestMeta struct {
ModelName string `json:"model_name"` ModelName string `json:"model_name"`
EnableITN bool `json:"enable_itn"` EnableITN bool `json:"enable_itn"`
EnablePUNC bool `json:"enable_punc"` EnablePUNC bool `json:"enable_punc"`
EnableDDC bool `json:"enable_ddc"` EnableDDC bool `json:"enable_ddc"`
ShowUtterances bool `json:"show_utterances"` ShowUtterances bool `json:"show_utterances"`
ResultType string `json:"result_type,omitempty"` ResultType string `json:"result_type,omitempty"`
EndWindowSize int `json:"end_window_size,omitempty"` EnableNonstream bool `json:"enable_nonstream,omitempty"`
EndWindowSize int `json:"end_window_size,omitempty"`
} }
// EncodeFullClientRequest builds the binary message for the initial handshake. // EncodeFullClientRequest builds the binary message for the initial handshake.
// nostream mode: header(4) + payload_size(4) + gzip(json) // nostream mode: header(4) + payload_size(4) + gzip(json)

View File

@@ -6,8 +6,8 @@ import (
// DoubaoConfig holds 火山引擎豆包 ASR credentials. // DoubaoConfig holds 火山引擎豆包 ASR credentials.
type DoubaoConfig struct { type DoubaoConfig struct {
AppKey string `yaml:"app_key"` AppID string `yaml:"app_id"`
AccessKey string `yaml:"access_key"` AccessToken string `yaml:"access_token"`
ResourceID string `yaml:"resource_id"` ResourceID string `yaml:"resource_id"`
} }

View File

@@ -41,11 +41,11 @@ func Load(configPath string) (Config, error) {
// applyEnv overrides config fields with environment variables. // applyEnv overrides config fields with environment variables.
func applyEnv(cfg *Config) { func applyEnv(cfg *Config) {
if v := os.Getenv("DOUBAO_APP_KEY"); v != "" { if v := os.Getenv("DOUBAO_APP_ID"); v != "" {
cfg.Doubao.AppKey = v cfg.Doubao.AppID = v
} }
if v := os.Getenv("DOUBAO_ACCESS_KEY"); v != "" { if v := os.Getenv("DOUBAO_ACCESS_TOKEN"); v != "" {
cfg.Doubao.AccessKey = v cfg.Doubao.AccessToken = v
} }
if v := os.Getenv("DOUBAO_RESOURCE_ID"); v != "" { if v := os.Getenv("DOUBAO_RESOURCE_ID"); v != "" {
cfg.Doubao.ResourceID = v cfg.Doubao.ResourceID = v
@@ -62,11 +62,11 @@ func applyEnv(cfg *Config) {
// validate checks required fields. // validate checks required fields.
func validate(cfg Config) error { func validate(cfg Config) error {
if cfg.Doubao.AppKey == "" { if cfg.Doubao.AppID == "" {
return fmt.Errorf("doubao.app_key is required (set DOUBAO_APP_KEY or config.yaml)") return fmt.Errorf("doubao.app_id is required (set DOUBAO_APP_ID or config.yaml)")
} }
if cfg.Doubao.AccessKey == "" { if cfg.Doubao.AccessToken == "" {
return fmt.Errorf("doubao.access_key is required (set DOUBAO_ACCESS_KEY or config.yaml)") return fmt.Errorf("doubao.access_token is required (set DOUBAO_ACCESS_TOKEN or config.yaml)")
} }
return nil return nil
} }

View File

@@ -62,21 +62,21 @@ func (h *Handler) handleConn(c *websocket.Conn) {
defer close(resultCh) defer close(resultCh)
// Writer goroutine: single writer to avoid concurrent writes // Writer goroutine: single writer to avoid concurrent writes
// Accumulates all result texts; paste is triggered by stop, not by ASR final. // bigmodel_async with enable_nonstream: server returns full text each time (not incremental)
// We replace preview text on each update instead of accumulating.
var wg sync.WaitGroup var wg sync.WaitGroup
var accMu sync.Mutex var previewMu sync.Mutex
var accText string var previewText string
wg.Add(1) wg.Add(1)
go func() { go func() {
defer wg.Done() defer wg.Done()
for msg := range resultCh { for msg := range resultCh {
// Accumulate text from both partial and final results // Replace preview text with latest result (full mode)
if msg.Type == MsgPartial || msg.Type == MsgFinal { if msg.Type == MsgPartial || msg.Type == MsgFinal {
accMu.Lock() previewMu.Lock()
accText += msg.Text previewText = msg.Text
// Send accumulated preview to phone preview := ServerMsg{Type: msg.Type, Text: previewText}
preview := ServerMsg{Type: msg.Type, Text: accText} previewMu.Unlock()
accMu.Unlock()
if err := c.WriteMessage(websocket.TextMessage, preview.Bytes()); err != nil { if err := c.WriteMessage(websocket.TextMessage, preview.Bytes()); err != nil {
log.Warn("ws write error", "err", err) log.Warn("ws write error", "err", err)
return return
@@ -128,10 +128,10 @@ func (h *Handler) handleConn(c *websocket.Conn) {
if active { if active {
continue continue
} }
// Reset accumulated text for new session // Reset preview text for new session
accMu.Lock() previewMu.Lock()
accText = "" previewText = ""
accMu.Unlock() previewMu.Unlock()
sa, cl, err := h.asrFactory(resultCh) sa, cl, err := h.asrFactory(resultCh)
if err != nil { if err != nil {
log.Error("asr start failed", "err", err) log.Error("asr start failed", "err", err)
@@ -154,11 +154,11 @@ func (h *Handler) handleConn(c *websocket.Conn) {
} }
sendAudio = nil sendAudio = nil
active = false active = false
// Now paste the accumulated text // Paste the final preview text
accMu.Lock() previewMu.Lock()
finalText := accText finalText := previewText
accText = "" previewText = ""
accMu.Unlock() previewMu.Unlock()
if finalText != "" && h.pasteFunc != nil { if finalText != "" && h.pasteFunc != nil {
if err := h.pasteFunc(finalText); err != nil { if err := h.pasteFunc(finalText); err != nil {
log.Error("auto-paste failed", "err", err) log.Error("auto-paste failed", "err", err)

View File

@@ -110,8 +110,8 @@ func main() {
srv := server.New(token, lanIP, webContent, serverTLSCfg) srv := server.New(token, lanIP, webContent, serverTLSCfg)
// Build ASR factory from config // Build ASR factory from config
asrCfg := asr.Config{ asrCfg := asr.Config{
AppKey: cfg.Doubao.AppKey, AppID: cfg.Doubao.AppID,
AccessKey: cfg.Doubao.AccessKey, AccessToken: cfg.Doubao.AccessToken,
ResourceID: cfg.Doubao.ResourceID, ResourceID: cfg.Doubao.ResourceID,
} }
asrFactory := func(resultCh chan<- ws.ServerMsg) (func([]byte), func(), error) { asrFactory := func(resultCh chan<- ws.ServerMsg) (func([]byte), func(), error) {