feat: 启用豆包二遍识别模式以提升实时性和准确率
- 切换到 bigmodel_async endpoint 并启用 enable_nonstream - 第一遍流式识别提供实时文字预览 - VAD 分句后自动触发第二遍非流式识别提升准确率 - 修改文本处理逻辑从累加改为替换(适配 full 模式) - 统一配置字段命名:app_key → app_id, access_key → access_token
This commit is contained in:
@@ -13,15 +13,15 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream"
|
||||
doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_async"
|
||||
writeTimeout = 10 * time.Second
|
||||
readTimeout = 30 * time.Second
|
||||
)
|
||||
|
||||
// Config holds Doubao ASR connection parameters.
|
||||
type Config struct {
|
||||
AppKey string
|
||||
AccessKey string
|
||||
AppID string
|
||||
AccessToken string
|
||||
ResourceID string
|
||||
}
|
||||
|
||||
@@ -39,8 +39,8 @@ type Client struct {
|
||||
func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) {
|
||||
connID := uuid.New().String()
|
||||
headers := http.Header{
|
||||
"X-Api-App-Key": {cfg.AppKey},
|
||||
"X-Api-Access-Key": {cfg.AccessKey},
|
||||
"X-Api-App-Key": {cfg.AppID},
|
||||
"X-Api-Access-Key": {cfg.AccessToken},
|
||||
"X-Api-Resource-Id": {cfg.ResourceID},
|
||||
"X-Api-Connect-Id": {connID},
|
||||
}
|
||||
@@ -68,13 +68,14 @@ func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) {
|
||||
Channel: 1,
|
||||
},
|
||||
Request: RequestMeta{
|
||||
ModelName: "seedasr-2.0",
|
||||
EnableITN: true,
|
||||
EnablePUNC: true,
|
||||
EnableDDC: true,
|
||||
ShowUtterances: false,
|
||||
ResultType: "single",
|
||||
EndWindowSize: 2000,
|
||||
ModelName: "seedasr-2.0",
|
||||
EnableITN: true,
|
||||
EnablePUNC: true,
|
||||
EnableDDC: true,
|
||||
ShowUtterances: true,
|
||||
ResultType: "full",
|
||||
EnableNonstream: true,
|
||||
EndWindowSize: 800,
|
||||
},
|
||||
}
|
||||
data, err := EncodeFullClientRequest(req)
|
||||
@@ -132,13 +133,13 @@ func (c *Client) readLoop(resultCh chan<- wsMsg.ServerMsg) {
|
||||
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgError, Message: resp.ErrMsg}
|
||||
return
|
||||
}
|
||||
// nostream mode: may return intermediate results every ~15s
|
||||
// bigmodel_async with enable_nonstream: returns both streaming (partial) and definite (final) results
|
||||
text := resp.Text
|
||||
if text != "" {
|
||||
if resp.IsLast {
|
||||
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgFinal, Text: text}
|
||||
} else {
|
||||
// Intermediate result (>15s audio) — preview only, don't paste
|
||||
// Intermediate streaming result (first pass) — preview only
|
||||
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgPartial, Text: text}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -104,13 +104,14 @@ type AudioMeta struct {
|
||||
}
|
||||
|
||||
type RequestMeta struct {
|
||||
ModelName string `json:"model_name"`
|
||||
EnableITN bool `json:"enable_itn"`
|
||||
EnablePUNC bool `json:"enable_punc"`
|
||||
EnableDDC bool `json:"enable_ddc"`
|
||||
ShowUtterances bool `json:"show_utterances"`
|
||||
ResultType string `json:"result_type,omitempty"`
|
||||
EndWindowSize int `json:"end_window_size,omitempty"`
|
||||
ModelName string `json:"model_name"`
|
||||
EnableITN bool `json:"enable_itn"`
|
||||
EnablePUNC bool `json:"enable_punc"`
|
||||
EnableDDC bool `json:"enable_ddc"`
|
||||
ShowUtterances bool `json:"show_utterances"`
|
||||
ResultType string `json:"result_type,omitempty"`
|
||||
EnableNonstream bool `json:"enable_nonstream,omitempty"`
|
||||
EndWindowSize int `json:"end_window_size,omitempty"`
|
||||
}
|
||||
// EncodeFullClientRequest builds the binary message for the initial handshake.
|
||||
// nostream mode: header(4) + payload_size(4) + gzip(json)
|
||||
|
||||
Reference in New Issue
Block a user