feat: 启用豆包二遍识别模式以提升实时性和准确率
- 切换到 bigmodel_async endpoint 并启用 enable_nonstream - 第一遍流式识别提供实时文字预览 - VAD 分句后自动触发第二遍非流式识别提升准确率 - 修改文本处理逻辑从累加改为替换(适配 full 模式) - 统一配置字段命名:app_key → app_id, access_key → access_token
This commit is contained in:
@@ -3,8 +3,8 @@
|
|||||||
|
|
||||||
# 火山引擎豆包 ASR 配置
|
# 火山引擎豆包 ASR 配置
|
||||||
doubao:
|
doubao:
|
||||||
app_key: "" # env: DOUBAO_APP_KEY
|
app_id: "" # env: DOUBAO_APP_ID
|
||||||
access_key: "" # env: DOUBAO_ACCESS_KEY
|
access_token: "" # env: DOUBAO_ACCESS_TOKEN
|
||||||
resource_id: "volc.seedasr.sauc.duration" # env: DOUBAO_RESOURCE_ID
|
resource_id: "volc.seedasr.sauc.duration" # env: DOUBAO_RESOURCE_ID
|
||||||
|
|
||||||
# 服务配置
|
# 服务配置
|
||||||
|
|||||||
@@ -13,15 +13,15 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream"
|
doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_async"
|
||||||
writeTimeout = 10 * time.Second
|
writeTimeout = 10 * time.Second
|
||||||
readTimeout = 30 * time.Second
|
readTimeout = 30 * time.Second
|
||||||
)
|
)
|
||||||
|
|
||||||
// Config holds Doubao ASR connection parameters.
|
// Config holds Doubao ASR connection parameters.
|
||||||
type Config struct {
|
type Config struct {
|
||||||
AppKey string
|
AppID string
|
||||||
AccessKey string
|
AccessToken string
|
||||||
ResourceID string
|
ResourceID string
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -39,8 +39,8 @@ type Client struct {
|
|||||||
func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) {
|
func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) {
|
||||||
connID := uuid.New().String()
|
connID := uuid.New().String()
|
||||||
headers := http.Header{
|
headers := http.Header{
|
||||||
"X-Api-App-Key": {cfg.AppKey},
|
"X-Api-App-Key": {cfg.AppID},
|
||||||
"X-Api-Access-Key": {cfg.AccessKey},
|
"X-Api-Access-Key": {cfg.AccessToken},
|
||||||
"X-Api-Resource-Id": {cfg.ResourceID},
|
"X-Api-Resource-Id": {cfg.ResourceID},
|
||||||
"X-Api-Connect-Id": {connID},
|
"X-Api-Connect-Id": {connID},
|
||||||
}
|
}
|
||||||
@@ -72,9 +72,10 @@ func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) {
|
|||||||
EnableITN: true,
|
EnableITN: true,
|
||||||
EnablePUNC: true,
|
EnablePUNC: true,
|
||||||
EnableDDC: true,
|
EnableDDC: true,
|
||||||
ShowUtterances: false,
|
ShowUtterances: true,
|
||||||
ResultType: "single",
|
ResultType: "full",
|
||||||
EndWindowSize: 2000,
|
EnableNonstream: true,
|
||||||
|
EndWindowSize: 800,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
data, err := EncodeFullClientRequest(req)
|
data, err := EncodeFullClientRequest(req)
|
||||||
@@ -132,13 +133,13 @@ func (c *Client) readLoop(resultCh chan<- wsMsg.ServerMsg) {
|
|||||||
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgError, Message: resp.ErrMsg}
|
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgError, Message: resp.ErrMsg}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// nostream mode: may return intermediate results every ~15s
|
// bigmodel_async with enable_nonstream: returns both streaming (partial) and definite (final) results
|
||||||
text := resp.Text
|
text := resp.Text
|
||||||
if text != "" {
|
if text != "" {
|
||||||
if resp.IsLast {
|
if resp.IsLast {
|
||||||
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgFinal, Text: text}
|
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgFinal, Text: text}
|
||||||
} else {
|
} else {
|
||||||
// Intermediate result (>15s audio) — preview only, don't paste
|
// Intermediate streaming result (first pass) — preview only
|
||||||
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgPartial, Text: text}
|
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgPartial, Text: text}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -110,6 +110,7 @@ type RequestMeta struct {
|
|||||||
EnableDDC bool `json:"enable_ddc"`
|
EnableDDC bool `json:"enable_ddc"`
|
||||||
ShowUtterances bool `json:"show_utterances"`
|
ShowUtterances bool `json:"show_utterances"`
|
||||||
ResultType string `json:"result_type,omitempty"`
|
ResultType string `json:"result_type,omitempty"`
|
||||||
|
EnableNonstream bool `json:"enable_nonstream,omitempty"`
|
||||||
EndWindowSize int `json:"end_window_size,omitempty"`
|
EndWindowSize int `json:"end_window_size,omitempty"`
|
||||||
}
|
}
|
||||||
// EncodeFullClientRequest builds the binary message for the initial handshake.
|
// EncodeFullClientRequest builds the binary message for the initial handshake.
|
||||||
|
|||||||
@@ -6,8 +6,8 @@ import (
|
|||||||
|
|
||||||
// DoubaoConfig holds 火山引擎豆包 ASR credentials.
|
// DoubaoConfig holds 火山引擎豆包 ASR credentials.
|
||||||
type DoubaoConfig struct {
|
type DoubaoConfig struct {
|
||||||
AppKey string `yaml:"app_key"`
|
AppID string `yaml:"app_id"`
|
||||||
AccessKey string `yaml:"access_key"`
|
AccessToken string `yaml:"access_token"`
|
||||||
ResourceID string `yaml:"resource_id"`
|
ResourceID string `yaml:"resource_id"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -41,11 +41,11 @@ func Load(configPath string) (Config, error) {
|
|||||||
|
|
||||||
// applyEnv overrides config fields with environment variables.
|
// applyEnv overrides config fields with environment variables.
|
||||||
func applyEnv(cfg *Config) {
|
func applyEnv(cfg *Config) {
|
||||||
if v := os.Getenv("DOUBAO_APP_KEY"); v != "" {
|
if v := os.Getenv("DOUBAO_APP_ID"); v != "" {
|
||||||
cfg.Doubao.AppKey = v
|
cfg.Doubao.AppID = v
|
||||||
}
|
}
|
||||||
if v := os.Getenv("DOUBAO_ACCESS_KEY"); v != "" {
|
if v := os.Getenv("DOUBAO_ACCESS_TOKEN"); v != "" {
|
||||||
cfg.Doubao.AccessKey = v
|
cfg.Doubao.AccessToken = v
|
||||||
}
|
}
|
||||||
if v := os.Getenv("DOUBAO_RESOURCE_ID"); v != "" {
|
if v := os.Getenv("DOUBAO_RESOURCE_ID"); v != "" {
|
||||||
cfg.Doubao.ResourceID = v
|
cfg.Doubao.ResourceID = v
|
||||||
@@ -62,11 +62,11 @@ func applyEnv(cfg *Config) {
|
|||||||
|
|
||||||
// validate checks required fields.
|
// validate checks required fields.
|
||||||
func validate(cfg Config) error {
|
func validate(cfg Config) error {
|
||||||
if cfg.Doubao.AppKey == "" {
|
if cfg.Doubao.AppID == "" {
|
||||||
return fmt.Errorf("doubao.app_key is required (set DOUBAO_APP_KEY or config.yaml)")
|
return fmt.Errorf("doubao.app_id is required (set DOUBAO_APP_ID or config.yaml)")
|
||||||
}
|
}
|
||||||
if cfg.Doubao.AccessKey == "" {
|
if cfg.Doubao.AccessToken == "" {
|
||||||
return fmt.Errorf("doubao.access_key is required (set DOUBAO_ACCESS_KEY or config.yaml)")
|
return fmt.Errorf("doubao.access_token is required (set DOUBAO_ACCESS_TOKEN or config.yaml)")
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -62,21 +62,21 @@ func (h *Handler) handleConn(c *websocket.Conn) {
|
|||||||
defer close(resultCh)
|
defer close(resultCh)
|
||||||
|
|
||||||
// Writer goroutine: single writer to avoid concurrent writes
|
// Writer goroutine: single writer to avoid concurrent writes
|
||||||
// Accumulates all result texts; paste is triggered by stop, not by ASR final.
|
// bigmodel_async with enable_nonstream: server returns full text each time (not incremental)
|
||||||
|
// We replace preview text on each update instead of accumulating.
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
var accMu sync.Mutex
|
var previewMu sync.Mutex
|
||||||
var accText string
|
var previewText string
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
for msg := range resultCh {
|
for msg := range resultCh {
|
||||||
// Accumulate text from both partial and final results
|
// Replace preview text with latest result (full mode)
|
||||||
if msg.Type == MsgPartial || msg.Type == MsgFinal {
|
if msg.Type == MsgPartial || msg.Type == MsgFinal {
|
||||||
accMu.Lock()
|
previewMu.Lock()
|
||||||
accText += msg.Text
|
previewText = msg.Text
|
||||||
// Send accumulated preview to phone
|
preview := ServerMsg{Type: msg.Type, Text: previewText}
|
||||||
preview := ServerMsg{Type: msg.Type, Text: accText}
|
previewMu.Unlock()
|
||||||
accMu.Unlock()
|
|
||||||
if err := c.WriteMessage(websocket.TextMessage, preview.Bytes()); err != nil {
|
if err := c.WriteMessage(websocket.TextMessage, preview.Bytes()); err != nil {
|
||||||
log.Warn("ws write error", "err", err)
|
log.Warn("ws write error", "err", err)
|
||||||
return
|
return
|
||||||
@@ -128,10 +128,10 @@ func (h *Handler) handleConn(c *websocket.Conn) {
|
|||||||
if active {
|
if active {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// Reset accumulated text for new session
|
// Reset preview text for new session
|
||||||
accMu.Lock()
|
previewMu.Lock()
|
||||||
accText = ""
|
previewText = ""
|
||||||
accMu.Unlock()
|
previewMu.Unlock()
|
||||||
sa, cl, err := h.asrFactory(resultCh)
|
sa, cl, err := h.asrFactory(resultCh)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("asr start failed", "err", err)
|
log.Error("asr start failed", "err", err)
|
||||||
@@ -154,11 +154,11 @@ func (h *Handler) handleConn(c *websocket.Conn) {
|
|||||||
}
|
}
|
||||||
sendAudio = nil
|
sendAudio = nil
|
||||||
active = false
|
active = false
|
||||||
// Now paste the accumulated text
|
// Paste the final preview text
|
||||||
accMu.Lock()
|
previewMu.Lock()
|
||||||
finalText := accText
|
finalText := previewText
|
||||||
accText = ""
|
previewText = ""
|
||||||
accMu.Unlock()
|
previewMu.Unlock()
|
||||||
if finalText != "" && h.pasteFunc != nil {
|
if finalText != "" && h.pasteFunc != nil {
|
||||||
if err := h.pasteFunc(finalText); err != nil {
|
if err := h.pasteFunc(finalText); err != nil {
|
||||||
log.Error("auto-paste failed", "err", err)
|
log.Error("auto-paste failed", "err", err)
|
||||||
|
|||||||
4
main.go
4
main.go
@@ -110,8 +110,8 @@ func main() {
|
|||||||
srv := server.New(token, lanIP, webContent, serverTLSCfg)
|
srv := server.New(token, lanIP, webContent, serverTLSCfg)
|
||||||
// Build ASR factory from config
|
// Build ASR factory from config
|
||||||
asrCfg := asr.Config{
|
asrCfg := asr.Config{
|
||||||
AppKey: cfg.Doubao.AppKey,
|
AppID: cfg.Doubao.AppID,
|
||||||
AccessKey: cfg.Doubao.AccessKey,
|
AccessToken: cfg.Doubao.AccessToken,
|
||||||
ResourceID: cfg.Doubao.ResourceID,
|
ResourceID: cfg.Doubao.ResourceID,
|
||||||
}
|
}
|
||||||
asrFactory := func(resultCh chan<- ws.ServerMsg) (func([]byte), func(), error) {
|
asrFactory := func(resultCh chan<- ws.ServerMsg) (func([]byte), func(), error) {
|
||||||
|
|||||||
Reference in New Issue
Block a user