- 移除本地热词列表配置,改为直接使用豆包控制台的热词表 ID - 删除 internal/asr/hotwords.go(不再需要本地解析) - 简化 client.go 逻辑,直接传递 boosting_table_id - 移除 protocol.go 中的 boosting_table_name 字段 - 更新配置示例,添加控制台链接说明 使用方法: 1. 在豆包控制台创建热词表:https://console.volcengine.com/speech/hotword 2. 复制热词表 ID 到 config.yaml 的 boosting_table_id 字段
175 lines
4.3 KiB
Go
175 lines
4.3 KiB
Go
package asr
|
||
|
||
import (
|
||
"fmt"
|
||
"log/slog"
|
||
"net/http"
|
||
"sync"
|
||
"time"
|
||
|
||
"github.com/fasthttp/websocket"
|
||
"github.com/google/uuid"
|
||
wsMsg "github.com/imbytecat/voicepaste/internal/ws"
|
||
)
|
||
|
||
const (
|
||
doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_async"
|
||
writeTimeout = 10 * time.Second
|
||
readTimeout = 30 * time.Second
|
||
)
|
||
|
||
// Config holds Doubao ASR connection parameters.
|
||
type Config struct {
|
||
AppID string
|
||
AccessToken string
|
||
ResourceID string
|
||
BoostingTableID string // 热词表 ID(从控制台创建)
|
||
}
|
||
|
||
// Client manages a single ASR session with Doubao.
|
||
type Client struct {
|
||
cfg Config
|
||
conn *websocket.Conn
|
||
mu sync.Mutex
|
||
closed bool
|
||
closeCh chan struct{}
|
||
log *slog.Logger
|
||
}
|
||
// Dial connects to Doubao ASR and sends the initial FullClientRequest.
|
||
// resultCh receives partial/final results. Caller must call Close() when done.
|
||
func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) {
|
||
connID := uuid.New().String()
|
||
headers := http.Header{
|
||
"X-Api-App-Key": {cfg.AppID},
|
||
"X-Api-Access-Key": {cfg.AccessToken},
|
||
"X-Api-Resource-Id": {cfg.ResourceID},
|
||
"X-Api-Connect-Id": {connID},
|
||
}
|
||
dialer := websocket.Dialer{
|
||
HandshakeTimeout: 10 * time.Second,
|
||
}
|
||
conn, _, err := dialer.Dial(doubaoEndpoint, headers)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("dial doubao: %w", err)
|
||
}
|
||
c := &Client{
|
||
cfg: cfg,
|
||
conn: conn,
|
||
closeCh: make(chan struct{}),
|
||
log: slog.With("conn_id", connID),
|
||
}
|
||
// Send FullClientRequest
|
||
req := &FullClientRequest{
|
||
User: UserMeta{UID: connID},
|
||
Audio: AudioMeta{
|
||
Format: "pcm",
|
||
Codec: "raw",
|
||
Rate: 16000,
|
||
Bits: 16,
|
||
Channel: 1,
|
||
},
|
||
Request: RequestMeta{
|
||
ModelName: "seedasr-2.0",
|
||
EnableITN: true,
|
||
EnablePUNC: true,
|
||
EnableDDC: true,
|
||
ShowUtterances: true,
|
||
ResultType: "full",
|
||
EnableNonstream: true,
|
||
EndWindowSize: 800,
|
||
BoostingTableID: cfg.BoostingTableID,
|
||
},
|
||
}
|
||
data, err := EncodeFullClientRequest(req)
|
||
if err != nil {
|
||
conn.Close()
|
||
return nil, fmt.Errorf("encode full request: %w", err)
|
||
}
|
||
_ = conn.SetWriteDeadline(time.Now().Add(writeTimeout))
|
||
if err := conn.WriteMessage(websocket.BinaryMessage, data); err != nil {
|
||
conn.Close()
|
||
return nil, fmt.Errorf("send full request: %w", err)
|
||
}
|
||
// Start read loop
|
||
go c.readLoop(resultCh)
|
||
return c, nil
|
||
}
|
||
|
||
// SendAudio sends a PCM audio frame to Doubao.
|
||
func (c *Client) SendAudio(pcm []byte, last bool) error {
|
||
c.mu.Lock()
|
||
defer c.mu.Unlock()
|
||
if c.closed {
|
||
return fmt.Errorf("client closed")
|
||
}
|
||
data, err := EncodeAudioFrame(pcm, last)
|
||
if err != nil {
|
||
return fmt.Errorf("encode audio: %w", err)
|
||
}
|
||
_ = c.conn.SetWriteDeadline(time.Now().Add(writeTimeout))
|
||
return c.conn.WriteMessage(websocket.BinaryMessage, data)
|
||
}
|
||
// readLoop reads server responses and forwards them to resultCh.
|
||
func (c *Client) readLoop(resultCh chan<- wsMsg.ServerMsg) {
|
||
defer func() {
|
||
c.conn.Close()
|
||
c.mu.Lock()
|
||
c.closed = true
|
||
c.mu.Unlock()
|
||
close(c.closeCh)
|
||
}()
|
||
for {
|
||
_ = c.conn.SetReadDeadline(time.Now().Add(readTimeout))
|
||
_, data, err := c.conn.ReadMessage()
|
||
if err != nil {
|
||
c.log.Debug("asr read done", "err", err)
|
||
return
|
||
}
|
||
resp, err := ParseResponse(data)
|
||
if err != nil {
|
||
c.log.Warn("parse asr response", "err", err)
|
||
continue
|
||
}
|
||
if resp.Code != 0 {
|
||
c.log.Error("asr error", "code", resp.Code, "msg", resp.ErrMsg)
|
||
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgError, Message: resp.ErrMsg}
|
||
return
|
||
}
|
||
// bigmodel_async with enable_nonstream: returns both streaming (partial) and definite (final) results
|
||
text := resp.Text
|
||
if text != "" {
|
||
if resp.IsLast {
|
||
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgFinal, Text: text}
|
||
} else {
|
||
// Intermediate streaming result (first pass) — preview only
|
||
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgPartial, Text: text}
|
||
}
|
||
}
|
||
if resp.IsLast {
|
||
return
|
||
}
|
||
}
|
||
}
|
||
// Finish sends the last audio frame and waits for ASR to return final results.
|
||
func (c *Client) Finish() {
|
||
c.mu.Lock()
|
||
if c.closed {
|
||
c.mu.Unlock()
|
||
return
|
||
}
|
||
c.mu.Unlock()
|
||
_ = c.SendAudio(nil, true)
|
||
<-c.closeCh
|
||
}
|
||
|
||
// Close forcefully shuts down the ASR connection.
|
||
func (c *Client) Close() {
|
||
c.mu.Lock()
|
||
if c.closed {
|
||
c.mu.Unlock()
|
||
return
|
||
}
|
||
c.conn.Close()
|
||
c.mu.Unlock()
|
||
<-c.closeCh
|
||
} |