feat: ASR 从双向流式切换为流式输入模式(bigmodel_nostream)

- endpoint 从 bigmodel_async 改为 bigmodel_nostream
- 二进制协议去掉 sequence 字段,初始请求和音频帧均不带序号
- 最后一帧使用 FlagLastNoSeq 标志
- RequestMeta 新增 result_type=single、end_window_size=400
- ShowUtterances 关闭(nostream 模式不需要)
- readLoop 简化:nostream 模式下直接返回 final 结果
This commit is contained in:
2026-03-01 06:12:58 +08:00
parent ce1ff2d04d
commit 350e405fac
2 changed files with 18 additions and 30 deletions

View File

@@ -5,7 +5,6 @@ import (
"log/slog" "log/slog"
"net/http" "net/http"
"sync" "sync"
"sync/atomic"
"time" "time"
"github.com/fasthttp/websocket" "github.com/fasthttp/websocket"
@@ -14,7 +13,7 @@ import (
) )
const ( const (
doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_async" doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream"
writeTimeout = 10 * time.Second writeTimeout = 10 * time.Second
readTimeout = 30 * time.Second readTimeout = 30 * time.Second
) )
@@ -30,7 +29,6 @@ type Config struct {
type Client struct { type Client struct {
cfg Config cfg Config
conn *websocket.Conn conn *websocket.Conn
seq atomic.Int32
mu sync.Mutex mu sync.Mutex
closed bool closed bool
closeCh chan struct{} closeCh chan struct{}
@@ -74,11 +72,12 @@ func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) {
EnableITN: true, EnableITN: true,
EnablePUNC: true, EnablePUNC: true,
EnableDDC: true, EnableDDC: true,
ShowUtterances: true, ShowUtterances: false,
ResultType: "single",
EndWindowSize: 400,
}, },
} }
c.seq.Store(1) data, err := EncodeFullClientRequest(req)
data, err := EncodeFullClientRequest(req, c.seq.Load())
if err != nil { if err != nil {
conn.Close() conn.Close()
return nil, fmt.Errorf("encode full request: %w", err) return nil, fmt.Errorf("encode full request: %w", err)
@@ -100,8 +99,7 @@ func (c *Client) SendAudio(pcm []byte, last bool) error {
if c.closed { if c.closed {
return fmt.Errorf("client closed") return fmt.Errorf("client closed")
} }
seq := c.seq.Add(1) data, err := EncodeAudioFrame(pcm, last)
data, err := EncodeAudioFrame(seq, pcm, last)
if err != nil { if err != nil {
return fmt.Errorf("encode audio: %w", err) return fmt.Errorf("encode audio: %w", err)
} }
@@ -134,20 +132,10 @@ func (c *Client) readLoop(resultCh chan<- wsMsg.ServerMsg) {
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgError, Message: resp.ErrMsg} resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgError, Message: resp.ErrMsg}
return return
} }
// Determine if this is a final result by checking utterances // nostream mode: result comes after last audio packet or >15s
isFinal := false
text := resp.Text text := resp.Text
for _, u := range resp.Utterances { if text != "" {
if u.Definite {
isFinal = true
text = u.Text
break
}
}
if isFinal {
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgFinal, Text: text} resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgFinal, Text: text}
} else if text != "" {
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgPartial, Text: text}
} }
if resp.IsLast { if resp.IsLast {
return return

View File

@@ -109,9 +109,12 @@ type RequestMeta struct {
EnablePUNC bool `json:"enable_punc"` EnablePUNC bool `json:"enable_punc"`
EnableDDC bool `json:"enable_ddc"` EnableDDC bool `json:"enable_ddc"`
ShowUtterances bool `json:"show_utterances"` ShowUtterances bool `json:"show_utterances"`
ResultType string `json:"result_type,omitempty"`
EndWindowSize int `json:"end_window_size,omitempty"`
} }
// EncodeFullClientRequest builds the binary message for the initial handshake. // EncodeFullClientRequest builds the binary message for the initial handshake.
func EncodeFullClientRequest(req *FullClientRequest, seq int32) ([]byte, error) { // nostream mode: header(4) + payload_size(4) + gzip(json)
func EncodeFullClientRequest(req *FullClientRequest) ([]byte, error) {
payloadJSON, err := json.Marshal(req) payloadJSON, err := json.Marshal(req)
if err != nil { if err != nil {
return nil, fmt.Errorf("marshal request: %w", err) return nil, fmt.Errorf("marshal request: %w", err)
@@ -121,20 +124,18 @@ func EncodeFullClientRequest(req *FullClientRequest, seq int32) ([]byte, error)
return nil, fmt.Errorf("gzip compress: %w", err) return nil, fmt.Errorf("gzip compress: %w", err)
} }
var buf bytes.Buffer var buf bytes.Buffer
buf.Write(encodeHeader(MsgFullClientRequest, FlagPosSeq, SerJSON, CompGzip)) buf.Write(encodeHeader(MsgFullClientRequest, FlagNoSeq, SerJSON, CompGzip))
_ = binary.Write(&buf, binary.BigEndian, seq)
_ = binary.Write(&buf, binary.BigEndian, int32(len(compressed))) _ = binary.Write(&buf, binary.BigEndian, int32(len(compressed)))
buf.Write(compressed) buf.Write(compressed)
return buf.Bytes(), nil return buf.Bytes(), nil
} }
// EncodeAudioFrame builds a binary audio-only request. // EncodeAudioFrame builds a binary audio-only request.
// If last is true, seq is sent as negative to signal end of stream. // nostream mode: header(4) + payload_size(4) + gzip(pcm)
func EncodeAudioFrame(seq int32, pcm []byte, last bool) ([]byte, error) { // last=true sets FlagLastNoSeq to signal end of stream.
flags := FlagPosSeq func EncodeAudioFrame(pcm []byte, last bool) ([]byte, error) {
wireSeq := seq flags := FlagNoSeq
if last { if last {
flags = FlagNegSeq flags = FlagLastNoSeq
wireSeq = -seq
} }
compressed, err := gzipCompress(pcm) compressed, err := gzipCompress(pcm)
if err != nil { if err != nil {
@@ -142,7 +143,6 @@ func EncodeAudioFrame(seq int32, pcm []byte, last bool) ([]byte, error) {
} }
var buf bytes.Buffer var buf bytes.Buffer
buf.Write(encodeHeader(MsgAudioOnlyRequest, flags, SerNone, CompGzip)) buf.Write(encodeHeader(MsgAudioOnlyRequest, flags, SerNone, CompGzip))
_ = binary.Write(&buf, binary.BigEndian, wireSeq)
_ = binary.Write(&buf, binary.BigEndian, int32(len(compressed))) _ = binary.Write(&buf, binary.BigEndian, int32(len(compressed)))
buf.Write(compressed) buf.Write(compressed)
return buf.Bytes(), nil return buf.Bytes(), nil