From 350e405facb029f740cd354827538205b92d5660 Mon Sep 17 00:00:00 2001 From: imbytecat Date: Sun, 1 Mar 2026 06:12:58 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20ASR=20=E4=BB=8E=E5=8F=8C=E5=90=91?= =?UTF-8?q?=E6=B5=81=E5=BC=8F=E5=88=87=E6=8D=A2=E4=B8=BA=E6=B5=81=E5=BC=8F?= =?UTF-8?q?=E8=BE=93=E5=85=A5=E6=A8=A1=E5=BC=8F=EF=BC=88bigmodel=5Fnostrea?= =?UTF-8?q?m=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - endpoint 从 bigmodel_async 改为 bigmodel_nostream - 二进制协议去掉 sequence 字段,初始请求和音频帧均不带序号 - 最后一帧使用 FlagLastNoSeq 标志 - RequestMeta 新增 result_type=single、end_window_size=400 - ShowUtterances 关闭(nostream 模式不需要) - readLoop 简化:nostream 模式下直接返回 final 结果 --- internal/asr/client.go | 28 ++++++++-------------------- internal/asr/protocol.go | 20 ++++++++++---------- 2 files changed, 18 insertions(+), 30 deletions(-) diff --git a/internal/asr/client.go b/internal/asr/client.go index baf763c..13e26dc 100644 --- a/internal/asr/client.go +++ b/internal/asr/client.go @@ -5,7 +5,6 @@ import ( "log/slog" "net/http" "sync" - "sync/atomic" "time" "github.com/fasthttp/websocket" @@ -14,7 +13,7 @@ import ( ) const ( - doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_async" + doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream" writeTimeout = 10 * time.Second readTimeout = 30 * time.Second ) @@ -30,7 +29,6 @@ type Config struct { type Client struct { cfg Config conn *websocket.Conn - seq atomic.Int32 mu sync.Mutex closed bool closeCh chan struct{} @@ -74,11 +72,12 @@ func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) { EnableITN: true, EnablePUNC: true, EnableDDC: true, - ShowUtterances: true, + ShowUtterances: false, + ResultType: "single", + EndWindowSize: 400, }, } - c.seq.Store(1) - data, err := EncodeFullClientRequest(req, c.seq.Load()) + data, err := EncodeFullClientRequest(req) if err != nil { conn.Close() return nil, fmt.Errorf("encode full request: %w", err) @@ -100,8 +99,7 @@ func (c *Client) SendAudio(pcm []byte, last bool) error { if c.closed { return fmt.Errorf("client closed") } - seq := c.seq.Add(1) - data, err := EncodeAudioFrame(seq, pcm, last) + data, err := EncodeAudioFrame(pcm, last) if err != nil { return fmt.Errorf("encode audio: %w", err) } @@ -134,20 +132,10 @@ func (c *Client) readLoop(resultCh chan<- wsMsg.ServerMsg) { resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgError, Message: resp.ErrMsg} return } - // Determine if this is a final result by checking utterances - isFinal := false + // nostream mode: result comes after last audio packet or >15s text := resp.Text - for _, u := range resp.Utterances { - if u.Definite { - isFinal = true - text = u.Text - break - } - } - if isFinal { + if text != "" { resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgFinal, Text: text} - } else if text != "" { - resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgPartial, Text: text} } if resp.IsLast { return diff --git a/internal/asr/protocol.go b/internal/asr/protocol.go index cd6a340..4bb9bb1 100644 --- a/internal/asr/protocol.go +++ b/internal/asr/protocol.go @@ -109,9 +109,12 @@ type RequestMeta struct { EnablePUNC bool `json:"enable_punc"` EnableDDC bool `json:"enable_ddc"` ShowUtterances bool `json:"show_utterances"` + ResultType string `json:"result_type,omitempty"` + EndWindowSize int `json:"end_window_size,omitempty"` } // EncodeFullClientRequest builds the binary message for the initial handshake. -func EncodeFullClientRequest(req *FullClientRequest, seq int32) ([]byte, error) { +// nostream mode: header(4) + payload_size(4) + gzip(json) + func EncodeFullClientRequest(req *FullClientRequest) ([]byte, error) { payloadJSON, err := json.Marshal(req) if err != nil { return nil, fmt.Errorf("marshal request: %w", err) @@ -121,20 +124,18 @@ func EncodeFullClientRequest(req *FullClientRequest, seq int32) ([]byte, error) return nil, fmt.Errorf("gzip compress: %w", err) } var buf bytes.Buffer - buf.Write(encodeHeader(MsgFullClientRequest, FlagPosSeq, SerJSON, CompGzip)) - _ = binary.Write(&buf, binary.BigEndian, seq) + buf.Write(encodeHeader(MsgFullClientRequest, FlagNoSeq, SerJSON, CompGzip)) _ = binary.Write(&buf, binary.BigEndian, int32(len(compressed))) buf.Write(compressed) return buf.Bytes(), nil } // EncodeAudioFrame builds a binary audio-only request. -// If last is true, seq is sent as negative to signal end of stream. -func EncodeAudioFrame(seq int32, pcm []byte, last bool) ([]byte, error) { - flags := FlagPosSeq - wireSeq := seq +// nostream mode: header(4) + payload_size(4) + gzip(pcm) +// last=true sets FlagLastNoSeq to signal end of stream. +func EncodeAudioFrame(pcm []byte, last bool) ([]byte, error) { + flags := FlagNoSeq if last { - flags = FlagNegSeq - wireSeq = -seq + flags = FlagLastNoSeq } compressed, err := gzipCompress(pcm) if err != nil { @@ -142,7 +143,6 @@ func EncodeAudioFrame(seq int32, pcm []byte, last bool) ([]byte, error) { } var buf bytes.Buffer buf.Write(encodeHeader(MsgAudioOnlyRequest, flags, SerNone, CompGzip)) - _ = binary.Write(&buf, binary.BigEndian, wireSeq) _ = binary.Write(&buf, binary.BigEndian, int32(len(compressed))) buf.Write(compressed) return buf.Bytes(), nil