feat: ASR 从双向流式切换为流式输入模式(bigmodel_nostream)
- endpoint 从 bigmodel_async 改为 bigmodel_nostream - 二进制协议去掉 sequence 字段,初始请求和音频帧均不带序号 - 最后一帧使用 FlagLastNoSeq 标志 - RequestMeta 新增 result_type=single、end_window_size=400 - ShowUtterances 关闭(nostream 模式不需要) - readLoop 简化:nostream 模式下直接返回 final 结果
This commit is contained in:
@@ -5,7 +5,6 @@ import (
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/fasthttp/websocket"
|
||||
@@ -14,7 +13,7 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_async"
|
||||
doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream"
|
||||
writeTimeout = 10 * time.Second
|
||||
readTimeout = 30 * time.Second
|
||||
)
|
||||
@@ -30,7 +29,6 @@ type Config struct {
|
||||
type Client struct {
|
||||
cfg Config
|
||||
conn *websocket.Conn
|
||||
seq atomic.Int32
|
||||
mu sync.Mutex
|
||||
closed bool
|
||||
closeCh chan struct{}
|
||||
@@ -74,11 +72,12 @@ func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) {
|
||||
EnableITN: true,
|
||||
EnablePUNC: true,
|
||||
EnableDDC: true,
|
||||
ShowUtterances: true,
|
||||
ShowUtterances: false,
|
||||
ResultType: "single",
|
||||
EndWindowSize: 400,
|
||||
},
|
||||
}
|
||||
c.seq.Store(1)
|
||||
data, err := EncodeFullClientRequest(req, c.seq.Load())
|
||||
data, err := EncodeFullClientRequest(req)
|
||||
if err != nil {
|
||||
conn.Close()
|
||||
return nil, fmt.Errorf("encode full request: %w", err)
|
||||
@@ -100,8 +99,7 @@ func (c *Client) SendAudio(pcm []byte, last bool) error {
|
||||
if c.closed {
|
||||
return fmt.Errorf("client closed")
|
||||
}
|
||||
seq := c.seq.Add(1)
|
||||
data, err := EncodeAudioFrame(seq, pcm, last)
|
||||
data, err := EncodeAudioFrame(pcm, last)
|
||||
if err != nil {
|
||||
return fmt.Errorf("encode audio: %w", err)
|
||||
}
|
||||
@@ -134,20 +132,10 @@ func (c *Client) readLoop(resultCh chan<- wsMsg.ServerMsg) {
|
||||
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgError, Message: resp.ErrMsg}
|
||||
return
|
||||
}
|
||||
// Determine if this is a final result by checking utterances
|
||||
isFinal := false
|
||||
// nostream mode: result comes after last audio packet or >15s
|
||||
text := resp.Text
|
||||
for _, u := range resp.Utterances {
|
||||
if u.Definite {
|
||||
isFinal = true
|
||||
text = u.Text
|
||||
break
|
||||
}
|
||||
}
|
||||
if isFinal {
|
||||
if text != "" {
|
||||
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgFinal, Text: text}
|
||||
} else if text != "" {
|
||||
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgPartial, Text: text}
|
||||
}
|
||||
if resp.IsLast {
|
||||
return
|
||||
|
||||
@@ -109,9 +109,12 @@ type RequestMeta struct {
|
||||
EnablePUNC bool `json:"enable_punc"`
|
||||
EnableDDC bool `json:"enable_ddc"`
|
||||
ShowUtterances bool `json:"show_utterances"`
|
||||
ResultType string `json:"result_type,omitempty"`
|
||||
EndWindowSize int `json:"end_window_size,omitempty"`
|
||||
}
|
||||
// EncodeFullClientRequest builds the binary message for the initial handshake.
|
||||
func EncodeFullClientRequest(req *FullClientRequest, seq int32) ([]byte, error) {
|
||||
// nostream mode: header(4) + payload_size(4) + gzip(json)
|
||||
func EncodeFullClientRequest(req *FullClientRequest) ([]byte, error) {
|
||||
payloadJSON, err := json.Marshal(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshal request: %w", err)
|
||||
@@ -121,20 +124,18 @@ func EncodeFullClientRequest(req *FullClientRequest, seq int32) ([]byte, error)
|
||||
return nil, fmt.Errorf("gzip compress: %w", err)
|
||||
}
|
||||
var buf bytes.Buffer
|
||||
buf.Write(encodeHeader(MsgFullClientRequest, FlagPosSeq, SerJSON, CompGzip))
|
||||
_ = binary.Write(&buf, binary.BigEndian, seq)
|
||||
buf.Write(encodeHeader(MsgFullClientRequest, FlagNoSeq, SerJSON, CompGzip))
|
||||
_ = binary.Write(&buf, binary.BigEndian, int32(len(compressed)))
|
||||
buf.Write(compressed)
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
// EncodeAudioFrame builds a binary audio-only request.
|
||||
// If last is true, seq is sent as negative to signal end of stream.
|
||||
func EncodeAudioFrame(seq int32, pcm []byte, last bool) ([]byte, error) {
|
||||
flags := FlagPosSeq
|
||||
wireSeq := seq
|
||||
// nostream mode: header(4) + payload_size(4) + gzip(pcm)
|
||||
// last=true sets FlagLastNoSeq to signal end of stream.
|
||||
func EncodeAudioFrame(pcm []byte, last bool) ([]byte, error) {
|
||||
flags := FlagNoSeq
|
||||
if last {
|
||||
flags = FlagNegSeq
|
||||
wireSeq = -seq
|
||||
flags = FlagLastNoSeq
|
||||
}
|
||||
compressed, err := gzipCompress(pcm)
|
||||
if err != nil {
|
||||
@@ -142,7 +143,6 @@ func EncodeAudioFrame(seq int32, pcm []byte, last bool) ([]byte, error) {
|
||||
}
|
||||
var buf bytes.Buffer
|
||||
buf.Write(encodeHeader(MsgAudioOnlyRequest, flags, SerNone, CompGzip))
|
||||
_ = binary.Write(&buf, binary.BigEndian, wireSeq)
|
||||
_ = binary.Write(&buf, binary.BigEndian, int32(len(compressed)))
|
||||
buf.Write(compressed)
|
||||
return buf.Bytes(), nil
|
||||
|
||||
Reference in New Issue
Block a user