feat: ASR 从双向流式切换为流式输入模式(bigmodel_nostream)
- endpoint 从 bigmodel_async 改为 bigmodel_nostream - 二进制协议去掉 sequence 字段,初始请求和音频帧均不带序号 - 最后一帧使用 FlagLastNoSeq 标志 - RequestMeta 新增 result_type=single、end_window_size=400 - ShowUtterances 关闭(nostream 模式不需要) - readLoop 简化:nostream 模式下直接返回 final 结果
This commit is contained in:
@@ -5,7 +5,6 @@ import (
|
|||||||
"log/slog"
|
"log/slog"
|
||||||
"net/http"
|
"net/http"
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/fasthttp/websocket"
|
"github.com/fasthttp/websocket"
|
||||||
@@ -14,7 +13,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_async"
|
doubaoEndpoint = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream"
|
||||||
writeTimeout = 10 * time.Second
|
writeTimeout = 10 * time.Second
|
||||||
readTimeout = 30 * time.Second
|
readTimeout = 30 * time.Second
|
||||||
)
|
)
|
||||||
@@ -30,7 +29,6 @@ type Config struct {
|
|||||||
type Client struct {
|
type Client struct {
|
||||||
cfg Config
|
cfg Config
|
||||||
conn *websocket.Conn
|
conn *websocket.Conn
|
||||||
seq atomic.Int32
|
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
closed bool
|
closed bool
|
||||||
closeCh chan struct{}
|
closeCh chan struct{}
|
||||||
@@ -74,11 +72,12 @@ func Dial(cfg Config, resultCh chan<- wsMsg.ServerMsg) (*Client, error) {
|
|||||||
EnableITN: true,
|
EnableITN: true,
|
||||||
EnablePUNC: true,
|
EnablePUNC: true,
|
||||||
EnableDDC: true,
|
EnableDDC: true,
|
||||||
ShowUtterances: true,
|
ShowUtterances: false,
|
||||||
|
ResultType: "single",
|
||||||
|
EndWindowSize: 400,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
c.seq.Store(1)
|
data, err := EncodeFullClientRequest(req)
|
||||||
data, err := EncodeFullClientRequest(req, c.seq.Load())
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
conn.Close()
|
conn.Close()
|
||||||
return nil, fmt.Errorf("encode full request: %w", err)
|
return nil, fmt.Errorf("encode full request: %w", err)
|
||||||
@@ -100,8 +99,7 @@ func (c *Client) SendAudio(pcm []byte, last bool) error {
|
|||||||
if c.closed {
|
if c.closed {
|
||||||
return fmt.Errorf("client closed")
|
return fmt.Errorf("client closed")
|
||||||
}
|
}
|
||||||
seq := c.seq.Add(1)
|
data, err := EncodeAudioFrame(pcm, last)
|
||||||
data, err := EncodeAudioFrame(seq, pcm, last)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("encode audio: %w", err)
|
return fmt.Errorf("encode audio: %w", err)
|
||||||
}
|
}
|
||||||
@@ -134,20 +132,10 @@ func (c *Client) readLoop(resultCh chan<- wsMsg.ServerMsg) {
|
|||||||
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgError, Message: resp.ErrMsg}
|
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgError, Message: resp.ErrMsg}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// Determine if this is a final result by checking utterances
|
// nostream mode: result comes after last audio packet or >15s
|
||||||
isFinal := false
|
|
||||||
text := resp.Text
|
text := resp.Text
|
||||||
for _, u := range resp.Utterances {
|
if text != "" {
|
||||||
if u.Definite {
|
|
||||||
isFinal = true
|
|
||||||
text = u.Text
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if isFinal {
|
|
||||||
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgFinal, Text: text}
|
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgFinal, Text: text}
|
||||||
} else if text != "" {
|
|
||||||
resultCh <- wsMsg.ServerMsg{Type: wsMsg.MsgPartial, Text: text}
|
|
||||||
}
|
}
|
||||||
if resp.IsLast {
|
if resp.IsLast {
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -109,9 +109,12 @@ type RequestMeta struct {
|
|||||||
EnablePUNC bool `json:"enable_punc"`
|
EnablePUNC bool `json:"enable_punc"`
|
||||||
EnableDDC bool `json:"enable_ddc"`
|
EnableDDC bool `json:"enable_ddc"`
|
||||||
ShowUtterances bool `json:"show_utterances"`
|
ShowUtterances bool `json:"show_utterances"`
|
||||||
|
ResultType string `json:"result_type,omitempty"`
|
||||||
|
EndWindowSize int `json:"end_window_size,omitempty"`
|
||||||
}
|
}
|
||||||
// EncodeFullClientRequest builds the binary message for the initial handshake.
|
// EncodeFullClientRequest builds the binary message for the initial handshake.
|
||||||
func EncodeFullClientRequest(req *FullClientRequest, seq int32) ([]byte, error) {
|
// nostream mode: header(4) + payload_size(4) + gzip(json)
|
||||||
|
func EncodeFullClientRequest(req *FullClientRequest) ([]byte, error) {
|
||||||
payloadJSON, err := json.Marshal(req)
|
payloadJSON, err := json.Marshal(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("marshal request: %w", err)
|
return nil, fmt.Errorf("marshal request: %w", err)
|
||||||
@@ -121,20 +124,18 @@ func EncodeFullClientRequest(req *FullClientRequest, seq int32) ([]byte, error)
|
|||||||
return nil, fmt.Errorf("gzip compress: %w", err)
|
return nil, fmt.Errorf("gzip compress: %w", err)
|
||||||
}
|
}
|
||||||
var buf bytes.Buffer
|
var buf bytes.Buffer
|
||||||
buf.Write(encodeHeader(MsgFullClientRequest, FlagPosSeq, SerJSON, CompGzip))
|
buf.Write(encodeHeader(MsgFullClientRequest, FlagNoSeq, SerJSON, CompGzip))
|
||||||
_ = binary.Write(&buf, binary.BigEndian, seq)
|
|
||||||
_ = binary.Write(&buf, binary.BigEndian, int32(len(compressed)))
|
_ = binary.Write(&buf, binary.BigEndian, int32(len(compressed)))
|
||||||
buf.Write(compressed)
|
buf.Write(compressed)
|
||||||
return buf.Bytes(), nil
|
return buf.Bytes(), nil
|
||||||
}
|
}
|
||||||
// EncodeAudioFrame builds a binary audio-only request.
|
// EncodeAudioFrame builds a binary audio-only request.
|
||||||
// If last is true, seq is sent as negative to signal end of stream.
|
// nostream mode: header(4) + payload_size(4) + gzip(pcm)
|
||||||
func EncodeAudioFrame(seq int32, pcm []byte, last bool) ([]byte, error) {
|
// last=true sets FlagLastNoSeq to signal end of stream.
|
||||||
flags := FlagPosSeq
|
func EncodeAudioFrame(pcm []byte, last bool) ([]byte, error) {
|
||||||
wireSeq := seq
|
flags := FlagNoSeq
|
||||||
if last {
|
if last {
|
||||||
flags = FlagNegSeq
|
flags = FlagLastNoSeq
|
||||||
wireSeq = -seq
|
|
||||||
}
|
}
|
||||||
compressed, err := gzipCompress(pcm)
|
compressed, err := gzipCompress(pcm)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -142,7 +143,6 @@ func EncodeAudioFrame(seq int32, pcm []byte, last bool) ([]byte, error) {
|
|||||||
}
|
}
|
||||||
var buf bytes.Buffer
|
var buf bytes.Buffer
|
||||||
buf.Write(encodeHeader(MsgAudioOnlyRequest, flags, SerNone, CompGzip))
|
buf.Write(encodeHeader(MsgAudioOnlyRequest, flags, SerNone, CompGzip))
|
||||||
_ = binary.Write(&buf, binary.BigEndian, wireSeq)
|
|
||||||
_ = binary.Write(&buf, binary.BigEndian, int32(len(compressed)))
|
_ = binary.Write(&buf, binary.BigEndian, int32(len(compressed)))
|
||||||
buf.Write(compressed)
|
buf.Write(compressed)
|
||||||
return buf.Bytes(), nil
|
return buf.Bytes(), nil
|
||||||
|
|||||||
Reference in New Issue
Block a user