Files
voicepaste/internal/asr/protocol.go
imbytecat b786d9f90b feat: 实现本地热词管理,移除平台绑定
- 使用 corpus.context 参数直接传递热词列表(豆包文档支持)
- 移除 boosting_table_id 配置,避免绑定火山引擎控制台
- 实现 BuildHotwordsContext 函数,将本地热词转换为 JSON 格式
- 热词配置完全本地化,便于迁移到其他 ASR 平台

配置示例:
  hotwords:
    - 张三
    - 李四
    - VoicePaste

程序自动转换为豆包 API 要求的格式:
{"hotwords":[{"word":"张三"},{"word":"李四"},{"word":"VoicePaste"}]}
2026-03-02 01:36:14 +08:00

243 lines
6.4 KiB
Go

// Package asr implements the Doubao (豆包) Seed-ASR-2.0 streaming speech
// recognition client using the custom binary WebSocket protocol.
package asr
import (
"bytes"
"compress/gzip"
"encoding/binary"
"encoding/json"
"fmt"
"io"
)
// ── Protocol constants ──
const protocolVersion = 0x01
const headerSizeUnit = 0x01 // header = 1 * 4 = 4 bytes
// MessageType (upper nibble of byte 1)
type MessageType uint8
const (
MsgFullClientRequest MessageType = 0x1
MsgAudioOnlyRequest MessageType = 0x2
MsgFullServerResponse MessageType = 0x9
MsgServerError MessageType = 0xF
)
// MessageFlags (lower nibble of byte 1)
type MessageFlags uint8
const (
FlagNoSeq MessageFlags = 0x0
FlagPosSeq MessageFlags = 0x1
FlagLastNoSeq MessageFlags = 0x2
FlagNegSeq MessageFlags = 0x3 // last packet with seq
)
// Serialization (upper nibble of byte 2)
const (
SerNone uint8 = 0x0
SerJSON uint8 = 0x1
)
// Compression (lower nibble of byte 2)
const (
CompNone uint8 = 0x0
CompGzip uint8 = 0x1
)
// ── Header encoding ──
func encodeHeader(mt MessageType, flags MessageFlags, ser, comp uint8) []byte {
return []byte{
(protocolVersion << 4) | headerSizeUnit,
(uint8(mt) << 4) | uint8(flags),
(ser << 4) | comp,
0x00, // reserved
}
}
// ── Gzip helpers ──
func gzipCompress(data []byte) ([]byte, error) {
var buf bytes.Buffer
w := gzip.NewWriter(&buf)
if _, err := w.Write(data); err != nil {
return nil, err
}
if err := w.Close(); err != nil {
return nil, err
}
return buf.Bytes(), nil
}
func gzipDecompress(data []byte) ([]byte, error) {
r, err := gzip.NewReader(bytes.NewReader(data))
if err != nil {
return nil, err
}
defer r.Close()
return io.ReadAll(r)
}
// ── Request builders ──
// FullClientRequest is the JSON payload for the initial handshake.
type FullClientRequest struct {
User UserMeta `json:"user"`
Audio AudioMeta `json:"audio"`
Request RequestMeta `json:"request"`
}
type UserMeta struct {
UID string `json:"uid"`
}
type AudioMeta struct {
Format string `json:"format"`
Codec string `json:"codec"`
Rate int `json:"rate"`
Bits int `json:"bits"`
Channel int `json:"channel"`
}
// Corpus holds hotwords and context configuration.
type Corpus struct {
Context string `json:"context,omitempty"` // 热词直传 JSON
}
type RequestMeta struct {
ModelName string `json:"model_name"`
EnableITN bool `json:"enable_itn"`
EnablePUNC bool `json:"enable_punc"`
EnableDDC bool `json:"enable_ddc"`
ShowUtterances bool `json:"show_utterances"`
ResultType string `json:"result_type,omitempty"`
EnableNonstream bool `json:"enable_nonstream,omitempty"`
EndWindowSize int `json:"end_window_size,omitempty"`
Corpus *Corpus `json:"corpus,omitempty"` // 语料/热词配置
}
// EncodeFullClientRequest builds the binary message for the initial handshake.
// nostream mode: header(4) + payload_size(4) + gzip(json)
func EncodeFullClientRequest(req *FullClientRequest) ([]byte, error) {
payloadJSON, err := json.Marshal(req)
if err != nil {
return nil, fmt.Errorf("marshal request: %w", err)
}
compressed, err := gzipCompress(payloadJSON)
if err != nil {
return nil, fmt.Errorf("gzip compress: %w", err)
}
var buf bytes.Buffer
buf.Write(encodeHeader(MsgFullClientRequest, FlagNoSeq, SerJSON, CompGzip))
_ = binary.Write(&buf, binary.BigEndian, int32(len(compressed)))
buf.Write(compressed)
return buf.Bytes(), nil
}
// EncodeAudioFrame builds a binary audio-only request.
// nostream mode: header(4) + payload_size(4) + gzip(pcm)
// last=true sets FlagLastNoSeq to signal end of stream.
func EncodeAudioFrame(pcm []byte, last bool) ([]byte, error) {
flags := FlagNoSeq
if last {
flags = FlagLastNoSeq
}
compressed, err := gzipCompress(pcm)
if err != nil {
return nil, fmt.Errorf("gzip audio: %w", err)
}
var buf bytes.Buffer
buf.Write(encodeHeader(MsgAudioOnlyRequest, flags, SerNone, CompGzip))
_ = binary.Write(&buf, binary.BigEndian, int32(len(compressed)))
buf.Write(compressed)
return buf.Bytes(), nil
}
// ── Response parsing ──
// ServerResponse is the parsed response from Doubao.
type ServerResponse struct {
Code int
IsLast bool
Text string
Utterances []Utterance
ErrMsg string
}
type Utterance struct {
Text string `json:"text"`
Definite bool `json:"definite"`
}
type responsePayload struct {
Result struct {
Text string `json:"text"`
Utterances []Utterance `json:"utterances"`
} `json:"result"`
AudioInfo struct {
Duration int `json:"duration"`
} `json:"audio_info"`
}
// ParseResponse decodes a binary server response.
func ParseResponse(msg []byte) (*ServerResponse, error) {
if len(msg) < 4 {
return nil, fmt.Errorf("message too short: %d bytes", len(msg))
}
headerSize := int(msg[0]&0x0F) * 4
msgType := MessageType(msg[1] >> 4)
flags := MessageFlags(msg[1] & 0x0F)
compression := msg[2] & 0x0F
payload := msg[headerSize:]
resp := &ServerResponse{}
// Parse sequence if present
if flags&0x01 != 0 && len(payload) >= 4 {
payload = payload[4:] // skip sequence
}
if flags == FlagNegSeq || flags == FlagLastNoSeq {
resp.IsLast = true
}
// Parse event if flag bit 2 set
if flags&0x04 != 0 && len(payload) >= 4 {
payload = payload[4:] // skip event
}
switch msgType {
case MsgFullServerResponse:
if len(payload) < 4 {
return resp, nil
}
payloadSize := int(binary.BigEndian.Uint32(payload[:4]))
payload = payload[4:]
if payloadSize == 0 || len(payload) == 0 {
return resp, nil
}
if compression == CompGzip {
var err error
payload, err = gzipDecompress(payload)
if err != nil {
return nil, fmt.Errorf("decompress response: %w", err)
}
}
var rp responsePayload
if err := json.Unmarshal(payload, &rp); err != nil {
return nil, fmt.Errorf("unmarshal response: %w", err)
}
resp.Text = rp.Result.Text
resp.Utterances = rp.Result.Utterances
case MsgServerError:
if len(payload) < 8 {
return resp, nil
}
resp.Code = int(binary.BigEndian.Uint32(payload[:4]))
payloadSize := int(binary.BigEndian.Uint32(payload[4:8]))
payload = payload[8:]
if payloadSize > 0 && len(payload) > 0 {
if compression == CompGzip {
var err error
payload, err = gzipDecompress(payload)
if err != nil {
return nil, fmt.Errorf("decompress error: %w", err)
}
}
resp.ErrMsg = string(payload)
}
}
return resp, nil
}