Files
go-telegram-bot/elevenlabs.go
T
HugeFrog24 265f6676d8 Design
2026-03-05 08:41:48 +01:00

116 lines
3.7 KiB
Go

package main
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"mime/multipart"
"net/http"
tgbot "github.com/go-telegram/bot"
)
const (
elevenLabsTTSURL = "https://api.elevenlabs.io/v1/text-to-speech/"
elevenLabsSTTURL = "https://api.elevenlabs.io/v1/speech-to-text"
elevenLabsDefaultModel = "eleven_multilingual_v2"
)
// generateSpeech converts text to an mp3 audio stream via ElevenLabs TTS.
func (b *Bot) generateSpeech(ctx context.Context, text string) (io.Reader, error) {
model := b.config.ElevenLabsModel
if model == "" {
model = elevenLabsDefaultModel
}
body, err := json.Marshal(map[string]string{
"text": text,
"model_id": model,
})
if err != nil {
return nil, fmt.Errorf("elevenlabs TTS marshal error: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost,
elevenLabsTTSURL+b.config.ElevenLabsVoiceID, bytes.NewReader(body))
if err != nil {
return nil, fmt.Errorf("elevenlabs TTS request error: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("xi-api-key", b.config.ElevenLabsAPIKey)
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, fmt.Errorf("elevenlabs TTS error: %w", err)
}
if resp.StatusCode != http.StatusOK {
defer resp.Body.Close()
errBody, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("elevenlabs TTS error: status %d: %s", resp.StatusCode, errBody)
}
return resp.Body, nil
}
// transcribeVoice downloads a Telegram voice file and transcribes it via ElevenLabs STT.
// Uses a direct multipart HTTP call instead of the SDK wrapper to avoid a bug in the
// ogen-generated encoder: AdditionalFormats (nil slice) is always written as an empty
// string with Content-Type: application/json, which ElevenLabs rejects with 400.
func (b *Bot) transcribeVoice(ctx context.Context, fileID string) (string, error) {
// 1. Resolve and download the voice file from Telegram.
fileInfo, err := b.tgBot.GetFile(ctx, &tgbot.GetFileParams{FileID: fileID})
if err != nil {
return "", fmt.Errorf("telegram GetFile error: %w", err)
}
downloadURL := b.tgBot.FileDownloadLink(fileInfo)
audioResp, err := http.Get(downloadURL) //nolint:noctx
if err != nil {
return "", fmt.Errorf("voice download error: %w", err)
}
defer audioResp.Body.Close()
// 2. Build multipart body with binary audio — bypasses SDK encoding issues.
var buf bytes.Buffer
mw := multipart.NewWriter(&buf)
if err := mw.WriteField("model_id", "scribe_v1"); err != nil {
return "", fmt.Errorf("multipart write error: %w", err)
}
part, err := mw.CreateFormFile("file", "audio.ogg")
if err != nil {
return "", fmt.Errorf("multipart create file error: %w", err)
}
if _, err := io.Copy(part, audioResp.Body); err != nil {
return "", fmt.Errorf("multipart copy error: %w", err)
}
if err := mw.Close(); err != nil {
return "", fmt.Errorf("multipart close error: %w", err)
}
// 3. POST to ElevenLabs STT.
req, err := http.NewRequestWithContext(ctx, http.MethodPost,
elevenLabsSTTURL, &buf)
if err != nil {
return "", fmt.Errorf("create STT request error: %w", err)
}
req.Header.Set("Content-Type", mw.FormDataContentType())
req.Header.Set("xi-api-key", b.config.ElevenLabsAPIKey)
sttResp, err := http.DefaultClient.Do(req)
if err != nil {
return "", fmt.Errorf("elevenlabs STT request error: %w", err)
}
defer sttResp.Body.Close()
if sttResp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(sttResp.Body)
return "", fmt.Errorf("elevenlabs STT error: status %d: %s", sttResp.StatusCode, body)
}
var result struct {
Text string `json:"text"`
}
if err := json.NewDecoder(sttResp.Body).Decode(&result); err != nil {
return "", fmt.Errorf("elevenlabs STT decode error: %w", err)
}
return result.Text, nil
}