129 lines
3.5 KiB
Go
129 lines
3.5 KiB
Go
package speech
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"mime/multipart"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/paramah/gw_telegram/internal/domain/port"
|
|
)
|
|
|
|
const whisperAPIURL = "https://api.openai.com/v1/audio/transcriptions"
|
|
|
|
type OpenAIWhisper struct {
|
|
apiKey string
|
|
model string
|
|
language string
|
|
client *http.Client
|
|
}
|
|
|
|
func NewOpenAIWhisper(apiKey, model, language string) *OpenAIWhisper {
|
|
return &OpenAIWhisper{
|
|
apiKey: apiKey,
|
|
model: model,
|
|
language: language,
|
|
client: &http.Client{Timeout: 60 * time.Second},
|
|
}
|
|
}
|
|
|
|
// whisperVerboseResponse is returned by Whisper API when response_format=verbose_json.
|
|
type whisperVerboseResponse struct {
|
|
Text string `json:"text"`
|
|
Segments []whisperSegment `json:"segments"`
|
|
}
|
|
|
|
type whisperSegment struct {
|
|
Start float64 `json:"start"`
|
|
End float64 `json:"end"`
|
|
Text string `json:"text"`
|
|
}
|
|
|
|
func (w *OpenAIWhisper) Transcribe(ctx context.Context, audioData []byte, mimeType string) (port.TranscriptionResult, error) {
|
|
var buf bytes.Buffer
|
|
mw := multipart.NewWriter(&buf)
|
|
|
|
fw, err := mw.CreateFormFile("file", "audio.wav")
|
|
if err != nil {
|
|
return port.TranscriptionResult{}, fmt.Errorf("create form file: %w", err)
|
|
}
|
|
if _, err := fw.Write(audioData); err != nil {
|
|
return port.TranscriptionResult{}, fmt.Errorf("write audio data: %w", err)
|
|
}
|
|
if err := mw.WriteField("model", w.model); err != nil {
|
|
return port.TranscriptionResult{}, fmt.Errorf("write model field: %w", err)
|
|
}
|
|
if err := mw.WriteField("response_format", "verbose_json"); err != nil {
|
|
return port.TranscriptionResult{}, fmt.Errorf("write response_format: %w", err)
|
|
}
|
|
if w.language != "" {
|
|
if err := mw.WriteField("language", w.language); err != nil {
|
|
return port.TranscriptionResult{}, fmt.Errorf("write language field: %w", err)
|
|
}
|
|
}
|
|
mw.Close()
|
|
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodPost, whisperAPIURL, &buf)
|
|
if err != nil {
|
|
return port.TranscriptionResult{}, fmt.Errorf("create whisper request: %w", err)
|
|
}
|
|
req.Header.Set("Authorization", "Bearer "+w.apiKey)
|
|
req.Header.Set("Content-Type", mw.FormDataContentType())
|
|
|
|
resp, err := w.client.Do(req)
|
|
if err != nil {
|
|
return port.TranscriptionResult{}, fmt.Errorf("whisper API call: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return port.TranscriptionResult{}, fmt.Errorf("read whisper response: %w", err)
|
|
}
|
|
if resp.StatusCode != http.StatusOK {
|
|
return port.TranscriptionResult{}, fmt.Errorf("whisper API error %d: %s", resp.StatusCode, string(body))
|
|
}
|
|
|
|
var result whisperVerboseResponse
|
|
if err := json.Unmarshal(body, &result); err != nil {
|
|
return port.TranscriptionResult{}, fmt.Errorf("parse whisper response: %w", err)
|
|
}
|
|
|
|
return port.TranscriptionResult{
|
|
Text: result.Text,
|
|
VTT: buildVTT(result.Segments),
|
|
}, nil
|
|
}
|
|
|
|
// buildVTT generates WebVTT content from Whisper segments.
|
|
func buildVTT(segments []whisperSegment) string {
|
|
if len(segments) == 0 {
|
|
return ""
|
|
}
|
|
var sb strings.Builder
|
|
sb.WriteString("WEBVTT\n\n")
|
|
for _, seg := range segments {
|
|
sb.WriteString(fmt.Sprintf("%s --> %s\n%s\n\n",
|
|
formatVTTTime(seg.Start),
|
|
formatVTTTime(seg.End),
|
|
strings.TrimSpace(seg.Text),
|
|
))
|
|
}
|
|
return sb.String()
|
|
}
|
|
|
|
// formatVTTTime converts seconds to WebVTT timestamp (HH:MM:SS.mmm).
|
|
func formatVTTTime(seconds float64) string {
|
|
d := time.Duration(seconds * float64(time.Second))
|
|
h := int(d.Hours())
|
|
m := int(d.Minutes()) % 60
|
|
s := int(d.Seconds()) % 60
|
|
ms := int(d.Milliseconds()) % 1000
|
|
return fmt.Sprintf("%02d:%02d:%02d.%03d", h, m, s, ms)
|
|
}
|