package speech import ( "bytes" "context" "encoding/json" "fmt" "io" "mime/multipart" "net/http" "strings" "time" "github.com/paramah/gw_telegram/internal/domain/port" ) const whisperAPIURL = "https://api.openai.com/v1/audio/transcriptions" type OpenAIWhisper struct { apiKey string model string language string client *http.Client } func NewOpenAIWhisper(apiKey, model, language string) *OpenAIWhisper { return &OpenAIWhisper{ apiKey: apiKey, model: model, language: language, client: &http.Client{Timeout: 60 * time.Second}, } } // whisperVerboseResponse is returned by Whisper API when response_format=verbose_json. type whisperVerboseResponse struct { Text string `json:"text"` Segments []whisperSegment `json:"segments"` } type whisperSegment struct { Start float64 `json:"start"` End float64 `json:"end"` Text string `json:"text"` } func (w *OpenAIWhisper) Transcribe(ctx context.Context, audioData []byte, mimeType string) (port.TranscriptionResult, error) { var buf bytes.Buffer mw := multipart.NewWriter(&buf) fw, err := mw.CreateFormFile("file", "audio.wav") if err != nil { return port.TranscriptionResult{}, fmt.Errorf("create form file: %w", err) } if _, err := fw.Write(audioData); err != nil { return port.TranscriptionResult{}, fmt.Errorf("write audio data: %w", err) } if err := mw.WriteField("model", w.model); err != nil { return port.TranscriptionResult{}, fmt.Errorf("write model field: %w", err) } if err := mw.WriteField("response_format", "verbose_json"); err != nil { return port.TranscriptionResult{}, fmt.Errorf("write response_format: %w", err) } if w.language != "" { if err := mw.WriteField("language", w.language); err != nil { return port.TranscriptionResult{}, fmt.Errorf("write language field: %w", err) } } mw.Close() req, err := http.NewRequestWithContext(ctx, http.MethodPost, whisperAPIURL, &buf) if err != nil { return port.TranscriptionResult{}, fmt.Errorf("create whisper request: %w", err) } req.Header.Set("Authorization", "Bearer "+w.apiKey) req.Header.Set("Content-Type", mw.FormDataContentType()) resp, err := w.client.Do(req) if err != nil { return port.TranscriptionResult{}, fmt.Errorf("whisper API call: %w", err) } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) if err != nil { return port.TranscriptionResult{}, fmt.Errorf("read whisper response: %w", err) } if resp.StatusCode != http.StatusOK { return port.TranscriptionResult{}, fmt.Errorf("whisper API error %d: %s", resp.StatusCode, string(body)) } var result whisperVerboseResponse if err := json.Unmarshal(body, &result); err != nil { return port.TranscriptionResult{}, fmt.Errorf("parse whisper response: %w", err) } return port.TranscriptionResult{ Text: result.Text, VTT: buildVTT(result.Segments), }, nil } // buildVTT generates WebVTT content from Whisper segments. func buildVTT(segments []whisperSegment) string { if len(segments) == 0 { return "" } var sb strings.Builder sb.WriteString("WEBVTT\n\n") for _, seg := range segments { sb.WriteString(fmt.Sprintf("%s --> %s\n%s\n\n", formatVTTTime(seg.Start), formatVTTTime(seg.End), strings.TrimSpace(seg.Text), )) } return sb.String() } // formatVTTTime converts seconds to WebVTT timestamp (HH:MM:SS.mmm). func formatVTTTime(seconds float64) string { d := time.Duration(seconds * float64(time.Second)) h := int(d.Hours()) m := int(d.Minutes()) % 60 s := int(d.Seconds()) % 60 ms := int(d.Milliseconds()) % 1000 return fmt.Sprintf("%02d:%02d:%02d.%03d", h, m, s, ms) }