From 516ceb58824a9595bc86be31be5ea20deda2f7fc Mon Sep 17 00:00:00 2001 From: Aleksander Cynarski Date: Thu, 16 Apr 2026 19:50:38 +0200 Subject: [PATCH] feat: persist voice samples to disk, generate VTT, cleanup after analysis --- .env.example | 1 + cmd/bot/main.go | 7 +- .../usecase/handle_voice_message.go | 75 ++++++++++++--- .../usecase/handle_voice_message_test.go | 94 +++++++++++++------ internal/config/config.go | 11 ++- internal/domain/entity/voice_sample.go | 17 ++++ internal/domain/port/speech_transcriber.go | 10 +- internal/domain/port/voice_file_store.go | 12 +++ .../infrastructure/speech/audio_converter.go | 41 ++++++-- .../speech/local_voice_file_store.go | 57 +++++++++++ .../infrastructure/speech/openai_whisper.go | 75 +++++++++++---- test/testutil/fake_speech_transcriber.go | 16 ++-- test/testutil/fake_voice_file_store.go | 33 +++++++ 13 files changed, 366 insertions(+), 83 deletions(-) create mode 100644 internal/domain/entity/voice_sample.go create mode 100644 internal/domain/port/voice_file_store.go create mode 100644 internal/infrastructure/speech/local_voice_file_store.go create mode 100644 test/testutil/fake_voice_file_store.go diff --git a/.env.example b/.env.example index 4d77b9c..775fc38 100644 --- a/.env.example +++ b/.env.example @@ -9,6 +9,7 @@ OPENAI_API_KEY=your_openai_key_here STT_PROVIDER=openai WHISPER_MODEL=whisper-1 WHISPER_LANGUAGE= +VOICE_STORE_PATH=/tmp/gw_telegram/voice REDIS_URL=redis://localhost:6379 SESSION_TTL=24 diff --git a/cmd/bot/main.go b/cmd/bot/main.go index 78b28cf..f1c5a62 100644 --- a/cmd/bot/main.go +++ b/cmd/bot/main.go @@ -87,10 +87,15 @@ func main() { // Speech transcriber := speech.NewOpenAIWhisper(cfg.Speech.OpenAIKey, cfg.Speech.WhisperModel, cfg.Speech.Language) converter := speech.NewFFmpegConverter(cfg.Speech.FFmpegPath, "") + voiceFileStore, err := speech.NewLocalVoiceFileStore(cfg.Speech.VoiceStorePath) + if err != nil { + logger.Error("failed to initialise voice file store", "error", err) + os.Exit(1) + } // Use cases textUC := usecase.NewHandleTextMessage(intentRouter, dispatcher, sessionStore, gateway, logger) - voiceUC := usecase.NewHandleVoiceMessage(downloader, converter, transcriber, textUC, gateway, logger) + voiceUC := usecase.NewHandleVoiceMessage(downloader, converter, transcriber, voiceFileStore, textUC, gateway, logger) // Handler + poller handler := interfaces.NewTelegramHandler(textUC, voiceUC, logger) diff --git a/internal/application/usecase/handle_voice_message.go b/internal/application/usecase/handle_voice_message.go index 9144f34..533f731 100644 --- a/internal/application/usecase/handle_voice_message.go +++ b/internal/application/usecase/handle_voice_message.go @@ -4,8 +4,11 @@ import ( "context" "fmt" "log/slog" + "time" + "github.com/google/uuid" "github.com/paramah/gw_telegram/internal/application/dto" + "github.com/paramah/gw_telegram/internal/domain/entity" "github.com/paramah/gw_telegram/internal/domain/port" ) @@ -13,12 +16,13 @@ type HandleVoiceMessage struct { downloader port.FileDownloader converter AudioConverter transcriber port.SpeechTranscriber + fileStore port.VoiceFileStore textHandler *HandleTextMessage gateway port.MessageGateway logger *slog.Logger } -// AudioConverter converts audio between formats (OGG -> WAV etc.) +// AudioConverter converts audio between formats (OGG → WAV etc.) type AudioConverter interface { Convert(ctx context.Context, input []byte, fromMime, toMime string) ([]byte, error) } @@ -27,6 +31,7 @@ func NewHandleVoiceMessage( downloader port.FileDownloader, converter AudioConverter, transcriber port.SpeechTranscriber, + fileStore port.VoiceFileStore, textHandler *HandleTextMessage, gateway port.MessageGateway, logger *slog.Logger, @@ -35,6 +40,7 @@ func NewHandleVoiceMessage( downloader: downloader, converter: converter, transcriber: transcriber, + fileStore: fileStore, textHandler: textHandler, gateway: gateway, logger: logger, @@ -44,6 +50,15 @@ func NewHandleVoiceMessage( func (h *HandleVoiceMessage) Execute(ctx context.Context, in dto.IncomingMessageDTO) error { _ = h.gateway.SendTyping(ctx, in.ChatID) + sample := entity.VoiceSample{ + ID: uuid.New().String(), + UserID: in.UserID, + ChatID: in.ChatID, + FileIDTg: in.VoiceFileID, + CreatedAt: time.Now(), + } + + // Step 1: download OGG from Telegram and persist to disk audioBytes, mimeType, err := h.downloader.Download(ctx, in.VoiceFileID) if err != nil { h.logger.ErrorContext(ctx, "voice download failed", "error", err, "file_id", in.VoiceFileID) @@ -51,30 +66,68 @@ func (h *HandleVoiceMessage) Execute(ctx context.Context, in dto.IncomingMessage return fmt.Errorf("handle voice: download: %w", err) } - // Convert OGG Opus (Telegram default) to WAV for Whisper + sample.OGGPath, err = h.fileStore.SaveRaw(ctx, sample.ID, audioBytes) + if err != nil { + h.logger.WarnContext(ctx, "failed to persist raw audio", "error", err, "sample_id", sample.ID) + } + + // Step 2: convert OGG → WAV and persist to disk if mimeType == "audio/ogg" || mimeType == "" { - wavBytes, err := h.converter.Convert(ctx, audioBytes, "audio/ogg", "audio/wav") - if err != nil { - h.logger.WarnContext(ctx, "audio conversion failed, trying raw", "error", err) - // fall through with original bytes + wavBytes, convErr := h.converter.Convert(ctx, audioBytes, "audio/ogg", "audio/wav") + if convErr != nil { + h.logger.WarnContext(ctx, "audio conversion failed, proceeding with raw audio", "error", convErr) } else { audioBytes = wavBytes mimeType = "audio/wav" + + sample.WAVPath, err = h.fileStore.SaveConverted(ctx, sample.ID, wavBytes) + if err != nil { + h.logger.WarnContext(ctx, "failed to persist converted audio", "error", err, "sample_id", sample.ID) + } } } - transcript, err := h.transcriber.Transcribe(ctx, audioBytes, mimeType) + // Step 3: transcribe — receive both text and VTT + result, err := h.transcriber.Transcribe(ctx, audioBytes, mimeType) if err != nil { - h.logger.ErrorContext(ctx, "transcription failed", "error", err) + h.logger.ErrorContext(ctx, "transcription failed", "error", err, "sample_id", sample.ID) _ = h.gateway.SendText(ctx, in.ChatID, "Sorry, I couldn't understand the voice message. Please try sending text instead.") + _ = h.fileStore.Cleanup(ctx, sample.ID) return fmt.Errorf("handle voice: transcribe: %w", err) } - h.logger.InfoContext(ctx, "voice transcribed", "user_id", in.UserID, "length", len(transcript)) + sample.Transcript = result.Text + // Step 4: persist VTT file + if result.VTT != "" { + sample.VTTPath, err = h.fileStore.SaveVTT(ctx, sample.ID, result.VTT) + if err != nil { + h.logger.WarnContext(ctx, "failed to persist vtt", "error", err, "sample_id", sample.ID) + } + } + + h.logger.InfoContext(ctx, "voice sample analysed", + "user_id", in.UserID, + "sample_id", sample.ID, + "ogg_path", sample.OGGPath, + "wav_path", sample.WAVPath, + "vtt_path", sample.VTTPath, + "transcript_len", len(result.Text), + ) + + // Step 5: route transcript as a regular text message textIn := in - textIn.Text = transcript + textIn.Text = result.Text textIn.IsVoice = false - return h.textHandler.Execute(ctx, textIn) + if err := h.textHandler.Execute(ctx, textIn); err != nil { + return err + } + + // Step 6: cleanup only after successful analysis and dispatch + if cleanErr := h.fileStore.Cleanup(ctx, sample.ID); cleanErr != nil { + h.logger.WarnContext(ctx, "failed to cleanup voice files", "error", cleanErr, "sample_id", sample.ID) + } + + return nil } diff --git a/internal/application/usecase/handle_voice_message_test.go b/internal/application/usecase/handle_voice_message_test.go index 13cf1d2..dd5349c 100644 --- a/internal/application/usecase/handle_voice_message_test.go +++ b/internal/application/usecase/handle_voice_message_test.go @@ -10,6 +10,7 @@ import ( "github.com/paramah/gw_telegram/internal/application/dto" "github.com/paramah/gw_telegram/internal/application/usecase" "github.com/paramah/gw_telegram/internal/domain/entity" + "github.com/paramah/gw_telegram/internal/domain/port" "github.com/paramah/gw_telegram/test/testutil" ) @@ -24,25 +25,44 @@ func validVoiceDTO() dto.IncomingMessageDTO { } } +func newVoiceUC( + downloader *testutil.FakeFileDownloader, + transcriber *testutil.FakeSpeechTranscriber, + fileStore *testutil.FakeVoiceFileStore, + router *testutil.FakeIntentRouter, + dispatcher *testutil.FakeWorkflowDispatcher, + gateway *testutil.FakeMessageGateway, +) *usecase.HandleVoiceMessage { + textUC := usecase.NewHandleTextMessage(router, dispatcher, &testutil.FakeSessionStore{}, gateway, newTestLogger()) + return usecase.NewHandleVoiceMessage( + downloader, + &testutil.FakeAudioConverter{}, + transcriber, + fileStore, + textUC, + gateway, + newTestLogger(), + ) +} + func TestHandleVoiceMessage_Execute_HappyPath(t *testing.T) { - downloader := &testutil.FakeFileDownloader{ - Data: []byte("fake-ogg-audio"), - MimeType: "audio/ogg", + downloader := &testutil.FakeFileDownloader{Data: []byte("fake-ogg"), MimeType: "audio/ogg"} + transcriber := &testutil.FakeSpeechTranscriber{ + Result: port.TranscriptionResult{ + Text: "I need help with my order", + VTT: "WEBVTT\n\n00:00:00.000 --> 00:00:02.000\nI need help with my order\n\n", + }, } - converter := &testutil.FakeAudioConverter{Output: []byte("fake-wav-audio")} - transcriber := &testutil.FakeSpeechTranscriber{Transcript: "I need help with my order"} - router := &testutil.FakeIntentRouter{ - RouteResult: entity.Route{IntentName: "order_inquiry"}, + fileStore := &testutil.FakeVoiceFileStore{ + SaveRawPath: "/tmp/abc.ogg", + SaveConvertedPath: "/tmp/abc.wav", + SaveVTTPath: "/tmp/abc.vtt", } - dispatcher := &testutil.FakeWorkflowDispatcher{ - Response: entity.WorkflowResponse{ReplyText: "Order status: shipped"}, - } - sessions := &testutil.FakeSessionStore{} + router := &testutil.FakeIntentRouter{RouteResult: entity.Route{IntentName: "order_inquiry"}} + dispatcher := &testutil.FakeWorkflowDispatcher{Response: entity.WorkflowResponse{ReplyText: "Order status: shipped"}} gateway := &testutil.FakeMessageGateway{} - textUC := usecase.NewHandleTextMessage(router, dispatcher, sessions, gateway, newTestLogger()) - uc := usecase.NewHandleVoiceMessage(downloader, converter, transcriber, textUC, gateway, newTestLogger()) - + uc := newVoiceUC(downloader, transcriber, fileStore, router, dispatcher, gateway) err := uc.Execute(context.Background(), validVoiceDTO()) require.NoError(t, err) @@ -50,50 +70,64 @@ func TestHandleVoiceMessage_Execute_HappyPath(t *testing.T) { assert.Equal(t, 1, dispatcher.CallCount) assert.Equal(t, "I need help with my order", dispatcher.LastRequest.MessageText) assert.Equal(t, "Order status: shipped", gateway.LastSentText) + assert.Equal(t, 1, fileStore.CleanupCallCount) } func TestHandleVoiceMessage_Execute_DownloadFails(t *testing.T) { - downloader := &testutil.FakeFileDownloader{Error: errors.New("download error")} + downloader := &testutil.FakeFileDownloader{Error: errors.New("network error")} transcriber := &testutil.FakeSpeechTranscriber{} + fileStore := &testutil.FakeVoiceFileStore{} gateway := &testutil.FakeMessageGateway{} - textUC := usecase.NewHandleTextMessage( + uc := newVoiceUC(downloader, transcriber, fileStore, &testutil.FakeIntentRouter{}, &testutil.FakeWorkflowDispatcher{}, - &testutil.FakeSessionStore{}, gateway, - newTestLogger(), ) - uc := usecase.NewHandleVoiceMessage(downloader, &testutil.FakeAudioConverter{}, transcriber, textUC, gateway, newTestLogger()) - err := uc.Execute(context.Background(), validVoiceDTO()) assert.Error(t, err) assert.Equal(t, 0, transcriber.CallCount) - assert.NotEmpty(t, gateway.LastSentText) // error message sent + assert.NotEmpty(t, gateway.LastSentText) + assert.Equal(t, 0, fileStore.CleanupCallCount) } func TestHandleVoiceMessage_Execute_TranscriptionFails(t *testing.T) { - downloader := &testutil.FakeFileDownloader{ - Data: []byte("audio"), - MimeType: "audio/ogg", - } + downloader := &testutil.FakeFileDownloader{Data: []byte("audio"), MimeType: "audio/ogg"} transcriber := &testutil.FakeSpeechTranscriber{Error: errors.New("whisper error")} + fileStore := &testutil.FakeVoiceFileStore{} gateway := &testutil.FakeMessageGateway{} dispatcher := &testutil.FakeWorkflowDispatcher{} - textUC := usecase.NewHandleTextMessage( + uc := newVoiceUC(downloader, transcriber, fileStore, &testutil.FakeIntentRouter{}, dispatcher, - &testutil.FakeSessionStore{}, gateway, - newTestLogger(), ) - uc := usecase.NewHandleVoiceMessage(downloader, &testutil.FakeAudioConverter{}, transcriber, textUC, gateway, newTestLogger()) - err := uc.Execute(context.Background(), validVoiceDTO()) assert.Error(t, err) assert.Equal(t, 0, dispatcher.CallCount) assert.NotEmpty(t, gateway.LastSentText) + assert.Equal(t, 1, fileStore.CleanupCallCount) +} + +func TestHandleVoiceMessage_Execute_NoVTTWhenEmpty(t *testing.T) { + downloader := &testutil.FakeFileDownloader{Data: []byte("audio"), MimeType: "audio/ogg"} + transcriber := &testutil.FakeSpeechTranscriber{ + Result: port.TranscriptionResult{Text: "Hello", VTT: ""}, + } + fileStore := &testutil.FakeVoiceFileStore{} + gateway := &testutil.FakeMessageGateway{} + + uc := newVoiceUC(downloader, transcriber, fileStore, + &testutil.FakeIntentRouter{RouteResult: entity.Route{IntentName: "general_query"}}, + &testutil.FakeWorkflowDispatcher{Response: entity.WorkflowResponse{ReplyText: "OK"}}, + gateway, + ) + err := uc.Execute(context.Background(), validVoiceDTO()) + + require.NoError(t, err) + // SaveVTT should NOT have been called — VTTPath stays empty + assert.Empty(t, fileStore.SaveVTTPath) } diff --git a/internal/config/config.go b/internal/config/config.go index 1ba4151..60932d2 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -30,11 +30,12 @@ type N8nConfig struct { } type SpeechConfig struct { - Provider string `envconfig:"STT_PROVIDER" default:"openai"` - OpenAIKey string `envconfig:"OPENAI_API_KEY"` - WhisperModel string `envconfig:"WHISPER_MODEL" default:"whisper-1"` - Language string `envconfig:"WHISPER_LANGUAGE" default:""` - FFmpegPath string `envconfig:"FFMPEG_PATH" default:"ffmpeg"` + Provider string `envconfig:"STT_PROVIDER" default:"openai"` + OpenAIKey string `envconfig:"OPENAI_API_KEY"` + WhisperModel string `envconfig:"WHISPER_MODEL" default:"whisper-1"` + Language string `envconfig:"WHISPER_LANGUAGE" default:""` + FFmpegPath string `envconfig:"FFMPEG_PATH" default:"ffmpeg"` + VoiceStorePath string `envconfig:"VOICE_STORE_PATH" default:"/tmp/gw_telegram/voice"` } type RedisConfig struct { diff --git a/internal/domain/entity/voice_sample.go b/internal/domain/entity/voice_sample.go new file mode 100644 index 0000000..e06cd3c --- /dev/null +++ b/internal/domain/entity/voice_sample.go @@ -0,0 +1,17 @@ +package entity + +import "time" + +// VoiceSample tracks all files produced during voice message processing. +// Files are kept on disk until Cleanup() is called after successful analysis. +type VoiceSample struct { + ID string + UserID int64 + ChatID int64 + FileIDTg string // original Telegram file ID + OGGPath string // downloaded raw audio + WAVPath string // ffmpeg-converted audio + VTTPath string // WebVTT subtitle file generated by Whisper + Transcript string + CreatedAt time.Time +} diff --git a/internal/domain/port/speech_transcriber.go b/internal/domain/port/speech_transcriber.go index a828b9a..a09c153 100644 --- a/internal/domain/port/speech_transcriber.go +++ b/internal/domain/port/speech_transcriber.go @@ -2,6 +2,12 @@ package port import "context" -type SpeechTranscriber interface { - Transcribe(ctx context.Context, audioData []byte, mimeType string) (string, error) +// TranscriptionResult holds both the plain-text transcript and the WebVTT subtitle content. +type TranscriptionResult struct { + Text string + VTT string // WebVTT format, empty if not available +} + +type SpeechTranscriber interface { + Transcribe(ctx context.Context, audioData []byte, mimeType string) (TranscriptionResult, error) } diff --git a/internal/domain/port/voice_file_store.go b/internal/domain/port/voice_file_store.go new file mode 100644 index 0000000..e4d1887 --- /dev/null +++ b/internal/domain/port/voice_file_store.go @@ -0,0 +1,12 @@ +package port + +import "context" + +// VoiceFileStore manages lifecycle of voice sample files on disk. +// Files are persisted until Cleanup is called after successful analysis. +type VoiceFileStore interface { + SaveRaw(ctx context.Context, sampleID string, data []byte) (path string, err error) + SaveConverted(ctx context.Context, sampleID string, data []byte) (path string, err error) + SaveVTT(ctx context.Context, sampleID string, content string) (path string, err error) + Cleanup(ctx context.Context, sampleID string) error +} diff --git a/internal/infrastructure/speech/audio_converter.go b/internal/infrastructure/speech/audio_converter.go index 50f09da..1a3b6fa 100644 --- a/internal/infrastructure/speech/audio_converter.go +++ b/internal/infrastructure/speech/audio_converter.go @@ -26,6 +26,9 @@ func NewFFmpegConverter(ffmpegPath, tempDir string) *FFmpegConverter { return &FFmpegConverter{ffmpegPath: ffmpegPath, tempDir: tempDir} } +// Convert converts audio bytes between formats and returns the converted bytes. +// Intermediate temp files are cleaned up automatically — use ConvertFile when +// you need to keep the output on disk. func (c *FFmpegConverter) Convert(ctx context.Context, input []byte, fromMime, toMime string) ([]byte, error) { id := uuid.New().String() inFile := filepath.Join(c.tempDir, id+".input") @@ -38,6 +41,32 @@ func (c *FFmpegConverter) Convert(ctx context.Context, input []byte, fromMime, t return nil, fmt.Errorf("write temp input: %w", err) } + if err := c.runFFmpeg(ctx, inFile, outFile); err != nil { + return nil, err + } + + out, err := os.ReadFile(outFile) + if err != nil { + return nil, fmt.Errorf("read converted file: %w", err) + } + return out, nil +} + +// ConvertFile converts audio bytes and writes the WAV output to outPath. +// The caller is responsible for deleting outPath when no longer needed. +func (c *FFmpegConverter) ConvertFile(ctx context.Context, input []byte, outPath string) error { + id := uuid.New().String() + inFile := filepath.Join(c.tempDir, id+".input") + + defer os.Remove(inFile) + + if err := os.WriteFile(inFile, input, 0600); err != nil { + return fmt.Errorf("write temp input: %w", err) + } + return c.runFFmpeg(ctx, inFile, outPath) +} + +func (c *FFmpegConverter) runFFmpeg(ctx context.Context, inFile, outFile string) error { cmd := exec.CommandContext(ctx, c.ffmpegPath, "-i", inFile, "-ar", "16000", @@ -46,18 +75,10 @@ func (c *FFmpegConverter) Convert(ctx context.Context, input []byte, fromMime, t "-y", outFile, ) - var stderr bytes.Buffer cmd.Stderr = &stderr - if err := cmd.Run(); err != nil { - return nil, fmt.Errorf("ffmpeg conversion: %w: %s", err, stderr.String()) + return fmt.Errorf("ffmpeg conversion: %w: %s", err, stderr.String()) } - - out, err := os.ReadFile(outFile) - if err != nil { - return nil, fmt.Errorf("read converted file: %w", err) - } - - return out, nil + return nil } diff --git a/internal/infrastructure/speech/local_voice_file_store.go b/internal/infrastructure/speech/local_voice_file_store.go new file mode 100644 index 0000000..fb2a0d7 --- /dev/null +++ b/internal/infrastructure/speech/local_voice_file_store.go @@ -0,0 +1,57 @@ +package speech + +import ( + "context" + "fmt" + "os" + "path/filepath" +) + +// LocalVoiceFileStore persists voice sample files on the local filesystem. +// All files for a sample share a common prefix: {storageDir}/{sampleID}.{ext} +type LocalVoiceFileStore struct { + storageDir string +} + +func NewLocalVoiceFileStore(storageDir string) (*LocalVoiceFileStore, error) { + if err := os.MkdirAll(storageDir, 0750); err != nil { + return nil, fmt.Errorf("create voice storage dir: %w", err) + } + return &LocalVoiceFileStore{storageDir: storageDir}, nil +} + +func (s *LocalVoiceFileStore) SaveRaw(_ context.Context, sampleID string, data []byte) (string, error) { + path := filepath.Join(s.storageDir, sampleID+".ogg") + if err := os.WriteFile(path, data, 0600); err != nil { + return "", fmt.Errorf("save raw audio: %w", err) + } + return path, nil +} + +func (s *LocalVoiceFileStore) SaveConverted(_ context.Context, sampleID string, data []byte) (string, error) { + path := filepath.Join(s.storageDir, sampleID+".wav") + if err := os.WriteFile(path, data, 0600); err != nil { + return "", fmt.Errorf("save converted audio: %w", err) + } + return path, nil +} + +func (s *LocalVoiceFileStore) SaveVTT(_ context.Context, sampleID string, content string) (string, error) { + path := filepath.Join(s.storageDir, sampleID+".vtt") + if err := os.WriteFile(path, []byte(content), 0600); err != nil { + return "", fmt.Errorf("save vtt: %w", err) + } + return path, nil +} + +// Cleanup removes all files associated with a sample ID (.ogg, .wav, .vtt). +// Missing files are silently ignored. +func (s *LocalVoiceFileStore) Cleanup(_ context.Context, sampleID string) error { + for _, ext := range []string{".ogg", ".wav", ".vtt"} { + path := filepath.Join(s.storageDir, sampleID+ext) + if err := os.Remove(path); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("cleanup %s: %w", ext, err) + } + } + return nil +} diff --git a/internal/infrastructure/speech/openai_whisper.go b/internal/infrastructure/speech/openai_whisper.go index 9807a1c..7ee9ee8 100644 --- a/internal/infrastructure/speech/openai_whisper.go +++ b/internal/infrastructure/speech/openai_whisper.go @@ -8,7 +8,10 @@ import ( "io" "mime/multipart" "net/http" + "strings" "time" + + "github.com/paramah/gw_telegram/internal/domain/port" ) const whisperAPIURL = "https://api.openai.com/v1/audio/transcriptions" @@ -29,61 +32,97 @@ func NewOpenAIWhisper(apiKey, model, language string) *OpenAIWhisper { } } -type whisperResponse struct { - Text string `json:"text"` +// whisperVerboseResponse is returned by Whisper API when response_format=verbose_json. +type whisperVerboseResponse struct { + Text string `json:"text"` + Segments []whisperSegment `json:"segments"` } -func (w *OpenAIWhisper) Transcribe(ctx context.Context, audioData []byte, mimeType string) (string, error) { +type whisperSegment struct { + Start float64 `json:"start"` + End float64 `json:"end"` + Text string `json:"text"` +} + +func (w *OpenAIWhisper) Transcribe(ctx context.Context, audioData []byte, mimeType string) (port.TranscriptionResult, error) { var buf bytes.Buffer mw := multipart.NewWriter(&buf) fw, err := mw.CreateFormFile("file", "audio.wav") if err != nil { - return "", fmt.Errorf("create form file: %w", err) + return port.TranscriptionResult{}, fmt.Errorf("create form file: %w", err) } if _, err := fw.Write(audioData); err != nil { - return "", fmt.Errorf("write audio data: %w", err) + return port.TranscriptionResult{}, fmt.Errorf("write audio data: %w", err) } - if err := mw.WriteField("model", w.model); err != nil { - return "", fmt.Errorf("write model field: %w", err) + return port.TranscriptionResult{}, fmt.Errorf("write model field: %w", err) } - if err := mw.WriteField("response_format", "json"); err != nil { - return "", fmt.Errorf("write response_format: %w", err) + if err := mw.WriteField("response_format", "verbose_json"); err != nil { + return port.TranscriptionResult{}, fmt.Errorf("write response_format: %w", err) } if w.language != "" { if err := mw.WriteField("language", w.language); err != nil { - return "", fmt.Errorf("write language field: %w", err) + return port.TranscriptionResult{}, fmt.Errorf("write language field: %w", err) } } mw.Close() req, err := http.NewRequestWithContext(ctx, http.MethodPost, whisperAPIURL, &buf) if err != nil { - return "", fmt.Errorf("create whisper request: %w", err) + return port.TranscriptionResult{}, fmt.Errorf("create whisper request: %w", err) } req.Header.Set("Authorization", "Bearer "+w.apiKey) req.Header.Set("Content-Type", mw.FormDataContentType()) resp, err := w.client.Do(req) if err != nil { - return "", fmt.Errorf("whisper API call: %w", err) + return port.TranscriptionResult{}, fmt.Errorf("whisper API call: %w", err) } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) if err != nil { - return "", fmt.Errorf("read whisper response: %w", err) + return port.TranscriptionResult{}, fmt.Errorf("read whisper response: %w", err) } - if resp.StatusCode != http.StatusOK { - return "", fmt.Errorf("whisper API error %d: %s", resp.StatusCode, string(body)) + return port.TranscriptionResult{}, fmt.Errorf("whisper API error %d: %s", resp.StatusCode, string(body)) } - var result whisperResponse + var result whisperVerboseResponse if err := json.Unmarshal(body, &result); err != nil { - return "", fmt.Errorf("parse whisper response: %w", err) + return port.TranscriptionResult{}, fmt.Errorf("parse whisper response: %w", err) } - return result.Text, nil + return port.TranscriptionResult{ + Text: result.Text, + VTT: buildVTT(result.Segments), + }, nil +} + +// buildVTT generates WebVTT content from Whisper segments. +func buildVTT(segments []whisperSegment) string { + if len(segments) == 0 { + return "" + } + var sb strings.Builder + sb.WriteString("WEBVTT\n\n") + for _, seg := range segments { + sb.WriteString(fmt.Sprintf("%s --> %s\n%s\n\n", + formatVTTTime(seg.Start), + formatVTTTime(seg.End), + strings.TrimSpace(seg.Text), + )) + } + return sb.String() +} + +// formatVTTTime converts seconds to WebVTT timestamp (HH:MM:SS.mmm). +func formatVTTTime(seconds float64) string { + d := time.Duration(seconds * float64(time.Second)) + h := int(d.Hours()) + m := int(d.Minutes()) % 60 + s := int(d.Seconds()) % 60 + ms := int(d.Milliseconds()) % 1000 + return fmt.Sprintf("%02d:%02d:%02d.%03d", h, m, s, ms) } diff --git a/test/testutil/fake_speech_transcriber.go b/test/testutil/fake_speech_transcriber.go index 567393b..210da56 100644 --- a/test/testutil/fake_speech_transcriber.go +++ b/test/testutil/fake_speech_transcriber.go @@ -1,14 +1,18 @@ package testutil -import "context" +import ( + "context" + + "github.com/paramah/gw_telegram/internal/domain/port" +) type FakeSpeechTranscriber struct { - Transcript string - Error error - CallCount int + Result port.TranscriptionResult + Error error + CallCount int } -func (f *FakeSpeechTranscriber) Transcribe(_ context.Context, _ []byte, _ string) (string, error) { +func (f *FakeSpeechTranscriber) Transcribe(_ context.Context, _ []byte, _ string) (port.TranscriptionResult, error) { f.CallCount++ - return f.Transcript, f.Error + return f.Result, f.Error } diff --git a/test/testutil/fake_voice_file_store.go b/test/testutil/fake_voice_file_store.go new file mode 100644 index 0000000..6c02588 --- /dev/null +++ b/test/testutil/fake_voice_file_store.go @@ -0,0 +1,33 @@ +package testutil + +import "context" + +type FakeVoiceFileStore struct { + SaveRawPath string + SaveConvertedPath string + SaveVTTPath string + SaveRawError error + SaveConvertedError error + SaveVTTError error + CleanupError error + CleanupCallCount int + LastCleanupID string +} + +func (f *FakeVoiceFileStore) SaveRaw(_ context.Context, _ string, _ []byte) (string, error) { + return f.SaveRawPath, f.SaveRawError +} + +func (f *FakeVoiceFileStore) SaveConverted(_ context.Context, _ string, _ []byte) (string, error) { + return f.SaveConvertedPath, f.SaveConvertedError +} + +func (f *FakeVoiceFileStore) SaveVTT(_ context.Context, _ string, _ string) (string, error) { + return f.SaveVTTPath, f.SaveVTTError +} + +func (f *FakeVoiceFileStore) Cleanup(_ context.Context, sampleID string) error { + f.CleanupCallCount++ + f.LastCleanupID = sampleID + return f.CleanupError +}