feat: persist voice samples to disk, generate VTT, cleanup after analysis
This commit is contained in:
@@ -4,8 +4,11 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/paramah/gw_telegram/internal/application/dto"
|
||||
"github.com/paramah/gw_telegram/internal/domain/entity"
|
||||
"github.com/paramah/gw_telegram/internal/domain/port"
|
||||
)
|
||||
|
||||
@@ -13,12 +16,13 @@ type HandleVoiceMessage struct {
|
||||
downloader port.FileDownloader
|
||||
converter AudioConverter
|
||||
transcriber port.SpeechTranscriber
|
||||
fileStore port.VoiceFileStore
|
||||
textHandler *HandleTextMessage
|
||||
gateway port.MessageGateway
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// AudioConverter converts audio between formats (OGG -> WAV etc.)
|
||||
// AudioConverter converts audio between formats (OGG → WAV etc.)
|
||||
type AudioConverter interface {
|
||||
Convert(ctx context.Context, input []byte, fromMime, toMime string) ([]byte, error)
|
||||
}
|
||||
@@ -27,6 +31,7 @@ func NewHandleVoiceMessage(
|
||||
downloader port.FileDownloader,
|
||||
converter AudioConverter,
|
||||
transcriber port.SpeechTranscriber,
|
||||
fileStore port.VoiceFileStore,
|
||||
textHandler *HandleTextMessage,
|
||||
gateway port.MessageGateway,
|
||||
logger *slog.Logger,
|
||||
@@ -35,6 +40,7 @@ func NewHandleVoiceMessage(
|
||||
downloader: downloader,
|
||||
converter: converter,
|
||||
transcriber: transcriber,
|
||||
fileStore: fileStore,
|
||||
textHandler: textHandler,
|
||||
gateway: gateway,
|
||||
logger: logger,
|
||||
@@ -44,6 +50,15 @@ func NewHandleVoiceMessage(
|
||||
func (h *HandleVoiceMessage) Execute(ctx context.Context, in dto.IncomingMessageDTO) error {
|
||||
_ = h.gateway.SendTyping(ctx, in.ChatID)
|
||||
|
||||
sample := entity.VoiceSample{
|
||||
ID: uuid.New().String(),
|
||||
UserID: in.UserID,
|
||||
ChatID: in.ChatID,
|
||||
FileIDTg: in.VoiceFileID,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
|
||||
// Step 1: download OGG from Telegram and persist to disk
|
||||
audioBytes, mimeType, err := h.downloader.Download(ctx, in.VoiceFileID)
|
||||
if err != nil {
|
||||
h.logger.ErrorContext(ctx, "voice download failed", "error", err, "file_id", in.VoiceFileID)
|
||||
@@ -51,30 +66,68 @@ func (h *HandleVoiceMessage) Execute(ctx context.Context, in dto.IncomingMessage
|
||||
return fmt.Errorf("handle voice: download: %w", err)
|
||||
}
|
||||
|
||||
// Convert OGG Opus (Telegram default) to WAV for Whisper
|
||||
sample.OGGPath, err = h.fileStore.SaveRaw(ctx, sample.ID, audioBytes)
|
||||
if err != nil {
|
||||
h.logger.WarnContext(ctx, "failed to persist raw audio", "error", err, "sample_id", sample.ID)
|
||||
}
|
||||
|
||||
// Step 2: convert OGG → WAV and persist to disk
|
||||
if mimeType == "audio/ogg" || mimeType == "" {
|
||||
wavBytes, err := h.converter.Convert(ctx, audioBytes, "audio/ogg", "audio/wav")
|
||||
if err != nil {
|
||||
h.logger.WarnContext(ctx, "audio conversion failed, trying raw", "error", err)
|
||||
// fall through with original bytes
|
||||
wavBytes, convErr := h.converter.Convert(ctx, audioBytes, "audio/ogg", "audio/wav")
|
||||
if convErr != nil {
|
||||
h.logger.WarnContext(ctx, "audio conversion failed, proceeding with raw audio", "error", convErr)
|
||||
} else {
|
||||
audioBytes = wavBytes
|
||||
mimeType = "audio/wav"
|
||||
|
||||
sample.WAVPath, err = h.fileStore.SaveConverted(ctx, sample.ID, wavBytes)
|
||||
if err != nil {
|
||||
h.logger.WarnContext(ctx, "failed to persist converted audio", "error", err, "sample_id", sample.ID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
transcript, err := h.transcriber.Transcribe(ctx, audioBytes, mimeType)
|
||||
// Step 3: transcribe — receive both text and VTT
|
||||
result, err := h.transcriber.Transcribe(ctx, audioBytes, mimeType)
|
||||
if err != nil {
|
||||
h.logger.ErrorContext(ctx, "transcription failed", "error", err)
|
||||
h.logger.ErrorContext(ctx, "transcription failed", "error", err, "sample_id", sample.ID)
|
||||
_ = h.gateway.SendText(ctx, in.ChatID, "Sorry, I couldn't understand the voice message. Please try sending text instead.")
|
||||
_ = h.fileStore.Cleanup(ctx, sample.ID)
|
||||
return fmt.Errorf("handle voice: transcribe: %w", err)
|
||||
}
|
||||
|
||||
h.logger.InfoContext(ctx, "voice transcribed", "user_id", in.UserID, "length", len(transcript))
|
||||
sample.Transcript = result.Text
|
||||
|
||||
// Step 4: persist VTT file
|
||||
if result.VTT != "" {
|
||||
sample.VTTPath, err = h.fileStore.SaveVTT(ctx, sample.ID, result.VTT)
|
||||
if err != nil {
|
||||
h.logger.WarnContext(ctx, "failed to persist vtt", "error", err, "sample_id", sample.ID)
|
||||
}
|
||||
}
|
||||
|
||||
h.logger.InfoContext(ctx, "voice sample analysed",
|
||||
"user_id", in.UserID,
|
||||
"sample_id", sample.ID,
|
||||
"ogg_path", sample.OGGPath,
|
||||
"wav_path", sample.WAVPath,
|
||||
"vtt_path", sample.VTTPath,
|
||||
"transcript_len", len(result.Text),
|
||||
)
|
||||
|
||||
// Step 5: route transcript as a regular text message
|
||||
textIn := in
|
||||
textIn.Text = transcript
|
||||
textIn.Text = result.Text
|
||||
textIn.IsVoice = false
|
||||
|
||||
return h.textHandler.Execute(ctx, textIn)
|
||||
if err := h.textHandler.Execute(ctx, textIn); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Step 6: cleanup only after successful analysis and dispatch
|
||||
if cleanErr := h.fileStore.Cleanup(ctx, sample.ID); cleanErr != nil {
|
||||
h.logger.WarnContext(ctx, "failed to cleanup voice files", "error", cleanErr, "sample_id", sample.ID)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"github.com/paramah/gw_telegram/internal/application/dto"
|
||||
"github.com/paramah/gw_telegram/internal/application/usecase"
|
||||
"github.com/paramah/gw_telegram/internal/domain/entity"
|
||||
"github.com/paramah/gw_telegram/internal/domain/port"
|
||||
"github.com/paramah/gw_telegram/test/testutil"
|
||||
)
|
||||
|
||||
@@ -24,25 +25,44 @@ func validVoiceDTO() dto.IncomingMessageDTO {
|
||||
}
|
||||
}
|
||||
|
||||
func newVoiceUC(
|
||||
downloader *testutil.FakeFileDownloader,
|
||||
transcriber *testutil.FakeSpeechTranscriber,
|
||||
fileStore *testutil.FakeVoiceFileStore,
|
||||
router *testutil.FakeIntentRouter,
|
||||
dispatcher *testutil.FakeWorkflowDispatcher,
|
||||
gateway *testutil.FakeMessageGateway,
|
||||
) *usecase.HandleVoiceMessage {
|
||||
textUC := usecase.NewHandleTextMessage(router, dispatcher, &testutil.FakeSessionStore{}, gateway, newTestLogger())
|
||||
return usecase.NewHandleVoiceMessage(
|
||||
downloader,
|
||||
&testutil.FakeAudioConverter{},
|
||||
transcriber,
|
||||
fileStore,
|
||||
textUC,
|
||||
gateway,
|
||||
newTestLogger(),
|
||||
)
|
||||
}
|
||||
|
||||
func TestHandleVoiceMessage_Execute_HappyPath(t *testing.T) {
|
||||
downloader := &testutil.FakeFileDownloader{
|
||||
Data: []byte("fake-ogg-audio"),
|
||||
MimeType: "audio/ogg",
|
||||
downloader := &testutil.FakeFileDownloader{Data: []byte("fake-ogg"), MimeType: "audio/ogg"}
|
||||
transcriber := &testutil.FakeSpeechTranscriber{
|
||||
Result: port.TranscriptionResult{
|
||||
Text: "I need help with my order",
|
||||
VTT: "WEBVTT\n\n00:00:00.000 --> 00:00:02.000\nI need help with my order\n\n",
|
||||
},
|
||||
}
|
||||
converter := &testutil.FakeAudioConverter{Output: []byte("fake-wav-audio")}
|
||||
transcriber := &testutil.FakeSpeechTranscriber{Transcript: "I need help with my order"}
|
||||
router := &testutil.FakeIntentRouter{
|
||||
RouteResult: entity.Route{IntentName: "order_inquiry"},
|
||||
fileStore := &testutil.FakeVoiceFileStore{
|
||||
SaveRawPath: "/tmp/abc.ogg",
|
||||
SaveConvertedPath: "/tmp/abc.wav",
|
||||
SaveVTTPath: "/tmp/abc.vtt",
|
||||
}
|
||||
dispatcher := &testutil.FakeWorkflowDispatcher{
|
||||
Response: entity.WorkflowResponse{ReplyText: "Order status: shipped"},
|
||||
}
|
||||
sessions := &testutil.FakeSessionStore{}
|
||||
router := &testutil.FakeIntentRouter{RouteResult: entity.Route{IntentName: "order_inquiry"}}
|
||||
dispatcher := &testutil.FakeWorkflowDispatcher{Response: entity.WorkflowResponse{ReplyText: "Order status: shipped"}}
|
||||
gateway := &testutil.FakeMessageGateway{}
|
||||
|
||||
textUC := usecase.NewHandleTextMessage(router, dispatcher, sessions, gateway, newTestLogger())
|
||||
uc := usecase.NewHandleVoiceMessage(downloader, converter, transcriber, textUC, gateway, newTestLogger())
|
||||
|
||||
uc := newVoiceUC(downloader, transcriber, fileStore, router, dispatcher, gateway)
|
||||
err := uc.Execute(context.Background(), validVoiceDTO())
|
||||
|
||||
require.NoError(t, err)
|
||||
@@ -50,50 +70,64 @@ func TestHandleVoiceMessage_Execute_HappyPath(t *testing.T) {
|
||||
assert.Equal(t, 1, dispatcher.CallCount)
|
||||
assert.Equal(t, "I need help with my order", dispatcher.LastRequest.MessageText)
|
||||
assert.Equal(t, "Order status: shipped", gateway.LastSentText)
|
||||
assert.Equal(t, 1, fileStore.CleanupCallCount)
|
||||
}
|
||||
|
||||
func TestHandleVoiceMessage_Execute_DownloadFails(t *testing.T) {
|
||||
downloader := &testutil.FakeFileDownloader{Error: errors.New("download error")}
|
||||
downloader := &testutil.FakeFileDownloader{Error: errors.New("network error")}
|
||||
transcriber := &testutil.FakeSpeechTranscriber{}
|
||||
fileStore := &testutil.FakeVoiceFileStore{}
|
||||
gateway := &testutil.FakeMessageGateway{}
|
||||
|
||||
textUC := usecase.NewHandleTextMessage(
|
||||
uc := newVoiceUC(downloader, transcriber, fileStore,
|
||||
&testutil.FakeIntentRouter{},
|
||||
&testutil.FakeWorkflowDispatcher{},
|
||||
&testutil.FakeSessionStore{},
|
||||
gateway,
|
||||
newTestLogger(),
|
||||
)
|
||||
uc := usecase.NewHandleVoiceMessage(downloader, &testutil.FakeAudioConverter{}, transcriber, textUC, gateway, newTestLogger())
|
||||
|
||||
err := uc.Execute(context.Background(), validVoiceDTO())
|
||||
|
||||
assert.Error(t, err)
|
||||
assert.Equal(t, 0, transcriber.CallCount)
|
||||
assert.NotEmpty(t, gateway.LastSentText) // error message sent
|
||||
assert.NotEmpty(t, gateway.LastSentText)
|
||||
assert.Equal(t, 0, fileStore.CleanupCallCount)
|
||||
}
|
||||
|
||||
func TestHandleVoiceMessage_Execute_TranscriptionFails(t *testing.T) {
|
||||
downloader := &testutil.FakeFileDownloader{
|
||||
Data: []byte("audio"),
|
||||
MimeType: "audio/ogg",
|
||||
}
|
||||
downloader := &testutil.FakeFileDownloader{Data: []byte("audio"), MimeType: "audio/ogg"}
|
||||
transcriber := &testutil.FakeSpeechTranscriber{Error: errors.New("whisper error")}
|
||||
fileStore := &testutil.FakeVoiceFileStore{}
|
||||
gateway := &testutil.FakeMessageGateway{}
|
||||
dispatcher := &testutil.FakeWorkflowDispatcher{}
|
||||
|
||||
textUC := usecase.NewHandleTextMessage(
|
||||
uc := newVoiceUC(downloader, transcriber, fileStore,
|
||||
&testutil.FakeIntentRouter{},
|
||||
dispatcher,
|
||||
&testutil.FakeSessionStore{},
|
||||
gateway,
|
||||
newTestLogger(),
|
||||
)
|
||||
uc := usecase.NewHandleVoiceMessage(downloader, &testutil.FakeAudioConverter{}, transcriber, textUC, gateway, newTestLogger())
|
||||
|
||||
err := uc.Execute(context.Background(), validVoiceDTO())
|
||||
|
||||
assert.Error(t, err)
|
||||
assert.Equal(t, 0, dispatcher.CallCount)
|
||||
assert.NotEmpty(t, gateway.LastSentText)
|
||||
assert.Equal(t, 1, fileStore.CleanupCallCount)
|
||||
}
|
||||
|
||||
func TestHandleVoiceMessage_Execute_NoVTTWhenEmpty(t *testing.T) {
|
||||
downloader := &testutil.FakeFileDownloader{Data: []byte("audio"), MimeType: "audio/ogg"}
|
||||
transcriber := &testutil.FakeSpeechTranscriber{
|
||||
Result: port.TranscriptionResult{Text: "Hello", VTT: ""},
|
||||
}
|
||||
fileStore := &testutil.FakeVoiceFileStore{}
|
||||
gateway := &testutil.FakeMessageGateway{}
|
||||
|
||||
uc := newVoiceUC(downloader, transcriber, fileStore,
|
||||
&testutil.FakeIntentRouter{RouteResult: entity.Route{IntentName: "general_query"}},
|
||||
&testutil.FakeWorkflowDispatcher{Response: entity.WorkflowResponse{ReplyText: "OK"}},
|
||||
gateway,
|
||||
)
|
||||
err := uc.Execute(context.Background(), validVoiceDTO())
|
||||
|
||||
require.NoError(t, err)
|
||||
// SaveVTT should NOT have been called — VTTPath stays empty
|
||||
assert.Empty(t, fileStore.SaveVTTPath)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user