package usecase import ( "context" "fmt" "log/slog" "time" "github.com/google/uuid" "github.com/paramah/gw_telegram/internal/application/dto" "github.com/paramah/gw_telegram/internal/domain/entity" "github.com/paramah/gw_telegram/internal/domain/port" ) type HandleVoiceMessage struct { downloader port.FileDownloader converter AudioConverter transcriber port.SpeechTranscriber fileStore port.VoiceFileStore textHandler *HandleTextMessage gateway port.MessageGateway logger *slog.Logger } // AudioConverter converts audio between formats (OGG → WAV etc.) type AudioConverter interface { Convert(ctx context.Context, input []byte, fromMime, toMime string) ([]byte, error) } func NewHandleVoiceMessage( downloader port.FileDownloader, converter AudioConverter, transcriber port.SpeechTranscriber, fileStore port.VoiceFileStore, textHandler *HandleTextMessage, gateway port.MessageGateway, logger *slog.Logger, ) *HandleVoiceMessage { return &HandleVoiceMessage{ downloader: downloader, converter: converter, transcriber: transcriber, fileStore: fileStore, textHandler: textHandler, gateway: gateway, logger: logger, } } func (h *HandleVoiceMessage) Execute(ctx context.Context, in dto.IncomingMessageDTO) error { _ = h.gateway.SendTyping(ctx, in.ChatID) sample := entity.VoiceSample{ ID: uuid.New().String(), UserID: in.UserID, ChatID: in.ChatID, FileIDTg: in.VoiceFileID, CreatedAt: time.Now(), } // Step 1: download OGG from Telegram and persist to disk audioBytes, mimeType, err := h.downloader.Download(ctx, in.VoiceFileID) if err != nil { h.logger.ErrorContext(ctx, "voice download failed", "error", err, "file_id", in.VoiceFileID) _ = h.gateway.SendText(ctx, in.ChatID, "Sorry, I couldn't download the audio message. Please try again.") return fmt.Errorf("handle voice: download: %w", err) } sample.OGGPath, err = h.fileStore.SaveRaw(ctx, sample.ID, audioBytes) if err != nil { h.logger.WarnContext(ctx, "failed to persist raw audio", "error", err, "sample_id", sample.ID) } // Step 2: convert OGG → WAV and persist to disk if mimeType == "audio/ogg" || mimeType == "" { wavBytes, convErr := h.converter.Convert(ctx, audioBytes, "audio/ogg", "audio/wav") if convErr != nil { h.logger.WarnContext(ctx, "audio conversion failed, proceeding with raw audio", "error", convErr) } else { audioBytes = wavBytes mimeType = "audio/wav" sample.WAVPath, err = h.fileStore.SaveConverted(ctx, sample.ID, wavBytes) if err != nil { h.logger.WarnContext(ctx, "failed to persist converted audio", "error", err, "sample_id", sample.ID) } } } // Step 3: transcribe — receive both text and VTT result, err := h.transcriber.Transcribe(ctx, audioBytes, mimeType) if err != nil { h.logger.ErrorContext(ctx, "transcription failed", "error", err, "sample_id", sample.ID) _ = h.gateway.SendText(ctx, in.ChatID, "Sorry, I couldn't understand the voice message. Please try sending text instead.") _ = h.fileStore.Cleanup(ctx, sample.ID) return fmt.Errorf("handle voice: transcribe: %w", err) } sample.Transcript = result.Text // Step 4: persist VTT file if result.VTT != "" { sample.VTTPath, err = h.fileStore.SaveVTT(ctx, sample.ID, result.VTT) if err != nil { h.logger.WarnContext(ctx, "failed to persist vtt", "error", err, "sample_id", sample.ID) } } h.logger.InfoContext(ctx, "voice sample analysed", "user_id", in.UserID, "sample_id", sample.ID, "ogg_path", sample.OGGPath, "wav_path", sample.WAVPath, "vtt_path", sample.VTTPath, "transcript_len", len(result.Text), ) // Step 5: route transcript as a regular text message textIn := in textIn.Text = result.Text textIn.IsVoice = false if err := h.textHandler.Execute(ctx, textIn); err != nil { return err } // Step 6: cleanup only after successful analysis and dispatch if cleanErr := h.fileStore.Cleanup(ctx, sample.ID); cleanErr != nil { h.logger.WarnContext(ctx, "failed to cleanup voice files", "error", cleanErr, "sample_id", sample.ID) } return nil }