package usecase import ( "context" "fmt" "log/slog" "github.com/paramah/gw_telegram/internal/application/dto" "github.com/paramah/gw_telegram/internal/domain/port" ) type HandleVoiceMessage struct { downloader port.FileDownloader converter AudioConverter transcriber port.SpeechTranscriber textHandler *HandleTextMessage gateway port.MessageGateway logger *slog.Logger } // AudioConverter converts audio between formats (OGG -> WAV etc.) type AudioConverter interface { Convert(ctx context.Context, input []byte, fromMime, toMime string) ([]byte, error) } func NewHandleVoiceMessage( downloader port.FileDownloader, converter AudioConverter, transcriber port.SpeechTranscriber, textHandler *HandleTextMessage, gateway port.MessageGateway, logger *slog.Logger, ) *HandleVoiceMessage { return &HandleVoiceMessage{ downloader: downloader, converter: converter, transcriber: transcriber, textHandler: textHandler, gateway: gateway, logger: logger, } } func (h *HandleVoiceMessage) Execute(ctx context.Context, in dto.IncomingMessageDTO) error { _ = h.gateway.SendTyping(ctx, in.ChatID) audioBytes, mimeType, err := h.downloader.Download(ctx, in.VoiceFileID) if err != nil { h.logger.ErrorContext(ctx, "voice download failed", "error", err, "file_id", in.VoiceFileID) _ = h.gateway.SendText(ctx, in.ChatID, "Sorry, I couldn't download the audio message. Please try again.") return fmt.Errorf("handle voice: download: %w", err) } // Convert OGG Opus (Telegram default) to WAV for Whisper if mimeType == "audio/ogg" || mimeType == "" { wavBytes, err := h.converter.Convert(ctx, audioBytes, "audio/ogg", "audio/wav") if err != nil { h.logger.WarnContext(ctx, "audio conversion failed, trying raw", "error", err) // fall through with original bytes } else { audioBytes = wavBytes mimeType = "audio/wav" } } transcript, err := h.transcriber.Transcribe(ctx, audioBytes, mimeType) if err != nil { h.logger.ErrorContext(ctx, "transcription failed", "error", err) _ = h.gateway.SendText(ctx, in.ChatID, "Sorry, I couldn't understand the voice message. Please try sending text instead.") return fmt.Errorf("handle voice: transcribe: %w", err) } h.logger.InfoContext(ctx, "voice transcribed", "user_id", in.UserID, "length", len(transcript)) textIn := in textIn.Text = transcript textIn.IsVoice = false return h.textHandler.Execute(ctx, textIn) }