feat: persist voice samples to disk, generate VTT, cleanup after analysis
This commit is contained in:
@@ -9,6 +9,7 @@ OPENAI_API_KEY=your_openai_key_here
|
|||||||
STT_PROVIDER=openai
|
STT_PROVIDER=openai
|
||||||
WHISPER_MODEL=whisper-1
|
WHISPER_MODEL=whisper-1
|
||||||
WHISPER_LANGUAGE=
|
WHISPER_LANGUAGE=
|
||||||
|
VOICE_STORE_PATH=/tmp/gw_telegram/voice
|
||||||
|
|
||||||
REDIS_URL=redis://localhost:6379
|
REDIS_URL=redis://localhost:6379
|
||||||
SESSION_TTL=24
|
SESSION_TTL=24
|
||||||
|
|||||||
@@ -87,10 +87,15 @@ func main() {
|
|||||||
// Speech
|
// Speech
|
||||||
transcriber := speech.NewOpenAIWhisper(cfg.Speech.OpenAIKey, cfg.Speech.WhisperModel, cfg.Speech.Language)
|
transcriber := speech.NewOpenAIWhisper(cfg.Speech.OpenAIKey, cfg.Speech.WhisperModel, cfg.Speech.Language)
|
||||||
converter := speech.NewFFmpegConverter(cfg.Speech.FFmpegPath, "")
|
converter := speech.NewFFmpegConverter(cfg.Speech.FFmpegPath, "")
|
||||||
|
voiceFileStore, err := speech.NewLocalVoiceFileStore(cfg.Speech.VoiceStorePath)
|
||||||
|
if err != nil {
|
||||||
|
logger.Error("failed to initialise voice file store", "error", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
// Use cases
|
// Use cases
|
||||||
textUC := usecase.NewHandleTextMessage(intentRouter, dispatcher, sessionStore, gateway, logger)
|
textUC := usecase.NewHandleTextMessage(intentRouter, dispatcher, sessionStore, gateway, logger)
|
||||||
voiceUC := usecase.NewHandleVoiceMessage(downloader, converter, transcriber, textUC, gateway, logger)
|
voiceUC := usecase.NewHandleVoiceMessage(downloader, converter, transcriber, voiceFileStore, textUC, gateway, logger)
|
||||||
|
|
||||||
// Handler + poller
|
// Handler + poller
|
||||||
handler := interfaces.NewTelegramHandler(textUC, voiceUC, logger)
|
handler := interfaces.NewTelegramHandler(textUC, voiceUC, logger)
|
||||||
|
|||||||
@@ -4,8 +4,11 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/google/uuid"
|
||||||
"github.com/paramah/gw_telegram/internal/application/dto"
|
"github.com/paramah/gw_telegram/internal/application/dto"
|
||||||
|
"github.com/paramah/gw_telegram/internal/domain/entity"
|
||||||
"github.com/paramah/gw_telegram/internal/domain/port"
|
"github.com/paramah/gw_telegram/internal/domain/port"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -13,12 +16,13 @@ type HandleVoiceMessage struct {
|
|||||||
downloader port.FileDownloader
|
downloader port.FileDownloader
|
||||||
converter AudioConverter
|
converter AudioConverter
|
||||||
transcriber port.SpeechTranscriber
|
transcriber port.SpeechTranscriber
|
||||||
|
fileStore port.VoiceFileStore
|
||||||
textHandler *HandleTextMessage
|
textHandler *HandleTextMessage
|
||||||
gateway port.MessageGateway
|
gateway port.MessageGateway
|
||||||
logger *slog.Logger
|
logger *slog.Logger
|
||||||
}
|
}
|
||||||
|
|
||||||
// AudioConverter converts audio between formats (OGG -> WAV etc.)
|
// AudioConverter converts audio between formats (OGG → WAV etc.)
|
||||||
type AudioConverter interface {
|
type AudioConverter interface {
|
||||||
Convert(ctx context.Context, input []byte, fromMime, toMime string) ([]byte, error)
|
Convert(ctx context.Context, input []byte, fromMime, toMime string) ([]byte, error)
|
||||||
}
|
}
|
||||||
@@ -27,6 +31,7 @@ func NewHandleVoiceMessage(
|
|||||||
downloader port.FileDownloader,
|
downloader port.FileDownloader,
|
||||||
converter AudioConverter,
|
converter AudioConverter,
|
||||||
transcriber port.SpeechTranscriber,
|
transcriber port.SpeechTranscriber,
|
||||||
|
fileStore port.VoiceFileStore,
|
||||||
textHandler *HandleTextMessage,
|
textHandler *HandleTextMessage,
|
||||||
gateway port.MessageGateway,
|
gateway port.MessageGateway,
|
||||||
logger *slog.Logger,
|
logger *slog.Logger,
|
||||||
@@ -35,6 +40,7 @@ func NewHandleVoiceMessage(
|
|||||||
downloader: downloader,
|
downloader: downloader,
|
||||||
converter: converter,
|
converter: converter,
|
||||||
transcriber: transcriber,
|
transcriber: transcriber,
|
||||||
|
fileStore: fileStore,
|
||||||
textHandler: textHandler,
|
textHandler: textHandler,
|
||||||
gateway: gateway,
|
gateway: gateway,
|
||||||
logger: logger,
|
logger: logger,
|
||||||
@@ -44,6 +50,15 @@ func NewHandleVoiceMessage(
|
|||||||
func (h *HandleVoiceMessage) Execute(ctx context.Context, in dto.IncomingMessageDTO) error {
|
func (h *HandleVoiceMessage) Execute(ctx context.Context, in dto.IncomingMessageDTO) error {
|
||||||
_ = h.gateway.SendTyping(ctx, in.ChatID)
|
_ = h.gateway.SendTyping(ctx, in.ChatID)
|
||||||
|
|
||||||
|
sample := entity.VoiceSample{
|
||||||
|
ID: uuid.New().String(),
|
||||||
|
UserID: in.UserID,
|
||||||
|
ChatID: in.ChatID,
|
||||||
|
FileIDTg: in.VoiceFileID,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 1: download OGG from Telegram and persist to disk
|
||||||
audioBytes, mimeType, err := h.downloader.Download(ctx, in.VoiceFileID)
|
audioBytes, mimeType, err := h.downloader.Download(ctx, in.VoiceFileID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
h.logger.ErrorContext(ctx, "voice download failed", "error", err, "file_id", in.VoiceFileID)
|
h.logger.ErrorContext(ctx, "voice download failed", "error", err, "file_id", in.VoiceFileID)
|
||||||
@@ -51,30 +66,68 @@ func (h *HandleVoiceMessage) Execute(ctx context.Context, in dto.IncomingMessage
|
|||||||
return fmt.Errorf("handle voice: download: %w", err)
|
return fmt.Errorf("handle voice: download: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert OGG Opus (Telegram default) to WAV for Whisper
|
sample.OGGPath, err = h.fileStore.SaveRaw(ctx, sample.ID, audioBytes)
|
||||||
|
if err != nil {
|
||||||
|
h.logger.WarnContext(ctx, "failed to persist raw audio", "error", err, "sample_id", sample.ID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: convert OGG → WAV and persist to disk
|
||||||
if mimeType == "audio/ogg" || mimeType == "" {
|
if mimeType == "audio/ogg" || mimeType == "" {
|
||||||
wavBytes, err := h.converter.Convert(ctx, audioBytes, "audio/ogg", "audio/wav")
|
wavBytes, convErr := h.converter.Convert(ctx, audioBytes, "audio/ogg", "audio/wav")
|
||||||
if err != nil {
|
if convErr != nil {
|
||||||
h.logger.WarnContext(ctx, "audio conversion failed, trying raw", "error", err)
|
h.logger.WarnContext(ctx, "audio conversion failed, proceeding with raw audio", "error", convErr)
|
||||||
// fall through with original bytes
|
|
||||||
} else {
|
} else {
|
||||||
audioBytes = wavBytes
|
audioBytes = wavBytes
|
||||||
mimeType = "audio/wav"
|
mimeType = "audio/wav"
|
||||||
|
|
||||||
|
sample.WAVPath, err = h.fileStore.SaveConverted(ctx, sample.ID, wavBytes)
|
||||||
|
if err != nil {
|
||||||
|
h.logger.WarnContext(ctx, "failed to persist converted audio", "error", err, "sample_id", sample.ID)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
transcript, err := h.transcriber.Transcribe(ctx, audioBytes, mimeType)
|
// Step 3: transcribe — receive both text and VTT
|
||||||
|
result, err := h.transcriber.Transcribe(ctx, audioBytes, mimeType)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
h.logger.ErrorContext(ctx, "transcription failed", "error", err)
|
h.logger.ErrorContext(ctx, "transcription failed", "error", err, "sample_id", sample.ID)
|
||||||
_ = h.gateway.SendText(ctx, in.ChatID, "Sorry, I couldn't understand the voice message. Please try sending text instead.")
|
_ = h.gateway.SendText(ctx, in.ChatID, "Sorry, I couldn't understand the voice message. Please try sending text instead.")
|
||||||
|
_ = h.fileStore.Cleanup(ctx, sample.ID)
|
||||||
return fmt.Errorf("handle voice: transcribe: %w", err)
|
return fmt.Errorf("handle voice: transcribe: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
h.logger.InfoContext(ctx, "voice transcribed", "user_id", in.UserID, "length", len(transcript))
|
sample.Transcript = result.Text
|
||||||
|
|
||||||
|
// Step 4: persist VTT file
|
||||||
|
if result.VTT != "" {
|
||||||
|
sample.VTTPath, err = h.fileStore.SaveVTT(ctx, sample.ID, result.VTT)
|
||||||
|
if err != nil {
|
||||||
|
h.logger.WarnContext(ctx, "failed to persist vtt", "error", err, "sample_id", sample.ID)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
h.logger.InfoContext(ctx, "voice sample analysed",
|
||||||
|
"user_id", in.UserID,
|
||||||
|
"sample_id", sample.ID,
|
||||||
|
"ogg_path", sample.OGGPath,
|
||||||
|
"wav_path", sample.WAVPath,
|
||||||
|
"vtt_path", sample.VTTPath,
|
||||||
|
"transcript_len", len(result.Text),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Step 5: route transcript as a regular text message
|
||||||
textIn := in
|
textIn := in
|
||||||
textIn.Text = transcript
|
textIn.Text = result.Text
|
||||||
textIn.IsVoice = false
|
textIn.IsVoice = false
|
||||||
|
|
||||||
return h.textHandler.Execute(ctx, textIn)
|
if err := h.textHandler.Execute(ctx, textIn); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 6: cleanup only after successful analysis and dispatch
|
||||||
|
if cleanErr := h.fileStore.Cleanup(ctx, sample.ID); cleanErr != nil {
|
||||||
|
h.logger.WarnContext(ctx, "failed to cleanup voice files", "error", cleanErr, "sample_id", sample.ID)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import (
|
|||||||
"github.com/paramah/gw_telegram/internal/application/dto"
|
"github.com/paramah/gw_telegram/internal/application/dto"
|
||||||
"github.com/paramah/gw_telegram/internal/application/usecase"
|
"github.com/paramah/gw_telegram/internal/application/usecase"
|
||||||
"github.com/paramah/gw_telegram/internal/domain/entity"
|
"github.com/paramah/gw_telegram/internal/domain/entity"
|
||||||
|
"github.com/paramah/gw_telegram/internal/domain/port"
|
||||||
"github.com/paramah/gw_telegram/test/testutil"
|
"github.com/paramah/gw_telegram/test/testutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -24,25 +25,44 @@ func validVoiceDTO() dto.IncomingMessageDTO {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func newVoiceUC(
|
||||||
|
downloader *testutil.FakeFileDownloader,
|
||||||
|
transcriber *testutil.FakeSpeechTranscriber,
|
||||||
|
fileStore *testutil.FakeVoiceFileStore,
|
||||||
|
router *testutil.FakeIntentRouter,
|
||||||
|
dispatcher *testutil.FakeWorkflowDispatcher,
|
||||||
|
gateway *testutil.FakeMessageGateway,
|
||||||
|
) *usecase.HandleVoiceMessage {
|
||||||
|
textUC := usecase.NewHandleTextMessage(router, dispatcher, &testutil.FakeSessionStore{}, gateway, newTestLogger())
|
||||||
|
return usecase.NewHandleVoiceMessage(
|
||||||
|
downloader,
|
||||||
|
&testutil.FakeAudioConverter{},
|
||||||
|
transcriber,
|
||||||
|
fileStore,
|
||||||
|
textUC,
|
||||||
|
gateway,
|
||||||
|
newTestLogger(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
func TestHandleVoiceMessage_Execute_HappyPath(t *testing.T) {
|
func TestHandleVoiceMessage_Execute_HappyPath(t *testing.T) {
|
||||||
downloader := &testutil.FakeFileDownloader{
|
downloader := &testutil.FakeFileDownloader{Data: []byte("fake-ogg"), MimeType: "audio/ogg"}
|
||||||
Data: []byte("fake-ogg-audio"),
|
transcriber := &testutil.FakeSpeechTranscriber{
|
||||||
MimeType: "audio/ogg",
|
Result: port.TranscriptionResult{
|
||||||
|
Text: "I need help with my order",
|
||||||
|
VTT: "WEBVTT\n\n00:00:00.000 --> 00:00:02.000\nI need help with my order\n\n",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
converter := &testutil.FakeAudioConverter{Output: []byte("fake-wav-audio")}
|
fileStore := &testutil.FakeVoiceFileStore{
|
||||||
transcriber := &testutil.FakeSpeechTranscriber{Transcript: "I need help with my order"}
|
SaveRawPath: "/tmp/abc.ogg",
|
||||||
router := &testutil.FakeIntentRouter{
|
SaveConvertedPath: "/tmp/abc.wav",
|
||||||
RouteResult: entity.Route{IntentName: "order_inquiry"},
|
SaveVTTPath: "/tmp/abc.vtt",
|
||||||
}
|
}
|
||||||
dispatcher := &testutil.FakeWorkflowDispatcher{
|
router := &testutil.FakeIntentRouter{RouteResult: entity.Route{IntentName: "order_inquiry"}}
|
||||||
Response: entity.WorkflowResponse{ReplyText: "Order status: shipped"},
|
dispatcher := &testutil.FakeWorkflowDispatcher{Response: entity.WorkflowResponse{ReplyText: "Order status: shipped"}}
|
||||||
}
|
|
||||||
sessions := &testutil.FakeSessionStore{}
|
|
||||||
gateway := &testutil.FakeMessageGateway{}
|
gateway := &testutil.FakeMessageGateway{}
|
||||||
|
|
||||||
textUC := usecase.NewHandleTextMessage(router, dispatcher, sessions, gateway, newTestLogger())
|
uc := newVoiceUC(downloader, transcriber, fileStore, router, dispatcher, gateway)
|
||||||
uc := usecase.NewHandleVoiceMessage(downloader, converter, transcriber, textUC, gateway, newTestLogger())
|
|
||||||
|
|
||||||
err := uc.Execute(context.Background(), validVoiceDTO())
|
err := uc.Execute(context.Background(), validVoiceDTO())
|
||||||
|
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
@@ -50,50 +70,64 @@ func TestHandleVoiceMessage_Execute_HappyPath(t *testing.T) {
|
|||||||
assert.Equal(t, 1, dispatcher.CallCount)
|
assert.Equal(t, 1, dispatcher.CallCount)
|
||||||
assert.Equal(t, "I need help with my order", dispatcher.LastRequest.MessageText)
|
assert.Equal(t, "I need help with my order", dispatcher.LastRequest.MessageText)
|
||||||
assert.Equal(t, "Order status: shipped", gateway.LastSentText)
|
assert.Equal(t, "Order status: shipped", gateway.LastSentText)
|
||||||
|
assert.Equal(t, 1, fileStore.CleanupCallCount)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestHandleVoiceMessage_Execute_DownloadFails(t *testing.T) {
|
func TestHandleVoiceMessage_Execute_DownloadFails(t *testing.T) {
|
||||||
downloader := &testutil.FakeFileDownloader{Error: errors.New("download error")}
|
downloader := &testutil.FakeFileDownloader{Error: errors.New("network error")}
|
||||||
transcriber := &testutil.FakeSpeechTranscriber{}
|
transcriber := &testutil.FakeSpeechTranscriber{}
|
||||||
|
fileStore := &testutil.FakeVoiceFileStore{}
|
||||||
gateway := &testutil.FakeMessageGateway{}
|
gateway := &testutil.FakeMessageGateway{}
|
||||||
|
|
||||||
textUC := usecase.NewHandleTextMessage(
|
uc := newVoiceUC(downloader, transcriber, fileStore,
|
||||||
&testutil.FakeIntentRouter{},
|
&testutil.FakeIntentRouter{},
|
||||||
&testutil.FakeWorkflowDispatcher{},
|
&testutil.FakeWorkflowDispatcher{},
|
||||||
&testutil.FakeSessionStore{},
|
|
||||||
gateway,
|
gateway,
|
||||||
newTestLogger(),
|
|
||||||
)
|
)
|
||||||
uc := usecase.NewHandleVoiceMessage(downloader, &testutil.FakeAudioConverter{}, transcriber, textUC, gateway, newTestLogger())
|
|
||||||
|
|
||||||
err := uc.Execute(context.Background(), validVoiceDTO())
|
err := uc.Execute(context.Background(), validVoiceDTO())
|
||||||
|
|
||||||
assert.Error(t, err)
|
assert.Error(t, err)
|
||||||
assert.Equal(t, 0, transcriber.CallCount)
|
assert.Equal(t, 0, transcriber.CallCount)
|
||||||
assert.NotEmpty(t, gateway.LastSentText) // error message sent
|
assert.NotEmpty(t, gateway.LastSentText)
|
||||||
|
assert.Equal(t, 0, fileStore.CleanupCallCount)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestHandleVoiceMessage_Execute_TranscriptionFails(t *testing.T) {
|
func TestHandleVoiceMessage_Execute_TranscriptionFails(t *testing.T) {
|
||||||
downloader := &testutil.FakeFileDownloader{
|
downloader := &testutil.FakeFileDownloader{Data: []byte("audio"), MimeType: "audio/ogg"}
|
||||||
Data: []byte("audio"),
|
|
||||||
MimeType: "audio/ogg",
|
|
||||||
}
|
|
||||||
transcriber := &testutil.FakeSpeechTranscriber{Error: errors.New("whisper error")}
|
transcriber := &testutil.FakeSpeechTranscriber{Error: errors.New("whisper error")}
|
||||||
|
fileStore := &testutil.FakeVoiceFileStore{}
|
||||||
gateway := &testutil.FakeMessageGateway{}
|
gateway := &testutil.FakeMessageGateway{}
|
||||||
dispatcher := &testutil.FakeWorkflowDispatcher{}
|
dispatcher := &testutil.FakeWorkflowDispatcher{}
|
||||||
|
|
||||||
textUC := usecase.NewHandleTextMessage(
|
uc := newVoiceUC(downloader, transcriber, fileStore,
|
||||||
&testutil.FakeIntentRouter{},
|
&testutil.FakeIntentRouter{},
|
||||||
dispatcher,
|
dispatcher,
|
||||||
&testutil.FakeSessionStore{},
|
|
||||||
gateway,
|
gateway,
|
||||||
newTestLogger(),
|
|
||||||
)
|
)
|
||||||
uc := usecase.NewHandleVoiceMessage(downloader, &testutil.FakeAudioConverter{}, transcriber, textUC, gateway, newTestLogger())
|
|
||||||
|
|
||||||
err := uc.Execute(context.Background(), validVoiceDTO())
|
err := uc.Execute(context.Background(), validVoiceDTO())
|
||||||
|
|
||||||
assert.Error(t, err)
|
assert.Error(t, err)
|
||||||
assert.Equal(t, 0, dispatcher.CallCount)
|
assert.Equal(t, 0, dispatcher.CallCount)
|
||||||
assert.NotEmpty(t, gateway.LastSentText)
|
assert.NotEmpty(t, gateway.LastSentText)
|
||||||
|
assert.Equal(t, 1, fileStore.CleanupCallCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleVoiceMessage_Execute_NoVTTWhenEmpty(t *testing.T) {
|
||||||
|
downloader := &testutil.FakeFileDownloader{Data: []byte("audio"), MimeType: "audio/ogg"}
|
||||||
|
transcriber := &testutil.FakeSpeechTranscriber{
|
||||||
|
Result: port.TranscriptionResult{Text: "Hello", VTT: ""},
|
||||||
|
}
|
||||||
|
fileStore := &testutil.FakeVoiceFileStore{}
|
||||||
|
gateway := &testutil.FakeMessageGateway{}
|
||||||
|
|
||||||
|
uc := newVoiceUC(downloader, transcriber, fileStore,
|
||||||
|
&testutil.FakeIntentRouter{RouteResult: entity.Route{IntentName: "general_query"}},
|
||||||
|
&testutil.FakeWorkflowDispatcher{Response: entity.WorkflowResponse{ReplyText: "OK"}},
|
||||||
|
gateway,
|
||||||
|
)
|
||||||
|
err := uc.Execute(context.Background(), validVoiceDTO())
|
||||||
|
|
||||||
|
require.NoError(t, err)
|
||||||
|
// SaveVTT should NOT have been called — VTTPath stays empty
|
||||||
|
assert.Empty(t, fileStore.SaveVTTPath)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -30,11 +30,12 @@ type N8nConfig struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type SpeechConfig struct {
|
type SpeechConfig struct {
|
||||||
Provider string `envconfig:"STT_PROVIDER" default:"openai"`
|
Provider string `envconfig:"STT_PROVIDER" default:"openai"`
|
||||||
OpenAIKey string `envconfig:"OPENAI_API_KEY"`
|
OpenAIKey string `envconfig:"OPENAI_API_KEY"`
|
||||||
WhisperModel string `envconfig:"WHISPER_MODEL" default:"whisper-1"`
|
WhisperModel string `envconfig:"WHISPER_MODEL" default:"whisper-1"`
|
||||||
Language string `envconfig:"WHISPER_LANGUAGE" default:""`
|
Language string `envconfig:"WHISPER_LANGUAGE" default:""`
|
||||||
FFmpegPath string `envconfig:"FFMPEG_PATH" default:"ffmpeg"`
|
FFmpegPath string `envconfig:"FFMPEG_PATH" default:"ffmpeg"`
|
||||||
|
VoiceStorePath string `envconfig:"VOICE_STORE_PATH" default:"/tmp/gw_telegram/voice"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type RedisConfig struct {
|
type RedisConfig struct {
|
||||||
|
|||||||
17
internal/domain/entity/voice_sample.go
Normal file
17
internal/domain/entity/voice_sample.go
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
package entity
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// VoiceSample tracks all files produced during voice message processing.
|
||||||
|
// Files are kept on disk until Cleanup() is called after successful analysis.
|
||||||
|
type VoiceSample struct {
|
||||||
|
ID string
|
||||||
|
UserID int64
|
||||||
|
ChatID int64
|
||||||
|
FileIDTg string // original Telegram file ID
|
||||||
|
OGGPath string // downloaded raw audio
|
||||||
|
WAVPath string // ffmpeg-converted audio
|
||||||
|
VTTPath string // WebVTT subtitle file generated by Whisper
|
||||||
|
Transcript string
|
||||||
|
CreatedAt time.Time
|
||||||
|
}
|
||||||
@@ -2,6 +2,12 @@ package port
|
|||||||
|
|
||||||
import "context"
|
import "context"
|
||||||
|
|
||||||
type SpeechTranscriber interface {
|
// TranscriptionResult holds both the plain-text transcript and the WebVTT subtitle content.
|
||||||
Transcribe(ctx context.Context, audioData []byte, mimeType string) (string, error)
|
type TranscriptionResult struct {
|
||||||
|
Text string
|
||||||
|
VTT string // WebVTT format, empty if not available
|
||||||
|
}
|
||||||
|
|
||||||
|
type SpeechTranscriber interface {
|
||||||
|
Transcribe(ctx context.Context, audioData []byte, mimeType string) (TranscriptionResult, error)
|
||||||
}
|
}
|
||||||
|
|||||||
12
internal/domain/port/voice_file_store.go
Normal file
12
internal/domain/port/voice_file_store.go
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
package port
|
||||||
|
|
||||||
|
import "context"
|
||||||
|
|
||||||
|
// VoiceFileStore manages lifecycle of voice sample files on disk.
|
||||||
|
// Files are persisted until Cleanup is called after successful analysis.
|
||||||
|
type VoiceFileStore interface {
|
||||||
|
SaveRaw(ctx context.Context, sampleID string, data []byte) (path string, err error)
|
||||||
|
SaveConverted(ctx context.Context, sampleID string, data []byte) (path string, err error)
|
||||||
|
SaveVTT(ctx context.Context, sampleID string, content string) (path string, err error)
|
||||||
|
Cleanup(ctx context.Context, sampleID string) error
|
||||||
|
}
|
||||||
@@ -26,6 +26,9 @@ func NewFFmpegConverter(ffmpegPath, tempDir string) *FFmpegConverter {
|
|||||||
return &FFmpegConverter{ffmpegPath: ffmpegPath, tempDir: tempDir}
|
return &FFmpegConverter{ffmpegPath: ffmpegPath, tempDir: tempDir}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Convert converts audio bytes between formats and returns the converted bytes.
|
||||||
|
// Intermediate temp files are cleaned up automatically — use ConvertFile when
|
||||||
|
// you need to keep the output on disk.
|
||||||
func (c *FFmpegConverter) Convert(ctx context.Context, input []byte, fromMime, toMime string) ([]byte, error) {
|
func (c *FFmpegConverter) Convert(ctx context.Context, input []byte, fromMime, toMime string) ([]byte, error) {
|
||||||
id := uuid.New().String()
|
id := uuid.New().String()
|
||||||
inFile := filepath.Join(c.tempDir, id+".input")
|
inFile := filepath.Join(c.tempDir, id+".input")
|
||||||
@@ -38,6 +41,32 @@ func (c *FFmpegConverter) Convert(ctx context.Context, input []byte, fromMime, t
|
|||||||
return nil, fmt.Errorf("write temp input: %w", err)
|
return nil, fmt.Errorf("write temp input: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if err := c.runFFmpeg(ctx, inFile, outFile); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
out, err := os.ReadFile(outFile)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("read converted file: %w", err)
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ConvertFile converts audio bytes and writes the WAV output to outPath.
|
||||||
|
// The caller is responsible for deleting outPath when no longer needed.
|
||||||
|
func (c *FFmpegConverter) ConvertFile(ctx context.Context, input []byte, outPath string) error {
|
||||||
|
id := uuid.New().String()
|
||||||
|
inFile := filepath.Join(c.tempDir, id+".input")
|
||||||
|
|
||||||
|
defer os.Remove(inFile)
|
||||||
|
|
||||||
|
if err := os.WriteFile(inFile, input, 0600); err != nil {
|
||||||
|
return fmt.Errorf("write temp input: %w", err)
|
||||||
|
}
|
||||||
|
return c.runFFmpeg(ctx, inFile, outPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *FFmpegConverter) runFFmpeg(ctx context.Context, inFile, outFile string) error {
|
||||||
cmd := exec.CommandContext(ctx, c.ffmpegPath,
|
cmd := exec.CommandContext(ctx, c.ffmpegPath,
|
||||||
"-i", inFile,
|
"-i", inFile,
|
||||||
"-ar", "16000",
|
"-ar", "16000",
|
||||||
@@ -46,18 +75,10 @@ func (c *FFmpegConverter) Convert(ctx context.Context, input []byte, fromMime, t
|
|||||||
"-y",
|
"-y",
|
||||||
outFile,
|
outFile,
|
||||||
)
|
)
|
||||||
|
|
||||||
var stderr bytes.Buffer
|
var stderr bytes.Buffer
|
||||||
cmd.Stderr = &stderr
|
cmd.Stderr = &stderr
|
||||||
|
|
||||||
if err := cmd.Run(); err != nil {
|
if err := cmd.Run(); err != nil {
|
||||||
return nil, fmt.Errorf("ffmpeg conversion: %w: %s", err, stderr.String())
|
return fmt.Errorf("ffmpeg conversion: %w: %s", err, stderr.String())
|
||||||
}
|
}
|
||||||
|
return nil
|
||||||
out, err := os.ReadFile(outFile)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("read converted file: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return out, nil
|
|
||||||
}
|
}
|
||||||
|
|||||||
57
internal/infrastructure/speech/local_voice_file_store.go
Normal file
57
internal/infrastructure/speech/local_voice_file_store.go
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
package speech
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
)
|
||||||
|
|
||||||
|
// LocalVoiceFileStore persists voice sample files on the local filesystem.
|
||||||
|
// All files for a sample share a common prefix: {storageDir}/{sampleID}.{ext}
|
||||||
|
type LocalVoiceFileStore struct {
|
||||||
|
storageDir string
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewLocalVoiceFileStore(storageDir string) (*LocalVoiceFileStore, error) {
|
||||||
|
if err := os.MkdirAll(storageDir, 0750); err != nil {
|
||||||
|
return nil, fmt.Errorf("create voice storage dir: %w", err)
|
||||||
|
}
|
||||||
|
return &LocalVoiceFileStore{storageDir: storageDir}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *LocalVoiceFileStore) SaveRaw(_ context.Context, sampleID string, data []byte) (string, error) {
|
||||||
|
path := filepath.Join(s.storageDir, sampleID+".ogg")
|
||||||
|
if err := os.WriteFile(path, data, 0600); err != nil {
|
||||||
|
return "", fmt.Errorf("save raw audio: %w", err)
|
||||||
|
}
|
||||||
|
return path, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *LocalVoiceFileStore) SaveConverted(_ context.Context, sampleID string, data []byte) (string, error) {
|
||||||
|
path := filepath.Join(s.storageDir, sampleID+".wav")
|
||||||
|
if err := os.WriteFile(path, data, 0600); err != nil {
|
||||||
|
return "", fmt.Errorf("save converted audio: %w", err)
|
||||||
|
}
|
||||||
|
return path, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *LocalVoiceFileStore) SaveVTT(_ context.Context, sampleID string, content string) (string, error) {
|
||||||
|
path := filepath.Join(s.storageDir, sampleID+".vtt")
|
||||||
|
if err := os.WriteFile(path, []byte(content), 0600); err != nil {
|
||||||
|
return "", fmt.Errorf("save vtt: %w", err)
|
||||||
|
}
|
||||||
|
return path, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cleanup removes all files associated with a sample ID (.ogg, .wav, .vtt).
|
||||||
|
// Missing files are silently ignored.
|
||||||
|
func (s *LocalVoiceFileStore) Cleanup(_ context.Context, sampleID string) error {
|
||||||
|
for _, ext := range []string{".ogg", ".wav", ".vtt"} {
|
||||||
|
path := filepath.Join(s.storageDir, sampleID+ext)
|
||||||
|
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
|
||||||
|
return fmt.Errorf("cleanup %s: %w", ext, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
@@ -8,7 +8,10 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"mime/multipart"
|
"mime/multipart"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/paramah/gw_telegram/internal/domain/port"
|
||||||
)
|
)
|
||||||
|
|
||||||
const whisperAPIURL = "https://api.openai.com/v1/audio/transcriptions"
|
const whisperAPIURL = "https://api.openai.com/v1/audio/transcriptions"
|
||||||
@@ -29,61 +32,97 @@ func NewOpenAIWhisper(apiKey, model, language string) *OpenAIWhisper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
type whisperResponse struct {
|
// whisperVerboseResponse is returned by Whisper API when response_format=verbose_json.
|
||||||
Text string `json:"text"`
|
type whisperVerboseResponse struct {
|
||||||
|
Text string `json:"text"`
|
||||||
|
Segments []whisperSegment `json:"segments"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *OpenAIWhisper) Transcribe(ctx context.Context, audioData []byte, mimeType string) (string, error) {
|
type whisperSegment struct {
|
||||||
|
Start float64 `json:"start"`
|
||||||
|
End float64 `json:"end"`
|
||||||
|
Text string `json:"text"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *OpenAIWhisper) Transcribe(ctx context.Context, audioData []byte, mimeType string) (port.TranscriptionResult, error) {
|
||||||
var buf bytes.Buffer
|
var buf bytes.Buffer
|
||||||
mw := multipart.NewWriter(&buf)
|
mw := multipart.NewWriter(&buf)
|
||||||
|
|
||||||
fw, err := mw.CreateFormFile("file", "audio.wav")
|
fw, err := mw.CreateFormFile("file", "audio.wav")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("create form file: %w", err)
|
return port.TranscriptionResult{}, fmt.Errorf("create form file: %w", err)
|
||||||
}
|
}
|
||||||
if _, err := fw.Write(audioData); err != nil {
|
if _, err := fw.Write(audioData); err != nil {
|
||||||
return "", fmt.Errorf("write audio data: %w", err)
|
return port.TranscriptionResult{}, fmt.Errorf("write audio data: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := mw.WriteField("model", w.model); err != nil {
|
if err := mw.WriteField("model", w.model); err != nil {
|
||||||
return "", fmt.Errorf("write model field: %w", err)
|
return port.TranscriptionResult{}, fmt.Errorf("write model field: %w", err)
|
||||||
}
|
}
|
||||||
if err := mw.WriteField("response_format", "json"); err != nil {
|
if err := mw.WriteField("response_format", "verbose_json"); err != nil {
|
||||||
return "", fmt.Errorf("write response_format: %w", err)
|
return port.TranscriptionResult{}, fmt.Errorf("write response_format: %w", err)
|
||||||
}
|
}
|
||||||
if w.language != "" {
|
if w.language != "" {
|
||||||
if err := mw.WriteField("language", w.language); err != nil {
|
if err := mw.WriteField("language", w.language); err != nil {
|
||||||
return "", fmt.Errorf("write language field: %w", err)
|
return port.TranscriptionResult{}, fmt.Errorf("write language field: %w", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
mw.Close()
|
mw.Close()
|
||||||
|
|
||||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, whisperAPIURL, &buf)
|
req, err := http.NewRequestWithContext(ctx, http.MethodPost, whisperAPIURL, &buf)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("create whisper request: %w", err)
|
return port.TranscriptionResult{}, fmt.Errorf("create whisper request: %w", err)
|
||||||
}
|
}
|
||||||
req.Header.Set("Authorization", "Bearer "+w.apiKey)
|
req.Header.Set("Authorization", "Bearer "+w.apiKey)
|
||||||
req.Header.Set("Content-Type", mw.FormDataContentType())
|
req.Header.Set("Content-Type", mw.FormDataContentType())
|
||||||
|
|
||||||
resp, err := w.client.Do(req)
|
resp, err := w.client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("whisper API call: %w", err)
|
return port.TranscriptionResult{}, fmt.Errorf("whisper API call: %w", err)
|
||||||
}
|
}
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
body, err := io.ReadAll(resp.Body)
|
body, err := io.ReadAll(resp.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("read whisper response: %w", err)
|
return port.TranscriptionResult{}, fmt.Errorf("read whisper response: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if resp.StatusCode != http.StatusOK {
|
if resp.StatusCode != http.StatusOK {
|
||||||
return "", fmt.Errorf("whisper API error %d: %s", resp.StatusCode, string(body))
|
return port.TranscriptionResult{}, fmt.Errorf("whisper API error %d: %s", resp.StatusCode, string(body))
|
||||||
}
|
}
|
||||||
|
|
||||||
var result whisperResponse
|
var result whisperVerboseResponse
|
||||||
if err := json.Unmarshal(body, &result); err != nil {
|
if err := json.Unmarshal(body, &result); err != nil {
|
||||||
return "", fmt.Errorf("parse whisper response: %w", err)
|
return port.TranscriptionResult{}, fmt.Errorf("parse whisper response: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return result.Text, nil
|
return port.TranscriptionResult{
|
||||||
|
Text: result.Text,
|
||||||
|
VTT: buildVTT(result.Segments),
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildVTT generates WebVTT content from Whisper segments.
|
||||||
|
func buildVTT(segments []whisperSegment) string {
|
||||||
|
if len(segments) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
var sb strings.Builder
|
||||||
|
sb.WriteString("WEBVTT\n\n")
|
||||||
|
for _, seg := range segments {
|
||||||
|
sb.WriteString(fmt.Sprintf("%s --> %s\n%s\n\n",
|
||||||
|
formatVTTTime(seg.Start),
|
||||||
|
formatVTTTime(seg.End),
|
||||||
|
strings.TrimSpace(seg.Text),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
return sb.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// formatVTTTime converts seconds to WebVTT timestamp (HH:MM:SS.mmm).
|
||||||
|
func formatVTTTime(seconds float64) string {
|
||||||
|
d := time.Duration(seconds * float64(time.Second))
|
||||||
|
h := int(d.Hours())
|
||||||
|
m := int(d.Minutes()) % 60
|
||||||
|
s := int(d.Seconds()) % 60
|
||||||
|
ms := int(d.Milliseconds()) % 1000
|
||||||
|
return fmt.Sprintf("%02d:%02d:%02d.%03d", h, m, s, ms)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,14 +1,18 @@
|
|||||||
package testutil
|
package testutil
|
||||||
|
|
||||||
import "context"
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
"github.com/paramah/gw_telegram/internal/domain/port"
|
||||||
|
)
|
||||||
|
|
||||||
type FakeSpeechTranscriber struct {
|
type FakeSpeechTranscriber struct {
|
||||||
Transcript string
|
Result port.TranscriptionResult
|
||||||
Error error
|
Error error
|
||||||
CallCount int
|
CallCount int
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f *FakeSpeechTranscriber) Transcribe(_ context.Context, _ []byte, _ string) (string, error) {
|
func (f *FakeSpeechTranscriber) Transcribe(_ context.Context, _ []byte, _ string) (port.TranscriptionResult, error) {
|
||||||
f.CallCount++
|
f.CallCount++
|
||||||
return f.Transcript, f.Error
|
return f.Result, f.Error
|
||||||
}
|
}
|
||||||
|
|||||||
33
test/testutil/fake_voice_file_store.go
Normal file
33
test/testutil/fake_voice_file_store.go
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
package testutil
|
||||||
|
|
||||||
|
import "context"
|
||||||
|
|
||||||
|
type FakeVoiceFileStore struct {
|
||||||
|
SaveRawPath string
|
||||||
|
SaveConvertedPath string
|
||||||
|
SaveVTTPath string
|
||||||
|
SaveRawError error
|
||||||
|
SaveConvertedError error
|
||||||
|
SaveVTTError error
|
||||||
|
CleanupError error
|
||||||
|
CleanupCallCount int
|
||||||
|
LastCleanupID string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *FakeVoiceFileStore) SaveRaw(_ context.Context, _ string, _ []byte) (string, error) {
|
||||||
|
return f.SaveRawPath, f.SaveRawError
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *FakeVoiceFileStore) SaveConverted(_ context.Context, _ string, _ []byte) (string, error) {
|
||||||
|
return f.SaveConvertedPath, f.SaveConvertedError
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *FakeVoiceFileStore) SaveVTT(_ context.Context, _ string, _ string) (string, error) {
|
||||||
|
return f.SaveVTTPath, f.SaveVTTError
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *FakeVoiceFileStore) Cleanup(_ context.Context, sampleID string) error {
|
||||||
|
f.CleanupCallCount++
|
||||||
|
f.LastCleanupID = sampleID
|
||||||
|
return f.CleanupError
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user