feat: persist voice samples to disk, generate VTT, cleanup after analysis
This commit is contained in:
@@ -26,6 +26,9 @@ func NewFFmpegConverter(ffmpegPath, tempDir string) *FFmpegConverter {
|
||||
return &FFmpegConverter{ffmpegPath: ffmpegPath, tempDir: tempDir}
|
||||
}
|
||||
|
||||
// Convert converts audio bytes between formats and returns the converted bytes.
|
||||
// Intermediate temp files are cleaned up automatically — use ConvertFile when
|
||||
// you need to keep the output on disk.
|
||||
func (c *FFmpegConverter) Convert(ctx context.Context, input []byte, fromMime, toMime string) ([]byte, error) {
|
||||
id := uuid.New().String()
|
||||
inFile := filepath.Join(c.tempDir, id+".input")
|
||||
@@ -38,6 +41,32 @@ func (c *FFmpegConverter) Convert(ctx context.Context, input []byte, fromMime, t
|
||||
return nil, fmt.Errorf("write temp input: %w", err)
|
||||
}
|
||||
|
||||
if err := c.runFFmpeg(ctx, inFile, outFile); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
out, err := os.ReadFile(outFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read converted file: %w", err)
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// ConvertFile converts audio bytes and writes the WAV output to outPath.
|
||||
// The caller is responsible for deleting outPath when no longer needed.
|
||||
func (c *FFmpegConverter) ConvertFile(ctx context.Context, input []byte, outPath string) error {
|
||||
id := uuid.New().String()
|
||||
inFile := filepath.Join(c.tempDir, id+".input")
|
||||
|
||||
defer os.Remove(inFile)
|
||||
|
||||
if err := os.WriteFile(inFile, input, 0600); err != nil {
|
||||
return fmt.Errorf("write temp input: %w", err)
|
||||
}
|
||||
return c.runFFmpeg(ctx, inFile, outPath)
|
||||
}
|
||||
|
||||
func (c *FFmpegConverter) runFFmpeg(ctx context.Context, inFile, outFile string) error {
|
||||
cmd := exec.CommandContext(ctx, c.ffmpegPath,
|
||||
"-i", inFile,
|
||||
"-ar", "16000",
|
||||
@@ -46,18 +75,10 @@ func (c *FFmpegConverter) Convert(ctx context.Context, input []byte, fromMime, t
|
||||
"-y",
|
||||
outFile,
|
||||
)
|
||||
|
||||
var stderr bytes.Buffer
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
return nil, fmt.Errorf("ffmpeg conversion: %w: %s", err, stderr.String())
|
||||
return fmt.Errorf("ffmpeg conversion: %w: %s", err, stderr.String())
|
||||
}
|
||||
|
||||
out, err := os.ReadFile(outFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read converted file: %w", err)
|
||||
}
|
||||
|
||||
return out, nil
|
||||
return nil
|
||||
}
|
||||
|
||||
57
internal/infrastructure/speech/local_voice_file_store.go
Normal file
57
internal/infrastructure/speech/local_voice_file_store.go
Normal file
@@ -0,0 +1,57 @@
|
||||
package speech
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
// LocalVoiceFileStore persists voice sample files on the local filesystem.
|
||||
// All files for a sample share a common prefix: {storageDir}/{sampleID}.{ext}
|
||||
type LocalVoiceFileStore struct {
|
||||
storageDir string
|
||||
}
|
||||
|
||||
func NewLocalVoiceFileStore(storageDir string) (*LocalVoiceFileStore, error) {
|
||||
if err := os.MkdirAll(storageDir, 0750); err != nil {
|
||||
return nil, fmt.Errorf("create voice storage dir: %w", err)
|
||||
}
|
||||
return &LocalVoiceFileStore{storageDir: storageDir}, nil
|
||||
}
|
||||
|
||||
func (s *LocalVoiceFileStore) SaveRaw(_ context.Context, sampleID string, data []byte) (string, error) {
|
||||
path := filepath.Join(s.storageDir, sampleID+".ogg")
|
||||
if err := os.WriteFile(path, data, 0600); err != nil {
|
||||
return "", fmt.Errorf("save raw audio: %w", err)
|
||||
}
|
||||
return path, nil
|
||||
}
|
||||
|
||||
func (s *LocalVoiceFileStore) SaveConverted(_ context.Context, sampleID string, data []byte) (string, error) {
|
||||
path := filepath.Join(s.storageDir, sampleID+".wav")
|
||||
if err := os.WriteFile(path, data, 0600); err != nil {
|
||||
return "", fmt.Errorf("save converted audio: %w", err)
|
||||
}
|
||||
return path, nil
|
||||
}
|
||||
|
||||
func (s *LocalVoiceFileStore) SaveVTT(_ context.Context, sampleID string, content string) (string, error) {
|
||||
path := filepath.Join(s.storageDir, sampleID+".vtt")
|
||||
if err := os.WriteFile(path, []byte(content), 0600); err != nil {
|
||||
return "", fmt.Errorf("save vtt: %w", err)
|
||||
}
|
||||
return path, nil
|
||||
}
|
||||
|
||||
// Cleanup removes all files associated with a sample ID (.ogg, .wav, .vtt).
|
||||
// Missing files are silently ignored.
|
||||
func (s *LocalVoiceFileStore) Cleanup(_ context.Context, sampleID string) error {
|
||||
for _, ext := range []string{".ogg", ".wav", ".vtt"} {
|
||||
path := filepath.Join(s.storageDir, sampleID+ext)
|
||||
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
|
||||
return fmt.Errorf("cleanup %s: %w", ext, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -8,7 +8,10 @@ import (
|
||||
"io"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/paramah/gw_telegram/internal/domain/port"
|
||||
)
|
||||
|
||||
const whisperAPIURL = "https://api.openai.com/v1/audio/transcriptions"
|
||||
@@ -29,61 +32,97 @@ func NewOpenAIWhisper(apiKey, model, language string) *OpenAIWhisper {
|
||||
}
|
||||
}
|
||||
|
||||
type whisperResponse struct {
|
||||
Text string `json:"text"`
|
||||
// whisperVerboseResponse is returned by Whisper API when response_format=verbose_json.
|
||||
type whisperVerboseResponse struct {
|
||||
Text string `json:"text"`
|
||||
Segments []whisperSegment `json:"segments"`
|
||||
}
|
||||
|
||||
func (w *OpenAIWhisper) Transcribe(ctx context.Context, audioData []byte, mimeType string) (string, error) {
|
||||
type whisperSegment struct {
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
func (w *OpenAIWhisper) Transcribe(ctx context.Context, audioData []byte, mimeType string) (port.TranscriptionResult, error) {
|
||||
var buf bytes.Buffer
|
||||
mw := multipart.NewWriter(&buf)
|
||||
|
||||
fw, err := mw.CreateFormFile("file", "audio.wav")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("create form file: %w", err)
|
||||
return port.TranscriptionResult{}, fmt.Errorf("create form file: %w", err)
|
||||
}
|
||||
if _, err := fw.Write(audioData); err != nil {
|
||||
return "", fmt.Errorf("write audio data: %w", err)
|
||||
return port.TranscriptionResult{}, fmt.Errorf("write audio data: %w", err)
|
||||
}
|
||||
|
||||
if err := mw.WriteField("model", w.model); err != nil {
|
||||
return "", fmt.Errorf("write model field: %w", err)
|
||||
return port.TranscriptionResult{}, fmt.Errorf("write model field: %w", err)
|
||||
}
|
||||
if err := mw.WriteField("response_format", "json"); err != nil {
|
||||
return "", fmt.Errorf("write response_format: %w", err)
|
||||
if err := mw.WriteField("response_format", "verbose_json"); err != nil {
|
||||
return port.TranscriptionResult{}, fmt.Errorf("write response_format: %w", err)
|
||||
}
|
||||
if w.language != "" {
|
||||
if err := mw.WriteField("language", w.language); err != nil {
|
||||
return "", fmt.Errorf("write language field: %w", err)
|
||||
return port.TranscriptionResult{}, fmt.Errorf("write language field: %w", err)
|
||||
}
|
||||
}
|
||||
mw.Close()
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, whisperAPIURL, &buf)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("create whisper request: %w", err)
|
||||
return port.TranscriptionResult{}, fmt.Errorf("create whisper request: %w", err)
|
||||
}
|
||||
req.Header.Set("Authorization", "Bearer "+w.apiKey)
|
||||
req.Header.Set("Content-Type", mw.FormDataContentType())
|
||||
|
||||
resp, err := w.client.Do(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("whisper API call: %w", err)
|
||||
return port.TranscriptionResult{}, fmt.Errorf("whisper API call: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read whisper response: %w", err)
|
||||
return port.TranscriptionResult{}, fmt.Errorf("read whisper response: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("whisper API error %d: %s", resp.StatusCode, string(body))
|
||||
return port.TranscriptionResult{}, fmt.Errorf("whisper API error %d: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
var result whisperResponse
|
||||
var result whisperVerboseResponse
|
||||
if err := json.Unmarshal(body, &result); err != nil {
|
||||
return "", fmt.Errorf("parse whisper response: %w", err)
|
||||
return port.TranscriptionResult{}, fmt.Errorf("parse whisper response: %w", err)
|
||||
}
|
||||
|
||||
return result.Text, nil
|
||||
return port.TranscriptionResult{
|
||||
Text: result.Text,
|
||||
VTT: buildVTT(result.Segments),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// buildVTT generates WebVTT content from Whisper segments.
|
||||
func buildVTT(segments []whisperSegment) string {
|
||||
if len(segments) == 0 {
|
||||
return ""
|
||||
}
|
||||
var sb strings.Builder
|
||||
sb.WriteString("WEBVTT\n\n")
|
||||
for _, seg := range segments {
|
||||
sb.WriteString(fmt.Sprintf("%s --> %s\n%s\n\n",
|
||||
formatVTTTime(seg.Start),
|
||||
formatVTTTime(seg.End),
|
||||
strings.TrimSpace(seg.Text),
|
||||
))
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
// formatVTTTime converts seconds to WebVTT timestamp (HH:MM:SS.mmm).
|
||||
func formatVTTTime(seconds float64) string {
|
||||
d := time.Duration(seconds * float64(time.Second))
|
||||
h := int(d.Hours())
|
||||
m := int(d.Minutes()) % 60
|
||||
s := int(d.Seconds()) % 60
|
||||
ms := int(d.Milliseconds()) % 1000
|
||||
return fmt.Sprintf("%02d:%02d:%02d.%03d", h, m, s, ms)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user