feat: handles submitted audio

2026-07-12 07:42:47 +02:00 · 2025-07-29 18:39:26 +02:00
parent b8272d519d
commit 33300d3193
10 changed files with 69 additions and 6 deletions
@@ -22,7 +22,7 @@ use std::io::{Seek, SeekFrom};
 use tempfile::NamedTempFile;
 use tracing::{error, info};

-use crate::utils::image_parsing::extract_text_from_image;
+use crate::utils::{audio_transcription::transcribe_audio_file, image_parsing::extract_text_from_image};

 pub async fn to_text_content(
    ingestion_payload: IngestionPayload,
@@ -231,6 +231,10 @@ async fn extract_text_from_file(
            let content = tokio::fs::read_to_string(&file_info.path).await?;
            Ok(content)
        }
+        "audio/mpeg" | "audio/mp3" | "audio/wav" | "audio/x-wav" | "audio/webm" | "audio/mp4" | "audio/ogg" | "audio/flac" => {
+
+            transcribe_audio_file(&file_info.path, db_client, openai_client).await
+        }
        // Handle other MIME types as needed
        _ => Err(AppError::NotFound(file_info.mime_type.clone())),
    }
@@ -0,0 +1,32 @@
+use async_openai::types::{CreateTranscriptionRequestArgs, AudioResponseFormat};
+use common::{
+    error::AppError,
+    storage::{
+        db::SurrealDbClient,
+        types::system_settings::SystemSettings,
+    },
+};
+
+/// Transcribes an audio file using the configured OpenAI Whisper model.
+pub async fn transcribe_audio_file(
+    file_path: &str,
+    db_client: &SurrealDbClient,
+    openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
+) -> Result<String, AppError> {
+    let system_settings = SystemSettings::get_current(db_client).await?;
+    let model = system_settings.voice_processing_model;
+
+    let request = CreateTranscriptionRequestArgs::default()
+        .file(file_path)
+        .model(model)
+        .response_format(AudioResponseFormat::Json)
+        .build()?;
+
+    let response = openai_client
+        .audio()
+        .transcribe(request)
+        .await
+        .map_err(|e| AppError::Processing(format!("Audio transcription failed: {}", e)))?;
+    Ok(response.text)
+}
+
@@ -1,5 +1,6 @@
 pub mod image_parsing;
 pub mod llm_instructions;
+pub mod audio_transcription;

 use common::error::AppError;
 use std::collections::HashMap;