feat: handles submitted audio

2026-07-21 20:18:41 +02:00 · 2025-07-29 18:39:26 +02:00
parent b8272d519d
commit 33300d3193
10 changed files with 69 additions and 6 deletions
@@ -10,7 +10,7 @@
 ## Demo deployment
-To test *Minne* out, enter [this](https://minne-demo.stark.pub) read-only demo deployment to view and test functionality out.
+To test _Minne_ out, enter [this](https://minne-demo.stark.pub) read-only demo deployment to view and test functionality out.
 ## The "Why" Behind Minne
@@ -70,7 +70,7 @@ This is a great way to manage Minne and its SurrealDB dependency together.
 1. Create a `docker-compose.yml` file:
   ```yaml
-   version: '3.8'
+   version: "3.8"
   services:
     minne:
       image: ghcr.io/perstarkse/minne:latest # Pulls the latest pre-built image
@@ -88,7 +88,7 @@ This is a great way to manage Minne and its SurrealDB dependency together.
         SURREALDB_DATABASE: "minne_db"
         SURREALDB_NAMESPACE: "minne_ns"
         OPENAI_API_KEY: "your_openai_api_key_here" # IMPORTANT: Replace with your actual key
-         #OPENAI_BASE_URL: "your_ollama_address" # Uncomment this and change it to override the default openai base url 
+         #OPENAI_BASE_URL: "your_ollama_address" # Uncomment this and change it to override the default openai base url
         HTTP_PORT: 3000
         DATA_DIR: "/data" # Data directory inside the container
         RUST_LOG: "minne=info,tower_http=info" # Example logging level
@@ -177,7 +177,7 @@ Binaries for Windows, macOS, and Linux (combined `main` version) are available o
     ```bash
     cargo run --release --bin worker
     ```
-   The compiled binaries will be in `target/release/`.
+     The compiled binaries will be in `target/release/`.
 ## Configuration
@@ -229,7 +229,7 @@ Once Minne is running:
 1. Access the web interface at `http://localhost:3000` (or your configured port).
 1. On iOS, consider setting up the [Minne iOS Shortcut](https://www.icloud.com/shortcuts/9aa960600ec14329837ba4169f57a166) for effortless content sending. **Add the shortcut, replace the [insert_url] and the [insert_api_key] snippets**.
-1. Start adding notes, URLs and explore your growing knowledge graph.
+1. Add notes, URLs, **audio files**, and explore your growing knowledge graph.
 1. Engage with the chat interface to query your saved content.
 1. Try the experimental visual graph explorer to see connections.
@@ -0,0 +1,5 @@
 DEFINE FIELD IF NOT EXISTS voice_processing_model ON system_settings TYPE string;
 UPDATE system_settings:current SET
    voice_processing_model = "whisper-1"
 WHERE voice_processing_model == NONE; 
@@ -0,0 +1 @@
 {"schemas":"--- original\n+++ modified\n@@ -160,6 +160,7 @@\n DEFINE FIELD IF NOT EXISTS query_system_prompt ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS ingestion_system_prompt ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS image_processing_prompt ON system_settings TYPE string;\n+DEFINE FIELD IF NOT EXISTS voice_processing_model ON system_settings TYPE string;\n\n # Defines the schema for the 'text_chunk' table.\n\n","events":null}
@@ -13,3 +13,4 @@ DEFINE FIELD IF NOT EXISTS embedding_dimensions ON system_settings TYPE int;
 DEFINE FIELD IF NOT EXISTS query_system_prompt ON system_settings TYPE string;
 DEFINE FIELD IF NOT EXISTS ingestion_system_prompt ON system_settings TYPE string;
 DEFINE FIELD IF NOT EXISTS image_processing_prompt ON system_settings TYPE string;
 DEFINE FIELD IF NOT EXISTS voice_processing_model ON system_settings TYPE string;
@@ -17,6 +17,7 @@ pub struct SystemSettings {
    pub ingestion_system_prompt: String,
    pub image_processing_model: String,
    pub image_processing_prompt: String,
    pub voice_processing_model: String,
 }
 impl StoredObject for SystemSettings {
@@ -128,6 +128,7 @@ pub struct ModelSettingsInput {
    query_model: String,
    processing_model: String,
    image_processing_model: String,
    voice_processing_model: String,
    embedding_model: String,
    embedding_dimensions: Option<u32>,
 }
@@ -159,6 +160,7 @@ pub async fn update_model_settings(
        query_model: input.query_model,
        processing_model: input.processing_model,
        image_processing_model: input.image_processing_model,
        voice_processing_model: input.voice_processing_model,
        embedding_model: input.embedding_model,
        // Use new dimensions if provided, otherwise retain the current ones.
        embedding_dimensions: input
@@ -107,6 +107,22 @@
          </p>
        </div>
        <!-- Voice Processing Model -->
        <div class="form-control mb-4">
          <label class="label">
            <span class="label-text">Voice Processing Model</span>
          </label>
          <select name="voice_processing_model" class="select select-bordered w-full">
            {% for model in available_models.data %}
            <option value="{{model.id}}" {% if settings.voice_processing_model==model.id %} selected {% endif %}>{{model.id}}</option>
            {% endfor %}
          </select>
          <p class="text-xs text-gray-500 mt-1">
            Current used:
            <span class="font-mono">{{settings.voice_processing_model}}</span>
          </p>
        </div>
        <!-- Embedding Model -->
        <div class="form-control mb-4">
          <label class="label">
@@ -22,7 +22,7 @@ use std::io::{Seek, SeekFrom};
 use tempfile::NamedTempFile;
 use tracing::{error, info};
-use crate::utils::image_parsing::extract_text_from_image;
+use crate::utils::{audio_transcription::transcribe_audio_file, image_parsing::extract_text_from_image};
 pub async fn to_text_content(
    ingestion_payload: IngestionPayload,
@@ -231,6 +231,10 @@ async fn extract_text_from_file(
            let content = tokio::fs::read_to_string(&file_info.path).await?;
            Ok(content)
        }
        "audio/mpeg" | "audio/mp3" | "audio/wav" | "audio/x-wav" | "audio/webm" | "audio/mp4" | "audio/ogg" | "audio/flac" => {
            transcribe_audio_file(&file_info.path, db_client, openai_client).await
        }
        // Handle other MIME types as needed
        _ => Err(AppError::NotFound(file_info.mime_type.clone())),
    }
@@ -0,0 +1,32 @@
 use async_openai::types::{CreateTranscriptionRequestArgs, AudioResponseFormat};
 use common::{
    error::AppError,
    storage::{
        db::SurrealDbClient,
        types::system_settings::SystemSettings,
    },
 };
 /// Transcribes an audio file using the configured OpenAI Whisper model.
 pub async fn transcribe_audio_file(
    file_path: &str,
    db_client: &SurrealDbClient,
    openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
 ) -> Result<String, AppError> {
    let system_settings = SystemSettings::get_current(db_client).await?;
    let model = system_settings.voice_processing_model;
    let request = CreateTranscriptionRequestArgs::default()
        .file(file_path)
        .model(model)
        .response_format(AudioResponseFormat::Json)
        .build()?;
    let response = openai_client
        .audio()
        .transcribe(request)
        .await
        .map_err(|e| AppError::Processing(format!("Audio transcription failed: {}", e)))?;
    Ok(response.text)
 }
@@ -1,5 +1,6 @@
 pub mod image_parsing;
 pub mod llm_instructions;
 pub mod audio_transcription;
 use common::error::AppError;
 use std::collections::HashMap;
		`@@ -0,0 +1 @@`
							`{"schemas":"--- original\n+++ modified\n@@ -160,6 +160,7 @@\n DEFINE FIELD IF NOT EXISTS query_system_prompt ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS ingestion_system_prompt ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS image_processing_prompt ON system_settings TYPE string;\n+DEFINE FIELD IF NOT EXISTS voice_processing_model ON system_settings TYPE string;\n\n # Defines the schema for the 'text_chunk' table.\n\n","events":null}`