feat: handles submitted audio

This commit is contained in:
Per Stark
2025-07-29 18:39:26 +02:00
parent b8272d519d
commit 33300d3193
10 changed files with 69 additions and 6 deletions

View File

@@ -10,7 +10,7 @@
## Demo deployment ## Demo deployment
To test *Minne* out, enter [this](https://minne-demo.stark.pub) read-only demo deployment to view and test functionality out. To test _Minne_ out, enter [this](https://minne-demo.stark.pub) read-only demo deployment to view and test functionality out.
## The "Why" Behind Minne ## The "Why" Behind Minne
@@ -70,7 +70,7 @@ This is a great way to manage Minne and its SurrealDB dependency together.
1. Create a `docker-compose.yml` file: 1. Create a `docker-compose.yml` file:
```yaml ```yaml
version: '3.8' version: "3.8"
services: services:
minne: minne:
image: ghcr.io/perstarkse/minne:latest # Pulls the latest pre-built image image: ghcr.io/perstarkse/minne:latest # Pulls the latest pre-built image
@@ -177,7 +177,7 @@ Binaries for Windows, macOS, and Linux (combined `main` version) are available o
```bash ```bash
cargo run --release --bin worker cargo run --release --bin worker
``` ```
The compiled binaries will be in `target/release/`. The compiled binaries will be in `target/release/`.
## Configuration ## Configuration
@@ -229,7 +229,7 @@ Once Minne is running:
1. Access the web interface at `http://localhost:3000` (or your configured port). 1. Access the web interface at `http://localhost:3000` (or your configured port).
1. On iOS, consider setting up the [Minne iOS Shortcut](https://www.icloud.com/shortcuts/9aa960600ec14329837ba4169f57a166) for effortless content sending. **Add the shortcut, replace the [insert_url] and the [insert_api_key] snippets**. 1. On iOS, consider setting up the [Minne iOS Shortcut](https://www.icloud.com/shortcuts/9aa960600ec14329837ba4169f57a166) for effortless content sending. **Add the shortcut, replace the [insert_url] and the [insert_api_key] snippets**.
1. Start adding notes, URLs and explore your growing knowledge graph. 1. Add notes, URLs, **audio files**, and explore your growing knowledge graph.
1. Engage with the chat interface to query your saved content. 1. Engage with the chat interface to query your saved content.
1. Try the experimental visual graph explorer to see connections. 1. Try the experimental visual graph explorer to see connections.

View File

@@ -0,0 +1,5 @@
DEFINE FIELD IF NOT EXISTS voice_processing_model ON system_settings TYPE string;
UPDATE system_settings:current SET
voice_processing_model = "whisper-1"
WHERE voice_processing_model == NONE;

View File

@@ -0,0 +1 @@
{"schemas":"--- original\n+++ modified\n@@ -160,6 +160,7 @@\n DEFINE FIELD IF NOT EXISTS query_system_prompt ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS ingestion_system_prompt ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS image_processing_prompt ON system_settings TYPE string;\n+DEFINE FIELD IF NOT EXISTS voice_processing_model ON system_settings TYPE string;\n\n # Defines the schema for the 'text_chunk' table.\n\n","events":null}

View File

@@ -13,3 +13,4 @@ DEFINE FIELD IF NOT EXISTS embedding_dimensions ON system_settings TYPE int;
DEFINE FIELD IF NOT EXISTS query_system_prompt ON system_settings TYPE string; DEFINE FIELD IF NOT EXISTS query_system_prompt ON system_settings TYPE string;
DEFINE FIELD IF NOT EXISTS ingestion_system_prompt ON system_settings TYPE string; DEFINE FIELD IF NOT EXISTS ingestion_system_prompt ON system_settings TYPE string;
DEFINE FIELD IF NOT EXISTS image_processing_prompt ON system_settings TYPE string; DEFINE FIELD IF NOT EXISTS image_processing_prompt ON system_settings TYPE string;
DEFINE FIELD IF NOT EXISTS voice_processing_model ON system_settings TYPE string;

View File

@@ -17,6 +17,7 @@ pub struct SystemSettings {
pub ingestion_system_prompt: String, pub ingestion_system_prompt: String,
pub image_processing_model: String, pub image_processing_model: String,
pub image_processing_prompt: String, pub image_processing_prompt: String,
pub voice_processing_model: String,
} }
impl StoredObject for SystemSettings { impl StoredObject for SystemSettings {

View File

@@ -128,6 +128,7 @@ pub struct ModelSettingsInput {
query_model: String, query_model: String,
processing_model: String, processing_model: String,
image_processing_model: String, image_processing_model: String,
voice_processing_model: String,
embedding_model: String, embedding_model: String,
embedding_dimensions: Option<u32>, embedding_dimensions: Option<u32>,
} }
@@ -159,6 +160,7 @@ pub async fn update_model_settings(
query_model: input.query_model, query_model: input.query_model,
processing_model: input.processing_model, processing_model: input.processing_model,
image_processing_model: input.image_processing_model, image_processing_model: input.image_processing_model,
voice_processing_model: input.voice_processing_model,
embedding_model: input.embedding_model, embedding_model: input.embedding_model,
// Use new dimensions if provided, otherwise retain the current ones. // Use new dimensions if provided, otherwise retain the current ones.
embedding_dimensions: input embedding_dimensions: input

View File

@@ -107,6 +107,22 @@
</p> </p>
</div> </div>
<!-- Voice Processing Model -->
<div class="form-control mb-4">
<label class="label">
<span class="label-text">Voice Processing Model</span>
</label>
<select name="voice_processing_model" class="select select-bordered w-full">
{% for model in available_models.data %}
<option value="{{model.id}}" {% if settings.voice_processing_model==model.id %} selected {% endif %}>{{model.id}}</option>
{% endfor %}
</select>
<p class="text-xs text-gray-500 mt-1">
Current used:
<span class="font-mono">{{settings.voice_processing_model}}</span>
</p>
</div>
<!-- Embedding Model --> <!-- Embedding Model -->
<div class="form-control mb-4"> <div class="form-control mb-4">
<label class="label"> <label class="label">

View File

@@ -22,7 +22,7 @@ use std::io::{Seek, SeekFrom};
use tempfile::NamedTempFile; use tempfile::NamedTempFile;
use tracing::{error, info}; use tracing::{error, info};
use crate::utils::image_parsing::extract_text_from_image; use crate::utils::{audio_transcription::transcribe_audio_file, image_parsing::extract_text_from_image};
pub async fn to_text_content( pub async fn to_text_content(
ingestion_payload: IngestionPayload, ingestion_payload: IngestionPayload,
@@ -231,6 +231,10 @@ async fn extract_text_from_file(
let content = tokio::fs::read_to_string(&file_info.path).await?; let content = tokio::fs::read_to_string(&file_info.path).await?;
Ok(content) Ok(content)
} }
"audio/mpeg" | "audio/mp3" | "audio/wav" | "audio/x-wav" | "audio/webm" | "audio/mp4" | "audio/ogg" | "audio/flac" => {
transcribe_audio_file(&file_info.path, db_client, openai_client).await
}
// Handle other MIME types as needed // Handle other MIME types as needed
_ => Err(AppError::NotFound(file_info.mime_type.clone())), _ => Err(AppError::NotFound(file_info.mime_type.clone())),
} }

View File

@@ -0,0 +1,32 @@
use async_openai::types::{CreateTranscriptionRequestArgs, AudioResponseFormat};
use common::{
error::AppError,
storage::{
db::SurrealDbClient,
types::system_settings::SystemSettings,
},
};
/// Transcribes an audio file using the configured OpenAI Whisper model.
pub async fn transcribe_audio_file(
file_path: &str,
db_client: &SurrealDbClient,
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
) -> Result<String, AppError> {
let system_settings = SystemSettings::get_current(db_client).await?;
let model = system_settings.voice_processing_model;
let request = CreateTranscriptionRequestArgs::default()
.file(file_path)
.model(model)
.response_format(AudioResponseFormat::Json)
.build()?;
let response = openai_client
.audio()
.transcribe(request)
.await
.map_err(|e| AppError::Processing(format!("Audio transcription failed: {}", e)))?;
Ok(response.text)
}

View File

@@ -1,5 +1,6 @@
pub mod image_parsing; pub mod image_parsing;
pub mod llm_instructions; pub mod llm_instructions;
pub mod audio_transcription;
use common::error::AppError; use common::error::AppError;
use std::collections::HashMap; use std::collections::HashMap;