feat: handles submitted audio

This commit is contained in:
Per Stark
2025-07-29 18:39:26 +02:00
parent b8272d519d
commit 33300d3193
10 changed files with 69 additions and 6 deletions
+5 -5
View File
@@ -10,7 +10,7 @@
## Demo deployment ## Demo deployment
To test *Minne* out, enter [this](https://minne-demo.stark.pub) read-only demo deployment to view and test functionality out. To test _Minne_ out, enter [this](https://minne-demo.stark.pub) read-only demo deployment to view and test functionality out.
## The "Why" Behind Minne ## The "Why" Behind Minne
@@ -70,7 +70,7 @@ This is a great way to manage Minne and its SurrealDB dependency together.
1. Create a `docker-compose.yml` file: 1. Create a `docker-compose.yml` file:
```yaml ```yaml
version: '3.8' version: "3.8"
services: services:
minne: minne:
image: ghcr.io/perstarkse/minne:latest # Pulls the latest pre-built image image: ghcr.io/perstarkse/minne:latest # Pulls the latest pre-built image
@@ -88,7 +88,7 @@ This is a great way to manage Minne and its SurrealDB dependency together.
SURREALDB_DATABASE: "minne_db" SURREALDB_DATABASE: "minne_db"
SURREALDB_NAMESPACE: "minne_ns" SURREALDB_NAMESPACE: "minne_ns"
OPENAI_API_KEY: "your_openai_api_key_here" # IMPORTANT: Replace with your actual key OPENAI_API_KEY: "your_openai_api_key_here" # IMPORTANT: Replace with your actual key
#OPENAI_BASE_URL: "your_ollama_address" # Uncomment this and change it to override the default openai base url #OPENAI_BASE_URL: "your_ollama_address" # Uncomment this and change it to override the default openai base url
HTTP_PORT: 3000 HTTP_PORT: 3000
DATA_DIR: "/data" # Data directory inside the container DATA_DIR: "/data" # Data directory inside the container
RUST_LOG: "minne=info,tower_http=info" # Example logging level RUST_LOG: "minne=info,tower_http=info" # Example logging level
@@ -177,7 +177,7 @@ Binaries for Windows, macOS, and Linux (combined `main` version) are available o
```bash ```bash
cargo run --release --bin worker cargo run --release --bin worker
``` ```
The compiled binaries will be in `target/release/`. The compiled binaries will be in `target/release/`.
## Configuration ## Configuration
@@ -229,7 +229,7 @@ Once Minne is running:
1. Access the web interface at `http://localhost:3000` (or your configured port). 1. Access the web interface at `http://localhost:3000` (or your configured port).
1. On iOS, consider setting up the [Minne iOS Shortcut](https://www.icloud.com/shortcuts/9aa960600ec14329837ba4169f57a166) for effortless content sending. **Add the shortcut, replace the [insert_url] and the [insert_api_key] snippets**. 1. On iOS, consider setting up the [Minne iOS Shortcut](https://www.icloud.com/shortcuts/9aa960600ec14329837ba4169f57a166) for effortless content sending. **Add the shortcut, replace the [insert_url] and the [insert_api_key] snippets**.
1. Start adding notes, URLs and explore your growing knowledge graph. 1. Add notes, URLs, **audio files**, and explore your growing knowledge graph.
1. Engage with the chat interface to query your saved content. 1. Engage with the chat interface to query your saved content.
1. Try the experimental visual graph explorer to see connections. 1. Try the experimental visual graph explorer to see connections.
@@ -0,0 +1,5 @@
DEFINE FIELD IF NOT EXISTS voice_processing_model ON system_settings TYPE string;
UPDATE system_settings:current SET
voice_processing_model = "whisper-1"
WHERE voice_processing_model == NONE;
@@ -0,0 +1 @@
{"schemas":"--- original\n+++ modified\n@@ -160,6 +160,7 @@\n DEFINE FIELD IF NOT EXISTS query_system_prompt ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS ingestion_system_prompt ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS image_processing_prompt ON system_settings TYPE string;\n+DEFINE FIELD IF NOT EXISTS voice_processing_model ON system_settings TYPE string;\n\n # Defines the schema for the 'text_chunk' table.\n\n","events":null}
+1
View File
@@ -13,3 +13,4 @@ DEFINE FIELD IF NOT EXISTS embedding_dimensions ON system_settings TYPE int;
DEFINE FIELD IF NOT EXISTS query_system_prompt ON system_settings TYPE string; DEFINE FIELD IF NOT EXISTS query_system_prompt ON system_settings TYPE string;
DEFINE FIELD IF NOT EXISTS ingestion_system_prompt ON system_settings TYPE string; DEFINE FIELD IF NOT EXISTS ingestion_system_prompt ON system_settings TYPE string;
DEFINE FIELD IF NOT EXISTS image_processing_prompt ON system_settings TYPE string; DEFINE FIELD IF NOT EXISTS image_processing_prompt ON system_settings TYPE string;
DEFINE FIELD IF NOT EXISTS voice_processing_model ON system_settings TYPE string;
@@ -17,6 +17,7 @@ pub struct SystemSettings {
pub ingestion_system_prompt: String, pub ingestion_system_prompt: String,
pub image_processing_model: String, pub image_processing_model: String,
pub image_processing_prompt: String, pub image_processing_prompt: String,
pub voice_processing_model: String,
} }
impl StoredObject for SystemSettings { impl StoredObject for SystemSettings {
+2
View File
@@ -128,6 +128,7 @@ pub struct ModelSettingsInput {
query_model: String, query_model: String,
processing_model: String, processing_model: String,
image_processing_model: String, image_processing_model: String,
voice_processing_model: String,
embedding_model: String, embedding_model: String,
embedding_dimensions: Option<u32>, embedding_dimensions: Option<u32>,
} }
@@ -159,6 +160,7 @@ pub async fn update_model_settings(
query_model: input.query_model, query_model: input.query_model,
processing_model: input.processing_model, processing_model: input.processing_model,
image_processing_model: input.image_processing_model, image_processing_model: input.image_processing_model,
voice_processing_model: input.voice_processing_model,
embedding_model: input.embedding_model, embedding_model: input.embedding_model,
// Use new dimensions if provided, otherwise retain the current ones. // Use new dimensions if provided, otherwise retain the current ones.
embedding_dimensions: input embedding_dimensions: input
+16
View File
@@ -107,6 +107,22 @@
</p> </p>
</div> </div>
<!-- Voice Processing Model -->
<div class="form-control mb-4">
<label class="label">
<span class="label-text">Voice Processing Model</span>
</label>
<select name="voice_processing_model" class="select select-bordered w-full">
{% for model in available_models.data %}
<option value="{{model.id}}" {% if settings.voice_processing_model==model.id %} selected {% endif %}>{{model.id}}</option>
{% endfor %}
</select>
<p class="text-xs text-gray-500 mt-1">
Current used:
<span class="font-mono">{{settings.voice_processing_model}}</span>
</p>
</div>
<!-- Embedding Model --> <!-- Embedding Model -->
<div class="form-control mb-4"> <div class="form-control mb-4">
<label class="label"> <label class="label">
+5 -1
View File
@@ -22,7 +22,7 @@ use std::io::{Seek, SeekFrom};
use tempfile::NamedTempFile; use tempfile::NamedTempFile;
use tracing::{error, info}; use tracing::{error, info};
use crate::utils::image_parsing::extract_text_from_image; use crate::utils::{audio_transcription::transcribe_audio_file, image_parsing::extract_text_from_image};
pub async fn to_text_content( pub async fn to_text_content(
ingestion_payload: IngestionPayload, ingestion_payload: IngestionPayload,
@@ -231,6 +231,10 @@ async fn extract_text_from_file(
let content = tokio::fs::read_to_string(&file_info.path).await?; let content = tokio::fs::read_to_string(&file_info.path).await?;
Ok(content) Ok(content)
} }
"audio/mpeg" | "audio/mp3" | "audio/wav" | "audio/x-wav" | "audio/webm" | "audio/mp4" | "audio/ogg" | "audio/flac" => {
transcribe_audio_file(&file_info.path, db_client, openai_client).await
}
// Handle other MIME types as needed // Handle other MIME types as needed
_ => Err(AppError::NotFound(file_info.mime_type.clone())), _ => Err(AppError::NotFound(file_info.mime_type.clone())),
} }
@@ -0,0 +1,32 @@
use async_openai::types::{CreateTranscriptionRequestArgs, AudioResponseFormat};
use common::{
error::AppError,
storage::{
db::SurrealDbClient,
types::system_settings::SystemSettings,
},
};
/// Transcribes an audio file using the configured OpenAI Whisper model.
pub async fn transcribe_audio_file(
file_path: &str,
db_client: &SurrealDbClient,
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
) -> Result<String, AppError> {
let system_settings = SystemSettings::get_current(db_client).await?;
let model = system_settings.voice_processing_model;
let request = CreateTranscriptionRequestArgs::default()
.file(file_path)
.model(model)
.response_format(AudioResponseFormat::Json)
.build()?;
let response = openai_client
.audio()
.transcribe(request)
.await
.map_err(|e| AppError::Processing(format!("Audio transcription failed: {}", e)))?;
Ok(response.text)
}
+1
View File
@@ -1,5 +1,6 @@
pub mod image_parsing; pub mod image_parsing;
pub mod llm_instructions; pub mod llm_instructions;
pub mod audio_transcription;
use common::error::AppError; use common::error::AppError;
use std::collections::HashMap; use std::collections::HashMap;