mirror of
https://github.com/perstarkse/minne.git
synced 2026-04-28 19:57:07 +02:00
feat: handles submitted audio
This commit is contained in:
@@ -10,7 +10,7 @@
|
|||||||
|
|
||||||
## Demo deployment
|
## Demo deployment
|
||||||
|
|
||||||
To test *Minne* out, enter [this](https://minne-demo.stark.pub) read-only demo deployment to view and test functionality out.
|
To test _Minne_ out, enter [this](https://minne-demo.stark.pub) read-only demo deployment to view and test functionality out.
|
||||||
|
|
||||||
## The "Why" Behind Minne
|
## The "Why" Behind Minne
|
||||||
|
|
||||||
@@ -70,7 +70,7 @@ This is a great way to manage Minne and its SurrealDB dependency together.
|
|||||||
1. Create a `docker-compose.yml` file:
|
1. Create a `docker-compose.yml` file:
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
version: '3.8'
|
version: "3.8"
|
||||||
services:
|
services:
|
||||||
minne:
|
minne:
|
||||||
image: ghcr.io/perstarkse/minne:latest # Pulls the latest pre-built image
|
image: ghcr.io/perstarkse/minne:latest # Pulls the latest pre-built image
|
||||||
@@ -177,7 +177,7 @@ Binaries for Windows, macOS, and Linux (combined `main` version) are available o
|
|||||||
```bash
|
```bash
|
||||||
cargo run --release --bin worker
|
cargo run --release --bin worker
|
||||||
```
|
```
|
||||||
The compiled binaries will be in `target/release/`.
|
The compiled binaries will be in `target/release/`.
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
@@ -229,7 +229,7 @@ Once Minne is running:
|
|||||||
|
|
||||||
1. Access the web interface at `http://localhost:3000` (or your configured port).
|
1. Access the web interface at `http://localhost:3000` (or your configured port).
|
||||||
1. On iOS, consider setting up the [Minne iOS Shortcut](https://www.icloud.com/shortcuts/9aa960600ec14329837ba4169f57a166) for effortless content sending. **Add the shortcut, replace the [insert_url] and the [insert_api_key] snippets**.
|
1. On iOS, consider setting up the [Minne iOS Shortcut](https://www.icloud.com/shortcuts/9aa960600ec14329837ba4169f57a166) for effortless content sending. **Add the shortcut, replace the [insert_url] and the [insert_api_key] snippets**.
|
||||||
1. Start adding notes, URLs and explore your growing knowledge graph.
|
1. Add notes, URLs, **audio files**, and explore your growing knowledge graph.
|
||||||
1. Engage with the chat interface to query your saved content.
|
1. Engage with the chat interface to query your saved content.
|
||||||
1. Try the experimental visual graph explorer to see connections.
|
1. Try the experimental visual graph explorer to see connections.
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,5 @@
|
|||||||
|
DEFINE FIELD IF NOT EXISTS voice_processing_model ON system_settings TYPE string;
|
||||||
|
|
||||||
|
UPDATE system_settings:current SET
|
||||||
|
voice_processing_model = "whisper-1"
|
||||||
|
WHERE voice_processing_model == NONE;
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
{"schemas":"--- original\n+++ modified\n@@ -160,6 +160,7 @@\n DEFINE FIELD IF NOT EXISTS query_system_prompt ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS ingestion_system_prompt ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS image_processing_prompt ON system_settings TYPE string;\n+DEFINE FIELD IF NOT EXISTS voice_processing_model ON system_settings TYPE string;\n\n # Defines the schema for the 'text_chunk' table.\n\n","events":null}
|
||||||
@@ -13,3 +13,4 @@ DEFINE FIELD IF NOT EXISTS embedding_dimensions ON system_settings TYPE int;
|
|||||||
DEFINE FIELD IF NOT EXISTS query_system_prompt ON system_settings TYPE string;
|
DEFINE FIELD IF NOT EXISTS query_system_prompt ON system_settings TYPE string;
|
||||||
DEFINE FIELD IF NOT EXISTS ingestion_system_prompt ON system_settings TYPE string;
|
DEFINE FIELD IF NOT EXISTS ingestion_system_prompt ON system_settings TYPE string;
|
||||||
DEFINE FIELD IF NOT EXISTS image_processing_prompt ON system_settings TYPE string;
|
DEFINE FIELD IF NOT EXISTS image_processing_prompt ON system_settings TYPE string;
|
||||||
|
DEFINE FIELD IF NOT EXISTS voice_processing_model ON system_settings TYPE string;
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ pub struct SystemSettings {
|
|||||||
pub ingestion_system_prompt: String,
|
pub ingestion_system_prompt: String,
|
||||||
pub image_processing_model: String,
|
pub image_processing_model: String,
|
||||||
pub image_processing_prompt: String,
|
pub image_processing_prompt: String,
|
||||||
|
pub voice_processing_model: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl StoredObject for SystemSettings {
|
impl StoredObject for SystemSettings {
|
||||||
|
|||||||
@@ -128,6 +128,7 @@ pub struct ModelSettingsInput {
|
|||||||
query_model: String,
|
query_model: String,
|
||||||
processing_model: String,
|
processing_model: String,
|
||||||
image_processing_model: String,
|
image_processing_model: String,
|
||||||
|
voice_processing_model: String,
|
||||||
embedding_model: String,
|
embedding_model: String,
|
||||||
embedding_dimensions: Option<u32>,
|
embedding_dimensions: Option<u32>,
|
||||||
}
|
}
|
||||||
@@ -159,6 +160,7 @@ pub async fn update_model_settings(
|
|||||||
query_model: input.query_model,
|
query_model: input.query_model,
|
||||||
processing_model: input.processing_model,
|
processing_model: input.processing_model,
|
||||||
image_processing_model: input.image_processing_model,
|
image_processing_model: input.image_processing_model,
|
||||||
|
voice_processing_model: input.voice_processing_model,
|
||||||
embedding_model: input.embedding_model,
|
embedding_model: input.embedding_model,
|
||||||
// Use new dimensions if provided, otherwise retain the current ones.
|
// Use new dimensions if provided, otherwise retain the current ones.
|
||||||
embedding_dimensions: input
|
embedding_dimensions: input
|
||||||
|
|||||||
@@ -107,6 +107,22 @@
|
|||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- Voice Processing Model -->
|
||||||
|
<div class="form-control mb-4">
|
||||||
|
<label class="label">
|
||||||
|
<span class="label-text">Voice Processing Model</span>
|
||||||
|
</label>
|
||||||
|
<select name="voice_processing_model" class="select select-bordered w-full">
|
||||||
|
{% for model in available_models.data %}
|
||||||
|
<option value="{{model.id}}" {% if settings.voice_processing_model==model.id %} selected {% endif %}>{{model.id}}</option>
|
||||||
|
{% endfor %}
|
||||||
|
</select>
|
||||||
|
<p class="text-xs text-gray-500 mt-1">
|
||||||
|
Current used:
|
||||||
|
<span class="font-mono">{{settings.voice_processing_model}}</span>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
<!-- Embedding Model -->
|
<!-- Embedding Model -->
|
||||||
<div class="form-control mb-4">
|
<div class="form-control mb-4">
|
||||||
<label class="label">
|
<label class="label">
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ use std::io::{Seek, SeekFrom};
|
|||||||
use tempfile::NamedTempFile;
|
use tempfile::NamedTempFile;
|
||||||
use tracing::{error, info};
|
use tracing::{error, info};
|
||||||
|
|
||||||
use crate::utils::image_parsing::extract_text_from_image;
|
use crate::utils::{audio_transcription::transcribe_audio_file, image_parsing::extract_text_from_image};
|
||||||
|
|
||||||
pub async fn to_text_content(
|
pub async fn to_text_content(
|
||||||
ingestion_payload: IngestionPayload,
|
ingestion_payload: IngestionPayload,
|
||||||
@@ -231,6 +231,10 @@ async fn extract_text_from_file(
|
|||||||
let content = tokio::fs::read_to_string(&file_info.path).await?;
|
let content = tokio::fs::read_to_string(&file_info.path).await?;
|
||||||
Ok(content)
|
Ok(content)
|
||||||
}
|
}
|
||||||
|
"audio/mpeg" | "audio/mp3" | "audio/wav" | "audio/x-wav" | "audio/webm" | "audio/mp4" | "audio/ogg" | "audio/flac" => {
|
||||||
|
|
||||||
|
transcribe_audio_file(&file_info.path, db_client, openai_client).await
|
||||||
|
}
|
||||||
// Handle other MIME types as needed
|
// Handle other MIME types as needed
|
||||||
_ => Err(AppError::NotFound(file_info.mime_type.clone())),
|
_ => Err(AppError::NotFound(file_info.mime_type.clone())),
|
||||||
}
|
}
|
||||||
|
|||||||
32
ingestion-pipeline/src/utils/audio_transcription.rs
Normal file
32
ingestion-pipeline/src/utils/audio_transcription.rs
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
use async_openai::types::{CreateTranscriptionRequestArgs, AudioResponseFormat};
|
||||||
|
use common::{
|
||||||
|
error::AppError,
|
||||||
|
storage::{
|
||||||
|
db::SurrealDbClient,
|
||||||
|
types::system_settings::SystemSettings,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Transcribes an audio file using the configured OpenAI Whisper model.
|
||||||
|
pub async fn transcribe_audio_file(
|
||||||
|
file_path: &str,
|
||||||
|
db_client: &SurrealDbClient,
|
||||||
|
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||||
|
) -> Result<String, AppError> {
|
||||||
|
let system_settings = SystemSettings::get_current(db_client).await?;
|
||||||
|
let model = system_settings.voice_processing_model;
|
||||||
|
|
||||||
|
let request = CreateTranscriptionRequestArgs::default()
|
||||||
|
.file(file_path)
|
||||||
|
.model(model)
|
||||||
|
.response_format(AudioResponseFormat::Json)
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
let response = openai_client
|
||||||
|
.audio()
|
||||||
|
.transcribe(request)
|
||||||
|
.await
|
||||||
|
.map_err(|e| AppError::Processing(format!("Audio transcription failed: {}", e)))?;
|
||||||
|
Ok(response.text)
|
||||||
|
}
|
||||||
|
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
pub mod image_parsing;
|
pub mod image_parsing;
|
||||||
pub mod llm_instructions;
|
pub mod llm_instructions;
|
||||||
|
pub mod audio_transcription;
|
||||||
|
|
||||||
use common::error::AppError;
|
use common::error::AppError;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|||||||
Reference in New Issue
Block a user