chore: harden common storage bootstrap and slim embedded db assets

Unify embedding config, build providers from system settings, and fail
startup when index builds error or time out. Move Surreal assets under
common/db so embeds exclude crate source, and read storage via streams.
This commit is contained in:
Per Stark
2026-05-29 12:26:26 +02:00
parent 93d11b66eb
commit e3bb2935d0
62 changed files with 672 additions and 443 deletions
+136
View File
@@ -0,0 +1,136 @@
mod startup;
pub mod wiring;
pub use startup::prepare_embedding_runtime;
use std::sync::Arc;
use anyhow::Context;
use async_openai::Client;
use common::{
storage::{
db::SurrealDbClient,
store::StorageManager,
types::system_settings::SystemSettings,
},
utils::{
config::{get_config, AppConfig},
embedding::EmbeddingProvider,
},
};
use retrieval_pipeline::reranking::RerankerPool;
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
pub struct SharedServices {
pub db: Arc<SurrealDbClient>,
pub openai_client: Arc<Client<async_openai::config::OpenAIConfig>>,
pub embedding_provider: Arc<EmbeddingProvider>,
pub storage: StorageManager,
pub reranker_pool: Option<Arc<RerankerPool>>,
pub config: AppConfig,
}
pub async fn init() -> anyhow::Result<SharedServices> {
tracing_subscriber::registry()
.with(fmt::layer().with_writer(std::io::stderr))
.with(EnvFilter::from_default_env())
.try_init()
.ok();
let config = get_config()?;
init_with_config(config).await
}
pub(crate) async fn init_with_config(config: AppConfig) -> anyhow::Result<SharedServices> {
let db = Arc::new(
SurrealDbClient::new(
&config.surrealdb_address,
&config.surrealdb_username,
&config.surrealdb_password,
&config.surrealdb_namespace,
&config.surrealdb_database,
)
.await
.context("connect to surrealdb")?,
);
db.apply_migrations()
.await
.context("apply database migrations")?;
let settings = SystemSettings::get_current(&db)
.await
.context("load system settings")?;
let openai_client = Arc::new(Client::with_config(
async_openai::config::OpenAIConfig::new()
.with_api_key(&config.openai_api_key)
.with_api_base(&config.openai_base_url),
));
let embedding_provider = Arc::new(
EmbeddingProvider::from_system_settings(
&settings,
&config,
Some(Arc::clone(&openai_client)),
)
.await
.context("initialize embedding provider")?,
);
let reranker_pool = RerankerPool::maybe_from_config(&config)?;
let storage = StorageManager::new(&config)
.await
.context("initialize storage manager")?;
Ok(SharedServices {
db,
openai_client,
embedding_provider,
storage,
reranker_pool,
config,
})
}
#[cfg(test)]
pub(crate) mod tests {
use std::path::Path;
use anyhow::Context;
use common::utils::config::{AppConfig, EmbeddingBackend, PdfIngestMode, StorageKind};
use uuid::Uuid;
pub fn smoke_test_config(namespace: &str, database: &str, data_dir: &Path) -> AppConfig {
AppConfig {
openai_api_key: "test-key".into(),
surrealdb_address: "mem://".into(),
surrealdb_username: "root".into(),
surrealdb_password: "root".into(),
surrealdb_namespace: namespace.into(),
surrealdb_database: database.into(),
data_dir: data_dir.to_string_lossy().into_owned(),
http_port: 0,
openai_base_url: "https://example.com".into(),
storage: StorageKind::Local,
pdf_ingest_mode: PdfIngestMode::LlmFirst,
embedding_backend: EmbeddingBackend::Hashed,
..Default::default()
}
}
pub async fn init_smoke_services() -> anyhow::Result<(super::SharedServices, std::path::PathBuf)>
{
let namespace = "test_ns";
let database = format!("test_db_{}", Uuid::new_v4());
let data_dir = std::env::temp_dir().join(format!("minne_smoke_{}", Uuid::new_v4()));
tokio::fs::create_dir_all(&data_dir)
.await
.context("create temp data directory")?;
let config = smoke_test_config(namespace, &database, &data_dir);
let services = super::init_with_config(config).await?;
Ok((services, data_dir))
}
}
+66
View File
@@ -0,0 +1,66 @@
use anyhow::Context;
use common::{
storage::{
db::SurrealDbClient,
indexes::ensure_runtime,
types::{
knowledge_entity::KnowledgeEntity, system_settings::SystemSettings,
text_chunk::TextChunk,
},
},
utils::embedding::EmbeddingProvider,
};
use tracing::{info, warn};
use super::SharedServices;
/// Syncs embedding settings, re-embeds stored vectors when dimensions change, and
/// ensures runtime indexes match the active embedding dimension.
pub async fn prepare_embedding_runtime(services: &SharedServices) -> anyhow::Result<SystemSettings> {
let (settings, dimensions_changed) =
SystemSettings::sync_from_embedding_provider(&services.db, &services.embedding_provider)
.await
.context("sync system settings from embedding provider")?;
if dimensions_changed {
re_embed_all(
&services.db,
&services.embedding_provider,
settings.embedding_dimensions,
)
.await?;
}
ensure_runtime(
&services.db,
settings.embedding_dimensions as usize,
)
.await
.context("ensure runtime indexes")?;
Ok(settings)
}
async fn re_embed_all(
db: &SurrealDbClient,
embedding_provider: &EmbeddingProvider,
embedding_dimensions: u32,
) -> anyhow::Result<()> {
warn!(
embedding_dimensions,
"Embedding configuration changed; re-embedding existing data"
);
info!("Re-embedding TextChunks");
TextChunk::update_all_embeddings_with_provider(db, embedding_provider)
.await
.context("re-embed text chunks after embedding dimension change")?;
info!("Re-embedding KnowledgeEntities");
KnowledgeEntity::update_all_embeddings_with_provider(db, embedding_provider)
.await
.context("re-embed knowledge entities after embedding dimension change")?;
info!("Re-embedding complete");
Ok(())
}
+54
View File
@@ -0,0 +1,54 @@
use std::sync::Arc;
use anyhow::Context;
use api_router::{api_routes_v1, api_state::ApiState};
use axum::{extract::FromRef, Router};
use html_router::{
html_routes,
html_state::{HtmlState, StateResources},
};
use super::SharedServices;
/// Builds the Minne API and HTML route subtrees without fixing the outer Axum state
/// type. SaaS consumers can merge additional routers and attach their own `AppState`
/// as long as it implements `FromRef` for `ApiState` and `HtmlState`.
pub fn minne_routes<S>(api_state: &ApiState, html_state: &HtmlState) -> Router<S>
where
S: Clone + Send + Sync + 'static,
ApiState: FromRef<S>,
HtmlState: FromRef<S>,
{
Router::new()
.nest("/api/v1", api_routes_v1(api_state))
.merge(html_routes(html_state))
}
pub fn build_api_state(services: &SharedServices) -> ApiState {
ApiState {
db: Arc::clone(&services.db),
config: services.config.clone(),
storage: services.storage.clone(),
}
}
pub async fn build_html_state(services: &SharedServices) -> anyhow::Result<HtmlState> {
let session_store = Arc::new(
services
.db
.create_session_store()
.await
.context("create session store")?,
);
Ok(HtmlState::new_with_resources(StateResources {
db: Arc::clone(&services.db),
openai_client: Arc::clone(&services.openai_client),
session_store,
storage: services.storage.clone(),
config: services.config.clone(),
reranker_pool: services.reranker_pool.clone(),
embedding_provider: Arc::clone(&services.embedding_provider),
template_engine: None,
}))
}