mirror of
https://github.com/perstarkse/minne.git
synced 2026-05-30 03:10:45 +02:00
chore: additional clippy fixes after rebasing
This commit is contained in:
@@ -4,6 +4,7 @@ use std::{
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use async_openai::Client;
|
||||
use common::{
|
||||
storage::{
|
||||
@@ -26,6 +27,7 @@ use crate::{
|
||||
slice, snapshot,
|
||||
};
|
||||
|
||||
#[allow(clippy::struct_excessive_bools)]
|
||||
pub(super) struct EvaluationContext<'a> {
|
||||
dataset: &'a ConvertedDataset,
|
||||
config: &'a Config,
|
||||
@@ -119,41 +121,39 @@ impl<'a> EvaluationContext<'a> {
|
||||
self.config
|
||||
}
|
||||
|
||||
pub fn slice(&self) -> &slice::ResolvedSlice<'a> {
|
||||
self.slice.as_ref().expect("slice has not been prepared")
|
||||
pub fn slice(&self) -> Result<&slice::ResolvedSlice<'a>> {
|
||||
self.slice.as_ref().ok_or_else(|| anyhow!("slice has not been prepared"))
|
||||
}
|
||||
|
||||
pub fn db(&self) -> &SurrealDbClient {
|
||||
self.db.as_ref().expect("database connection missing")
|
||||
pub fn db(&self) -> Result<&SurrealDbClient> {
|
||||
self.db.as_ref().ok_or_else(|| anyhow!("database connection missing"))
|
||||
}
|
||||
|
||||
pub fn descriptor(&self) -> &snapshot::Descriptor {
|
||||
pub fn descriptor(&self) -> Result<&snapshot::Descriptor> {
|
||||
self.descriptor
|
||||
.as_ref()
|
||||
.expect("snapshot descriptor unavailable")
|
||||
.ok_or_else(|| anyhow!("snapshot descriptor unavailable"))
|
||||
}
|
||||
|
||||
pub fn embedding_provider(&self) -> &EmbeddingProvider {
|
||||
pub fn embedding_provider(&self) -> Result<&EmbeddingProvider> {
|
||||
self.embedding_provider
|
||||
.as_ref()
|
||||
.expect("embedding provider not initialised")
|
||||
.ok_or_else(|| anyhow!("embedding provider not initialised"))
|
||||
}
|
||||
|
||||
pub fn openai_client(&self) -> Arc<Client<async_openai::config::OpenAIConfig>> {
|
||||
self.openai_client
|
||||
.as_ref()
|
||||
.expect("openai client missing")
|
||||
.clone()
|
||||
pub fn openai_client(&self) -> Result<Arc<Client<async_openai::config::OpenAIConfig>>> {
|
||||
Ok(Arc::clone(self.openai_client.as_ref().ok_or_else(|| anyhow!("openai client missing"))?))
|
||||
}
|
||||
|
||||
pub fn corpus_handle(&self) -> &corpus::CorpusHandle {
|
||||
self.corpus_handle.as_ref().expect("corpus handle missing")
|
||||
pub fn corpus_handle(&self) -> Result<&corpus::CorpusHandle> {
|
||||
self.corpus_handle.as_ref().ok_or_else(|| anyhow!("corpus handle missing"))
|
||||
}
|
||||
|
||||
pub fn evaluation_user(&self) -> &User {
|
||||
self.eval_user.as_ref().expect("evaluation user missing")
|
||||
pub fn evaluation_user(&self) -> Result<&User> {
|
||||
self.eval_user.as_ref().ok_or_else(|| anyhow!("evaluation user missing"))
|
||||
}
|
||||
|
||||
#[allow(clippy::arithmetic_side_effects)]
|
||||
pub fn record_stage_duration(&mut self, stage: EvalStage, duration: Duration) {
|
||||
let elapsed = duration.as_millis();
|
||||
match stage {
|
||||
@@ -167,8 +167,8 @@ impl<'a> EvaluationContext<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_summary(self) -> EvaluationSummary {
|
||||
self.summary.expect("evaluation summary missing")
|
||||
pub fn into_summary(self) -> Result<EvaluationSummary> {
|
||||
self.summary.ok_or_else(|| anyhow!("evaluation summary missing"))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -184,7 +184,7 @@ pub(super) enum EvalStage {
|
||||
}
|
||||
|
||||
impl EvalStage {
|
||||
pub fn label(&self) -> &'static str {
|
||||
pub fn label(self) -> &'static str {
|
||||
match self {
|
||||
EvalStage::PrepareSlice => "prepare-slice",
|
||||
EvalStage::PrepareDb => "prepare-db",
|
||||
|
||||
@@ -23,5 +23,5 @@ pub async fn run_evaluation(
|
||||
let machine = stages::summarize(machine, &mut ctx).await?;
|
||||
let _ = stages::finalize(machine, &mut ctx).await?;
|
||||
|
||||
Ok(ctx.into_summary())
|
||||
ctx.into_summary()
|
||||
}
|
||||
|
||||
@@ -55,5 +55,5 @@ pub(crate) async fn finalize(
|
||||
|
||||
machine
|
||||
.finalize()
|
||||
.map_err(|(_, guard)| map_guard_error("finalize", guard))
|
||||
.map_err(|(_, guard)| map_guard_error("finalize", &guard))
|
||||
}
|
||||
|
||||
@@ -19,7 +19,7 @@ use state_machines::core::GuardError;
|
||||
|
||||
use super::state::EvaluationMachine;
|
||||
|
||||
fn map_guard_error(event: &str, guard: GuardError) -> anyhow::Error {
|
||||
fn map_guard_error(event: &str, guard: &GuardError) -> anyhow::Error {
|
||||
anyhow::anyhow!("invalid evaluation pipeline transition during {event}: {guard:?}")
|
||||
}
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ use super::super::{
|
||||
};
|
||||
use super::{map_guard_error, StageResult};
|
||||
|
||||
#[allow(clippy::too_many_lines)]
|
||||
pub(crate) async fn prepare_corpus(
|
||||
machine: EvaluationMachine<(), DbReady>,
|
||||
ctx: &mut EvaluationContext<'_>,
|
||||
@@ -24,13 +25,13 @@ pub(crate) async fn prepare_corpus(
|
||||
|
||||
let config = ctx.config();
|
||||
let cache_settings = corpus::CorpusCacheConfig::from(config);
|
||||
let embedding_provider = ctx.embedding_provider().clone();
|
||||
let openai_client = ctx.openai_client();
|
||||
let slice = ctx.slice();
|
||||
let embedding_provider = ctx.embedding_provider()?.clone();
|
||||
let openai_client = ctx.openai_client()?;
|
||||
let slice = ctx.slice()?;
|
||||
let window = slice::select_window(slice, ctx.config().slice_offset, ctx.config().limit)
|
||||
.context("selecting slice window for corpus preparation")?;
|
||||
|
||||
let descriptor = snapshot::Descriptor::new(config, slice, ctx.embedding_provider());
|
||||
let descriptor = snapshot::Descriptor::new(config, slice, ctx.embedding_provider()?);
|
||||
let ingestion_config = corpus::make_ingestion_config(config);
|
||||
let expected_fingerprint = corpus::compute_ingestion_fingerprint(
|
||||
ctx.dataset(),
|
||||
@@ -47,7 +48,7 @@ pub(crate) async fn prepare_corpus(
|
||||
if !config.reseed_slice {
|
||||
let requested_cases = window.cases.len();
|
||||
if can_reuse_namespace(
|
||||
ctx.db(),
|
||||
ctx.db()?,
|
||||
&descriptor,
|
||||
&ctx.namespace,
|
||||
&ctx.database,
|
||||
@@ -81,7 +82,7 @@ pub(crate) async fn prepare_corpus(
|
||||
|
||||
return machine
|
||||
.prepare_corpus()
|
||||
.map_err(|(_, guard)| map_guard_error("prepare_corpus", guard));
|
||||
.map_err(|(_, guard)| map_guard_error("prepare_corpus", &guard));
|
||||
}
|
||||
info!(
|
||||
cache = %base_dir.display(),
|
||||
@@ -137,5 +138,5 @@ pub(crate) async fn prepare_corpus(
|
||||
|
||||
machine
|
||||
.prepare_corpus()
|
||||
.map_err(|(_, guard)| map_guard_error("prepare_corpus", guard))
|
||||
.map_err(|(_, guard)| map_guard_error("prepare_corpus", &guard))
|
||||
}
|
||||
|
||||
@@ -117,5 +117,5 @@ pub(crate) async fn prepare_db(
|
||||
|
||||
machine
|
||||
.prepare_db()
|
||||
.map_err(|(_, guard)| map_guard_error("prepare_db", guard))
|
||||
.map_err(|(_, guard)| map_guard_error("prepare_db", &guard))
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@ use super::super::{
|
||||
};
|
||||
use super::{map_guard_error, StageResult};
|
||||
|
||||
#[allow(clippy::too_many_lines)]
|
||||
pub(crate) async fn prepare_namespace(
|
||||
machine: EvaluationMachine<(), CorpusReady>,
|
||||
ctx: &mut EvaluationContext<'_>,
|
||||
@@ -39,9 +40,9 @@ pub(crate) async fn prepare_namespace(
|
||||
.to_string();
|
||||
let namespace = ctx.namespace.clone();
|
||||
let database = ctx.database.clone();
|
||||
let embedding_provider = ctx.embedding_provider().clone();
|
||||
let embedding_provider = ctx.embedding_provider()?.clone();
|
||||
|
||||
let corpus_handle = ctx.corpus_handle();
|
||||
let corpus_handle = ctx.corpus_handle()?;
|
||||
let base_manifest = &corpus_handle.manifest;
|
||||
let manifest_for_seed =
|
||||
if ctx.window_offset == 0 && ctx.window_length >= base_manifest.questions.len() {
|
||||
@@ -60,10 +61,10 @@ pub(crate) async fn prepare_namespace(
|
||||
let mut namespace_reused = false;
|
||||
if !config.reseed_slice {
|
||||
namespace_reused = {
|
||||
let slice = ctx.slice();
|
||||
let slice = ctx.slice()?;
|
||||
can_reuse_namespace(
|
||||
ctx.db(),
|
||||
ctx.descriptor(),
|
||||
ctx.db()?,
|
||||
ctx.descriptor()?,
|
||||
&namespace,
|
||||
&database,
|
||||
dataset.metadata.id.as_str(),
|
||||
@@ -78,19 +79,19 @@ pub(crate) async fn prepare_namespace(
|
||||
let mut namespace_seed_ms = None;
|
||||
if !namespace_reused {
|
||||
ctx.must_reapply_settings = true;
|
||||
if let Err(err) = reset_namespace(ctx.db(), &namespace, &database).await {
|
||||
if let Err(err) = reset_namespace(ctx.db()?, &namespace, &database).await {
|
||||
warn!(
|
||||
error = %err,
|
||||
namespace,
|
||||
database = %database,
|
||||
"Failed to reset namespace before reseeding; continuing with existing data"
|
||||
);
|
||||
} else if let Err(err) = ctx.db().apply_migrations().await {
|
||||
} else if let Err(err) = ctx.db()?.apply_migrations().await {
|
||||
warn!(error = %err, "Failed to reapply migrations after namespace reset");
|
||||
}
|
||||
|
||||
{
|
||||
let slice = ctx.slice();
|
||||
let slice = ctx.slice()?;
|
||||
info!(
|
||||
slice = slice.manifest.slice_id.as_str(),
|
||||
window_offset = ctx.window_offset,
|
||||
@@ -113,10 +114,10 @@ pub(crate) async fn prepare_namespace(
|
||||
"Seeding ingestion corpus into SurrealDB"
|
||||
);
|
||||
}
|
||||
let indexes_disabled = remove_all_indexes(ctx.db()).await.is_ok();
|
||||
let indexes_disabled = remove_all_indexes(ctx.db()?).await.is_ok();
|
||||
|
||||
let seed_start = Instant::now();
|
||||
corpus::seed_manifest_into_db(ctx.db(), &manifest_for_seed)
|
||||
corpus::seed_manifest_into_db(ctx.db()?, &manifest_for_seed)
|
||||
.await
|
||||
.context("seeding ingestion corpus from manifest")?;
|
||||
namespace_seed_ms = Some(seed_start.elapsed().as_millis());
|
||||
@@ -124,15 +125,15 @@ pub(crate) async fn prepare_namespace(
|
||||
// Recreate indexes AFTER data is loaded (correct bulk loading pattern)
|
||||
if indexes_disabled {
|
||||
info!("Recreating indexes after seeding data");
|
||||
recreate_indexes(ctx.db(), embedding_provider.dimension())
|
||||
recreate_indexes(ctx.db()?, embedding_provider.dimension())
|
||||
.await
|
||||
.context("recreating indexes with correct dimension")?;
|
||||
warm_hnsw_cache(ctx.db(), embedding_provider.dimension()).await?;
|
||||
warm_hnsw_cache(ctx.db()?, embedding_provider.dimension()).await?;
|
||||
}
|
||||
{
|
||||
let slice = ctx.slice();
|
||||
let slice = ctx.slice()?;
|
||||
record_namespace_state(
|
||||
ctx.descriptor(),
|
||||
ctx.descriptor()?,
|
||||
dataset.metadata.id.as_str(),
|
||||
slice.manifest.slice_id.as_str(),
|
||||
expected_fingerprint.as_str(),
|
||||
@@ -145,17 +146,17 @@ pub(crate) async fn prepare_namespace(
|
||||
}
|
||||
|
||||
if ctx.must_reapply_settings {
|
||||
let mut settings = SystemSettings::get_current(ctx.db())
|
||||
let mut settings = SystemSettings::get_current(ctx.db()?)
|
||||
.await
|
||||
.context("reloading system settings after namespace reset")?;
|
||||
settings =
|
||||
enforce_system_settings(ctx.db(), settings, embedding_provider.dimension(), config)
|
||||
enforce_system_settings(ctx.db()?, settings, embedding_provider.dimension(), config)
|
||||
.await?;
|
||||
ctx.settings = Some(settings);
|
||||
ctx.must_reapply_settings = false;
|
||||
}
|
||||
|
||||
let user = ensure_eval_user(ctx.db()).await?;
|
||||
let user = ensure_eval_user(ctx.db()?).await?;
|
||||
ctx.eval_user = Some(user);
|
||||
|
||||
let total_manifest_questions = manifest_for_seed.questions.len();
|
||||
@@ -199,5 +200,5 @@ pub(crate) async fn prepare_namespace(
|
||||
|
||||
machine
|
||||
.prepare_namespace()
|
||||
.map_err(|(_, guard)| map_guard_error("prepare_namespace", guard))
|
||||
.map_err(|(_, guard)| map_guard_error("prepare_namespace", &guard))
|
||||
}
|
||||
|
||||
@@ -68,5 +68,5 @@ pub(crate) async fn prepare_slice(
|
||||
|
||||
machine
|
||||
.prepare_slice()
|
||||
.map_err(|(_, guard)| map_guard_error("prepare_slice", guard))
|
||||
.map_err(|(_, guard)| map_guard_error("prepare_slice", &guard))
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::{collections::HashSet, sync::Arc, time::Instant};
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::{anyhow, Context};
|
||||
use common::storage::types::StoredObject;
|
||||
use futures::stream::{self, StreamExt};
|
||||
use tracing::{debug, info};
|
||||
@@ -21,6 +21,7 @@ use super::super::{
|
||||
};
|
||||
use super::{map_guard_error, StageResult};
|
||||
|
||||
#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects)]
|
||||
pub(crate) async fn run_queries(
|
||||
machine: EvaluationMachine<(), NamespaceReady>,
|
||||
ctx: &mut EvaluationContext<'_>,
|
||||
@@ -37,7 +38,7 @@ pub(crate) async fn run_queries(
|
||||
let slice_settings = ctx
|
||||
.slice_settings
|
||||
.as_ref()
|
||||
.expect("slice settings missing during query stage");
|
||||
.ok_or_else(|| anyhow!("slice settings missing during query stage"))?;
|
||||
let total_cases = ctx.cases.len();
|
||||
let cases_iter = std::mem::take(&mut ctx.cases).into_iter().enumerate();
|
||||
|
||||
@@ -115,9 +116,9 @@ pub(crate) async fn run_queries(
|
||||
chunk_rrf_fts_weight = active_tuning.chunk_rrf_fts_weight,
|
||||
chunk_rrf_use_vector = active_tuning.flags.chunk_rrf_use_vector.as_bool(),
|
||||
chunk_rrf_use_fts = active_tuning.flags.chunk_rrf_use_fts.as_bool(),
|
||||
embedding_backend = ctx.embedding_provider().backend_label(),
|
||||
embedding_backend = ctx.embedding_provider()?.backend_label(),
|
||||
embedding_model = ctx
|
||||
.embedding_provider()
|
||||
.embedding_provider()?
|
||||
.model_code()
|
||||
.as_deref()
|
||||
.unwrap_or("<default>"),
|
||||
@@ -125,11 +126,11 @@ pub(crate) async fn run_queries(
|
||||
);
|
||||
|
||||
let retrieval_config = Arc::new(retrieval_config);
|
||||
ctx.rerank_pool = rerank_pool.clone();
|
||||
ctx.retrieval_config = Some(retrieval_config.clone());
|
||||
ctx.rerank_pool.clone_from(&rerank_pool);
|
||||
ctx.retrieval_config = Some(Arc::clone(&retrieval_config));
|
||||
|
||||
ctx.evaluation_start = Some(Instant::now());
|
||||
let user_id = ctx.evaluation_user().id.clone();
|
||||
let user_id = ctx.evaluation_user()?.id.clone();
|
||||
let concurrency = config.concurrency.max(1);
|
||||
let diagnostics_enabled = ctx.diagnostics_enabled;
|
||||
|
||||
@@ -141,20 +142,20 @@ pub(crate) async fn run_queries(
|
||||
"Starting evaluation with staged query execution"
|
||||
);
|
||||
|
||||
let embedding_provider_for_queries = ctx.embedding_provider().clone();
|
||||
let embedding_provider_for_queries = ctx.embedding_provider()?.clone();
|
||||
let rerank_pool_for_queries = rerank_pool.clone();
|
||||
let db = ctx.db().clone();
|
||||
let openai_client = ctx.openai_client();
|
||||
let db = ctx.db()?.clone();
|
||||
let openai_client = ctx.openai_client()?;
|
||||
|
||||
let raw_results = stream::iter(cases_iter)
|
||||
.map(move |(idx, case)| {
|
||||
let db = db.clone();
|
||||
let openai_client = openai_client.clone();
|
||||
let openai_client = Arc::clone(&openai_client);
|
||||
let user_id = user_id.clone();
|
||||
let retrieval_config = retrieval_config.clone();
|
||||
let retrieval_config = Arc::clone(&retrieval_config);
|
||||
let embedding_provider = embedding_provider_for_queries.clone();
|
||||
let rerank_pool = rerank_pool_for_queries.clone();
|
||||
let semaphore = query_semaphore.clone();
|
||||
let semaphore = Arc::clone(&query_semaphore);
|
||||
let diagnostics_enabled = diagnostics_enabled;
|
||||
|
||||
async move {
|
||||
@@ -374,9 +375,10 @@ pub(crate) async fn run_queries(
|
||||
|
||||
machine
|
||||
.run_queries()
|
||||
.map_err(|(_, guard)| map_guard_error("run_queries", guard))
|
||||
.map_err(|(_, guard)| map_guard_error("run_queries", &guard))
|
||||
}
|
||||
|
||||
#[allow(clippy::arithmetic_side_effects, clippy::cast_precision_loss)]
|
||||
fn calculate_reciprocal_rank(rank: Option<usize>) -> f64 {
|
||||
match rank {
|
||||
Some(r) if r > 0 => 1.0 / (r as f64),
|
||||
@@ -384,6 +386,7 @@ fn calculate_reciprocal_rank(rank: Option<usize>) -> f64 {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::arithmetic_side_effects, clippy::cast_precision_loss)]
|
||||
fn calculate_ndcg(retrieved: &[RetrievedSummary], k: usize) -> f64 {
|
||||
let mut dcg = 0.0;
|
||||
let mut relevant_count = 0;
|
||||
|
||||
@@ -13,6 +13,7 @@ use super::super::{
|
||||
};
|
||||
use super::{map_guard_error, StageResult};
|
||||
|
||||
#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects, clippy::cast_precision_loss)]
|
||||
pub(crate) async fn summarize(
|
||||
machine: EvaluationMachine<(), QueriesFinished>,
|
||||
ctx: &mut EvaluationContext<'_>,
|
||||
@@ -34,8 +35,8 @@ pub(crate) async fn summarize(
|
||||
.unwrap_or_default();
|
||||
let config = ctx.config();
|
||||
let dataset = ctx.dataset();
|
||||
let slice = ctx.slice();
|
||||
let corpus_handle = ctx.corpus_handle();
|
||||
let slice = ctx.slice()?;
|
||||
let corpus_handle = ctx.corpus_handle()?;
|
||||
let total_cases = summaries.len();
|
||||
|
||||
let mut correct = 0usize;
|
||||
@@ -176,7 +177,7 @@ pub(crate) async fn summarize(
|
||||
slice_total_paragraphs: slice.manifest.total_paragraphs,
|
||||
slice_negative_multiplier: slice.manifest.negative_multiplier,
|
||||
namespace_reused: ctx.namespace_reused,
|
||||
corpus_paragraphs: ctx.corpus_handle().manifest.metadata.paragraph_count,
|
||||
corpus_paragraphs: ctx.corpus_handle()?.manifest.metadata.paragraph_count,
|
||||
ingestion_cache_path: corpus_handle.path.display().to_string(),
|
||||
ingestion_reused: corpus_handle.reused_ingestion,
|
||||
ingestion_embeddings_reused: corpus_handle.reused_embeddings,
|
||||
@@ -189,9 +190,9 @@ pub(crate) async fn summarize(
|
||||
negative_paragraphs_reused: corpus_handle.negative_reused,
|
||||
latency_ms: latency_stats,
|
||||
perf: perf_timings,
|
||||
embedding_backend: ctx.embedding_provider().backend_label().to_string(),
|
||||
embedding_model: ctx.embedding_provider().model_code(),
|
||||
embedding_dimension: ctx.embedding_provider().dimension(),
|
||||
embedding_backend: ctx.embedding_provider()?.backend_label().to_string(),
|
||||
embedding_model: ctx.embedding_provider()?.model_code(),
|
||||
embedding_dimension: ctx.embedding_provider()?.dimension(),
|
||||
rerank_enabled: config.retrieval.rerank,
|
||||
rerank_pool_size: ctx
|
||||
.rerank_pool
|
||||
@@ -228,5 +229,5 @@ pub(crate) async fn summarize(
|
||||
|
||||
machine
|
||||
.summarize()
|
||||
.map_err(|(_, guard)| map_guard_error("summarize", guard))
|
||||
.map_err(|(_, guard)| map_guard_error("summarize", &guard))
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user