mirror of
https://github.com/perstarkse/minne.git
synced 2026-06-30 01:51:43 +02:00
e9d8654324
Collapse the multi-strategy entity engine into one benchmarked chunk retrieval path, derive entities from retrieved chunks, and update consumers, docs, and clippy fixes across the workspace.
499 lines
15 KiB
Rust
499 lines
15 KiB
Rust
use std::{
|
|
env,
|
|
path::{Path, PathBuf},
|
|
};
|
|
|
|
use anyhow::{anyhow, Context, Result};
|
|
use clap::{Args, Parser, ValueEnum};
|
|
|
|
use crate::datasets::DatasetKind;
|
|
|
|
fn workspace_root() -> PathBuf {
|
|
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
|
manifest_dir.parent().unwrap_or(&manifest_dir).to_path_buf()
|
|
}
|
|
|
|
fn default_report_dir() -> PathBuf {
|
|
workspace_root().join("evaluations/reports")
|
|
}
|
|
|
|
fn default_cache_dir() -> PathBuf {
|
|
workspace_root().join("evaluations/cache")
|
|
}
|
|
|
|
fn default_ingestion_cache_dir() -> PathBuf {
|
|
workspace_root().join("evaluations/cache/ingested")
|
|
}
|
|
|
|
pub const DEFAULT_SLICE_SEED: u64 = 0x5eed_2025;
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Default)]
|
|
#[value(rename_all = "lowercase")]
|
|
pub enum EmbeddingBackend {
|
|
Hashed,
|
|
#[default]
|
|
FastEmbed,
|
|
}
|
|
|
|
impl std::fmt::Display for EmbeddingBackend {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
match self {
|
|
Self::Hashed => write!(f, "hashed"),
|
|
Self::FastEmbed => write!(f, "fastembed"),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, Args)]
|
|
pub struct RetrievalSettings {
|
|
/// Override chunk vector candidate cap
|
|
#[arg(long)]
|
|
pub chunk_vector_take: Option<usize>,
|
|
|
|
/// Override chunk FTS candidate cap
|
|
#[arg(long)]
|
|
pub chunk_fts_take: Option<usize>,
|
|
|
|
/// Override maximum chunks attached per entity
|
|
#[arg(long)]
|
|
pub max_chunks_per_entity: Option<usize>,
|
|
|
|
/// Enable the `FastEmbed` reranking stage
|
|
#[arg(long = "rerank", action = clap::ArgAction::SetTrue, default_value_t = false)]
|
|
pub rerank: bool,
|
|
|
|
/// Reranking engine pool size / parallelism
|
|
#[arg(long, default_value_t = 4)]
|
|
pub rerank_pool_size: usize,
|
|
|
|
/// Keep top-N chunks after reranking
|
|
#[arg(long, default_value_t = 10)]
|
|
pub rerank_keep_top: usize,
|
|
|
|
/// Cap the number of chunks returned by retrieval
|
|
#[arg(long, default_value_t = 5)]
|
|
pub chunk_result_cap: usize,
|
|
|
|
/// Reciprocal rank fusion k value for chunk merging
|
|
#[arg(long)]
|
|
pub chunk_rrf_k: Option<f32>,
|
|
|
|
/// Weight for vector ranks in RRF
|
|
#[arg(long)]
|
|
pub chunk_rrf_vector_weight: Option<f32>,
|
|
|
|
/// Weight for chunk FTS ranks in RRF
|
|
#[arg(long)]
|
|
pub chunk_rrf_fts_weight: Option<f32>,
|
|
|
|
/// Include vector ranks in RRF (default: true)
|
|
#[arg(long)]
|
|
pub chunk_rrf_use_vector: Option<bool>,
|
|
|
|
/// Include chunk FTS ranks in RRF (default: true)
|
|
#[arg(long)]
|
|
pub chunk_rrf_use_fts: Option<bool>,
|
|
|
|
/// Require verified chunks (disable with --llm-mode)
|
|
#[arg(skip = true)]
|
|
pub require_verified_chunks: bool,
|
|
}
|
|
|
|
impl Default for RetrievalSettings {
|
|
fn default() -> Self {
|
|
Self {
|
|
chunk_vector_take: None,
|
|
chunk_fts_take: None,
|
|
max_chunks_per_entity: None,
|
|
rerank: false,
|
|
rerank_pool_size: 4,
|
|
rerank_keep_top: 10,
|
|
chunk_result_cap: 5,
|
|
chunk_rrf_k: None,
|
|
chunk_rrf_vector_weight: None,
|
|
chunk_rrf_fts_weight: None,
|
|
chunk_rrf_use_vector: None,
|
|
chunk_rrf_use_fts: None,
|
|
require_verified_chunks: true,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, Args)]
|
|
pub struct IngestConfig {
|
|
/// Directory for ingestion corpora caches
|
|
#[arg(long, default_value_os_t = default_ingestion_cache_dir())]
|
|
pub ingestion_cache_dir: PathBuf,
|
|
|
|
/// Minimum tokens per chunk for ingestion
|
|
#[arg(long, default_value_t = 256)]
|
|
pub ingest_chunk_min_tokens: usize,
|
|
|
|
/// Maximum tokens per chunk for ingestion
|
|
#[arg(long, default_value_t = 512)]
|
|
pub ingest_chunk_max_tokens: usize,
|
|
|
|
/// Overlap between chunks during ingestion (tokens)
|
|
#[arg(long, default_value_t = 50)]
|
|
pub ingest_chunk_overlap_tokens: usize,
|
|
|
|
/// Run ingestion in chunk-only mode (skip analyzer/graph generation)
|
|
#[arg(long)]
|
|
pub ingest_chunks_only: bool,
|
|
|
|
/// Number of paragraphs to ingest concurrently
|
|
#[arg(long, default_value_t = 10)]
|
|
pub ingestion_batch_size: usize,
|
|
|
|
/// Maximum retries for ingestion failures per paragraph
|
|
#[arg(long, default_value_t = 3)]
|
|
pub ingestion_max_retries: usize,
|
|
|
|
/// Recompute embeddings for cached corpora without re-running ingestion
|
|
#[arg(long, alias = "refresh-embeddings")]
|
|
pub refresh_embeddings_only: bool,
|
|
|
|
/// Delete cached paragraph shards before rebuilding the ingestion corpus
|
|
#[arg(long)]
|
|
pub slice_reset_ingestion: bool,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Args)]
|
|
pub struct DatabaseArgs {
|
|
/// `SurrealDB` server endpoint
|
|
#[arg(long, default_value = "ws://127.0.0.1:8000", env = "EVAL_DB_ENDPOINT")]
|
|
pub db_endpoint: String,
|
|
|
|
/// `SurrealDB` root username
|
|
#[arg(long, default_value = "root_user", env = "EVAL_DB_USERNAME")]
|
|
pub db_username: String,
|
|
|
|
/// `SurrealDB` root password
|
|
#[arg(long, default_value = "root_password", env = "EVAL_DB_PASSWORD")]
|
|
pub db_password: String,
|
|
|
|
/// Override the namespace used on the `SurrealDB` server
|
|
#[arg(long, env = "EVAL_DB_NAMESPACE")]
|
|
pub db_namespace: Option<String>,
|
|
|
|
/// Override the database used on the `SurrealDB` server
|
|
#[arg(long, env = "EVAL_DB_DATABASE")]
|
|
pub db_database: Option<String>,
|
|
|
|
/// Path to inspect DB state
|
|
#[arg(long)]
|
|
pub inspect_db_state: Option<PathBuf>,
|
|
}
|
|
|
|
#[derive(Parser, Debug, Clone)]
|
|
#[command(author, version, about, long_about = None)]
|
|
#[allow(clippy::struct_excessive_bools)]
|
|
pub struct Config {
|
|
/// Convert the selected dataset and exit
|
|
#[arg(long)]
|
|
pub convert_only: bool,
|
|
|
|
/// Regenerate the converted dataset even if it already exists
|
|
#[arg(long, alias = "refresh")]
|
|
pub force_convert: bool,
|
|
|
|
/// Dataset to evaluate
|
|
#[arg(long, default_value_t = DatasetKind::default())]
|
|
pub dataset: DatasetKind,
|
|
|
|
/// Enable LLM-assisted evaluation features (includes unanswerable cases)
|
|
#[arg(long)]
|
|
pub llm_mode: bool,
|
|
|
|
/// Cap the slice corpus size (positives + negatives)
|
|
#[arg(long)]
|
|
pub corpus_limit: Option<usize>,
|
|
|
|
/// Path to the raw dataset (defaults per dataset)
|
|
#[arg(long)]
|
|
pub raw: Option<PathBuf>,
|
|
|
|
/// Path to write/read the converted dataset (defaults per dataset)
|
|
#[arg(long)]
|
|
pub converted: Option<PathBuf>,
|
|
|
|
/// Directory to write evaluation reports
|
|
#[arg(long, default_value_os_t = default_report_dir())]
|
|
pub report_dir: PathBuf,
|
|
|
|
/// Precision@k cutoff
|
|
#[arg(long, default_value_t = 5)]
|
|
pub k: usize,
|
|
|
|
/// Limit the number of questions evaluated (0 = all)
|
|
#[arg(long = "limit", default_value_t = 200)]
|
|
pub limit_arg: usize,
|
|
|
|
/// Number of mismatches to surface in the Markdown summary
|
|
#[arg(long, default_value_t = 5)]
|
|
pub sample: usize,
|
|
|
|
/// Disable context cropping when converting datasets (ingest entire documents)
|
|
#[arg(long)]
|
|
pub full_context: bool,
|
|
|
|
#[command(flatten)]
|
|
pub retrieval: RetrievalSettings,
|
|
|
|
/// Concurrency level
|
|
#[arg(long, default_value_t = 1)]
|
|
pub concurrency: usize,
|
|
|
|
/// Embedding backend
|
|
#[arg(long, default_value_t = EmbeddingBackend::FastEmbed)]
|
|
pub embedding_backend: EmbeddingBackend,
|
|
|
|
/// `FastEmbed` model code
|
|
#[arg(long)]
|
|
pub embedding_model: Option<String>,
|
|
|
|
/// Directory for embedding caches
|
|
#[arg(long, default_value_os_t = default_cache_dir())]
|
|
pub cache_dir: PathBuf,
|
|
|
|
#[command(flatten)]
|
|
pub ingest: IngestConfig,
|
|
|
|
/// Include entity descriptions and categories in JSON reports
|
|
#[arg(long)]
|
|
pub detailed_report: bool,
|
|
|
|
/// Use a cached dataset slice by id or path
|
|
#[arg(long)]
|
|
pub slice: Option<String>,
|
|
|
|
/// Ignore cached corpus state and rebuild the slice's `SurrealDB` corpus
|
|
#[arg(long)]
|
|
pub reseed_slice: bool,
|
|
|
|
/// Slice seed
|
|
#[arg(skip = DEFAULT_SLICE_SEED)]
|
|
pub slice_seed: u64,
|
|
|
|
/// Grow the slice ledger to contain at least this many answerable cases, then exit
|
|
#[arg(long)]
|
|
pub slice_grow: Option<usize>,
|
|
|
|
/// Evaluate questions starting at this offset within the slice
|
|
#[arg(long, default_value_t = 0)]
|
|
pub slice_offset: usize,
|
|
|
|
/// Target negative-to-positive paragraph ratio for slice growth
|
|
#[arg(long, default_value_t = crate::slice::DEFAULT_NEGATIVE_MULTIPLIER)]
|
|
pub negative_multiplier: f32,
|
|
|
|
/// Annotate the run; label is stored in JSON/Markdown reports
|
|
#[arg(long)]
|
|
pub label: Option<String>,
|
|
|
|
/// Write per-query chunk diagnostics JSONL to the provided path
|
|
#[arg(long, alias = "chunk-diagnostics")]
|
|
pub chunk_diagnostics_path: Option<PathBuf>,
|
|
|
|
/// Inspect an ingestion cache question and exit
|
|
#[arg(long)]
|
|
pub inspect_question: Option<String>,
|
|
|
|
/// Path to an ingestion cache manifest JSON for inspection mode
|
|
#[arg(long)]
|
|
pub inspect_manifest: Option<PathBuf>,
|
|
|
|
/// Override the `SurrealDB` system settings query model
|
|
#[arg(long)]
|
|
pub query_model: Option<String>,
|
|
|
|
/// Write structured performance telemetry JSON to the provided path
|
|
#[arg(long)]
|
|
pub perf_log_json: Option<PathBuf>,
|
|
|
|
/// Directory that receives timestamped perf JSON copies
|
|
#[arg(long)]
|
|
pub perf_log_dir: Option<PathBuf>,
|
|
|
|
/// Print per-stage performance timings to stdout after the run
|
|
#[arg(long, alias = "perf-log")]
|
|
pub perf_log_console: bool,
|
|
|
|
#[command(flatten)]
|
|
pub database: DatabaseArgs,
|
|
|
|
// Computed fields (not arguments)
|
|
#[arg(skip)]
|
|
pub raw_dataset_path: PathBuf,
|
|
#[arg(skip)]
|
|
pub converted_dataset_path: PathBuf,
|
|
#[arg(skip)]
|
|
pub limit: Option<usize>,
|
|
#[arg(skip)]
|
|
pub summary_sample: usize,
|
|
}
|
|
|
|
impl Config {
|
|
#[allow(clippy::unused_self)]
|
|
pub fn context_token_limit(&self) -> Option<usize> {
|
|
None
|
|
}
|
|
|
|
#[allow(clippy::too_many_lines)]
|
|
pub fn finalize(&mut self) -> Result<()> {
|
|
// Handle dataset paths
|
|
if let Some(raw) = &self.raw {
|
|
self.raw_dataset_path = raw.clone();
|
|
} else {
|
|
self.raw_dataset_path = self.dataset.default_raw_path();
|
|
}
|
|
|
|
if let Some(converted) = &self.converted {
|
|
self.converted_dataset_path = converted.clone();
|
|
} else {
|
|
self.converted_dataset_path = self.dataset.default_converted_path();
|
|
}
|
|
|
|
// Handle limit
|
|
if self.limit_arg == 0 {
|
|
self.limit = None;
|
|
} else {
|
|
self.limit = Some(self.limit_arg);
|
|
}
|
|
|
|
// Handle sample
|
|
self.summary_sample = self.sample.max(1);
|
|
|
|
// Handle retrieval settings
|
|
self.retrieval.require_verified_chunks = !self.llm_mode;
|
|
|
|
if self.dataset == DatasetKind::Beir {
|
|
self.negative_multiplier = 9.0;
|
|
}
|
|
|
|
// Validations
|
|
if self.ingest.ingest_chunk_min_tokens == 0
|
|
|| self.ingest.ingest_chunk_min_tokens >= self.ingest.ingest_chunk_max_tokens
|
|
{
|
|
return Err(anyhow!(
|
|
"--ingest-chunk-min-tokens must be greater than zero and less than --ingest-chunk-max-tokens (got {} >= {})",
|
|
self.ingest.ingest_chunk_min_tokens,
|
|
self.ingest.ingest_chunk_max_tokens
|
|
));
|
|
}
|
|
|
|
if self.ingest.ingest_chunk_overlap_tokens >= self.ingest.ingest_chunk_min_tokens {
|
|
return Err(anyhow!(
|
|
"--ingest-chunk-overlap-tokens ({}) must be less than --ingest-chunk-min-tokens ({})",
|
|
self.ingest.ingest_chunk_overlap_tokens,
|
|
self.ingest.ingest_chunk_min_tokens
|
|
));
|
|
}
|
|
|
|
if self.retrieval.rerank && self.retrieval.rerank_pool_size == 0 {
|
|
return Err(anyhow!(
|
|
"--rerank-pool must be greater than zero when reranking is enabled"
|
|
));
|
|
}
|
|
|
|
if let Some(k) = self.retrieval.chunk_rrf_k {
|
|
if k <= 0.0 || !k.is_finite() {
|
|
return Err(anyhow!(
|
|
"--chunk-rrf-k must be a positive, finite number (got {k})"
|
|
));
|
|
}
|
|
}
|
|
if let Some(weight) = self.retrieval.chunk_rrf_vector_weight {
|
|
if weight < 0.0 || !weight.is_finite() {
|
|
return Err(anyhow!(
|
|
"--chunk-rrf-vector-weight must be a non-negative, finite number (got {weight})"
|
|
));
|
|
}
|
|
}
|
|
if let Some(weight) = self.retrieval.chunk_rrf_fts_weight {
|
|
if weight < 0.0 || !weight.is_finite() {
|
|
return Err(anyhow!(
|
|
"--chunk-rrf-fts-weight must be a non-negative, finite number (got {weight})"
|
|
));
|
|
}
|
|
}
|
|
|
|
if self.concurrency == 0 {
|
|
return Err(anyhow!("--concurrency must be greater than zero"));
|
|
}
|
|
|
|
if self.embedding_backend == EmbeddingBackend::Hashed && self.embedding_model.is_some() {
|
|
return Err(anyhow!(
|
|
"--embedding-model cannot be used with the 'hashed' embedding backend"
|
|
));
|
|
}
|
|
|
|
if let Some(query_model) = &self.query_model {
|
|
if query_model.trim().is_empty() {
|
|
return Err(anyhow!("--query-model requires a non-empty model name"));
|
|
}
|
|
}
|
|
|
|
if let Some(grow) = self.slice_grow {
|
|
if grow == 0 {
|
|
return Err(anyhow!("--slice-grow must be greater than zero"));
|
|
}
|
|
}
|
|
|
|
if self.negative_multiplier <= 0.0 || !self.negative_multiplier.is_finite() {
|
|
return Err(anyhow!(
|
|
"--negative-multiplier must be a positive finite number"
|
|
));
|
|
}
|
|
|
|
// Handle corpus limit logic
|
|
if let Some(limit) = self.limit {
|
|
if let Some(corpus_limit) = self.corpus_limit {
|
|
if corpus_limit < limit {
|
|
self.corpus_limit = Some(limit);
|
|
}
|
|
} else {
|
|
let default_multiplier = 10usize;
|
|
let mut computed = limit.saturating_mul(default_multiplier);
|
|
if computed < limit {
|
|
computed = limit;
|
|
}
|
|
let max_cap = 1_000usize;
|
|
if computed > max_cap {
|
|
computed = max_cap;
|
|
}
|
|
self.corpus_limit = Some(computed);
|
|
}
|
|
}
|
|
|
|
// Handle perf log dir env var fallback
|
|
if self.perf_log_dir.is_none() {
|
|
if let Ok(dir) = env::var("EVAL_PERF_LOG_DIR") {
|
|
if !dir.trim().is_empty() {
|
|
self.perf_log_dir = Some(PathBuf::from(dir));
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
pub struct ParsedArgs {
|
|
pub config: Config,
|
|
}
|
|
|
|
pub fn parse() -> Result<ParsedArgs> {
|
|
let mut config = Config::parse();
|
|
config.finalize()?;
|
|
Ok(ParsedArgs { config })
|
|
}
|
|
|
|
pub fn ensure_parent(path: &Path) -> Result<()> {
|
|
if let Some(parent) = path.parent() {
|
|
std::fs::create_dir_all(parent)
|
|
.with_context(|| format!("creating parent directory for {}", path.display()))?;
|
|
}
|
|
Ok(())
|
|
}
|